|
|
|
|
@ -107,8 +107,10 @@ def _extract_text_by_mime_type(*, file_content: bytes, mime_type: str) -> str:
|
|
|
|
|
return _extract_text_from_plain_text(file_content)
|
|
|
|
|
case "application/pdf":
|
|
|
|
|
return _extract_text_from_pdf(file_content)
|
|
|
|
|
case "application/vnd.openxmlformats-officedocument.wordprocessingml.document" | "application/msword":
|
|
|
|
|
case "application/msword":
|
|
|
|
|
return _extract_text_from_doc(file_content)
|
|
|
|
|
case "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
|
|
|
|
|
return _extract_text_from_docx(file_content)
|
|
|
|
|
case "text/csv":
|
|
|
|
|
return _extract_text_from_csv(file_content)
|
|
|
|
|
case "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" | "application/vnd.ms-excel":
|
|
|
|
|
@ -142,8 +144,10 @@ def _extract_text_by_file_extension(*, file_content: bytes, file_extension: str)
|
|
|
|
|
return _extract_text_from_yaml(file_content)
|
|
|
|
|
case ".pdf":
|
|
|
|
|
return _extract_text_from_pdf(file_content)
|
|
|
|
|
case ".doc" | ".docx":
|
|
|
|
|
case ".doc":
|
|
|
|
|
return _extract_text_from_doc(file_content)
|
|
|
|
|
case ".docx":
|
|
|
|
|
return _extract_text_from_docx(file_content)
|
|
|
|
|
case ".csv":
|
|
|
|
|
return _extract_text_from_csv(file_content)
|
|
|
|
|
case ".xls" | ".xlsx":
|
|
|
|
|
@ -203,7 +207,33 @@ def _extract_text_from_pdf(file_content: bytes) -> str:
|
|
|
|
|
|
|
|
|
|
def _extract_text_from_doc(file_content: bytes) -> str:
|
|
|
|
|
"""
|
|
|
|
|
Extract text from a DOC/DOCX file.
|
|
|
|
|
Extract text from a DOC file.
|
|
|
|
|
"""
|
|
|
|
|
from unstructured.partition.api import partition_via_api
|
|
|
|
|
|
|
|
|
|
if not (dify_config.UNSTRUCTURED_API_URL and dify_config.UNSTRUCTURED_API_KEY):
|
|
|
|
|
raise TextExtractionError("UNSTRUCTURED_API_URL and UNSTRUCTURED_API_KEY must be set")
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
with tempfile.NamedTemporaryFile(suffix=".doc", delete=False) as temp_file:
|
|
|
|
|
temp_file.write(file_content)
|
|
|
|
|
temp_file.flush()
|
|
|
|
|
with open(temp_file.name, "rb") as file:
|
|
|
|
|
elements = partition_via_api(
|
|
|
|
|
file=file,
|
|
|
|
|
metadata_filename=temp_file.name,
|
|
|
|
|
api_url=dify_config.UNSTRUCTURED_API_URL,
|
|
|
|
|
api_key=dify_config.UNSTRUCTURED_API_KEY,
|
|
|
|
|
)
|
|
|
|
|
os.unlink(temp_file.name)
|
|
|
|
|
return "\n".join([getattr(element, "text", "") for element in elements])
|
|
|
|
|
except Exception as e:
|
|
|
|
|
raise TextExtractionError(f"Failed to extract text from DOC: {str(e)}") from e
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _extract_text_from_docx(file_content: bytes) -> str:
|
|
|
|
|
"""
|
|
|
|
|
Extract text from a DOCX file.
|
|
|
|
|
For now support only paragraph and table add more if needed
|
|
|
|
|
"""
|
|
|
|
|
try:
|
|
|
|
|
@ -255,13 +285,13 @@ def _extract_text_from_doc(file_content: bytes) -> str:
|
|
|
|
|
|
|
|
|
|
text.append(markdown_table)
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logger.warning(f"Failed to extract table from DOC/DOCX: {e}")
|
|
|
|
|
logger.warning(f"Failed to extract table from DOC: {e}")
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
return "\n".join(text)
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
raise TextExtractionError(f"Failed to extract text from DOC/DOCX: {str(e)}") from e
|
|
|
|
|
raise TextExtractionError(f"Failed to extract text from DOCX: {str(e)}") from e
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _download_file_content(file: File) -> bytes:
|
|
|
|
|
@ -329,14 +359,29 @@ def _extract_text_from_excel(file_content: bytes) -> str:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _extract_text_from_ppt(file_content: bytes) -> str:
|
|
|
|
|
from unstructured.partition.api import partition_via_api
|
|
|
|
|
from unstructured.partition.ppt import partition_ppt
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
with io.BytesIO(file_content) as file:
|
|
|
|
|
elements = partition_ppt(file=file)
|
|
|
|
|
if dify_config.UNSTRUCTURED_API_URL and dify_config.UNSTRUCTURED_API_KEY:
|
|
|
|
|
with tempfile.NamedTemporaryFile(suffix=".ppt", delete=False) as temp_file:
|
|
|
|
|
temp_file.write(file_content)
|
|
|
|
|
temp_file.flush()
|
|
|
|
|
with open(temp_file.name, "rb") as file:
|
|
|
|
|
elements = partition_via_api(
|
|
|
|
|
file=file,
|
|
|
|
|
metadata_filename=temp_file.name,
|
|
|
|
|
api_url=dify_config.UNSTRUCTURED_API_URL,
|
|
|
|
|
api_key=dify_config.UNSTRUCTURED_API_KEY,
|
|
|
|
|
)
|
|
|
|
|
os.unlink(temp_file.name)
|
|
|
|
|
else:
|
|
|
|
|
with io.BytesIO(file_content) as file:
|
|
|
|
|
elements = partition_ppt(file=file)
|
|
|
|
|
return "\n".join([getattr(element, "text", "") for element in elements])
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
raise TextExtractionError(f"Failed to extract text from PPT: {str(e)}") from e
|
|
|
|
|
raise TextExtractionError(f"Failed to extract text from PPTX: {str(e)}") from e
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _extract_text_from_pptx(file_content: bytes) -> str:
|
|
|
|
|
|