|
|
|
|
@ -18,6 +18,7 @@ from core.rag.extractor.markdown_extractor import MarkdownExtractor
|
|
|
|
|
from core.rag.extractor.notion_extractor import NotionExtractor
|
|
|
|
|
from core.rag.extractor.pdf_extractor import PdfExtractor
|
|
|
|
|
from core.rag.extractor.text_extractor import TextExtractor
|
|
|
|
|
from core.rag.extractor.unstructured.unstructured_doc_extractor import UnstructuredWordExtractor
|
|
|
|
|
from core.rag.extractor.unstructured.unstructured_eml_extractor import UnstructuredEmailExtractor
|
|
|
|
|
from core.rag.extractor.unstructured.unstructured_epub_extractor import UnstructuredEpubExtractor
|
|
|
|
|
from core.rag.extractor.unstructured.unstructured_markdown_extractor import UnstructuredMarkdownExtractor
|
|
|
|
|
@ -104,7 +105,7 @@ class ExtractProcessor:
|
|
|
|
|
etl_type = dify_config.ETL_TYPE
|
|
|
|
|
extractor: Optional[BaseExtractor] = None
|
|
|
|
|
if etl_type == "Unstructured":
|
|
|
|
|
unstructured_api_url = dify_config.UNSTRUCTURED_API_URL
|
|
|
|
|
unstructured_api_url = dify_config.UNSTRUCTURED_API_URL or ""
|
|
|
|
|
unstructured_api_key = dify_config.UNSTRUCTURED_API_KEY or ""
|
|
|
|
|
|
|
|
|
|
if file_extension in {".xlsx", ".xls"}:
|
|
|
|
|
@ -121,6 +122,8 @@ class ExtractProcessor:
|
|
|
|
|
extractor = HtmlExtractor(file_path)
|
|
|
|
|
elif file_extension == ".docx":
|
|
|
|
|
extractor = WordExtractor(file_path, upload_file.tenant_id, upload_file.created_by)
|
|
|
|
|
elif file_extension == ".doc":
|
|
|
|
|
extractor = UnstructuredWordExtractor(file_path, unstructured_api_url, unstructured_api_key)
|
|
|
|
|
elif file_extension == ".csv":
|
|
|
|
|
extractor = CSVExtractor(file_path, autodetect_encoding=True)
|
|
|
|
|
elif file_extension == ".msg":
|
|
|
|
|
|