From 1590697a583f27489aed4224b52fffd5b0c87526 Mon Sep 17 00:00:00 2001 From: "liuchangsheng@wisdomidata.com" Date: Wed, 18 Jun 2025 19:19:10 +0800 Subject: [PATCH] =?UTF-8?q?=20=E3=80=90Dify=E3=80=91=20=E5=85=A8=E6=96=87?= =?UTF-8?q?=E6=A3=80=E7=B4=A2=E7=9A=84=E5=88=86=E7=89=87-=E9=80=9A?= =?UTF-8?q?=E8=BF=87=E7=9F=A5=E8=AF=86=E5=BA=93=E7=9A=84=E5=90=8D=E7=A7=B0?= =?UTF-8?q?=E5=81=9A=E5=88=A4=E6=96=AD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- api/core/indexing_runner.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/api/core/indexing_runner.py b/api/core/indexing_runner.py index 6b7237030f..d44f656062 100644 --- a/api/core/indexing_runner.py +++ b/api/core/indexing_runner.py @@ -69,9 +69,11 @@ class IndexingRunner: # extract text_docs = self._extract(index_processor, dataset_document, processing_rule.to_dict()) # 对读取的结果做处理 - text_docs = self.handle_text_docs(dataset_document,text_docs) - - print("text_docs len:",len(text_docs)) + text_docs = self.handle_text_docs( + dataset=dataset, + dataset_document=dataset_document, + text_docs=text_docs + ) # transform documents = self._transform( index_processor, dataset, text_docs, dataset_document.doc_language, processing_rule.to_dict() @@ -102,9 +104,9 @@ class IndexingRunner: dataset_document.stopped_at = datetime.datetime.now(datetime.UTC).replace(tzinfo=None) db.session.commit() - def handle_text_docs(self, dataset_document: DatasetDocument,text_docs: list[Document]): + def handle_text_docs(self, dataset : Dataset, dataset_document: DatasetDocument,text_docs: list[Document]): # if dataset_document.doc_metadata and dataset_document.doc_metadata["file_id"] is not None: - if dataset_document.doc_metadata is None: + if dataset.name == "FULL_TEXT_SEARCH_KNOWLEDGE": # 全文检索处理文档页面是,需要完全按照统一token处理,所以要保证所有页面合并到一起处理 # 添加标题到文本第一行 contents:list[str] = [f"【{dataset_document.name}】-"]