From 6db2d12965cd55e530a9a0675e16a9507e47ad66 Mon Sep 17 00:00:00 2001 From: "liuchangsheng@wisdomidata.com" Date: Mon, 16 Jun 2025 11:35:10 +0800 Subject: [PATCH] =?UTF-8?q?=E3=80=90Dify=E3=80=91=20=E6=9F=A5=E8=AF=A2?= =?UTF-8?q?=E6=94=B9=E4=B8=BA=E5=85=B3=E9=94=AE=E8=AF=8D=E5=8C=B9=E9=85=8D?= =?UTF-8?q?=E6=9F=A5=E8=AF=A2=20-=20=E5=AE=8C=E5=96=84?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- api/extensions/utils/search_tool.py | 6 +-- api/services/ext/dataset_ext_service.py | 59 ++++++++++++++----------- 2 files changed, 36 insertions(+), 29 deletions(-) diff --git a/api/extensions/utils/search_tool.py b/api/extensions/utils/search_tool.py index 0ed73b67c0..a1ef7f404d 100644 --- a/api/extensions/utils/search_tool.py +++ b/api/extensions/utils/search_tool.py @@ -45,7 +45,6 @@ def get_text_max_score(search_texts: list[str],search_index: int, pos_map,root_l def get_text_index_score(text_indexs: list[TextIndex],search_texts: list[str]): deduct_points = 0 - search_text_count = len("".join(search_texts)) text_count = 0 for idx,text_index in enumerate(text_indexs): text_count += len(text_index.text_text) @@ -60,6 +59,7 @@ def get_text_index_score(text_indexs: list[TextIndex],search_texts: list[str]): deduct_points += t_score if deduct_points > 50: return 0 + search_text_count = len("".join(search_texts)) deduct_points += (search_text_count - text_count) * 3 return 100 - deduct_points @@ -75,11 +75,11 @@ def get_full_search_text_max_score(search_texts: list[str], target_text: str) -> # groups:list[list[TextIndex]] = [] max_score = -100000 - max_index_list:list[TextIndex] + max_index_list:list[TextIndex]=[] for text_index_s in itertools.product(*text_index_groups): text_index_list:list[TextIndex] = list(text_index_s) score_ = get_text_index_score(text_indexs=text_index_list,search_texts=search_texts) - if score_ < 50: + if score_ < 80: continue if score_ > max_score: max_score = score_ diff --git a/api/services/ext/dataset_ext_service.py b/api/services/ext/dataset_ext_service.py index a322e1345a..9161f2e2e9 100644 --- a/api/services/ext/dataset_ext_service.py +++ b/api/services/ext/dataset_ext_service.py @@ -234,6 +234,7 @@ class DocumentExtService: # 精准查询的向量片段 # fetch_segments = DocumentExtService.get_full_search_segments(dataset_ids=dataset_ids,query_text=query_text) keywords = DocumentExtService.get_keywords(query_text=query_text) + print(keywords.__dict__) segments_rows, document_rows = DocumentExtService.get_keyword_search_segments( dataset_ids=dataset_ids, keywords=keywords, @@ -312,22 +313,26 @@ class DocumentExtService: # 提取关键词,默认 topK=30,withWeight=True main_keywords_texts__ = jieba.analyse.extract_tags(query_text, topK=200, withWeight=False) - main_keywords_texts = [] - for text in keyword_texts: - if text in main_keywords_texts__: - main_keywords_texts.append(text) - - keyword_len = len(main_keywords_texts) + keyword_len = len(main_keywords_texts__) main_keywords_len = 0 + # import pdb; pdb.set_trace() # 提取80% if keyword_len > 2: main_keywords_len = int(keyword_len * 0.8) else: main_keywords_len = keyword_len - main_keywords_len = len(main_keywords_texts) if main_keywords_len > len(main_keywords_texts) else main_keywords_len + main_keywords_len = keyword_len if main_keywords_len > keyword_len else main_keywords_len # 得出最关键的分词 - search_keywords_texts = main_keywords_texts[:main_keywords_len + 1] + search_keywords_texts__ = main_keywords_texts__[:main_keywords_len] + + main_keywords_texts = [] + search_keywords_texts = [] + for text in keyword_texts: + if text in main_keywords_texts__: + main_keywords_texts.append(text) + if text in search_keywords_texts__: + search_keywords_texts.append(text) search_sql = ' & '.join(search_keywords_texts) # 按照最关键的分词查询 @@ -382,27 +387,29 @@ class DocumentExtService: segment_datas = [] for document in document_rows: score, s_list = get_full_search_text_max_score(search_texts=keywords.main_texts,target_text=document.document_name) - segment_data = { - "document_id" : str(document.document_id), - "title": document.document_name, - "content": document.segment_content, - "doc_metadata": document.doc_metadata, - "query": query_text, - "score": score, - } - segment_datas.append(segment_data) + if score > 80: + segment_data = { + "document_id" : str(document.document_id), + "title": document.document_name, + "content": document.segment_content, + "doc_metadata": document.doc_metadata, + "query": query_text, + "score": score, + } + segment_datas.append(segment_data) for segment in segments_rows: score,s_list = get_full_search_text_max_score(search_texts=keywords.main_texts,target_text=segment.segment_content) - segment_data = { - "document_id" : str(segment.document_id), - "title": segment.document_name, - "content": segment.segment_content, - "doc_metadata": segment.doc_metadata, - "query": query_text, - "score": score, - } - segment_datas.append(segment_data) + if score > 80: + segment_data = { + "document_id" : str(segment.document_id), + "title": segment.document_name, + "content": segment.segment_content, + "doc_metadata": segment.doc_metadata, + "query": query_text, + "score": score, + } + segment_datas.append(segment_data) grouped = defaultdict(list)