From eb7ecd96e4280fb6ccf8347c6be7b7727013b392 Mon Sep 17 00:00:00 2001 From: "liuchangsheng@wisdomidata.com" Date: Thu, 19 Jun 2025 15:34:59 +0800 Subject: [PATCH] =?UTF-8?q?c=E3=80=90Dify=E3=80=91=20=E5=85=A8=E6=96=87?= =?UTF-8?q?=E6=A3=80=E7=B4=A2=E7=9A=84=E5=88=86=E7=89=87-=E9=80=9A?= =?UTF-8?q?=E8=BF=87=E7=9F=A5=E8=AF=86=E5=BA=93=E7=9A=84=E5=90=8D=E7=A7=B0?= =?UTF-8?q?=E5=81=9A=E5=88=A4=E6=96=AD-=20=E6=A0=B9=E6=8D=AE=E6=9F=A5?= =?UTF-8?q?=E8=AF=A2=E7=9A=84=E9=95=BF=E7=9F=AD=EF=BC=8C=E9=80=82=E9=87=8F?= =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E5=88=86=E5=80=BC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- api/configs/ext/full_text_dataset_config.yml | 6 +- .../knowledge_retrieval_node.py | 6 +- api/extensions/utils/search_tool.py | 67 +++++++++++++++---- 3 files changed, 63 insertions(+), 16 deletions(-) diff --git a/api/configs/ext/full_text_dataset_config.yml b/api/configs/ext/full_text_dataset_config.yml index 6598dd700c..f5f90736ef 100644 --- a/api/configs/ext/full_text_dataset_config.yml +++ b/api/configs/ext/full_text_dataset_config.yml @@ -15,7 +15,7 @@ process_rule: enabled: false segmentation: separator: "&&&&&" - max_tokens: 500 + max_tokens: 512 chunk_overlap: 50 mode: custom doc_form: text_model @@ -30,9 +30,9 @@ retrieval_model: weights: weight_type: customized keyword_setting: - keyword_weight: 0.7 + keyword_weight: 0.3 vector_setting: - vector_weight: 0.3 + vector_weight: 0.7 embedding_model_name: text-embedding-v3 embedding_provider_name: langgenius/tongyi/tongyi top_k: 10 diff --git a/api/core/workflow/nodes/knowledge_retrieval/knowledge_retrieval_node.py b/api/core/workflow/nodes/knowledge_retrieval/knowledge_retrieval_node.py index 8b706f54c5..3cafe924e7 100644 --- a/api/core/workflow/nodes/knowledge_retrieval/knowledge_retrieval_node.py +++ b/api/core/workflow/nodes/knowledge_retrieval/knowledge_retrieval_node.py @@ -44,7 +44,7 @@ from extensions.ext_redis import redis_client from libs.json_in_md_parser import parse_and_check_json_markdown from models.dataset import Dataset, DatasetMetadata, Document, RateLimitLog from services.feature_service import FeatureService - +from extensions.utils.search_tool import set_full_search_score from .entities import KnowledgeRetrievalNodeData, ModelConfig from .exc import ( InvalidModelTypeError, @@ -115,6 +115,10 @@ class KnowledgeRetrievalNode(LLMNode): # retrieve knowledge try: results = self._fetch_dataset_retriever(node_data=node_data, query=query) + + # 扩展处理分值 + set_full_search_score(query=query,doc_list=results) + outputs = {"result": results} return NodeRunResult( status=WorkflowNodeExecutionStatus.SUCCEEDED, inputs=variables, process_data=None, outputs=outputs diff --git a/api/extensions/utils/search_tool.py b/api/extensions/utils/search_tool.py index 3bb2c502c6..4957ff7856 100644 --- a/api/extensions/utils/search_tool.py +++ b/api/extensions/utils/search_tool.py @@ -5,6 +5,8 @@ import re import jieba import jieba.analyse import json +from typing import Any, Optional, cast +import math class Keywords: def __init__(self, texts, main_texts, search_texts, search_sql): @@ -122,20 +124,23 @@ def get_full_search_text_max_score(search_texts: list[str], target_text: str) -> # print("".join(texts)) return (max_score,max_index_list) +def get_main_keywords_texts(query_text: str) -> list[str]: + # 判断关键词的长度 + jieba.analyse.set_stop_words("extensions/utils/stopwords.txt") + # jieba.analyse.set_idf_path("extensions/utils/idfwords.txt") + # 提取关键词,默认 topK=30,withWeight=True + main_keywords_texts__ = jieba.analyse.extract_tags(query_text, topK=200, withWeight=False) + + return main_keywords_texts__ + def get_keywords(query_text: str) -> Keywords: # 分词器分词关键词 keyword_texts = list(jieba.cut(query_text)) keyword_texts_for_search = list(jieba.cut_for_search(query_text)) - print("keyword_texts:",keyword_texts) print("keyword_texts_for_search:",keyword_texts_for_search) - # import pdb; pdb.set_trace() - # 判断关键词的长度 - jieba.analyse.set_stop_words("extensions/utils/stopwords.txt") - # def get_text(): - # return text - # 提取关键词,默认 topK=30,withWeight=True - main_keywords_texts__ = jieba.analyse.extract_tags(query_text, topK=200, withWeight=False) + main_keywords_texts__ = get_main_keywords_texts(query_text=query_text) + print("main_keywords_texts__:",main_keywords_texts__) keyword_len = len(main_keywords_texts__) main_keywords_len = 0 # import pdb; pdb.set_trace() @@ -171,11 +176,14 @@ def get_keywords(query_text: str) -> Keywords: def get_search_keywords_texts_sql(search_keywords_texts:list[str]): texts = [] + query_sql_list = [] for text in search_keywords_texts: # 将元素才拆成可查询用的分词 texts_for_search:list[str] = list(jieba.cut_for_search(text)) + query_sql_list.append(" | ".join(texts_for_search)) min_texts:list[str] = get_min_search_keywords_texts(texts=texts_for_search) texts.extend(min_texts) + query_sql = " & ".join(query_sql_list) # import pdb; pdb.set_trace() texts_len = len(texts) sql = "" @@ -196,7 +204,8 @@ def get_search_keywords_texts_sql(search_keywords_texts:list[str]): sql_texts.append(f"({text} | {text}{texts[idx + 1]} | {texts[idx-1]}{text} & ({texts[idx + 1]} | {texts[idx + 1]}{texts[idx + 2]}))") sql = " & ".join(sql_texts) print(sql) - return sql + + return f"{sql} | {query_sql}" def get_min_search_keywords_texts(texts:list[str]): # import pdb; pdb.set_trace() @@ -210,7 +219,41 @@ def get_min_search_keywords_texts(texts:list[str]): min_texts.append(text) return min_texts +# 扩展处理分值(全文检索的方法需要处理分值) +def set_full_search_score(query:str,doc_list:list[dict[str, Any]]): + import pdb; pdb.set_trace() + # 根据查询条件的长短 + main_keywords_texts = get_main_keywords_texts(query_text=query) + + all_texts = [] + for main_keywords_text in main_keywords_texts: + keyword_texts_for_search = list(jieba.cut_for_search(main_keywords_text)) + all_texts.extend(keyword_texts_for_search) + + sum_lens = len(all_texts) + sum_lens = 2 if sum_lens == 1 else sum_lens + plus_score = score(sum_lens) + print("plus_score",plus_score) + if doc_list: + for doc in doc_list: + metadata = doc["metadata"] + if metadata: + dataset_name = metadata["dataset_name"] + doc_score = metadata["score"] + if dataset_name == "FULL_TEXT_SEARCH_KNOWLEDGE" and doc_score: + doc_score += plus_score + doc["metadata"]["score"] = doc_score + print("new score:",doc["metadata"]["score"]) + for doc in doc_list: + if doc["metadata"] and doc["metadata"]["score"]: + print("new score:",doc["metadata"]["score"]) + +def score(value): + return round(20 * math.exp(-0.4 * value), 2) / 100 + if __name__ == "__main__": - search_texts=["湖人","阵容"] - score, max_index_list =get_full_search_text_max_score(search_texts=search_texts, source="所以,**严格讲,詹姆斯在湖人确实拥有超级巨星(戴维斯),但不像热火三巨头那样多核并立。**更多时候,他还是湖人阵容的绝对核心和领袖。") - print(score, len(max_index_list)) + print(score(1)) + # get_keywords("分类码") + # search_texts=["湖人","阵容"] + # score, max_index_list =get_full_search_text_max_score(search_texts=search_texts, source="所以,**严格讲,詹姆斯在湖人确实拥有超级巨星(戴维斯),但不像热火三巨头那样多核并立。**更多时候,他还是湖人阵容的绝对核心和领袖。") + # print(score, len(max_index_list))