diff --git a/api/configs/ext/full_text_dataset_config.yml b/api/configs/ext/full_text_dataset_config.yml index 6598dd700c..f5f90736ef 100644 --- a/api/configs/ext/full_text_dataset_config.yml +++ b/api/configs/ext/full_text_dataset_config.yml @@ -15,7 +15,7 @@ process_rule: enabled: false segmentation: separator: "&&&&&" - max_tokens: 500 + max_tokens: 512 chunk_overlap: 50 mode: custom doc_form: text_model @@ -30,9 +30,9 @@ retrieval_model: weights: weight_type: customized keyword_setting: - keyword_weight: 0.7 + keyword_weight: 0.3 vector_setting: - vector_weight: 0.3 + vector_weight: 0.7 embedding_model_name: text-embedding-v3 embedding_provider_name: langgenius/tongyi/tongyi top_k: 10 diff --git a/api/core/workflow/nodes/knowledge_retrieval/knowledge_retrieval_node.py b/api/core/workflow/nodes/knowledge_retrieval/knowledge_retrieval_node.py index 8b706f54c5..3cafe924e7 100644 --- a/api/core/workflow/nodes/knowledge_retrieval/knowledge_retrieval_node.py +++ b/api/core/workflow/nodes/knowledge_retrieval/knowledge_retrieval_node.py @@ -44,7 +44,7 @@ from extensions.ext_redis import redis_client from libs.json_in_md_parser import parse_and_check_json_markdown from models.dataset import Dataset, DatasetMetadata, Document, RateLimitLog from services.feature_service import FeatureService - +from extensions.utils.search_tool import set_full_search_score from .entities import KnowledgeRetrievalNodeData, ModelConfig from .exc import ( InvalidModelTypeError, @@ -115,6 +115,10 @@ class KnowledgeRetrievalNode(LLMNode): # retrieve knowledge try: results = self._fetch_dataset_retriever(node_data=node_data, query=query) + + # 扩展处理分值 + set_full_search_score(query=query,doc_list=results) + outputs = {"result": results} return NodeRunResult( status=WorkflowNodeExecutionStatus.SUCCEEDED, inputs=variables, process_data=None, outputs=outputs diff --git a/api/extensions/utils/search_tool.py b/api/extensions/utils/search_tool.py index 3bb2c502c6..4957ff7856 100644 --- a/api/extensions/utils/search_tool.py +++ b/api/extensions/utils/search_tool.py @@ -5,6 +5,8 @@ import re import jieba import jieba.analyse import json +from typing import Any, Optional, cast +import math class Keywords: def __init__(self, texts, main_texts, search_texts, search_sql): @@ -122,20 +124,23 @@ def get_full_search_text_max_score(search_texts: list[str], target_text: str) -> # print("".join(texts)) return (max_score,max_index_list) +def get_main_keywords_texts(query_text: str) -> list[str]: + # 判断关键词的长度 + jieba.analyse.set_stop_words("extensions/utils/stopwords.txt") + # jieba.analyse.set_idf_path("extensions/utils/idfwords.txt") + # 提取关键词,默认 topK=30,withWeight=True + main_keywords_texts__ = jieba.analyse.extract_tags(query_text, topK=200, withWeight=False) + + return main_keywords_texts__ + def get_keywords(query_text: str) -> Keywords: # 分词器分词关键词 keyword_texts = list(jieba.cut(query_text)) keyword_texts_for_search = list(jieba.cut_for_search(query_text)) - print("keyword_texts:",keyword_texts) print("keyword_texts_for_search:",keyword_texts_for_search) - # import pdb; pdb.set_trace() - # 判断关键词的长度 - jieba.analyse.set_stop_words("extensions/utils/stopwords.txt") - # def get_text(): - # return text - # 提取关键词,默认 topK=30,withWeight=True - main_keywords_texts__ = jieba.analyse.extract_tags(query_text, topK=200, withWeight=False) + main_keywords_texts__ = get_main_keywords_texts(query_text=query_text) + print("main_keywords_texts__:",main_keywords_texts__) keyword_len = len(main_keywords_texts__) main_keywords_len = 0 # import pdb; pdb.set_trace() @@ -171,11 +176,14 @@ def get_keywords(query_text: str) -> Keywords: def get_search_keywords_texts_sql(search_keywords_texts:list[str]): texts = [] + query_sql_list = [] for text in search_keywords_texts: # 将元素才拆成可查询用的分词 texts_for_search:list[str] = list(jieba.cut_for_search(text)) + query_sql_list.append(" | ".join(texts_for_search)) min_texts:list[str] = get_min_search_keywords_texts(texts=texts_for_search) texts.extend(min_texts) + query_sql = " & ".join(query_sql_list) # import pdb; pdb.set_trace() texts_len = len(texts) sql = "" @@ -196,7 +204,8 @@ def get_search_keywords_texts_sql(search_keywords_texts:list[str]): sql_texts.append(f"({text} | {text}{texts[idx + 1]} | {texts[idx-1]}{text} & ({texts[idx + 1]} | {texts[idx + 1]}{texts[idx + 2]}))") sql = " & ".join(sql_texts) print(sql) - return sql + + return f"{sql} | {query_sql}" def get_min_search_keywords_texts(texts:list[str]): # import pdb; pdb.set_trace() @@ -210,7 +219,41 @@ def get_min_search_keywords_texts(texts:list[str]): min_texts.append(text) return min_texts +# 扩展处理分值(全文检索的方法需要处理分值) +def set_full_search_score(query:str,doc_list:list[dict[str, Any]]): + import pdb; pdb.set_trace() + # 根据查询条件的长短 + main_keywords_texts = get_main_keywords_texts(query_text=query) + + all_texts = [] + for main_keywords_text in main_keywords_texts: + keyword_texts_for_search = list(jieba.cut_for_search(main_keywords_text)) + all_texts.extend(keyword_texts_for_search) + + sum_lens = len(all_texts) + sum_lens = 2 if sum_lens == 1 else sum_lens + plus_score = score(sum_lens) + print("plus_score",plus_score) + if doc_list: + for doc in doc_list: + metadata = doc["metadata"] + if metadata: + dataset_name = metadata["dataset_name"] + doc_score = metadata["score"] + if dataset_name == "FULL_TEXT_SEARCH_KNOWLEDGE" and doc_score: + doc_score += plus_score + doc["metadata"]["score"] = doc_score + print("new score:",doc["metadata"]["score"]) + for doc in doc_list: + if doc["metadata"] and doc["metadata"]["score"]: + print("new score:",doc["metadata"]["score"]) + +def score(value): + return round(20 * math.exp(-0.4 * value), 2) / 100 + if __name__ == "__main__": - search_texts=["湖人","阵容"] - score, max_index_list =get_full_search_text_max_score(search_texts=search_texts, source="所以,**严格讲,詹姆斯在湖人确实拥有超级巨星(戴维斯),但不像热火三巨头那样多核并立。**更多时候,他还是湖人阵容的绝对核心和领袖。") - print(score, len(max_index_list)) + print(score(1)) + # get_keywords("分类码") + # search_texts=["湖人","阵容"] + # score, max_index_list =get_full_search_text_max_score(search_texts=search_texts, source="所以,**严格讲,詹姆斯在湖人确实拥有超级巨星(戴维斯),但不像热火三巨头那样多核并立。**更多时候,他还是湖人阵容的绝对核心和领袖。") + # print(score, len(max_index_list))