【Dify】 查询改为关键词匹配查询 - 完善

pull/22121/head
liuchangsheng@wisdomidata.com 11 months ago
parent f7bec15478
commit 6db2d12965

@ -45,7 +45,6 @@ def get_text_max_score(search_texts: list[str],search_index: int, pos_map,root_l
def get_text_index_score(text_indexs: list[TextIndex],search_texts: list[str]):
deduct_points = 0
search_text_count = len("".join(search_texts))
text_count = 0
for idx,text_index in enumerate(text_indexs):
text_count += len(text_index.text_text)
@ -60,6 +59,7 @@ def get_text_index_score(text_indexs: list[TextIndex],search_texts: list[str]):
deduct_points += t_score
if deduct_points > 50:
return 0
search_text_count = len("".join(search_texts))
deduct_points += (search_text_count - text_count) * 3
return 100 - deduct_points
@ -75,11 +75,11 @@ def get_full_search_text_max_score(search_texts: list[str], target_text: str) ->
# groups:list[list[TextIndex]] = []
max_score = -100000
max_index_list:list[TextIndex]
max_index_list:list[TextIndex]=[]
for text_index_s in itertools.product(*text_index_groups):
text_index_list:list[TextIndex] = list(text_index_s)
score_ = get_text_index_score(text_indexs=text_index_list,search_texts=search_texts)
if score_ < 50:
if score_ < 80:
continue
if score_ > max_score:
max_score = score_

@ -234,6 +234,7 @@ class DocumentExtService:
# 精准查询的向量片段
# fetch_segments = DocumentExtService.get_full_search_segments(dataset_ids=dataset_ids,query_text=query_text)
keywords = DocumentExtService.get_keywords(query_text=query_text)
print(keywords.__dict__)
segments_rows, document_rows = DocumentExtService.get_keyword_search_segments(
dataset_ids=dataset_ids,
keywords=keywords,
@ -312,22 +313,26 @@ class DocumentExtService:
# 提取关键词,默认 topK=30withWeight=True
main_keywords_texts__ = jieba.analyse.extract_tags(query_text, topK=200, withWeight=False)
main_keywords_texts = []
for text in keyword_texts:
if text in main_keywords_texts__:
main_keywords_texts.append(text)
keyword_len = len(main_keywords_texts)
keyword_len = len(main_keywords_texts__)
main_keywords_len = 0
# import pdb; pdb.set_trace()
# 提取80%
if keyword_len > 2:
main_keywords_len = int(keyword_len * 0.8)
else:
main_keywords_len = keyword_len
main_keywords_len = len(main_keywords_texts) if main_keywords_len > len(main_keywords_texts) else main_keywords_len
main_keywords_len = keyword_len if main_keywords_len > keyword_len else main_keywords_len
# 得出最关键的分词
search_keywords_texts = main_keywords_texts[:main_keywords_len + 1]
search_keywords_texts__ = main_keywords_texts__[:main_keywords_len]
main_keywords_texts = []
search_keywords_texts = []
for text in keyword_texts:
if text in main_keywords_texts__:
main_keywords_texts.append(text)
if text in search_keywords_texts__:
search_keywords_texts.append(text)
search_sql = ' & '.join(search_keywords_texts)
# 按照最关键的分词查询
@ -382,6 +387,7 @@ class DocumentExtService:
segment_datas = []
for document in document_rows:
score, s_list = get_full_search_text_max_score(search_texts=keywords.main_texts,target_text=document.document_name)
if score > 80:
segment_data = {
"document_id" : str(document.document_id),
"title": document.document_name,
@ -394,6 +400,7 @@ class DocumentExtService:
for segment in segments_rows:
score,s_list = get_full_search_text_max_score(search_texts=keywords.main_texts,target_text=segment.segment_content)
if score > 80:
segment_data = {
"document_id" : str(segment.document_id),
"title": segment.document_name,

Loading…
Cancel
Save