diff --git a/api/extensions/utils/search_tool.py b/api/extensions/utils/search_tool.py index a1ef7f404d..3bb2c502c6 100644 --- a/api/extensions/utils/search_tool.py +++ b/api/extensions/utils/search_tool.py @@ -2,7 +2,24 @@ import difflib from collections import defaultdict, Counter import itertools import re +import jieba +import jieba.analyse +import json +class Keywords: + def __init__(self, texts, main_texts, search_texts, search_sql): + self.texts = texts + self.main_texts = main_texts + self.search_texts=search_texts + self.search_sql=search_sql + + def to_dict(self): + return { + "texts": self.texts, + "main_texts": self.main_texts, + "search_texts": self.search_texts, + "search_sql": self.search_sql, + } class TextIndex: def __init__(self, text_text, index): self.text_text = text_text @@ -105,6 +122,94 @@ def get_full_search_text_max_score(search_texts: list[str], target_text: str) -> # print("".join(texts)) return (max_score,max_index_list) +def get_keywords(query_text: str) -> Keywords: + # 分词器分词关键词 + keyword_texts = list(jieba.cut(query_text)) + keyword_texts_for_search = list(jieba.cut_for_search(query_text)) + + print("keyword_texts:",keyword_texts) + print("keyword_texts_for_search:",keyword_texts_for_search) + # import pdb; pdb.set_trace() + # 判断关键词的长度 + jieba.analyse.set_stop_words("extensions/utils/stopwords.txt") + # def get_text(): + # return text + # 提取关键词,默认 topK=30,withWeight=True + main_keywords_texts__ = jieba.analyse.extract_tags(query_text, topK=200, withWeight=False) + keyword_len = len(main_keywords_texts__) + main_keywords_len = 0 + # import pdb; pdb.set_trace() + # 提取80% + if keyword_len > 4: + main_keywords_len = int(keyword_len * 0.8) + else: + main_keywords_len = keyword_len + + main_keywords_len = keyword_len if main_keywords_len > keyword_len else main_keywords_len + # 得出最关键的分词 + search_keywords_texts__ = main_keywords_texts__[:main_keywords_len] + + main_keywords_texts = [] + search_keywords_texts = [] + for text in keyword_texts: + if text in main_keywords_texts__: + main_keywords_texts.append(text) + if text in search_keywords_texts__: + search_keywords_texts.append(text) + + search_sql = get_search_keywords_texts_sql(search_keywords_texts=search_keywords_texts) + # search_sql = ' & '.join(search_keywords_texts) + # 按照最关键的分词查询 + keywords = Keywords( + texts=main_keywords_texts, + main_texts=main_keywords_texts, + search_texts=search_keywords_texts, + search_sql=search_sql + ) + return keywords + +def get_search_keywords_texts_sql(search_keywords_texts:list[str]): + + texts = [] + for text in search_keywords_texts: + # 将元素才拆成可查询用的分词 + texts_for_search:list[str] = list(jieba.cut_for_search(text)) + min_texts:list[str] = get_min_search_keywords_texts(texts=texts_for_search) + texts.extend(min_texts) + # import pdb; pdb.set_trace() + texts_len = len(texts) + sql = "" + if texts_len == 1: + sql = texts[0] + elif texts_len == 2: + sql = f"{texts[0]} & {texts[1]} | {texts[0]}{texts[1]}" + else: + sql_texts:list[str] = [] + for idx,text in enumerate(texts): + if idx == 0: + sql_texts.append(f"({text} | {text}{texts[idx + 1]})") + elif idx == texts_len - 2: + sql_texts.append(f"({text} | {text}{texts[idx + 1]} | {texts[idx-1]}{text} & {texts[idx + 1]})") + elif idx == texts_len - 1: + sql_texts.append(f"({text} | {texts[idx - 1]}{text})") + else: + sql_texts.append(f"({text} | {text}{texts[idx + 1]} | {texts[idx-1]}{text} & ({texts[idx + 1]} | {texts[idx + 1]}{texts[idx + 2]}))") + sql = " & ".join(sql_texts) + print(sql) + return sql + +def get_min_search_keywords_texts(texts:list[str]): + # import pdb; pdb.set_trace() + min_texts = [] + for text in texts: + b = True + for text2 in texts: + if text != text2 and text2 in text: + b = False + if b: + min_texts.append(text) + return min_texts + if __name__ == "__main__": search_texts=["湖人","阵容"] score, max_index_list =get_full_search_text_max_score(search_texts=search_texts, source="所以,**严格讲,詹姆斯在湖人确实拥有超级巨星(戴维斯),但不像热火三巨头那样多核并立。**更多时候,他还是湖人阵容的绝对核心和领袖。") diff --git a/api/extensions/utils/stopwords.txt b/api/extensions/utils/stopwords.txt new file mode 100644 index 0000000000..191fb7265f --- /dev/null +++ b/api/extensions/utils/stopwords.txt @@ -0,0 +1,264 @@ +的 +了 +和 +是 +我 +也 +就 +都 +而 +及 +与 +着 +或 +一个 +没有 +我们 +你们 +他们 +她们 +它们 +自己 +这 +那 +这些 +那些 +它 +被 +在 +对于 +因为 +所以 +如果 +然后 +而且 +并且 +并 +但是 +不过 +不是 +而是 +还有 +还 +已 +已经 +正在 +非常 +很 +较 +更 +最 +吧 +啊 +呀 +嘛 +呢 +么 +吗 +哦 +恩 +呃 +咯 +啊呀 +啥 +哈 +啊哈 +啦 +咱 +什么 +多少 +几 +多 +你 +我 +他 +她 +它 +咱们 +此 +其 +某 +某个 +某些 +每 +每个 +各 +个 +等 +等于 +以及 +其中 +从而 +因此 +除此之外 +据此 +比如 +例如 +比如说 +譬如 +比如说的 +说 +要 +来 +去 +把 +被 +给 +使 +令 +让 +令得 +所 +之 +之所以 +以 +以便 +以免 +以至 +以致 +以内 +以来 +之后 +之前 +之后 +期间 +前后 +上下 +以上 +以下 +左右 +当时 +当年 +当下 +眼下 +马上 +立刻 +即将 +刚刚 +刚才 +后来 +曾经 +仍然 +依然 +一直 +一直到 +尚且 +甚至 +最终 +总是 +总共 +其实 +本来 +原来 +明显 +确实 +大概 +大约 +差不多 +也许 +可能 +估计 +基本 +尤其 +尽管 +虽然 +然而 +然而却 +不过 +但是 +还是 +但是呢 +毕竟 +同时 +并不是 +并非 +并无 +未必 +尚未 +不如 +不然 +以外 +之外 +其中 +而后 +而今 +而后 +除此 +除此之外 +否则 +万一 +万万 +若 +若是 +假如 +假设 +要是 +若非 +非得 +无非 +何况 +况且 +再说 +试问 +只要 +除非 +只有 +宁愿 +宁可 +宁肯 +不如 +不妨 +不必 +务必 +务须 +尚需 +无需 +都 +谁 +啥 +哪 +哪儿 +哪里 +怎样 +怎么 +怎么样 +何时 +几时 +多少 +几多 +多么 +啥也 +哪怕 +纵然 +即使 +假使 +便 +则 +即 +乃 +虽 +虽说 +且 +以致于 +为止 +为此 +为的是 +不管 +无论 +任凭 +凡是 +凡 +既然 +既 +既已 +既然如此 +说实话 +说到底 +也就是说 +一句话 +总之 +总的来说 +总而言之 +总的说来 +换句话说 +话说 diff --git a/api/services/ext/dataset_ext_service.py b/api/services/ext/dataset_ext_service.py index 9161f2e2e9..6bd760bede 100644 --- a/api/services/ext/dataset_ext_service.py +++ b/api/services/ext/dataset_ext_service.py @@ -25,23 +25,10 @@ from sqlalchemy.engine import Row import jieba import jieba.analyse import difflib -from extensions.utils.search_tool import get_full_search_text_max_score +from extensions.utils.search_tool import get_full_search_text_max_score,Keywords,get_keywords import json -class Keywords: - def __init__(self, texts, main_texts, search_texts, search_sql): - self.texts = texts - self.main_texts = main_texts - self.search_texts=search_texts - self.search_sql=search_sql - - def to_dict(self): - return { - "texts": self.texts, - "main_texts": self.main_texts, - "search_texts": self.search_texts, - "search_sql": self.search_sql, - } + class DatasetExtService: resource_type = "dataset" @@ -233,7 +220,7 @@ class DocumentExtService: dataset_ids = [dataset.id for dataset in datasets] # 精准查询的向量片段 # fetch_segments = DocumentExtService.get_full_search_segments(dataset_ids=dataset_ids,query_text=query_text) - keywords = DocumentExtService.get_keywords(query_text=query_text) + keywords = get_keywords(query_text=query_text) print(keywords.__dict__) segments_rows, document_rows = DocumentExtService.get_keyword_search_segments( dataset_ids=dataset_ids, @@ -303,46 +290,6 @@ class DocumentExtService: fetch_segments.append(segment_list[1]) return fetch_segments - def get_keywords(query_text: str) -> Keywords: - # 分词器分词关键词 - keyword_texts = list(jieba.cut(query_text)) - # 判断关键词的长度 - jieba.analyse.set_stop_words("services/ext/stopwords.txt") - # def get_text(): - # return text - # 提取关键词,默认 topK=30,withWeight=True - main_keywords_texts__ = jieba.analyse.extract_tags(query_text, topK=200, withWeight=False) - - keyword_len = len(main_keywords_texts__) - main_keywords_len = 0 - # import pdb; pdb.set_trace() - # 提取80% - if keyword_len > 2: - main_keywords_len = int(keyword_len * 0.8) - else: - main_keywords_len = keyword_len - - main_keywords_len = keyword_len if main_keywords_len > keyword_len else main_keywords_len - # 得出最关键的分词 - search_keywords_texts__ = main_keywords_texts__[:main_keywords_len] - - main_keywords_texts = [] - search_keywords_texts = [] - for text in keyword_texts: - if text in main_keywords_texts__: - main_keywords_texts.append(text) - if text in search_keywords_texts__: - search_keywords_texts.append(text) - - search_sql = ' & '.join(search_keywords_texts) - # 按照最关键的分词查询 - keywords = Keywords( - texts=main_keywords_texts, - main_texts=main_keywords_texts, - search_texts=search_keywords_texts, - search_sql=search_sql - ) - return keywords def get_keyword_search_segments(dataset_ids: list[str], keywords: Keywords @@ -351,10 +298,10 @@ class DocumentExtService: sql = text(f""" SELECT s.id segment_id, s.document_id, s.content segment_content, d.name document_name,d.doc_metadata FROM document_segments s - left join documents d on d.id = s.document_id - WHERE to_tsvector('chinese', s.content) @@ to_tsquery(:keywords) and d.dataset_id::text = ANY(:dataset_ids) + LEFT JOIN documents d ON d.id = s.document_id + WHERE to_tsvector('chinese', s.content) @@ to_tsquery(:keywords) AND d.dataset_id::text = ANY(:dataset_ids) """) - + print(sql,keywords.search_sql,dataset_ids[0]) segments_rows = db.session.execute(sql, {"keywords": keywords.search_sql, "dataset_ids" : dataset_ids}).fetchall() sql = text(""" @@ -383,7 +330,7 @@ class DocumentExtService: query_text : str, segments_rows: list[Row], document_rows: list[Row]) -> list[dict]: - + # import pdb; pdb.set_trace() segment_datas = [] for document in document_rows: score, s_list = get_full_search_text_max_score(search_texts=keywords.main_texts,target_text=document.document_name)