feat/datasource
jyong 11 months ago
parent 7b7f8ef51d
commit 4130c50643

@ -28,9 +28,11 @@ class Jieba(BaseKeyword):
with redis_client.lock(lock_name, timeout=600): with redis_client.lock(lock_name, timeout=600):
keyword_table_handler = JiebaKeywordTableHandler() keyword_table_handler = JiebaKeywordTableHandler()
keyword_table = self._get_dataset_keyword_table() keyword_table = self._get_dataset_keyword_table()
keyword_number = self.dataset.keyword_number if self.dataset.keyword_number else self._config.max_keywords_per_chunk
for text in texts: for text in texts:
keywords = keyword_table_handler.extract_keywords( keywords = keyword_table_handler.extract_keywords(
text.page_content, self._config.max_keywords_per_chunk text.page_content, keyword_number
) )
if text.metadata is not None: if text.metadata is not None:
self._update_segment_keywords(self.dataset.id, text.metadata["doc_id"], list(keywords)) self._update_segment_keywords(self.dataset.id, text.metadata["doc_id"], list(keywords))
@ -49,17 +51,18 @@ class Jieba(BaseKeyword):
keyword_table = self._get_dataset_keyword_table() keyword_table = self._get_dataset_keyword_table()
keywords_list = kwargs.get("keywords_list") keywords_list = kwargs.get("keywords_list")
keyword_number = self.dataset.keyword_number if self.dataset.keyword_number else self._config.max_keywords_per_chunk
for i in range(len(texts)): for i in range(len(texts)):
text = texts[i] text = texts[i]
if keywords_list: if keywords_list:
keywords = keywords_list[i] keywords = keywords_list[i]
if not keywords: if not keywords:
keywords = keyword_table_handler.extract_keywords( keywords = keyword_table_handler.extract_keywords(
text.page_content, self._config.max_keywords_per_chunk text.page_content, keyword_number
) )
else: else:
keywords = keyword_table_handler.extract_keywords( keywords = keyword_table_handler.extract_keywords(
text.page_content, self._config.max_keywords_per_chunk text.page_content, keyword_number
) )
if text.metadata is not None: if text.metadata is not None:
self._update_segment_keywords(self.dataset.id, text.metadata["doc_id"], list(keywords)) self._update_segment_keywords(self.dataset.id, text.metadata["doc_id"], list(keywords))
@ -239,7 +242,9 @@ class Jieba(BaseKeyword):
keyword_table or {}, segment.index_node_id, pre_segment_data["keywords"] keyword_table or {}, segment.index_node_id, pre_segment_data["keywords"]
) )
else: else:
keywords = keyword_table_handler.extract_keywords(segment.content, self._config.max_keywords_per_chunk) keyword_number = self.dataset.keyword_number if self.dataset.keyword_number else self._config.max_keywords_per_chunk
keywords = keyword_table_handler.extract_keywords(segment.content, keyword_number)
segment.keywords = list(keywords) segment.keywords = list(keywords)
keyword_table = self._add_text_to_keyword_table( keyword_table = self._add_text_to_keyword_table(
keyword_table or {}, segment.index_node_id, list(keywords) keyword_table or {}, segment.index_node_id, list(keywords)

Loading…
Cancel
Save