|
|
|
|
@ -1,38 +1,41 @@
|
|
|
|
|
import json
|
|
|
|
|
import logging
|
|
|
|
|
from typing import Any, Optional
|
|
|
|
|
from collections import defaultdict
|
|
|
|
|
from typing import Any, Optional
|
|
|
|
|
|
|
|
|
|
from core.rag.datasource.keyword.keyword_base import BaseKeyword
|
|
|
|
|
from core.rag.datasource.keyword.mecab.mecab_keyword_table_handler import MeCabKeywordTableHandler
|
|
|
|
|
from core.rag.datasource.keyword.mecab.config import MeCabConfig
|
|
|
|
|
from core.rag.datasource.keyword.mecab.mecab_keyword_table_handler import MeCabKeywordTableHandler
|
|
|
|
|
from core.rag.models.document import Document
|
|
|
|
|
from extensions.ext_database import db
|
|
|
|
|
from extensions.ext_redis import redis_client
|
|
|
|
|
from extensions.ext_storage import storage
|
|
|
|
|
from models.dataset import Dataset, DatasetKeywordTable, DocumentSegment
|
|
|
|
|
|
|
|
|
|
from models.dataset import Dataset, DocumentSegment
|
|
|
|
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class KeywordProcessorError(Exception):
|
|
|
|
|
"""Base error for keyword processing."""
|
|
|
|
|
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class KeywordExtractionError(KeywordProcessorError):
|
|
|
|
|
"""Error during keyword extraction."""
|
|
|
|
|
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class KeywordStorageError(KeywordProcessorError):
|
|
|
|
|
"""Error during storage operations."""
|
|
|
|
|
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class SetEncoder(json.JSONEncoder):
|
|
|
|
|
"""JSON encoder that handles sets."""
|
|
|
|
|
|
|
|
|
|
def default(self, obj):
|
|
|
|
|
if isinstance(obj, set):
|
|
|
|
|
return list(obj)
|
|
|
|
|
@ -52,8 +55,7 @@ class MeCab(BaseKeyword):
|
|
|
|
|
"""Initialize MeCab handler with configuration."""
|
|
|
|
|
try:
|
|
|
|
|
self._keyword_handler = MeCabKeywordTableHandler(
|
|
|
|
|
dictionary_path=self._config.dictionary_path,
|
|
|
|
|
user_dictionary_path=self._config.user_dictionary_path
|
|
|
|
|
dictionary_path=self._config.dictionary_path, user_dictionary_path=self._config.user_dictionary_path
|
|
|
|
|
)
|
|
|
|
|
if self._config.pos_weights:
|
|
|
|
|
self._keyword_handler.pos_weights = self._config.pos_weights
|
|
|
|
|
@ -70,19 +72,12 @@ class MeCab(BaseKeyword):
|
|
|
|
|
|
|
|
|
|
for text in texts:
|
|
|
|
|
keywords = self._keyword_handler.extract_keywords(
|
|
|
|
|
text.page_content,
|
|
|
|
|
self._config.max_keywords_per_chunk
|
|
|
|
|
text.page_content, self._config.max_keywords_per_chunk
|
|
|
|
|
)
|
|
|
|
|
if text.metadata is not None:
|
|
|
|
|
self._update_segment_keywords(
|
|
|
|
|
self.dataset.id,
|
|
|
|
|
text.metadata["doc_id"],
|
|
|
|
|
list(keywords)
|
|
|
|
|
)
|
|
|
|
|
self._update_segment_keywords(self.dataset.id, text.metadata["doc_id"], list(keywords))
|
|
|
|
|
keyword_table = self._add_text_to_keyword_table(
|
|
|
|
|
keyword_table or {},
|
|
|
|
|
text.metadata["doc_id"],
|
|
|
|
|
list(keywords)
|
|
|
|
|
keyword_table or {}, text.metadata["doc_id"], list(keywords)
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
self._save_dataset_keyword_table(keyword_table)
|
|
|
|
|
@ -100,25 +95,17 @@ class MeCab(BaseKeyword):
|
|
|
|
|
keywords = keywords_list[i]
|
|
|
|
|
if not keywords:
|
|
|
|
|
keywords = self._keyword_handler.extract_keywords(
|
|
|
|
|
text.page_content,
|
|
|
|
|
self._config.max_keywords_per_chunk
|
|
|
|
|
text.page_content, self._config.max_keywords_per_chunk
|
|
|
|
|
)
|
|
|
|
|
else:
|
|
|
|
|
keywords = self._keyword_handler.extract_keywords(
|
|
|
|
|
text.page_content,
|
|
|
|
|
self._config.max_keywords_per_chunk
|
|
|
|
|
text.page_content, self._config.max_keywords_per_chunk
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
if text.metadata is not None:
|
|
|
|
|
self._update_segment_keywords(
|
|
|
|
|
self.dataset.id,
|
|
|
|
|
text.metadata["doc_id"],
|
|
|
|
|
list(keywords)
|
|
|
|
|
)
|
|
|
|
|
self._update_segment_keywords(self.dataset.id, text.metadata["doc_id"], list(keywords))
|
|
|
|
|
keyword_table = self._add_text_to_keyword_table(
|
|
|
|
|
keyword_table or {},
|
|
|
|
|
text.metadata["doc_id"],
|
|
|
|
|
list(keywords)
|
|
|
|
|
keyword_table or {}, text.metadata["doc_id"], list(keywords)
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
self._save_dataset_keyword_table(keyword_table)
|
|
|
|
|
@ -156,20 +143,13 @@ class MeCab(BaseKeyword):
|
|
|
|
|
keyword_table = self._get_dataset_keyword_table()
|
|
|
|
|
k = kwargs.get("top_k", 4)
|
|
|
|
|
|
|
|
|
|
sorted_chunk_indices = self._retrieve_ids_by_query(
|
|
|
|
|
keyword_table or {},
|
|
|
|
|
query,
|
|
|
|
|
k
|
|
|
|
|
)
|
|
|
|
|
sorted_chunk_indices = self._retrieve_ids_by_query(keyword_table or {}, query, k)
|
|
|
|
|
|
|
|
|
|
documents = []
|
|
|
|
|
for chunk_index in sorted_chunk_indices:
|
|
|
|
|
segment = (
|
|
|
|
|
db.session.query(DocumentSegment)
|
|
|
|
|
.filter(
|
|
|
|
|
DocumentSegment.dataset_id == self.dataset.id,
|
|
|
|
|
DocumentSegment.index_node_id == chunk_index
|
|
|
|
|
)
|
|
|
|
|
.filter(DocumentSegment.dataset_id == self.dataset.id, DocumentSegment.index_node_id == chunk_index)
|
|
|
|
|
.first()
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
@ -201,11 +181,7 @@ class MeCab(BaseKeyword):
|
|
|
|
|
"""Save keyword table to storage."""
|
|
|
|
|
table_dict = {
|
|
|
|
|
"__type__": "keyword_table",
|
|
|
|
|
"__data__": {
|
|
|
|
|
"index_id": self.dataset.id,
|
|
|
|
|
"summary": None,
|
|
|
|
|
"table": keyword_table
|
|
|
|
|
}
|
|
|
|
|
"__data__": {"index_id": self.dataset.id, "summary": None, "table": keyword_table},
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
dataset_keyword_table = self.dataset.dataset_keyword_table
|
|
|
|
|
@ -218,10 +194,7 @@ class MeCab(BaseKeyword):
|
|
|
|
|
file_key = f"keyword_files/{self.dataset.tenant_id}/{self.dataset.id}.txt"
|
|
|
|
|
if storage.exists(file_key):
|
|
|
|
|
storage.delete(file_key)
|
|
|
|
|
storage.save(
|
|
|
|
|
file_key,
|
|
|
|
|
json.dumps(table_dict, cls=SetEncoder).encode("utf-8")
|
|
|
|
|
)
|
|
|
|
|
storage.save(file_key, json.dumps(table_dict, cls=SetEncoder).encode("utf-8"))
|
|
|
|
|
|
|
|
|
|
def _add_text_to_keyword_table(self, keyword_table: dict, id: str, keywords: list[str]) -> dict:
|
|
|
|
|
"""Add text keywords to table."""
|
|
|
|
|
@ -253,20 +226,13 @@ class MeCab(BaseKeyword):
|
|
|
|
|
|
|
|
|
|
# Score documents based on matching keywords
|
|
|
|
|
chunk_indices_count = defaultdict(int)
|
|
|
|
|
keywords_list = [
|
|
|
|
|
keyword for keyword in keywords
|
|
|
|
|
if keyword in set(keyword_table.keys())
|
|
|
|
|
]
|
|
|
|
|
keywords_list = [keyword for keyword in keywords if keyword in set(keyword_table.keys())]
|
|
|
|
|
|
|
|
|
|
for keyword in keywords_list:
|
|
|
|
|
for node_id in keyword_table[keyword]:
|
|
|
|
|
chunk_indices_count[node_id] += 1
|
|
|
|
|
|
|
|
|
|
sorted_chunk_indices = sorted(
|
|
|
|
|
chunk_indices_count.keys(),
|
|
|
|
|
key=lambda x: chunk_indices_count[x],
|
|
|
|
|
reverse=True
|
|
|
|
|
)
|
|
|
|
|
sorted_chunk_indices = sorted(chunk_indices_count.keys(), key=lambda x: chunk_indices_count[x], reverse=True)
|
|
|
|
|
|
|
|
|
|
return sorted_chunk_indices[:k]
|
|
|
|
|
|
|
|
|
|
@ -274,10 +240,7 @@ class MeCab(BaseKeyword):
|
|
|
|
|
"""Update segment keywords in database."""
|
|
|
|
|
document_segment = (
|
|
|
|
|
db.session.query(DocumentSegment)
|
|
|
|
|
.filter(
|
|
|
|
|
DocumentSegment.dataset_id == dataset_id,
|
|
|
|
|
DocumentSegment.index_node_id == node_id
|
|
|
|
|
)
|
|
|
|
|
.filter(DocumentSegment.dataset_id == dataset_id, DocumentSegment.index_node_id == node_id)
|
|
|
|
|
.first()
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|