pull/12311/head
Dr. Kiji 1 year ago
parent 81c5953fa5
commit 77030d7581

@ -22,9 +22,11 @@ class Keyword:
match keyword_type:
case KeyWordType.JIEBA:
from core.rag.datasource.keyword.jieba.jieba import Jieba
return Jieba
case KeyWordType.MECAB:
from core.rag.datasource.keyword.mecab.mecab import MeCab
return MeCab
case _:
raise ValueError(f"Keyword store {keyword_type} is not supported.")

@ -1,7 +1,9 @@
from pydantic import BaseModel
class MeCabConfig(BaseModel):
"""Configuration for MeCab keyword processor."""
max_keywords_per_chunk: int = 10
min_keyword_length: int = 2
score_threshold: float = 0.3
@ -12,8 +14,8 @@ class MeCabConfig(BaseModel):
dictionary_path: str = "" # Optional custom dictionary path
user_dictionary_path: str = "" # Optional user dictionary path
pos_weights: dict = {
'名詞': 1.0, # Nouns
'動詞': 0.8, # Verbs
'形容詞': 0.6, # Adjectives
'副詞': 0.4, # Adverbs
"名詞": 1.0, # Nouns
"動詞": 0.8, # Verbs
"形容詞": 0.6, # Adjectives
"副詞": 0.4, # Adverbs
}

@ -1,38 +1,41 @@
import json
import logging
from typing import Any, Optional
from collections import defaultdict
from typing import Any, Optional
from core.rag.datasource.keyword.keyword_base import BaseKeyword
from core.rag.datasource.keyword.mecab.mecab_keyword_table_handler import MeCabKeywordTableHandler
from core.rag.datasource.keyword.mecab.config import MeCabConfig
from core.rag.datasource.keyword.mecab.mecab_keyword_table_handler import MeCabKeywordTableHandler
from core.rag.models.document import Document
from extensions.ext_database import db
from extensions.ext_redis import redis_client
from extensions.ext_storage import storage
from models.dataset import Dataset, DatasetKeywordTable, DocumentSegment
from models.dataset import Dataset, DocumentSegment
logger = logging.getLogger(__name__)
class KeywordProcessorError(Exception):
"""Base error for keyword processing."""
pass
class KeywordExtractionError(KeywordProcessorError):
"""Error during keyword extraction."""
pass
class KeywordStorageError(KeywordProcessorError):
"""Error during storage operations."""
pass
class SetEncoder(json.JSONEncoder):
"""JSON encoder that handles sets."""
def default(self, obj):
if isinstance(obj, set):
return list(obj)
@ -52,8 +55,7 @@ class MeCab(BaseKeyword):
"""Initialize MeCab handler with configuration."""
try:
self._keyword_handler = MeCabKeywordTableHandler(
dictionary_path=self._config.dictionary_path,
user_dictionary_path=self._config.user_dictionary_path
dictionary_path=self._config.dictionary_path, user_dictionary_path=self._config.user_dictionary_path
)
if self._config.pos_weights:
self._keyword_handler.pos_weights = self._config.pos_weights
@ -70,19 +72,12 @@ class MeCab(BaseKeyword):
for text in texts:
keywords = self._keyword_handler.extract_keywords(
text.page_content,
self._config.max_keywords_per_chunk
text.page_content, self._config.max_keywords_per_chunk
)
if text.metadata is not None:
self._update_segment_keywords(
self.dataset.id,
text.metadata["doc_id"],
list(keywords)
)
self._update_segment_keywords(self.dataset.id, text.metadata["doc_id"], list(keywords))
keyword_table = self._add_text_to_keyword_table(
keyword_table or {},
text.metadata["doc_id"],
list(keywords)
keyword_table or {}, text.metadata["doc_id"], list(keywords)
)
self._save_dataset_keyword_table(keyword_table)
@ -100,25 +95,17 @@ class MeCab(BaseKeyword):
keywords = keywords_list[i]
if not keywords:
keywords = self._keyword_handler.extract_keywords(
text.page_content,
self._config.max_keywords_per_chunk
text.page_content, self._config.max_keywords_per_chunk
)
else:
keywords = self._keyword_handler.extract_keywords(
text.page_content,
self._config.max_keywords_per_chunk
text.page_content, self._config.max_keywords_per_chunk
)
if text.metadata is not None:
self._update_segment_keywords(
self.dataset.id,
text.metadata["doc_id"],
list(keywords)
)
self._update_segment_keywords(self.dataset.id, text.metadata["doc_id"], list(keywords))
keyword_table = self._add_text_to_keyword_table(
keyword_table or {},
text.metadata["doc_id"],
list(keywords)
keyword_table or {}, text.metadata["doc_id"], list(keywords)
)
self._save_dataset_keyword_table(keyword_table)
@ -156,20 +143,13 @@ class MeCab(BaseKeyword):
keyword_table = self._get_dataset_keyword_table()
k = kwargs.get("top_k", 4)
sorted_chunk_indices = self._retrieve_ids_by_query(
keyword_table or {},
query,
k
)
sorted_chunk_indices = self._retrieve_ids_by_query(keyword_table or {}, query, k)
documents = []
for chunk_index in sorted_chunk_indices:
segment = (
db.session.query(DocumentSegment)
.filter(
DocumentSegment.dataset_id == self.dataset.id,
DocumentSegment.index_node_id == chunk_index
)
.filter(DocumentSegment.dataset_id == self.dataset.id, DocumentSegment.index_node_id == chunk_index)
.first()
)
@ -201,11 +181,7 @@ class MeCab(BaseKeyword):
"""Save keyword table to storage."""
table_dict = {
"__type__": "keyword_table",
"__data__": {
"index_id": self.dataset.id,
"summary": None,
"table": keyword_table
}
"__data__": {"index_id": self.dataset.id, "summary": None, "table": keyword_table},
}
dataset_keyword_table = self.dataset.dataset_keyword_table
@ -218,10 +194,7 @@ class MeCab(BaseKeyword):
file_key = f"keyword_files/{self.dataset.tenant_id}/{self.dataset.id}.txt"
if storage.exists(file_key):
storage.delete(file_key)
storage.save(
file_key,
json.dumps(table_dict, cls=SetEncoder).encode("utf-8")
)
storage.save(file_key, json.dumps(table_dict, cls=SetEncoder).encode("utf-8"))
def _add_text_to_keyword_table(self, keyword_table: dict, id: str, keywords: list[str]) -> dict:
"""Add text keywords to table."""
@ -253,20 +226,13 @@ class MeCab(BaseKeyword):
# Score documents based on matching keywords
chunk_indices_count = defaultdict(int)
keywords_list = [
keyword for keyword in keywords
if keyword in set(keyword_table.keys())
]
keywords_list = [keyword for keyword in keywords if keyword in set(keyword_table.keys())]
for keyword in keywords_list:
for node_id in keyword_table[keyword]:
chunk_indices_count[node_id] += 1
sorted_chunk_indices = sorted(
chunk_indices_count.keys(),
key=lambda x: chunk_indices_count[x],
reverse=True
)
sorted_chunk_indices = sorted(chunk_indices_count.keys(), key=lambda x: chunk_indices_count[x], reverse=True)
return sorted_chunk_indices[:k]
@ -274,10 +240,7 @@ class MeCab(BaseKeyword):
"""Update segment keywords in database."""
document_segment = (
db.session.query(DocumentSegment)
.filter(
DocumentSegment.dataset_id == dataset_id,
DocumentSegment.index_node_id == node_id
)
.filter(DocumentSegment.dataset_id == dataset_id, DocumentSegment.index_node_id == node_id)
.first()
)

@ -1,10 +1,11 @@
import re
from collections import defaultdict
from typing import Optional, Set
import MeCab
from collections import defaultdict
from core.rag.datasource.keyword.mecab.stopwords import STOPWORDS
class MeCabKeywordTableHandler:
"""Japanese keyword extraction using MeCab morphological analyzer."""
@ -24,19 +25,19 @@ class MeCabKeywordTableHandler:
mecab_args.append(f"-u {user_dictionary_path}")
self.tagger = MeCab.Tagger(" ".join(mecab_args))
self.tagger.parse('') # Force initialization to catch dictionary errors
self.tagger.parse("") # Force initialization to catch dictionary errors
except RuntimeError as e:
raise RuntimeError(f"Failed to initialize MeCab: {str(e)}")
# POS weights for scoring
self.pos_weights = {
'名詞': 1.0, # Nouns
'動詞': 0.8, # Verbs
'形容詞': 0.6, # Adjectives
'副詞': 0.4, # Adverbs
'連体詞': 0.3, # Adnominal adjectives
'感動詞': 0.2, # Interjections
"名詞": 1.0, # Nouns
"動詞": 0.8, # Verbs
"形容詞": 0.6, # Adjectives
"副詞": 0.4, # Adverbs
"連体詞": 0.3, # Adnominal adjectives
"感動詞": 0.2, # Interjections
}
self.min_score = 0.3
@ -55,23 +56,23 @@ class MeCabKeywordTableHandler:
try:
# Parse text with MeCab
self.tagger.parse('') # Clear tagger state
self.tagger.parse("") # Clear tagger state
node = self.tagger.parseToNode(text)
# Calculate term frequencies and scores
term_scores = defaultdict(float)
while node:
features = node.feature.split(',')
features = node.feature.split(",")
if len(features) > 0:
pos = features[0] # Part of speech
pos_subtype = features[1] if len(features) > 1 else ''
pos_subtype = features[1] if len(features) > 1 else ""
base_form = features[6] if len(features) > 6 else node.surface
# Score the term based on its POS
if pos in self.pos_weights and base_form not in STOPWORDS:
score = self.pos_weights[pos]
# Boost proper nouns and technical terms
if pos == '名詞' and pos_subtype in ['固有名詞', '専門用語']:
if pos == "名詞" and pos_subtype in ["固有名詞", "専門用語"]:
score *= 1.5
if len(base_form) > 1: # Filter out single characters
term_scores[base_form] += score
@ -79,17 +80,10 @@ class MeCabKeywordTableHandler:
node = node.next
# Get top scoring terms
sorted_terms = sorted(
term_scores.items(),
key=lambda x: x[1],
reverse=True
)
sorted_terms = sorted(term_scores.items(), key=lambda x: x[1], reverse=True)
# Filter by minimum score and take top N
keywords = {
term for term, score in sorted_terms
if score >= self.min_score
}
keywords = {term for term, score in sorted_terms if score >= self.min_score}
if max_keywords_per_chunk:
keywords = set(list(keywords)[:max_keywords_per_chunk])
@ -117,7 +111,7 @@ class MeCabKeywordTableHandler:
compound_readings = [] # For handling different forms of the same compound
while node:
features = node.feature.split(',')
features = node.feature.split(",")
if len(features) > 6:
base_form = features[6]
reading = features[7] if len(features) > 7 else None
@ -132,12 +126,12 @@ class MeCabKeywordTableHandler:
else:
if len(compound) > 1:
# Add the compound term
compound_term = ''.join(compound)
compound_term = "".join(compound)
if len(compound_term) > 1:
results.add(compound_term)
# If readings are available, add normalized form
if compound_readings:
normalized_term = ''.join(compound_readings)
normalized_term = "".join(compound_readings)
if normalized_term != compound_term:
results.add(normalized_term)
compound = []

@ -1,36 +1,190 @@
STOPWORDS = {
# Japanese particles and basic stopwords
"", "", "", "", "", "", "", "", "から", "より", "まで", "によって",
"あそこ", "あっ", "あの", "あのかた", "あの人", "あり", "あります", "ある", "あれ",
"", "いう", "います", "いる", "", "うち", "", "", "および", "おり", "おります",
"", "かつて", "から", "", "", "ここ", "こちら", "こと", "この", "これ", "これら",
"", "さらに", "", "しかし", "する", "", "", "せる", "そこ", "そして", "その",
"その他", "その後", "それ", "それぞれ", "それで", "", "ただし", "たち", "ため", "たり",
"", "だっ", "だれ", "", "", "", "でき", "できる", "です", "では", "でも", "",
"という", "といった", "とき", "ところ", "として", "とともに", "とも", "と共に", "どこ",
"どの", "", "ない", "なお", "なかっ", "ながら", "なく", "なっ", "など", "なに", "なら",
"なり", "なる", "なん", "", "において", "における", "について", "にて", "によって", "により",
"による", "に対して", "に対する", "に関する", "", "ので", "のみ", "", "", "", "ほか",
"ほとんど", "ほど", "ます", "また", "または", "まで", "", "もの", "ものの", "", "よう",
"より", "", "られ", "られる", "", "れる", "", "", "", "及び", "", "彼女",
"我々", "特に", "", "私達", "貴方", "貴方方",
"",
"",
"",
"",
"",
"",
"",
"",
"から",
"より",
"まで",
"によって",
"あそこ",
"あっ",
"あの",
"あのかた",
"あの人",
"あり",
"あります",
"ある",
"あれ",
"",
"いう",
"います",
"いる",
"",
"うち",
"",
"",
"および",
"おり",
"おります",
"",
"かつて",
"",
"ここ",
"こちら",
"こと",
"この",
"これ",
"これら",
"",
"さらに",
"",
"しかし",
"する",
"",
"",
"せる",
"そこ",
"そして",
"その",
"その他",
"その後",
"それ",
"それぞれ",
"それで",
"",
"ただし",
"たち",
"ため",
"たり",
"",
"だっ",
"だれ",
"",
"",
"でき",
"できる",
"です",
"では",
"でも",
"という",
"といった",
"とき",
"ところ",
"として",
"とともに",
"とも",
"と共に",
"どこ",
"どの",
"",
"ない",
"なお",
"なかっ",
"ながら",
"なく",
"なっ",
"など",
"なに",
"なら",
"なり",
"なる",
"なん",
"において",
"における",
"について",
"にて",
"により",
"による",
"に対して",
"に対する",
"に関する",
"ので",
"のみ",
"",
"ほか",
"ほとんど",
"ほど",
"ます",
"また",
"または",
"",
"もの",
"ものの",
"",
"よう",
"",
"られ",
"られる",
"",
"れる",
"",
"",
"及び",
"",
"彼女",
"我々",
"特に",
"",
"私達",
"貴方",
"貴方方",
# Japanese auxiliary verbs
"です", "ます", "でした", "ました", "である", "", "", "だった",
"でした",
"ました",
"である",
"だった",
# Japanese pronouns
"これ", "それ", "あれ", "この", "その", "あの", "ここ", "そこ", "あそこ",
# Japanese common words
"いる", "ある", "なる", "する", "できる", "おる", "いく", "くる",
"おる",
"いく",
"くる",
# Numbers
"", "", "", "", "", "", "", "", "", "",
"1", "2", "3", "4", "5", "6", "7", "8", "9", "0",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"1",
"2",
"3",
"4",
"5",
"6",
"7",
"8",
"9",
"0",
# Punctuation
"", "", "", "", "", "", "", "", "", "",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
# Common English stopwords (for mixed text)
"the", "is", "at", "which", "on", "in", "and", "or", "a", "an",
"the",
"is",
"at",
"which",
"on",
"in",
"and",
"or",
"a",
"an",
}

Loading…
Cancel
Save