pull/12311/head
Dr. Kiji 1 year ago
parent 81c5953fa5
commit 77030d7581

@ -22,9 +22,11 @@ class Keyword:
match keyword_type: match keyword_type:
case KeyWordType.JIEBA: case KeyWordType.JIEBA:
from core.rag.datasource.keyword.jieba.jieba import Jieba from core.rag.datasource.keyword.jieba.jieba import Jieba
return Jieba return Jieba
case KeyWordType.MECAB: case KeyWordType.MECAB:
from core.rag.datasource.keyword.mecab.mecab import MeCab from core.rag.datasource.keyword.mecab.mecab import MeCab
return MeCab return MeCab
case _: case _:
raise ValueError(f"Keyword store {keyword_type} is not supported.") raise ValueError(f"Keyword store {keyword_type} is not supported.")

@ -1,7 +1,9 @@
from pydantic import BaseModel from pydantic import BaseModel
class MeCabConfig(BaseModel): class MeCabConfig(BaseModel):
"""Configuration for MeCab keyword processor.""" """Configuration for MeCab keyword processor."""
max_keywords_per_chunk: int = 10 max_keywords_per_chunk: int = 10
min_keyword_length: int = 2 min_keyword_length: int = 2
score_threshold: float = 0.3 score_threshold: float = 0.3
@ -12,8 +14,8 @@ class MeCabConfig(BaseModel):
dictionary_path: str = "" # Optional custom dictionary path dictionary_path: str = "" # Optional custom dictionary path
user_dictionary_path: str = "" # Optional user dictionary path user_dictionary_path: str = "" # Optional user dictionary path
pos_weights: dict = { pos_weights: dict = {
'名詞': 1.0, # Nouns "名詞": 1.0, # Nouns
'動詞': 0.8, # Verbs "動詞": 0.8, # Verbs
'形容詞': 0.6, # Adjectives "形容詞": 0.6, # Adjectives
'副詞': 0.4, # Adverbs "副詞": 0.4, # Adverbs
} }

@ -1,38 +1,41 @@
import json import json
import logging import logging
from typing import Any, Optional
from collections import defaultdict from collections import defaultdict
from typing import Any, Optional
from core.rag.datasource.keyword.keyword_base import BaseKeyword from core.rag.datasource.keyword.keyword_base import BaseKeyword
from core.rag.datasource.keyword.mecab.mecab_keyword_table_handler import MeCabKeywordTableHandler
from core.rag.datasource.keyword.mecab.config import MeCabConfig from core.rag.datasource.keyword.mecab.config import MeCabConfig
from core.rag.datasource.keyword.mecab.mecab_keyword_table_handler import MeCabKeywordTableHandler
from core.rag.models.document import Document from core.rag.models.document import Document
from extensions.ext_database import db from extensions.ext_database import db
from extensions.ext_redis import redis_client from extensions.ext_redis import redis_client
from extensions.ext_storage import storage from extensions.ext_storage import storage
from models.dataset import Dataset, DatasetKeywordTable, DocumentSegment from models.dataset import Dataset, DocumentSegment
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
class KeywordProcessorError(Exception): class KeywordProcessorError(Exception):
"""Base error for keyword processing.""" """Base error for keyword processing."""
pass pass
class KeywordExtractionError(KeywordProcessorError): class KeywordExtractionError(KeywordProcessorError):
"""Error during keyword extraction.""" """Error during keyword extraction."""
pass pass
class KeywordStorageError(KeywordProcessorError): class KeywordStorageError(KeywordProcessorError):
"""Error during storage operations.""" """Error during storage operations."""
pass pass
class SetEncoder(json.JSONEncoder): class SetEncoder(json.JSONEncoder):
"""JSON encoder that handles sets.""" """JSON encoder that handles sets."""
def default(self, obj): def default(self, obj):
if isinstance(obj, set): if isinstance(obj, set):
return list(obj) return list(obj)
@ -52,8 +55,7 @@ class MeCab(BaseKeyword):
"""Initialize MeCab handler with configuration.""" """Initialize MeCab handler with configuration."""
try: try:
self._keyword_handler = MeCabKeywordTableHandler( self._keyword_handler = MeCabKeywordTableHandler(
dictionary_path=self._config.dictionary_path, dictionary_path=self._config.dictionary_path, user_dictionary_path=self._config.user_dictionary_path
user_dictionary_path=self._config.user_dictionary_path
) )
if self._config.pos_weights: if self._config.pos_weights:
self._keyword_handler.pos_weights = self._config.pos_weights self._keyword_handler.pos_weights = self._config.pos_weights
@ -70,19 +72,12 @@ class MeCab(BaseKeyword):
for text in texts: for text in texts:
keywords = self._keyword_handler.extract_keywords( keywords = self._keyword_handler.extract_keywords(
text.page_content, text.page_content, self._config.max_keywords_per_chunk
self._config.max_keywords_per_chunk
) )
if text.metadata is not None: if text.metadata is not None:
self._update_segment_keywords( self._update_segment_keywords(self.dataset.id, text.metadata["doc_id"], list(keywords))
self.dataset.id,
text.metadata["doc_id"],
list(keywords)
)
keyword_table = self._add_text_to_keyword_table( keyword_table = self._add_text_to_keyword_table(
keyword_table or {}, keyword_table or {}, text.metadata["doc_id"], list(keywords)
text.metadata["doc_id"],
list(keywords)
) )
self._save_dataset_keyword_table(keyword_table) self._save_dataset_keyword_table(keyword_table)
@ -100,25 +95,17 @@ class MeCab(BaseKeyword):
keywords = keywords_list[i] keywords = keywords_list[i]
if not keywords: if not keywords:
keywords = self._keyword_handler.extract_keywords( keywords = self._keyword_handler.extract_keywords(
text.page_content, text.page_content, self._config.max_keywords_per_chunk
self._config.max_keywords_per_chunk
) )
else: else:
keywords = self._keyword_handler.extract_keywords( keywords = self._keyword_handler.extract_keywords(
text.page_content, text.page_content, self._config.max_keywords_per_chunk
self._config.max_keywords_per_chunk
) )
if text.metadata is not None: if text.metadata is not None:
self._update_segment_keywords( self._update_segment_keywords(self.dataset.id, text.metadata["doc_id"], list(keywords))
self.dataset.id,
text.metadata["doc_id"],
list(keywords)
)
keyword_table = self._add_text_to_keyword_table( keyword_table = self._add_text_to_keyword_table(
keyword_table or {}, keyword_table or {}, text.metadata["doc_id"], list(keywords)
text.metadata["doc_id"],
list(keywords)
) )
self._save_dataset_keyword_table(keyword_table) self._save_dataset_keyword_table(keyword_table)
@ -156,20 +143,13 @@ class MeCab(BaseKeyword):
keyword_table = self._get_dataset_keyword_table() keyword_table = self._get_dataset_keyword_table()
k = kwargs.get("top_k", 4) k = kwargs.get("top_k", 4)
sorted_chunk_indices = self._retrieve_ids_by_query( sorted_chunk_indices = self._retrieve_ids_by_query(keyword_table or {}, query, k)
keyword_table or {},
query,
k
)
documents = [] documents = []
for chunk_index in sorted_chunk_indices: for chunk_index in sorted_chunk_indices:
segment = ( segment = (
db.session.query(DocumentSegment) db.session.query(DocumentSegment)
.filter( .filter(DocumentSegment.dataset_id == self.dataset.id, DocumentSegment.index_node_id == chunk_index)
DocumentSegment.dataset_id == self.dataset.id,
DocumentSegment.index_node_id == chunk_index
)
.first() .first()
) )
@ -201,11 +181,7 @@ class MeCab(BaseKeyword):
"""Save keyword table to storage.""" """Save keyword table to storage."""
table_dict = { table_dict = {
"__type__": "keyword_table", "__type__": "keyword_table",
"__data__": { "__data__": {"index_id": self.dataset.id, "summary": None, "table": keyword_table},
"index_id": self.dataset.id,
"summary": None,
"table": keyword_table
}
} }
dataset_keyword_table = self.dataset.dataset_keyword_table dataset_keyword_table = self.dataset.dataset_keyword_table
@ -218,10 +194,7 @@ class MeCab(BaseKeyword):
file_key = f"keyword_files/{self.dataset.tenant_id}/{self.dataset.id}.txt" file_key = f"keyword_files/{self.dataset.tenant_id}/{self.dataset.id}.txt"
if storage.exists(file_key): if storage.exists(file_key):
storage.delete(file_key) storage.delete(file_key)
storage.save( storage.save(file_key, json.dumps(table_dict, cls=SetEncoder).encode("utf-8"))
file_key,
json.dumps(table_dict, cls=SetEncoder).encode("utf-8")
)
def _add_text_to_keyword_table(self, keyword_table: dict, id: str, keywords: list[str]) -> dict: def _add_text_to_keyword_table(self, keyword_table: dict, id: str, keywords: list[str]) -> dict:
"""Add text keywords to table.""" """Add text keywords to table."""
@ -253,20 +226,13 @@ class MeCab(BaseKeyword):
# Score documents based on matching keywords # Score documents based on matching keywords
chunk_indices_count = defaultdict(int) chunk_indices_count = defaultdict(int)
keywords_list = [ keywords_list = [keyword for keyword in keywords if keyword in set(keyword_table.keys())]
keyword for keyword in keywords
if keyword in set(keyword_table.keys())
]
for keyword in keywords_list: for keyword in keywords_list:
for node_id in keyword_table[keyword]: for node_id in keyword_table[keyword]:
chunk_indices_count[node_id] += 1 chunk_indices_count[node_id] += 1
sorted_chunk_indices = sorted( sorted_chunk_indices = sorted(chunk_indices_count.keys(), key=lambda x: chunk_indices_count[x], reverse=True)
chunk_indices_count.keys(),
key=lambda x: chunk_indices_count[x],
reverse=True
)
return sorted_chunk_indices[:k] return sorted_chunk_indices[:k]
@ -274,10 +240,7 @@ class MeCab(BaseKeyword):
"""Update segment keywords in database.""" """Update segment keywords in database."""
document_segment = ( document_segment = (
db.session.query(DocumentSegment) db.session.query(DocumentSegment)
.filter( .filter(DocumentSegment.dataset_id == dataset_id, DocumentSegment.index_node_id == node_id)
DocumentSegment.dataset_id == dataset_id,
DocumentSegment.index_node_id == node_id
)
.first() .first()
) )

@ -1,10 +1,11 @@
import re from collections import defaultdict
from typing import Optional, Set from typing import Optional, Set
import MeCab import MeCab
from collections import defaultdict
from core.rag.datasource.keyword.mecab.stopwords import STOPWORDS from core.rag.datasource.keyword.mecab.stopwords import STOPWORDS
class MeCabKeywordTableHandler: class MeCabKeywordTableHandler:
"""Japanese keyword extraction using MeCab morphological analyzer.""" """Japanese keyword extraction using MeCab morphological analyzer."""
@ -24,19 +25,19 @@ class MeCabKeywordTableHandler:
mecab_args.append(f"-u {user_dictionary_path}") mecab_args.append(f"-u {user_dictionary_path}")
self.tagger = MeCab.Tagger(" ".join(mecab_args)) self.tagger = MeCab.Tagger(" ".join(mecab_args))
self.tagger.parse('') # Force initialization to catch dictionary errors self.tagger.parse("") # Force initialization to catch dictionary errors
except RuntimeError as e: except RuntimeError as e:
raise RuntimeError(f"Failed to initialize MeCab: {str(e)}") raise RuntimeError(f"Failed to initialize MeCab: {str(e)}")
# POS weights for scoring # POS weights for scoring
self.pos_weights = { self.pos_weights = {
'名詞': 1.0, # Nouns "名詞": 1.0, # Nouns
'動詞': 0.8, # Verbs "動詞": 0.8, # Verbs
'形容詞': 0.6, # Adjectives "形容詞": 0.6, # Adjectives
'副詞': 0.4, # Adverbs "副詞": 0.4, # Adverbs
'連体詞': 0.3, # Adnominal adjectives "連体詞": 0.3, # Adnominal adjectives
'感動詞': 0.2, # Interjections "感動詞": 0.2, # Interjections
} }
self.min_score = 0.3 self.min_score = 0.3
@ -55,23 +56,23 @@ class MeCabKeywordTableHandler:
try: try:
# Parse text with MeCab # Parse text with MeCab
self.tagger.parse('') # Clear tagger state self.tagger.parse("") # Clear tagger state
node = self.tagger.parseToNode(text) node = self.tagger.parseToNode(text)
# Calculate term frequencies and scores # Calculate term frequencies and scores
term_scores = defaultdict(float) term_scores = defaultdict(float)
while node: while node:
features = node.feature.split(',') features = node.feature.split(",")
if len(features) > 0: if len(features) > 0:
pos = features[0] # Part of speech pos = features[0] # Part of speech
pos_subtype = features[1] if len(features) > 1 else '' pos_subtype = features[1] if len(features) > 1 else ""
base_form = features[6] if len(features) > 6 else node.surface base_form = features[6] if len(features) > 6 else node.surface
# Score the term based on its POS # Score the term based on its POS
if pos in self.pos_weights and base_form not in STOPWORDS: if pos in self.pos_weights and base_form not in STOPWORDS:
score = self.pos_weights[pos] score = self.pos_weights[pos]
# Boost proper nouns and technical terms # Boost proper nouns and technical terms
if pos == '名詞' and pos_subtype in ['固有名詞', '専門用語']: if pos == "名詞" and pos_subtype in ["固有名詞", "専門用語"]:
score *= 1.5 score *= 1.5
if len(base_form) > 1: # Filter out single characters if len(base_form) > 1: # Filter out single characters
term_scores[base_form] += score term_scores[base_form] += score
@ -79,17 +80,10 @@ class MeCabKeywordTableHandler:
node = node.next node = node.next
# Get top scoring terms # Get top scoring terms
sorted_terms = sorted( sorted_terms = sorted(term_scores.items(), key=lambda x: x[1], reverse=True)
term_scores.items(),
key=lambda x: x[1],
reverse=True
)
# Filter by minimum score and take top N # Filter by minimum score and take top N
keywords = { keywords = {term for term, score in sorted_terms if score >= self.min_score}
term for term, score in sorted_terms
if score >= self.min_score
}
if max_keywords_per_chunk: if max_keywords_per_chunk:
keywords = set(list(keywords)[:max_keywords_per_chunk]) keywords = set(list(keywords)[:max_keywords_per_chunk])
@ -117,7 +111,7 @@ class MeCabKeywordTableHandler:
compound_readings = [] # For handling different forms of the same compound compound_readings = [] # For handling different forms of the same compound
while node: while node:
features = node.feature.split(',') features = node.feature.split(",")
if len(features) > 6: if len(features) > 6:
base_form = features[6] base_form = features[6]
reading = features[7] if len(features) > 7 else None reading = features[7] if len(features) > 7 else None
@ -132,12 +126,12 @@ class MeCabKeywordTableHandler:
else: else:
if len(compound) > 1: if len(compound) > 1:
# Add the compound term # Add the compound term
compound_term = ''.join(compound) compound_term = "".join(compound)
if len(compound_term) > 1: if len(compound_term) > 1:
results.add(compound_term) results.add(compound_term)
# If readings are available, add normalized form # If readings are available, add normalized form
if compound_readings: if compound_readings:
normalized_term = ''.join(compound_readings) normalized_term = "".join(compound_readings)
if normalized_term != compound_term: if normalized_term != compound_term:
results.add(normalized_term) results.add(normalized_term)
compound = [] compound = []

@ -1,36 +1,190 @@
STOPWORDS = { STOPWORDS = {
# Japanese particles and basic stopwords # Japanese particles and basic stopwords
"", "", "", "", "", "", "", "", "から", "より", "まで", "によって", "",
"あそこ", "あっ", "あの", "あのかた", "あの人", "あり", "あります", "ある", "あれ", "",
"", "いう", "います", "いる", "", "うち", "", "", "および", "おり", "おります", "",
"", "かつて", "から", "", "", "ここ", "こちら", "こと", "この", "これ", "これら", "",
"", "さらに", "", "しかし", "する", "", "", "せる", "そこ", "そして", "その", "",
"その他", "その後", "それ", "それぞれ", "それで", "", "ただし", "たち", "ため", "たり", "",
"", "だっ", "だれ", "", "", "", "でき", "できる", "です", "では", "でも", "", "",
"という", "といった", "とき", "ところ", "として", "とともに", "とも", "と共に", "どこ", "",
"どの", "", "ない", "なお", "なかっ", "ながら", "なく", "なっ", "など", "なに", "なら", "から",
"なり", "なる", "なん", "", "において", "における", "について", "にて", "によって", "により", "より",
"による", "に対して", "に対する", "に関する", "", "ので", "のみ", "", "", "", "ほか", "まで",
"ほとんど", "ほど", "ます", "また", "または", "まで", "", "もの", "ものの", "", "よう", "によって",
"より", "", "られ", "られる", "", "れる", "", "", "", "及び", "", "彼女", "あそこ",
"我々", "特に", "", "私達", "貴方", "貴方方", "あっ",
"あの",
"あのかた",
"あの人",
"あり",
"あります",
"ある",
"あれ",
"",
"いう",
"います",
"いる",
"",
"うち",
"",
"",
"および",
"おり",
"おります",
"",
"かつて",
"",
"ここ",
"こちら",
"こと",
"この",
"これ",
"これら",
"",
"さらに",
"",
"しかし",
"する",
"",
"",
"せる",
"そこ",
"そして",
"その",
"その他",
"その後",
"それ",
"それぞれ",
"それで",
"",
"ただし",
"たち",
"ため",
"たり",
"",
"だっ",
"だれ",
"",
"",
"でき",
"できる",
"です",
"では",
"でも",
"という",
"といった",
"とき",
"ところ",
"として",
"とともに",
"とも",
"と共に",
"どこ",
"どの",
"",
"ない",
"なお",
"なかっ",
"ながら",
"なく",
"なっ",
"など",
"なに",
"なら",
"なり",
"なる",
"なん",
"において",
"における",
"について",
"にて",
"により",
"による",
"に対して",
"に対する",
"に関する",
"ので",
"のみ",
"",
"ほか",
"ほとんど",
"ほど",
"ます",
"また",
"または",
"",
"もの",
"ものの",
"",
"よう",
"",
"られ",
"られる",
"",
"れる",
"",
"",
"及び",
"",
"彼女",
"我々",
"特に",
"",
"私達",
"貴方",
"貴方方",
# Japanese auxiliary verbs # Japanese auxiliary verbs
"です", "ます", "でした", "ました", "である", "", "", "だった", "でした",
"ました",
"である",
"だった",
# Japanese pronouns # Japanese pronouns
"これ", "それ", "あれ", "この", "その", "あの", "ここ", "そこ", "あそこ",
# Japanese common words # Japanese common words
"いる", "ある", "なる", "する", "できる", "おる", "いく", "くる", "おる",
"いく",
"くる",
# Numbers # Numbers
"", "", "", "", "", "", "", "", "", "", "",
"1", "2", "3", "4", "5", "6", "7", "8", "9", "0", "",
"",
"",
"",
"",
"",
"",
"",
"",
"1",
"2",
"3",
"4",
"5",
"6",
"7",
"8",
"9",
"0",
# Punctuation # Punctuation
"", "", "", "", "", "", "", "", "", "", "",
"",
"",
"",
"",
"",
"",
"",
"",
"",
# Common English stopwords (for mixed text) # Common English stopwords (for mixed text)
"the", "is", "at", "which", "on", "in", "and", "or", "a", "an", "the",
"is",
"at",
"which",
"on",
"in",
"and",
"or",
"a",
"an",
} }

Loading…
Cancel
Save