pull/12311/head
Dr. Kiji 1 year ago
parent 81c5953fa5
commit 77030d7581

@ -22,9 +22,11 @@ class Keyword:
match keyword_type: match keyword_type:
case KeyWordType.JIEBA: case KeyWordType.JIEBA:
from core.rag.datasource.keyword.jieba.jieba import Jieba from core.rag.datasource.keyword.jieba.jieba import Jieba
return Jieba return Jieba
case KeyWordType.MECAB: case KeyWordType.MECAB:
from core.rag.datasource.keyword.mecab.mecab import MeCab from core.rag.datasource.keyword.mecab.mecab import MeCab
return MeCab return MeCab
case _: case _:
raise ValueError(f"Keyword store {keyword_type} is not supported.") raise ValueError(f"Keyword store {keyword_type} is not supported.")

@ -1,19 +1,21 @@
from pydantic import BaseModel from pydantic import BaseModel
class MeCabConfig(BaseModel): class MeCabConfig(BaseModel):
"""Configuration for MeCab keyword processor.""" """Configuration for MeCab keyword processor."""
max_keywords_per_chunk: int = 10 max_keywords_per_chunk: int = 10
min_keyword_length: int = 2 min_keyword_length: int = 2
score_threshold: float = 0.3 score_threshold: float = 0.3
storage_type: str = "database" storage_type: str = "database"
cache_timeout: int = 3600 cache_timeout: int = 3600
# MeCab specific settings # MeCab specific settings
dictionary_path: str = "" # Optional custom dictionary path dictionary_path: str = "" # Optional custom dictionary path
user_dictionary_path: str = "" # Optional user dictionary path user_dictionary_path: str = "" # Optional user dictionary path
pos_weights: dict = { pos_weights: dict = {
'名詞': 1.0, # Nouns "名詞": 1.0, # Nouns
'動詞': 0.8, # Verbs "動詞": 0.8, # Verbs
'形容詞': 0.6, # Adjectives "形容詞": 0.6, # Adjectives
'副詞': 0.4, # Adverbs "副詞": 0.4, # Adverbs
} }

@ -1,38 +1,41 @@
import json import json
import logging import logging
from typing import Any, Optional
from collections import defaultdict from collections import defaultdict
from typing import Any, Optional
from core.rag.datasource.keyword.keyword_base import BaseKeyword from core.rag.datasource.keyword.keyword_base import BaseKeyword
from core.rag.datasource.keyword.mecab.mecab_keyword_table_handler import MeCabKeywordTableHandler
from core.rag.datasource.keyword.mecab.config import MeCabConfig from core.rag.datasource.keyword.mecab.config import MeCabConfig
from core.rag.datasource.keyword.mecab.mecab_keyword_table_handler import MeCabKeywordTableHandler
from core.rag.models.document import Document from core.rag.models.document import Document
from extensions.ext_database import db from extensions.ext_database import db
from extensions.ext_redis import redis_client from extensions.ext_redis import redis_client
from extensions.ext_storage import storage from extensions.ext_storage import storage
from models.dataset import Dataset, DatasetKeywordTable, DocumentSegment from models.dataset import Dataset, DocumentSegment
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
class KeywordProcessorError(Exception): class KeywordProcessorError(Exception):
"""Base error for keyword processing.""" """Base error for keyword processing."""
pass pass
class KeywordExtractionError(KeywordProcessorError): class KeywordExtractionError(KeywordProcessorError):
"""Error during keyword extraction.""" """Error during keyword extraction."""
pass pass
class KeywordStorageError(KeywordProcessorError): class KeywordStorageError(KeywordProcessorError):
"""Error during storage operations.""" """Error during storage operations."""
pass pass
class SetEncoder(json.JSONEncoder): class SetEncoder(json.JSONEncoder):
"""JSON encoder that handles sets.""" """JSON encoder that handles sets."""
def default(self, obj): def default(self, obj):
if isinstance(obj, set): if isinstance(obj, set):
return list(obj) return list(obj)
@ -41,19 +44,18 @@ class SetEncoder(json.JSONEncoder):
class MeCab(BaseKeyword): class MeCab(BaseKeyword):
"""Japanese keyword processor using MeCab morphological analyzer.""" """Japanese keyword processor using MeCab morphological analyzer."""
def __init__(self, dataset: Dataset): def __init__(self, dataset: Dataset):
super().__init__(dataset) super().__init__(dataset)
self._config = MeCabConfig() self._config = MeCabConfig()
self._keyword_handler = None self._keyword_handler = None
self._init_handler() self._init_handler()
def _init_handler(self): def _init_handler(self):
"""Initialize MeCab handler with configuration.""" """Initialize MeCab handler with configuration."""
try: try:
self._keyword_handler = MeCabKeywordTableHandler( self._keyword_handler = MeCabKeywordTableHandler(
dictionary_path=self._config.dictionary_path, dictionary_path=self._config.dictionary_path, user_dictionary_path=self._config.user_dictionary_path
user_dictionary_path=self._config.user_dictionary_path
) )
if self._config.pos_weights: if self._config.pos_weights:
self._keyword_handler.pos_weights = self._config.pos_weights self._keyword_handler.pos_weights = self._config.pos_weights
@ -61,75 +63,60 @@ class MeCab(BaseKeyword):
except Exception as e: except Exception as e:
logger.error(f"Failed to initialize MeCab handler: {str(e)}") logger.error(f"Failed to initialize MeCab handler: {str(e)}")
raise KeywordProcessorError(f"MeCab initialization failed: {str(e)}") raise KeywordProcessorError(f"MeCab initialization failed: {str(e)}")
def create(self, texts: list[Document], **kwargs) -> BaseKeyword: def create(self, texts: list[Document], **kwargs) -> BaseKeyword:
"""Create keyword index for documents.""" """Create keyword index for documents."""
lock_name = f"keyword_indexing_lock_{self.dataset.id}" lock_name = f"keyword_indexing_lock_{self.dataset.id}"
with redis_client.lock(lock_name, timeout=600): with redis_client.lock(lock_name, timeout=600):
keyword_table = self._get_dataset_keyword_table() keyword_table = self._get_dataset_keyword_table()
for text in texts: for text in texts:
keywords = self._keyword_handler.extract_keywords( keywords = self._keyword_handler.extract_keywords(
text.page_content, text.page_content, self._config.max_keywords_per_chunk
self._config.max_keywords_per_chunk
) )
if text.metadata is not None: if text.metadata is not None:
self._update_segment_keywords( self._update_segment_keywords(self.dataset.id, text.metadata["doc_id"], list(keywords))
self.dataset.id,
text.metadata["doc_id"],
list(keywords)
)
keyword_table = self._add_text_to_keyword_table( keyword_table = self._add_text_to_keyword_table(
keyword_table or {}, keyword_table or {}, text.metadata["doc_id"], list(keywords)
text.metadata["doc_id"],
list(keywords)
) )
self._save_dataset_keyword_table(keyword_table) self._save_dataset_keyword_table(keyword_table)
return self return self
def add_texts(self, texts: list[Document], **kwargs): def add_texts(self, texts: list[Document], **kwargs):
"""Add new texts to existing index.""" """Add new texts to existing index."""
lock_name = f"keyword_indexing_lock_{self.dataset.id}" lock_name = f"keyword_indexing_lock_{self.dataset.id}"
with redis_client.lock(lock_name, timeout=600): with redis_client.lock(lock_name, timeout=600):
keyword_table = self._get_dataset_keyword_table() keyword_table = self._get_dataset_keyword_table()
keywords_list = kwargs.get("keywords_list") keywords_list = kwargs.get("keywords_list")
for i, text in enumerate(texts): for i, text in enumerate(texts):
if keywords_list: if keywords_list:
keywords = keywords_list[i] keywords = keywords_list[i]
if not keywords: if not keywords:
keywords = self._keyword_handler.extract_keywords( keywords = self._keyword_handler.extract_keywords(
text.page_content, text.page_content, self._config.max_keywords_per_chunk
self._config.max_keywords_per_chunk
) )
else: else:
keywords = self._keyword_handler.extract_keywords( keywords = self._keyword_handler.extract_keywords(
text.page_content, text.page_content, self._config.max_keywords_per_chunk
self._config.max_keywords_per_chunk
) )
if text.metadata is not None: if text.metadata is not None:
self._update_segment_keywords( self._update_segment_keywords(self.dataset.id, text.metadata["doc_id"], list(keywords))
self.dataset.id,
text.metadata["doc_id"],
list(keywords)
)
keyword_table = self._add_text_to_keyword_table( keyword_table = self._add_text_to_keyword_table(
keyword_table or {}, keyword_table or {}, text.metadata["doc_id"], list(keywords)
text.metadata["doc_id"],
list(keywords)
) )
self._save_dataset_keyword_table(keyword_table) self._save_dataset_keyword_table(keyword_table)
def text_exists(self, id: str) -> bool: def text_exists(self, id: str) -> bool:
"""Check if text exists in index.""" """Check if text exists in index."""
keyword_table = self._get_dataset_keyword_table() keyword_table = self._get_dataset_keyword_table()
if keyword_table is None: if keyword_table is None:
return False return False
return id in set.union(*keyword_table.values()) if keyword_table else False return id in set.union(*keyword_table.values()) if keyword_table else False
def delete_by_ids(self, ids: list[str]) -> None: def delete_by_ids(self, ids: list[str]) -> None:
"""Delete texts by IDs.""" """Delete texts by IDs."""
lock_name = f"keyword_indexing_lock_{self.dataset.id}" lock_name = f"keyword_indexing_lock_{self.dataset.id}"
@ -138,7 +125,7 @@ class MeCab(BaseKeyword):
if keyword_table is not None: if keyword_table is not None:
keyword_table = self._delete_ids_from_keyword_table(keyword_table, ids) keyword_table = self._delete_ids_from_keyword_table(keyword_table, ids)
self._save_dataset_keyword_table(keyword_table) self._save_dataset_keyword_table(keyword_table)
def delete(self) -> None: def delete(self) -> None:
"""Delete entire index.""" """Delete entire index."""
lock_name = f"keyword_indexing_lock_{self.dataset.id}" lock_name = f"keyword_indexing_lock_{self.dataset.id}"
@ -150,29 +137,22 @@ class MeCab(BaseKeyword):
if dataset_keyword_table.data_source_type != "database": if dataset_keyword_table.data_source_type != "database":
file_key = f"keyword_files/{self.dataset.tenant_id}/{self.dataset.id}.txt" file_key = f"keyword_files/{self.dataset.tenant_id}/{self.dataset.id}.txt"
storage.delete(file_key) storage.delete(file_key)
def search(self, query: str, **kwargs: Any) -> list[Document]: def search(self, query: str, **kwargs: Any) -> list[Document]:
"""Search documents using keywords.""" """Search documents using keywords."""
keyword_table = self._get_dataset_keyword_table() keyword_table = self._get_dataset_keyword_table()
k = kwargs.get("top_k", 4) k = kwargs.get("top_k", 4)
sorted_chunk_indices = self._retrieve_ids_by_query( sorted_chunk_indices = self._retrieve_ids_by_query(keyword_table or {}, query, k)
keyword_table or {},
query,
k
)
documents = [] documents = []
for chunk_index in sorted_chunk_indices: for chunk_index in sorted_chunk_indices:
segment = ( segment = (
db.session.query(DocumentSegment) db.session.query(DocumentSegment)
.filter( .filter(DocumentSegment.dataset_id == self.dataset.id, DocumentSegment.index_node_id == chunk_index)
DocumentSegment.dataset_id == self.dataset.id,
DocumentSegment.index_node_id == chunk_index
)
.first() .first()
) )
if segment: if segment:
documents.append( documents.append(
Document( Document(
@ -185,9 +165,9 @@ class MeCab(BaseKeyword):
}, },
) )
) )
return documents return documents
def _get_dataset_keyword_table(self) -> Optional[dict]: def _get_dataset_keyword_table(self) -> Optional[dict]:
"""Get keyword table from storage.""" """Get keyword table from storage."""
dataset_keyword_table = self.dataset.dataset_keyword_table dataset_keyword_table = self.dataset.dataset_keyword_table
@ -196,21 +176,17 @@ class MeCab(BaseKeyword):
if keyword_table_dict: if keyword_table_dict:
return dict(keyword_table_dict["__data__"]["table"]) return dict(keyword_table_dict["__data__"]["table"])
return {} return {}
def _save_dataset_keyword_table(self, keyword_table): def _save_dataset_keyword_table(self, keyword_table):
"""Save keyword table to storage.""" """Save keyword table to storage."""
table_dict = { table_dict = {
"__type__": "keyword_table", "__type__": "keyword_table",
"__data__": { "__data__": {"index_id": self.dataset.id, "summary": None, "table": keyword_table},
"index_id": self.dataset.id,
"summary": None,
"table": keyword_table
}
} }
dataset_keyword_table = self.dataset.dataset_keyword_table dataset_keyword_table = self.dataset.dataset_keyword_table
data_source_type = dataset_keyword_table.data_source_type data_source_type = dataset_keyword_table.data_source_type
if data_source_type == "database": if data_source_type == "database":
dataset_keyword_table.keyword_table = json.dumps(table_dict, cls=SetEncoder) dataset_keyword_table.keyword_table = json.dumps(table_dict, cls=SetEncoder)
db.session.commit() db.session.commit()
@ -218,11 +194,8 @@ class MeCab(BaseKeyword):
file_key = f"keyword_files/{self.dataset.tenant_id}/{self.dataset.id}.txt" file_key = f"keyword_files/{self.dataset.tenant_id}/{self.dataset.id}.txt"
if storage.exists(file_key): if storage.exists(file_key):
storage.delete(file_key) storage.delete(file_key)
storage.save( storage.save(file_key, json.dumps(table_dict, cls=SetEncoder).encode("utf-8"))
file_key,
json.dumps(table_dict, cls=SetEncoder).encode("utf-8")
)
def _add_text_to_keyword_table(self, keyword_table: dict, id: str, keywords: list[str]) -> dict: def _add_text_to_keyword_table(self, keyword_table: dict, id: str, keywords: list[str]) -> dict:
"""Add text keywords to table.""" """Add text keywords to table."""
for keyword in keywords: for keyword in keywords:
@ -230,58 +203,48 @@ class MeCab(BaseKeyword):
keyword_table[keyword] = set() keyword_table[keyword] = set()
keyword_table[keyword].add(id) keyword_table[keyword].add(id)
return keyword_table return keyword_table
def _delete_ids_from_keyword_table(self, keyword_table: dict, ids: list[str]) -> dict: def _delete_ids_from_keyword_table(self, keyword_table: dict, ids: list[str]) -> dict:
"""Delete IDs from keyword table.""" """Delete IDs from keyword table."""
node_idxs_to_delete = set(ids) node_idxs_to_delete = set(ids)
keywords_to_delete = set() keywords_to_delete = set()
for keyword, node_idxs in keyword_table.items(): for keyword, node_idxs in keyword_table.items():
if node_idxs_to_delete.intersection(node_idxs): if node_idxs_to_delete.intersection(node_idxs):
keyword_table[keyword] = node_idxs.difference(node_idxs_to_delete) keyword_table[keyword] = node_idxs.difference(node_idxs_to_delete)
if not keyword_table[keyword]: if not keyword_table[keyword]:
keywords_to_delete.add(keyword) keywords_to_delete.add(keyword)
for keyword in keywords_to_delete: for keyword in keywords_to_delete:
del keyword_table[keyword] del keyword_table[keyword]
return keyword_table return keyword_table
def _retrieve_ids_by_query(self, keyword_table: dict, query: str, k: int = 4): def _retrieve_ids_by_query(self, keyword_table: dict, query: str, k: int = 4):
"""Retrieve document IDs by query.""" """Retrieve document IDs by query."""
keywords = self._keyword_handler.extract_keywords(query) keywords = self._keyword_handler.extract_keywords(query)
# Score documents based on matching keywords # Score documents based on matching keywords
chunk_indices_count = defaultdict(int) chunk_indices_count = defaultdict(int)
keywords_list = [ keywords_list = [keyword for keyword in keywords if keyword in set(keyword_table.keys())]
keyword for keyword in keywords
if keyword in set(keyword_table.keys())
]
for keyword in keywords_list: for keyword in keywords_list:
for node_id in keyword_table[keyword]: for node_id in keyword_table[keyword]:
chunk_indices_count[node_id] += 1 chunk_indices_count[node_id] += 1
sorted_chunk_indices = sorted( sorted_chunk_indices = sorted(chunk_indices_count.keys(), key=lambda x: chunk_indices_count[x], reverse=True)
chunk_indices_count.keys(),
key=lambda x: chunk_indices_count[x],
reverse=True
)
return sorted_chunk_indices[:k] return sorted_chunk_indices[:k]
def _update_segment_keywords(self, dataset_id: str, node_id: str, keywords: list[str]): def _update_segment_keywords(self, dataset_id: str, node_id: str, keywords: list[str]):
"""Update segment keywords in database.""" """Update segment keywords in database."""
document_segment = ( document_segment = (
db.session.query(DocumentSegment) db.session.query(DocumentSegment)
.filter( .filter(DocumentSegment.dataset_id == dataset_id, DocumentSegment.index_node_id == node_id)
DocumentSegment.dataset_id == dataset_id,
DocumentSegment.index_node_id == node_id
)
.first() .first()
) )
if document_segment: if document_segment:
document_segment.keywords = keywords document_segment.keywords = keywords
db.session.add(document_segment) db.session.add(document_segment)
db.session.commit() db.session.commit()

@ -1,16 +1,17 @@
import re from collections import defaultdict
from typing import Optional, Set from typing import Optional, Set
import MeCab import MeCab
from collections import defaultdict
from core.rag.datasource.keyword.mecab.stopwords import STOPWORDS from core.rag.datasource.keyword.mecab.stopwords import STOPWORDS
class MeCabKeywordTableHandler: class MeCabKeywordTableHandler:
"""Japanese keyword extraction using MeCab morphological analyzer.""" """Japanese keyword extraction using MeCab morphological analyzer."""
def __init__(self, dictionary_path: str = "", user_dictionary_path: str = ""): def __init__(self, dictionary_path: str = "", user_dictionary_path: str = ""):
"""Initialize MeCab tokenizer. """Initialize MeCab tokenizer.
Args: Args:
dictionary_path: Path to custom system dictionary dictionary_path: Path to custom system dictionary
user_dictionary_path: Path to user dictionary user_dictionary_path: Path to user dictionary
@ -22,109 +23,102 @@ class MeCabKeywordTableHandler:
mecab_args.append(f"-d {dictionary_path}") mecab_args.append(f"-d {dictionary_path}")
if user_dictionary_path: if user_dictionary_path:
mecab_args.append(f"-u {user_dictionary_path}") mecab_args.append(f"-u {user_dictionary_path}")
self.tagger = MeCab.Tagger(" ".join(mecab_args)) self.tagger = MeCab.Tagger(" ".join(mecab_args))
self.tagger.parse('') # Force initialization to catch dictionary errors self.tagger.parse("") # Force initialization to catch dictionary errors
except RuntimeError as e: except RuntimeError as e:
raise RuntimeError(f"Failed to initialize MeCab: {str(e)}") raise RuntimeError(f"Failed to initialize MeCab: {str(e)}")
# POS weights for scoring # POS weights for scoring
self.pos_weights = { self.pos_weights = {
'名詞': 1.0, # Nouns "名詞": 1.0, # Nouns
'動詞': 0.8, # Verbs "動詞": 0.8, # Verbs
'形容詞': 0.6, # Adjectives "形容詞": 0.6, # Adjectives
'副詞': 0.4, # Adverbs "副詞": 0.4, # Adverbs
'連体詞': 0.3, # Adnominal adjectives "連体詞": 0.3, # Adnominal adjectives
'感動詞': 0.2, # Interjections "感動詞": 0.2, # Interjections
} }
self.min_score = 0.3 self.min_score = 0.3
def extract_keywords(self, text: str, max_keywords_per_chunk: Optional[int] = 10) -> Set[str]: def extract_keywords(self, text: str, max_keywords_per_chunk: Optional[int] = 10) -> Set[str]:
"""Extract keywords from Japanese text using MeCab. """Extract keywords from Japanese text using MeCab.
Args: Args:
text: Input text to extract keywords from text: Input text to extract keywords from
max_keywords_per_chunk: Maximum number of keywords to extract max_keywords_per_chunk: Maximum number of keywords to extract
Returns: Returns:
Set of extracted keywords Set of extracted keywords
""" """
if not text or not text.strip(): if not text or not text.strip():
return set() return set()
try: try:
# Parse text with MeCab # Parse text with MeCab
self.tagger.parse('') # Clear tagger state self.tagger.parse("") # Clear tagger state
node = self.tagger.parseToNode(text) node = self.tagger.parseToNode(text)
# Calculate term frequencies and scores # Calculate term frequencies and scores
term_scores = defaultdict(float) term_scores = defaultdict(float)
while node: while node:
features = node.feature.split(',') features = node.feature.split(",")
if len(features) > 0: if len(features) > 0:
pos = features[0] # Part of speech pos = features[0] # Part of speech
pos_subtype = features[1] if len(features) > 1 else '' pos_subtype = features[1] if len(features) > 1 else ""
base_form = features[6] if len(features) > 6 else node.surface base_form = features[6] if len(features) > 6 else node.surface
# Score the term based on its POS # Score the term based on its POS
if pos in self.pos_weights and base_form not in STOPWORDS: if pos in self.pos_weights and base_form not in STOPWORDS:
score = self.pos_weights[pos] score = self.pos_weights[pos]
# Boost proper nouns and technical terms # Boost proper nouns and technical terms
if pos == '名詞' and pos_subtype in ['固有名詞', '専門用語']: if pos == "名詞" and pos_subtype in ["固有名詞", "専門用語"]:
score *= 1.5 score *= 1.5
if len(base_form) > 1: # Filter out single characters if len(base_form) > 1: # Filter out single characters
term_scores[base_form] += score term_scores[base_form] += score
node = node.next node = node.next
# Get top scoring terms # Get top scoring terms
sorted_terms = sorted( sorted_terms = sorted(term_scores.items(), key=lambda x: x[1], reverse=True)
term_scores.items(),
key=lambda x: x[1],
reverse=True
)
# Filter by minimum score and take top N # Filter by minimum score and take top N
keywords = { keywords = {term for term, score in sorted_terms if score >= self.min_score}
term for term, score in sorted_terms
if score >= self.min_score
}
if max_keywords_per_chunk: if max_keywords_per_chunk:
keywords = set(list(keywords)[:max_keywords_per_chunk]) keywords = set(list(keywords)[:max_keywords_per_chunk])
# Expand with compound terms # Expand with compound terms
expanded_keywords = self._expand_tokens_with_compounds(keywords, text) expanded_keywords = self._expand_tokens_with_compounds(keywords, text)
return expanded_keywords return expanded_keywords
except Exception as e: except Exception as e:
raise RuntimeError(f"Failed to extract keywords: {str(e)}") raise RuntimeError(f"Failed to extract keywords: {str(e)}")
def _expand_tokens_with_compounds(self, keywords: Set[str], text: str) -> Set[str]: def _expand_tokens_with_compounds(self, keywords: Set[str], text: str) -> Set[str]:
"""Expand keywords with compound terms. """Expand keywords with compound terms.
This method looks for adjacent keywords in the original text to capture This method looks for adjacent keywords in the original text to capture
compound terms like '機械学習' (machine learning) or '自然言語処理' (natural language processing). compound terms like '機械学習' (machine learning) or '自然言語処理' (natural language processing).
""" """
results = set(keywords) results = set(keywords)
try: try:
# Parse again to find compounds # Parse again to find compounds
node = self.tagger.parseToNode(text) node = self.tagger.parseToNode(text)
compound = [] compound = []
compound_readings = [] # For handling different forms of the same compound compound_readings = [] # For handling different forms of the same compound
while node: while node:
features = node.feature.split(',') features = node.feature.split(",")
if len(features) > 6: if len(features) > 6:
base_form = features[6] base_form = features[6]
reading = features[7] if len(features) > 7 else None reading = features[7] if len(features) > 7 else None
else: else:
base_form = node.surface base_form = node.surface
reading = None reading = None
if base_form in keywords: if base_form in keywords:
compound.append(base_form) compound.append(base_form)
if reading: if reading:
@ -132,21 +126,21 @@ class MeCabKeywordTableHandler:
else: else:
if len(compound) > 1: if len(compound) > 1:
# Add the compound term # Add the compound term
compound_term = ''.join(compound) compound_term = "".join(compound)
if len(compound_term) > 1: if len(compound_term) > 1:
results.add(compound_term) results.add(compound_term)
# If readings are available, add normalized form # If readings are available, add normalized form
if compound_readings: if compound_readings:
normalized_term = ''.join(compound_readings) normalized_term = "".join(compound_readings)
if normalized_term != compound_term: if normalized_term != compound_term:
results.add(normalized_term) results.add(normalized_term)
compound = [] compound = []
compound_readings = [] compound_readings = []
node = node.next node = node.next
return results return results
except Exception as e: except Exception as e:
# If compound expansion fails, return original keywords # If compound expansion fails, return original keywords
return keywords return keywords

@ -1,36 +1,190 @@
STOPWORDS = { STOPWORDS = {
# Japanese particles and basic stopwords # Japanese particles and basic stopwords
"", "", "", "", "", "", "", "", "から", "より", "まで", "によって", "",
"あそこ", "あっ", "あの", "あのかた", "あの人", "あり", "あります", "ある", "あれ", "",
"", "いう", "います", "いる", "", "うち", "", "", "および", "おり", "おります", "",
"", "かつて", "から", "", "", "ここ", "こちら", "こと", "この", "これ", "これら", "",
"", "さらに", "", "しかし", "する", "", "", "せる", "そこ", "そして", "その", "",
"その他", "その後", "それ", "それぞれ", "それで", "", "ただし", "たち", "ため", "たり", "",
"", "だっ", "だれ", "", "", "", "でき", "できる", "です", "では", "でも", "", "",
"という", "といった", "とき", "ところ", "として", "とともに", "とも", "と共に", "どこ", "",
"どの", "", "ない", "なお", "なかっ", "ながら", "なく", "なっ", "など", "なに", "なら", "から",
"なり", "なる", "なん", "", "において", "における", "について", "にて", "によって", "により", "より",
"による", "に対して", "に対する", "に関する", "", "ので", "のみ", "", "", "", "ほか", "まで",
"ほとんど", "ほど", "ます", "また", "または", "まで", "", "もの", "ものの", "", "よう", "によって",
"より", "", "られ", "られる", "", "れる", "", "", "", "及び", "", "彼女", "あそこ",
"我々", "特に", "", "私達", "貴方", "貴方方", "あっ",
"あの",
"あのかた",
"あの人",
"あり",
"あります",
"ある",
"あれ",
"",
"いう",
"います",
"いる",
"",
"うち",
"",
"",
"および",
"おり",
"おります",
"",
"かつて",
"",
"ここ",
"こちら",
"こと",
"この",
"これ",
"これら",
"",
"さらに",
"",
"しかし",
"する",
"",
"",
"せる",
"そこ",
"そして",
"その",
"その他",
"その後",
"それ",
"それぞれ",
"それで",
"",
"ただし",
"たち",
"ため",
"たり",
"",
"だっ",
"だれ",
"",
"",
"でき",
"できる",
"です",
"では",
"でも",
"という",
"といった",
"とき",
"ところ",
"として",
"とともに",
"とも",
"と共に",
"どこ",
"どの",
"",
"ない",
"なお",
"なかっ",
"ながら",
"なく",
"なっ",
"など",
"なに",
"なら",
"なり",
"なる",
"なん",
"において",
"における",
"について",
"にて",
"により",
"による",
"に対して",
"に対する",
"に関する",
"ので",
"のみ",
"",
"ほか",
"ほとんど",
"ほど",
"ます",
"また",
"または",
"",
"もの",
"ものの",
"",
"よう",
"",
"られ",
"られる",
"",
"れる",
"",
"",
"及び",
"",
"彼女",
"我々",
"特に",
"",
"私達",
"貴方",
"貴方方",
# Japanese auxiliary verbs # Japanese auxiliary verbs
"です", "ます", "でした", "ました", "である", "", "", "だった", "でした",
"ました",
"である",
"だった",
# Japanese pronouns # Japanese pronouns
"これ", "それ", "あれ", "この", "その", "あの", "ここ", "そこ", "あそこ",
# Japanese common words # Japanese common words
"いる", "ある", "なる", "する", "できる", "おる", "いく", "くる", "おる",
"いく",
"くる",
# Numbers # Numbers
"", "", "", "", "", "", "", "", "", "", "",
"1", "2", "3", "4", "5", "6", "7", "8", "9", "0", "",
"",
"",
"",
"",
"",
"",
"",
"",
"1",
"2",
"3",
"4",
"5",
"6",
"7",
"8",
"9",
"0",
# Punctuation # Punctuation
"", "", "", "", "", "", "", "", "", "", "",
"",
"",
"",
"",
"",
"",
"",
"",
"",
# Common English stopwords (for mixed text) # Common English stopwords (for mixed text)
"the", "is", "at", "which", "on", "in", "and", "or", "a", "an", "the",
} "is",
"at",
"which",
"on",
"in",
"and",
"or",
"a",
"an",
}

Loading…
Cancel
Save