linting

1 year ago · 77030d7581
parent 81c5953fa5
commit 77030d7581
5 changed files with 294 additions and 179 deletions
--- a/api/core/rag/datasource/keyword/keyword_factory.py
+++ b/api/core/rag/datasource/keyword/keyword_factory.py
@ -22,9 +22,11 @@ class Keyword:
        match keyword_type:
            case KeyWordType.JIEBA:
                from core.rag.datasource.keyword.jieba.jieba import Jieba
                return Jieba
            case KeyWordType.MECAB:
                from core.rag.datasource.keyword.mecab.mecab import MeCab
                return MeCab
            case _:
                raise ValueError(f"Keyword store {keyword_type} is not supported.")
--- a/api/core/rag/datasource/keyword/mecab/config.py
+++ b/api/core/rag/datasource/keyword/mecab/config.py
@ -1,7 +1,9 @@
 from pydantic import BaseModel
 class MeCabConfig(BaseModel):
    """Configuration for MeCab keyword processor."""
    max_keywords_per_chunk: int = 10
    min_keyword_length: int = 2
    score_threshold: float = 0.3
@ -12,8 +14,8 @@ class MeCabConfig(BaseModel):
    dictionary_path: str = ""  # Optional custom dictionary path
    user_dictionary_path: str = ""  # Optional user dictionary path
    pos_weights: dict = {
-        '名詞': 1.0,      # Nouns
+        "名詞": 1.0,  # Nouns
-        '動詞': 0.8,      # Verbs
+        "動詞": 0.8,  # Verbs
-        '形容詞': 0.6,    # Adjectives
+        "形容詞": 0.6,  # Adjectives
-        '副詞': 0.4,      # Adverbs
+        "副詞": 0.4,  # Adverbs
    }
--- a/api/core/rag/datasource/keyword/mecab/mecab.py
+++ b/api/core/rag/datasource/keyword/mecab/mecab.py
@ -1,38 +1,41 @@
 import json
 import logging
 from typing import Any, Optional
 from collections import defaultdict
 from typing import Any, Optional
 from core.rag.datasource.keyword.keyword_base import BaseKeyword
 from core.rag.datasource.keyword.mecab.mecab_keyword_table_handler import MeCabKeywordTableHandler
 from core.rag.datasource.keyword.mecab.config import MeCabConfig
 from core.rag.datasource.keyword.mecab.mecab_keyword_table_handler import MeCabKeywordTableHandler
 from core.rag.models.document import Document
 from extensions.ext_database import db
 from extensions.ext_redis import redis_client
 from extensions.ext_storage import storage
-from models.dataset import Dataset, DatasetKeywordTable, DocumentSegment
+from models.dataset import Dataset, DocumentSegment
 logger = logging.getLogger(__name__)
 class KeywordProcessorError(Exception):
    """Base error for keyword processing."""
    pass
 class KeywordExtractionError(KeywordProcessorError):
    """Error during keyword extraction."""
    pass
 class KeywordStorageError(KeywordProcessorError):
    """Error during storage operations."""
    pass
 class SetEncoder(json.JSONEncoder):
    """JSON encoder that handles sets."""
    def default(self, obj):
        if isinstance(obj, set):
            return list(obj)
@ -52,8 +55,7 @@ class MeCab(BaseKeyword):
        """Initialize MeCab handler with configuration."""
        try:
            self._keyword_handler = MeCabKeywordTableHandler(
-                dictionary_path=self._config.dictionary_path,
+                dictionary_path=self._config.dictionary_path, user_dictionary_path=self._config.user_dictionary_path
                user_dictionary_path=self._config.user_dictionary_path
            )
            if self._config.pos_weights:
                self._keyword_handler.pos_weights = self._config.pos_weights
@ -70,19 +72,12 @@ class MeCab(BaseKeyword):
            for text in texts:
                keywords = self._keyword_handler.extract_keywords(
-                    text.page_content,
+                    text.page_content, self._config.max_keywords_per_chunk
                    self._config.max_keywords_per_chunk
                )
                if text.metadata is not None:
-                    self._update_segment_keywords(
+                    self._update_segment_keywords(self.dataset.id, text.metadata["doc_id"], list(keywords))
                        self.dataset.id,
                        text.metadata["doc_id"],
                        list(keywords)
                    )
                    keyword_table = self._add_text_to_keyword_table(
-                        keyword_table or {},
+                        keyword_table or {}, text.metadata["doc_id"], list(keywords)
                        text.metadata["doc_id"],
                        list(keywords)
                    )
            self._save_dataset_keyword_table(keyword_table)
@ -100,25 +95,17 @@ class MeCab(BaseKeyword):
                    keywords = keywords_list[i]
                    if not keywords:
                        keywords = self._keyword_handler.extract_keywords(
-                            text.page_content,
+                            text.page_content, self._config.max_keywords_per_chunk
                            self._config.max_keywords_per_chunk
                        )
                else:
                    keywords = self._keyword_handler.extract_keywords(
-                        text.page_content,
+                        text.page_content, self._config.max_keywords_per_chunk
                        self._config.max_keywords_per_chunk
                    )
                if text.metadata is not None:
-                    self._update_segment_keywords(
+                    self._update_segment_keywords(self.dataset.id, text.metadata["doc_id"], list(keywords))
                        self.dataset.id,
                        text.metadata["doc_id"],
                        list(keywords)
                    )
                    keyword_table = self._add_text_to_keyword_table(
-                        keyword_table or {},
+                        keyword_table or {}, text.metadata["doc_id"], list(keywords)
                        text.metadata["doc_id"],
                        list(keywords)
                    )
            self._save_dataset_keyword_table(keyword_table)
@ -156,20 +143,13 @@ class MeCab(BaseKeyword):
        keyword_table = self._get_dataset_keyword_table()
        k = kwargs.get("top_k", 4)
-        sorted_chunk_indices = self._retrieve_ids_by_query(
+        sorted_chunk_indices = self._retrieve_ids_by_query(keyword_table or {}, query, k)
            keyword_table or {},
            query,
            k
        )
        documents = []
        for chunk_index in sorted_chunk_indices:
            segment = (
                db.session.query(DocumentSegment)
-                .filter(
+                .filter(DocumentSegment.dataset_id == self.dataset.id, DocumentSegment.index_node_id == chunk_index)
                    DocumentSegment.dataset_id == self.dataset.id,
                    DocumentSegment.index_node_id == chunk_index
                )
                .first()
            )
@ -201,11 +181,7 @@ class MeCab(BaseKeyword):
        """Save keyword table to storage."""
        table_dict = {
            "__type__": "keyword_table",
-            "__data__": {
+            "__data__": {"index_id": self.dataset.id, "summary": None, "table": keyword_table},
                "index_id": self.dataset.id,
                "summary": None,
                "table": keyword_table
            }
        }
        dataset_keyword_table = self.dataset.dataset_keyword_table
@ -218,10 +194,7 @@ class MeCab(BaseKeyword):
            file_key = f"keyword_files/{self.dataset.tenant_id}/{self.dataset.id}.txt"
            if storage.exists(file_key):
                storage.delete(file_key)
-            storage.save(
+            storage.save(file_key, json.dumps(table_dict, cls=SetEncoder).encode("utf-8"))
                file_key,
                json.dumps(table_dict, cls=SetEncoder).encode("utf-8")
            )
    def _add_text_to_keyword_table(self, keyword_table: dict, id: str, keywords: list[str]) -> dict:
        """Add text keywords to table."""
@ -253,20 +226,13 @@ class MeCab(BaseKeyword):
        # Score documents based on matching keywords
        chunk_indices_count = defaultdict(int)
-        keywords_list = [
+        keywords_list = [keyword for keyword in keywords if keyword in set(keyword_table.keys())]
            keyword for keyword in keywords
            if keyword in set(keyword_table.keys())
        ]
        for keyword in keywords_list:
            for node_id in keyword_table[keyword]:
                chunk_indices_count[node_id] += 1
-        sorted_chunk_indices = sorted(
+        sorted_chunk_indices = sorted(chunk_indices_count.keys(), key=lambda x: chunk_indices_count[x], reverse=True)
            chunk_indices_count.keys(),
            key=lambda x: chunk_indices_count[x],
            reverse=True
        )
        return sorted_chunk_indices[:k]
@ -274,10 +240,7 @@ class MeCab(BaseKeyword):
        """Update segment keywords in database."""
        document_segment = (
            db.session.query(DocumentSegment)
-            .filter(
+            .filter(DocumentSegment.dataset_id == dataset_id, DocumentSegment.index_node_id == node_id)
                DocumentSegment.dataset_id == dataset_id,
                DocumentSegment.index_node_id == node_id
            )
            .first()
        )
--- a/api/core/rag/datasource/keyword/mecab/mecab_keyword_table_handler.py
+++ b/api/core/rag/datasource/keyword/mecab/mecab_keyword_table_handler.py
@ -1,10 +1,11 @@
-import re
+from collections import defaultdict
 from typing import Optional, Set
 import MeCab
 from collections import defaultdict
 from core.rag.datasource.keyword.mecab.stopwords import STOPWORDS
 class MeCabKeywordTableHandler:
    """Japanese keyword extraction using MeCab morphological analyzer."""
@ -24,19 +25,19 @@ class MeCabKeywordTableHandler:
                mecab_args.append(f"-u {user_dictionary_path}")
            self.tagger = MeCab.Tagger(" ".join(mecab_args))
-            self.tagger.parse('')  # Force initialization to catch dictionary errors
+            self.tagger.parse("")  # Force initialization to catch dictionary errors
        except RuntimeError as e:
            raise RuntimeError(f"Failed to initialize MeCab: {str(e)}")
        # POS weights for scoring
        self.pos_weights = {
-            '名詞': 1.0,      # Nouns
+            "名詞": 1.0,  # Nouns
-            '動詞': 0.8,      # Verbs
+            "動詞": 0.8,  # Verbs
-            '形容詞': 0.6,    # Adjectives
+            "形容詞": 0.6,  # Adjectives
-            '副詞': 0.4,      # Adverbs
+            "副詞": 0.4,  # Adverbs
-            '連体詞': 0.3,    # Adnominal adjectives
+            "連体詞": 0.3,  # Adnominal adjectives
-            '感動詞': 0.2,    # Interjections
+            "感動詞": 0.2,  # Interjections
        }
        self.min_score = 0.3
@ -55,23 +56,23 @@ class MeCabKeywordTableHandler:
        try:
            # Parse text with MeCab
-            self.tagger.parse('')  # Clear tagger state
+            self.tagger.parse("")  # Clear tagger state
            node = self.tagger.parseToNode(text)
            # Calculate term frequencies and scores
            term_scores = defaultdict(float)
            while node:
-                features = node.feature.split(',')
+                features = node.feature.split(",")
                if len(features) > 0:
                    pos = features[0]  # Part of speech
-                    pos_subtype = features[1] if len(features) > 1 else ''
+                    pos_subtype = features[1] if len(features) > 1 else ""
                    base_form = features[6] if len(features) > 6 else node.surface
                    # Score the term based on its POS
                    if pos in self.pos_weights and base_form not in STOPWORDS:
                        score = self.pos_weights[pos]
                        # Boost proper nouns and technical terms
-                        if pos == '名詞' and pos_subtype in ['固有名詞', '専門用語']:
+                        if pos == "名詞" and pos_subtype in ["固有名詞", "専門用語"]:
                            score *= 1.5
                        if len(base_form) > 1:  # Filter out single characters
                            term_scores[base_form] += score
@ -79,17 +80,10 @@ class MeCabKeywordTableHandler:
                node = node.next
            # Get top scoring terms
-            sorted_terms = sorted(
+            sorted_terms = sorted(term_scores.items(), key=lambda x: x[1], reverse=True)
                term_scores.items(), 
                key=lambda x: x[1], 
                reverse=True
            )
            # Filter by minimum score and take top N
-            keywords = {
+            keywords = {term for term, score in sorted_terms if score >= self.min_score}
                term for term, score in sorted_terms 
                if score >= self.min_score
            }
            if max_keywords_per_chunk:
                keywords = set(list(keywords)[:max_keywords_per_chunk])
@ -117,7 +111,7 @@ class MeCabKeywordTableHandler:
            compound_readings = []  # For handling different forms of the same compound
            while node:
-                features = node.feature.split(',')
+                features = node.feature.split(",")
                if len(features) > 6:
                    base_form = features[6]
                    reading = features[7] if len(features) > 7 else None
@ -132,12 +126,12 @@ class MeCabKeywordTableHandler:
                else:
                    if len(compound) > 1:
                        # Add the compound term
-                        compound_term = ''.join(compound)
+                        compound_term = "".join(compound)
                        if len(compound_term) > 1:
                            results.add(compound_term)
                            # If readings are available, add normalized form
                            if compound_readings:
-                                normalized_term = ''.join(compound_readings)
+                                normalized_term = "".join(compound_readings)
                                if normalized_term != compound_term:
                                    results.add(normalized_term)
                    compound = []
--- a/api/core/rag/datasource/keyword/mecab/stopwords.py
+++ b/api/core/rag/datasource/keyword/mecab/stopwords.py
@ -1,36 +1,190 @@
 STOPWORDS = {
    # Japanese particles and basic stopwords
-    "は", "が", "の", "に", "を", "で", "へ", "と", "から", "より", "まで", "によって",
+    "は",
-    "あそこ", "あっ", "あの", "あのかた", "あの人", "あり", "あります", "ある", "あれ",
+    "が",
-    "い", "いう", "います", "いる", "う", "うち", "え", "お", "および", "おり", "おります",
+    "の",
-    "か", "かつて", "から", "が", "き", "ここ", "こちら", "こと", "この", "これ", "これら",
+    "に",
-    "さ", "さらに", "し", "しかし", "する", "ず", "せ", "せる", "そこ", "そして", "その",
+    "を",
-    "その他", "その後", "それ", "それぞれ", "それで", "た", "ただし", "たち", "ため", "たり",
+    "で",
-    "だ", "だっ", "だれ", "つ", "て", "で", "でき", "できる", "です", "では", "でも", "と",
+    "へ",
-    "という", "といった", "とき", "ところ", "として", "とともに", "とも", "と共に", "どこ",
+    "と",
-    "どの", "な", "ない", "なお", "なかっ", "ながら", "なく", "なっ", "など", "なに", "なら",
+    "から",
-    "なり", "なる", "なん", "に", "において", "における", "について", "にて", "によって", "により",
+    "より",
-    "による", "に対して", "に対する", "に関する", "の", "ので", "のみ", "は", "ば", "へ", "ほか",
+    "まで",
-    "ほとんど", "ほど", "ます", "また", "または", "まで", "も", "もの", "ものの", "や", "よう",
+    "によって",
-    "より", "ら", "られ", "られる", "れ", "れる", "を", "ん", "何", "及び", "彼", "彼女",
+    "あそこ",
-    "我々", "特に", "私", "私達", "貴方", "貴方方",
+    "あっ",
-
+    "あの",
    "あのかた",
    "あの人",
    "あり",
    "あります",
    "ある",
    "あれ",
    "い",
    "いう",
    "います",
    "いる",
    "う",
    "うち",
    "え",
    "お",
    "および",
    "おり",
    "おります",
    "か",
    "かつて",
    "き",
    "ここ",
    "こちら",
    "こと",
    "この",
    "これ",
    "これら",
    "さ",
    "さらに",
    "し",
    "しかし",
    "する",
    "ず",
    "せ",
    "せる",
    "そこ",
    "そして",
    "その",
    "その他",
    "その後",
    "それ",
    "それぞれ",
    "それで",
    "た",
    "ただし",
    "たち",
    "ため",
    "たり",
    "だ",
    "だっ",
    "だれ",
    "つ",
    "て",
    "でき",
    "できる",
    "です",
    "では",
    "でも",
    "という",
    "といった",
    "とき",
    "ところ",
    "として",
    "とともに",
    "とも",
    "と共に",
    "どこ",
    "どの",
    "な",
    "ない",
    "なお",
    "なかっ",
    "ながら",
    "なく",
    "なっ",
    "など",
    "なに",
    "なら",
    "なり",
    "なる",
    "なん",
    "において",
    "における",
    "について",
    "にて",
    "により",
    "による",
    "に対して",
    "に対する",
    "に関する",
    "ので",
    "のみ",
    "ば",
    "ほか",
    "ほとんど",
    "ほど",
    "ます",
    "また",
    "または",
    "も",
    "もの",
    "ものの",
    "や",
    "よう",
    "ら",
    "られ",
    "られる",
    "れ",
    "れる",
    "ん",
    "何",
    "及び",
    "彼",
    "彼女",
    "我々",
    "特に",
    "私",
    "私達",
    "貴方",
    "貴方方",
    # Japanese auxiliary verbs
-    "です", "ます", "でした", "ました", "である", "だ", "な", "だった",
+    "でした",
-
+    "ました",
    "である",
    "だった",
    # Japanese pronouns
    "これ", "それ", "あれ", "この", "その", "あの", "ここ", "そこ", "あそこ",
    # Japanese common words
-    "いる", "ある", "なる", "する", "できる", "おる", "いく", "くる",
+    "おる",
-
+    "いく",
    "くる",
    # Numbers
-    "一", "二", "三", "四", "五", "六", "七", "八", "九", "十",
+    "一",
-    "1", "2", "3", "4", "5", "6", "7", "8", "9", "0",
+    "二",
-
+    "三",
    "四",
    "五",
    "六",
    "七",
    "八",
    "九",
    "十",
    "1",
    "2",
    "3",
    "4",
    "5",
    "6",
    "7",
    "8",
    "9",
    "0",
    # Punctuation
-    "、", "。", "「", "」", "『", "』", "（", "）", "［", "］",
+    "、",
-
+    "。",
    "「",
    "」",
    "『",
    "』",
    "（",
    "）",
    "［",
    "］",
    # Common English stopwords (for mixed text)
-    "the", "is", "at", "which", "on", "in", "and", "or", "a", "an",
+    "the",
    "is",
    "at",
    "which",
    "on",
    "in",
    "and",
    "or",
    "a",
    "an",
 }