[WIP] before final test

1 year ago · 4f5a4e7194
parent 75dd8677b9
commit 4f5a4e7194
2 changed files with 287 additions and 167 deletions
--- a/api/core/rag/datasource/keyword/mecab/README.md
+++ b/api/core/rag/datasource/keyword/mecab/README.md
@ -1,6 +1,6 @@
-# MeCab Keyword Processor
+# MeCab Keyword Processor for Dify
-A Japanese text keyword extraction module using MeCab morphological analyzer for the Dify RAG system.
+A Japanese text keyword extraction module for Dify's RAG system, powered by MeCab morphological analyzer.
 ## Overview
@ -85,115 +85,71 @@ Comprehensive Japanese stopword list including:
 from core.rag.datasource.keyword.keyword_factory import Keyword
 from models.dataset import Dataset
-# Initialize
+# Initialize with KEYWORD_STORE = "mecab" in config
-dataset = Dataset(...)
+keyword_processor = Keyword(dataset)
 keyword_processor = Keyword(dataset)  # Will use MeCab if KEYWORD_STORE = "mecab"
-# Process text
+# Process documents
 documents = [
    Document(
        page_content="自然言語処理は人工知能の重要な分野です。",
-        metadata={"doc_id": "1", ...}
+        metadata={"doc_id": "1"}
    )
 ]
 keyword_processor.create(documents)
 # Search
-results = keyword_processor.search("自然言語処理について")
+results = keyword_processor.search("自然言語処理")
 ```
-### Custom Dictionary Usage
+## Configuration
-```python
+### Basic Settings
 # In your configuration:
 KEYWORD_PROCESSOR_CONFIG = {
    "dictionary_path": "/path/to/mecab/dict",
    "user_dictionary_path": "/path/to/user.dic",
    "pos_weights": {
        "名詞": 1.2,
        "動詞": 0.8,
        # ... customize weights
    }
 }
 ```
 ## Features
 ### 1. Keyword Extraction
 - **POS-based Scoring**:
  - Weights different parts of speech
  - Boosts important terms
  - Configurable scoring thresholds
 - **Compound Word Detection**:
 ```python
-  # Input text: "自然言語処理の研究"
+# In your environment configuration:
-  # Detected compounds:
+KEYWORD_STORE = "mecab"
-  # - "自然言語"
+KEYWORD_DATA_SOURCE_TYPE = "database"  # or other supported storage types
  # - "自然言語処理"
  # - "言語処理"
 ```
- **Reading Normalization**:
+### Advanced Settings
 ```python
-  # Handles variations:
+# MeCab-specific configuration
-  # - "データベース" (katakana)
+MECAB_CONFIG = {
-  # - "データベース" (with readings)
+    "max_keywords_per_chunk": 10,
-  # Both normalize to same term
+    "score_threshold": 0.3,
-  ```
+    "dictionary_path": "/path/to/dict",      # Optional
-
+    "user_dictionary_path": "/path/to/user_dict",  # Optional
-### 2. Storage
+    "pos_weights": {
-
+        "名詞": 1.0,  # Nouns
- **Flexible Storage Options**:
+        "動詞": 0.8,  # Verbs
-  - Database storage
+        "形容詞": 0.6  # Adjectives
  - File-based storage
  - Redis-based locking for concurrency
 - **Data Structure**:
  ```python
  {
      "__type__": "keyword_table",
      "__data__": {
          "index_id": "dataset_id",
          "table": {
              "keyword1": ["doc_id1", "doc_id2"],
              "keyword2": ["doc_id2", "doc_id3"],
          }
    }
 }
 ```
-### 3. Error Handling
+## Key Features
- Comprehensive error handling
+### 1. Intelligent Keyword Extraction
 - Custom exception classes
 - Logging integration
 - Graceful fallbacks
-## Performance Considerations
+- Part-of-speech based scoring
 - Compound word detection
 - Technical term recognition
 - Reading normalization for variations
-1. **Memory Usage**:
+### 2. Storage Options
   - Efficient keyword table structure
   - Batch processing support
   - Caching mechanisms
-2. **Concurrency**:
+- Database storage (default)
-   - Redis-based locking
+- File-based storage
-   - Transaction handling
+- Concurrent access support via Redis locking
   - Safe concurrent access
-3. **Optimization Tips**:
+### 3. Error Handling
   - Use appropriate batch sizes
   - Configure caching timeouts
   - Adjust scoring thresholds
-## Dependencies
+- Comprehensive exception handling
 - Detailed logging
 - Graceful fallbacks
- MeCab and Python bindings:
+## Dependencies
 ```bash
 # Ubuntu/Debian
@ -206,68 +162,178 @@ KEYWORD_PROCESSOR_CONFIG = {
 ## Best Practices
-1. **Dictionary Management**:
+1. **Performance**
-   - Keep dictionaries updated
+   - Use batch processing for large datasets
-   - Use domain-specific user dictionaries
+   - Configure appropriate cache timeouts
-   - Regular maintenance of custom terms
+   - Monitor memory usage
-2. **Configuration Tuning**:
+2. **Customization**
   - Update dictionaries regularly
   - Adjust POS weights for your use case
   - Set appropriate thresholds
   - Monitor and adjust batch sizes
-3. **Error Handling**:
+3. **Error Handling**
   - Implement proper logging
-   - Monitor extraction quality
+   - Handle dictionary loading errors
-   - Handle edge cases
+   - Manage concurrent access
-## Testing
+## Example Usage
-Example test cases:
+### Basic Keyword Extraction
 ```python
-def test_basic_extraction():
+# Extract keywords from text
 text = "自然言語処理は人工知能の重要な分野です。"
-    keywords = handler.extract_keywords(text)
+keywords = keyword_processor.create([
-    assert "自然言語処理" in keywords
+    Document(page_content=text, metadata={"doc_id": "1"})
-    assert "人工知能" in keywords
+])
-
+```
-def test_compound_words():
+
-    text = "機械学習モデルを使った自然言語処理"
+### Custom Dictionary
-    keywords = handler.extract_keywords(text)
+
-    assert "機械学習" in keywords
+```python
-    assert "自然言語処理" in keywords
+# Use custom dictionary
-
+config = MeCabConfig(
-def test_mixed_text():
+    dictionary_path="/path/to/dict",
-    text = "AIを使った自然言語処理のResearch"
+    user_dictionary_path="/path/to/user.dic"
-    keywords = handler.extract_keywords(text)
+)
-    assert "AI" in keywords
+```
-    assert "自然言語処理" in keywords
+
-    assert "Research" in keywords
+### Batch Processing
 ```python
 # Process multiple documents
 documents = [
    Document(page_content=text1, metadata={"doc_id": "1"}),
    Document(page_content=text2, metadata={"doc_id": "2"})
 ]
 keyword_processor.create(documents)
 ```
-## Common Issues and Solutions
+## Integration with Dify
 The MeCab processor integrates seamlessly with Dify's existing keyword system:
 1. Implements the `BaseKeyword` interface
 2. Works with the keyword factory system
 3. Supports all standard operations:
   - Document indexing
   - Keyword extraction
   - Search functionality
   - Index management
 ## Common Issues
-1. **Dictionary Loading Failures**:
+1. **Dictionary Loading**
   ```python
   try:
-       handler = MeCabKeywordTableHandler(dictionary_path=path)
+       keyword_processor.create(documents)
-   except RuntimeError as e:
+   except KeywordProcessorError as e:
-       # Handle dictionary loading error
+       logger.error("Dictionary loading failed: %s", str(e))
   ```
 2. **Memory Management**
   ```python
   # Process in batches
   batch_size = 100
   for i in range(0, len(documents), batch_size):
       batch = documents[i:i + batch_size]
       keyword_processor.create(batch)
   ```
 3. **Concurrent Access**
   ```python
   # Handled automatically via Redis locks
   keyword_processor.create(documents)  # Safe for concurrent use
   ```
 For more details, refer to the [Dify Documentation](https://docs.dify.ai).
 ## Text Processing Examples
 ### Compound Words
 The MeCab processor intelligently handles compound words in Japanese text:
 ```python
 text = "人工知能と機械学習の研究を行っています。"
 keywords = keyword_processor.create([
    Document(page_content=text, metadata={"doc_id": "1"})
 ])
 # Extracted keywords include:
 # - "人工知能" (artificial intelligence - compound)
 # - "機械学習" (machine learning - compound)
 # - "研究" (research - single)
 ```
 Complex technical terms are properly recognized:
 ```python
 text = "自然言語処理における深層学習の応用"
 # Extracts:
 # - "自然言語処理" (natural language processing)
 # - "深層学習" (deep learning)
 # - "応用" (application)
 ```
 ### Stopwords Handling
 Common particles and auxiliary words are automatically filtered:
 ```python
 text = "私はデータベースの設計をしています。"
 # Ignores:
 # - "は" (particle)
 # - "の" (particle)
 # - "を" (particle)
 # - "います" (auxiliary verb)
 # Extracts:
 # - "データベース" (database)
 # - "設計" (design)
 ```
-2. **Memory Usage**:
+Mixed language text is also handled appropriately:
 ```python
-   # Use batch processing for large datasets
+text = "AIシステムのパフォーマンスを改善する。"
-   for batch in chunks(documents, size=100):
+# Ignores:
-       process_batch(batch)
+# - "の" (particle)
 # - "を" (particle)
 # - "する" (auxiliary verb)
 # Extracts:
 # - "AI" (kept as is)
 # - "システム" (system)
 # - "パフォーマンス" (performance)
 # - "改善" (improvement)
 ```
-3. **Concurrent Access**:
+### Reading Variations
 The processor normalizes different forms of the same word:
 ```python
 text1 = "データベース設計"  # カタカナ
 text2 = "データベース設計"  # with readings
 # Both normalize to the same keywords:
 # - "データベース"
 # - "設計"
 ```
 ### Technical Term Boosting
 Technical terms receive higher scores in keyword extraction:
 ```python
-   with redis_client.lock(f"lock_{dataset_id}"):
+text = "機械学習モデルを用いた自然言語処理の研究"
-       # Safe concurrent operations
+# Prioritizes technical terms:
 # High score:
 # - "機械学習" (machine learning)
 # - "自然言語処理" (natural language processing)
 # Lower score:
 # - "研究" (research)
 # - "モデル" (model)
 ```
--- a/api/core/rag/datasource/keyword/mecab/mecab.py
+++ b/api/core/rag/datasource/keyword/mecab/mecab.py
@ -2,7 +2,7 @@ import json
 import logging
 import os
 from collections import defaultdict
-from typing import Any, Dict, List, Optional, Set
+from typing import Any, Optional
 from core.rag.datasource.keyword.keyword_base import BaseKeyword
 from core.rag.datasource.keyword.mecab.config import MeCabConfig
@ -18,21 +18,25 @@ logger = logging.getLogger(__name__)
 class KeywordProcessorError(Exception):
    """Base error for keyword processing."""
    pass
 class KeywordExtractionError(KeywordProcessorError):
    """Error during keyword extraction."""
    pass
 class KeywordStorageError(KeywordProcessorError):
    """Error during storage operations."""
    pass
 class SetEncoder(json.JSONEncoder):
    """JSON encoder that handles sets."""
    def default(self, obj):
        if isinstance(obj, set):
            return list(obj)
@ -52,8 +56,7 @@ class MeCab(BaseKeyword):
        """Initialize MeCab handler with configuration."""
        try:
            self._keyword_handler = MeCabKeywordTableHandler(
-                dictionary_path=self._config.dictionary_path,
+                dictionary_path=self._config.dictionary_path, user_dictionary_path=self._config.user_dictionary_path
                user_dictionary_path=self._config.user_dictionary_path
            )
            if self._config.pos_weights:
                self._keyword_handler.pos_weights = self._config.pos_weights
@ -62,8 +65,21 @@ class MeCab(BaseKeyword):
            logger.exception("Failed to initialize MeCab handler")
            raise KeywordProcessorError("MeCab initialization failed: {}".format(str(e)))
-    def create(self, texts: List[Document], **kwargs: Any) -> BaseKeyword:
+    def create(self, texts: list[Document], **kwargs: Any) -> BaseKeyword:
-        """Create keyword index for documents."""
+        """Create keyword index for documents.
        Args:
            texts: List of documents to index
            **kwargs: Additional arguments
        Returns:
            BaseKeyword: Self for method chaining
        Raises:
            KeywordProcessorError: If indexing fails
            KeywordExtractionError: If keyword extraction fails
            KeywordStorageError: If storage operations fail
        """
        if not texts:
            return self
@ -105,8 +121,17 @@ class MeCab(BaseKeyword):
        return self
-    def add_texts(self, texts: List[Document], **kwargs: Any) -> None:
+    def add_texts(self, texts: list[Document], **kwargs: Any) -> None:
-        """Add new texts to existing index."""
+        """Add new texts to existing index.
        Args:
            texts: List of documents to add
            **kwargs: Additional arguments including optional keywords_list
        Raises:
            KeywordProcessorError: If indexing fails
            KeywordStorageError: If storage operations fail
        """
        if not texts:
            return
@ -156,17 +181,38 @@ class MeCab(BaseKeyword):
            raise
    def text_exists(self, id: str) -> bool:
-        """Check if text exists in index."""
+        """Check if text exists in index.
        Args:
            id: Document ID to check
        Returns:
            bool: True if text exists, False otherwise
        Raises:
            KeywordProcessorError: If check fails
        """
        if not id:
            return False
        try:
            keyword_table = self._get_dataset_keyword_table()
            if keyword_table is None:
                return False
            return id in set.union(*keyword_table.values()) if keyword_table else False
        except Exception as e:
            logger.exception("Failed to check text existence")
            raise KeywordProcessorError("Failed to check text existence: {}".format(str(e)))
-    def delete_by_ids(self, ids: List[str]) -> None:
+    def delete_by_ids(self, ids: list[str]) -> None:
-        """Delete texts by IDs."""
+        """Delete texts by IDs.
        Args:
            ids: List of document IDs to delete
        Raises:
            KeywordStorageError: If deletion fails
        """
        if not ids:
            return
@ -182,7 +228,11 @@ class MeCab(BaseKeyword):
            raise KeywordStorageError("Failed to delete documents: {}".format(str(e)))
    def delete(self) -> None:
-        """Delete entire index."""
+        """Delete entire index.
        Raises:
            KeywordStorageError: If deletion fails
        """
        lock_name = "keyword_indexing_lock_{}".format(self.dataset.id)
        try:
            with redis_client.lock(lock_name, timeout=600):
@ -197,8 +247,19 @@ class MeCab(BaseKeyword):
            logger.exception("Failed to delete index")
            raise KeywordStorageError("Failed to delete index: {}".format(str(e)))
-    def search(self, query: str, **kwargs: Any) -> List[Document]:
+    def search(self, query: str, **kwargs: Any) -> list[Document]:
-        """Search documents using keywords."""
+        """Search documents using keywords.
        Args:
            query: Search query string
            **kwargs: Additional arguments including optional top_k
        Returns:
            List[Document]: List of matching documents
        Raises:
            KeywordProcessorError: If search fails
        """
        if not query:
            return []
@ -214,10 +275,7 @@ class MeCab(BaseKeyword):
            for chunk_index in sorted_chunk_indices:
                segment = (
                    db.session.query(DocumentSegment)
-                    .filter(
+                    .filter(DocumentSegment.dataset_id == self.dataset.id, DocumentSegment.index_node_id == chunk_index)
                        DocumentSegment.dataset_id == self.dataset.id,
                        DocumentSegment.index_node_id == chunk_index
                    )
                    .first()
                )
@ -239,7 +297,7 @@ class MeCab(BaseKeyword):
            logger.exception("Failed to search documents")
            raise KeywordProcessorError("Search failed: {}".format(str(e)))
-    def _get_dataset_keyword_table(self) -> Optional[Dict[str, Set[str]]]:
+    def _get_dataset_keyword_table(self) -> Optional[dict[str, set[str]]]:
        """Get keyword table from storage."""
        try:
            dataset_keyword_table = self.dataset.dataset_keyword_table
@ -273,7 +331,7 @@ class MeCab(BaseKeyword):
            logger.exception("Failed to get keyword table")
            raise KeywordStorageError("Failed to get keyword table: {}".format(str(e)))
-    def _save_dataset_keyword_table(self, keyword_table: Dict[str, Set[str]]) -> None:
+    def _save_dataset_keyword_table(self, keyword_table: dict[str, set[str]]) -> None:
        """Save keyword table to storage."""
        if keyword_table is None:
            raise ValueError("Keyword table cannot be None")
@ -303,8 +361,8 @@ class MeCab(BaseKeyword):
            raise KeywordStorageError("Failed to save keyword table: {}".format(str(e)))
    def _add_text_to_keyword_table(
-        self, keyword_table: Dict[str, Set[str]], id: str, keywords: List[str]
+        self, keyword_table: dict[str, set[str]], id: str, keywords: list[str]
-    ) -> Dict[str, Set[str]]:
+    ) -> dict[str, set[str]]:
        """Add text keywords to table."""
        if not id or not keywords:
            return keyword_table
@ -315,9 +373,7 @@ class MeCab(BaseKeyword):
            keyword_table[keyword].add(id)
        return keyword_table
-    def _delete_ids_from_keyword_table(
+    def _delete_ids_from_keyword_table(self, keyword_table: dict[str, set[str]], ids: list[str]) -> dict[str, set[str]]:
        self, keyword_table: Dict[str, Set[str]], ids: List[str]
    ) -> Dict[str, Set[str]]:
        """Delete IDs from keyword table."""
        if not keyword_table or not ids:
            return keyword_table
@ -336,9 +392,7 @@ class MeCab(BaseKeyword):
        return keyword_table
-    def _retrieve_ids_by_query(
+    def _retrieve_ids_by_query(self, keyword_table: dict[str, set[str]], query: str, k: int = 4) -> list[str]:
        self, keyword_table: Dict[str, Set[str]], query: str, k: int = 4
    ) -> List[str]:
        """Retrieve document IDs by query."""
        if not query or not keyword_table:
            return []
@ -366,9 +420,9 @@ class MeCab(BaseKeyword):
            logger.exception("Failed to retrieve IDs by query")
            raise KeywordExtractionError("Failed to retrieve IDs: {}".format(str(e)))
-    def _update_segment_keywords(self, dataset_id: str, node_id: str, keywords: List[str]) -> None:
+    def _update_segment_keywords(self, dataset_id: str, node_id: str, keywords: list[str]) -> None:
        """Update segment keywords in database."""
-        if not dataset_id or not node_id:
+        if not dataset_id or not node_id or not keywords:
            return
        try:
@ -386,7 +440,7 @@ class MeCab(BaseKeyword):
            logger.exception("Failed to update segment keywords")
            raise KeywordStorageError("Failed to update segment keywords: {}".format(str(e)))
-    def create_segment_keywords(self, node_id: str, keywords: List[str]) -> None:
+    def create_segment_keywords(self, node_id: str, keywords: list[str]) -> None:
        """Create keywords for a single segment.
        Args:
@ -405,7 +459,7 @@ class MeCab(BaseKeyword):
            logger.exception("Failed to create segment keywords")
            raise KeywordProcessorError("Failed to create segment keywords: {}".format(str(e)))
-    def multi_create_segment_keywords(self, pre_segment_data_list: List[Dict[str, Any]]) -> None:
+    def multi_create_segment_keywords(self, pre_segment_data_list: list[dict[str, Any]]) -> None:
        """Create keywords for multiple segments in batch."""
        if not pre_segment_data_list:
            return
@ -443,7 +497,7 @@ class MeCab(BaseKeyword):
            logger.exception("Failed to create multiple segment keywords")
            raise KeywordProcessorError("Failed to create multiple segment keywords: {}".format(str(e)))
-    def update_segment_keywords_index(self, node_id: str, keywords: List[str]) -> None:
+    def update_segment_keywords_index(self, node_id: str, keywords: list[str]) -> None:
        """Update keywords index for a segment.
        Args: