fix: preserve Chinese numeric prefixes in text segmentation and fix linter error (#22765)

7 months ago · 74c0baa7cc
parent 6d3e198c3c
commit 74c0baa7cc
2 changed files with 103 additions and 4 deletions
--- a/api/core/rag/splitter/fixed_text_splitter.py
+++ b/api/core/rag/splitter/fixed_text_splitter.py
@ -2,6 +2,7 @@

 from __future__ import annotations

+import re
 from typing import Any, Optional

 from core.model_manager import ModelInstance
@ -57,6 +58,9 @@ class EnhanceRecursiveCharacterTextSplitter(RecursiveCharacterTextSplitter):


 class FixedRecursiveCharacterTextSplitter(EnhanceRecursiveCharacterTextSplitter):
+    # Regex special characters for detection
+    _regex_chars = ["(", ")", "[", "]", "{", "}", "*", "+", "?", "|", "\\", ".", "^", "$"]
+
    def __init__(self, fixed_separator: str = "\n\n", separators: Optional[list[str]] = None, **kwargs: Any):
        """Create a new TextSplitter."""
        super().__init__(**kwargs)
@ -66,6 +70,22 @@ class FixedRecursiveCharacterTextSplitter(EnhanceRecursiveCharacterTextSplitter)
    def split_text(self, text: str) -> list[str]:
        """Split incoming text and return chunks."""
        if self._fixed_separator:
+            # Check if the separator contains regex special characters
+            is_regex = any(char in self._fixed_separator for char in self._regex_chars)
+
+            if is_regex:
+                # For regex separators, use finditer to find all matches and split manually
+                chunks = self._split_with_regex_manual(text, self._fixed_separator)
+                # Handle large chunks at sentence boundaries while preserving regex structure
+                final_chunks = []
+                for chunk in chunks:
+                    if len(chunk) > self._chunk_size:
+                        final_chunks.extend(self._split_large_regex_chunk(chunk))
+                    else:
+                        final_chunks.append(chunk)
+                return final_chunks
+            else:
+                # Use regular string splitting for simple separators
                chunks = text.split(self._fixed_separator)
        else:
            chunks = [text]
@ -123,6 +143,12 @@ class FixedRecursiveCharacterTextSplitter(EnhanceRecursiveCharacterTextSplitter)
                        _good_splits_lengths = []
                    if not new_separators:
                        final_chunks.append(s)
+                    else:
+                        # For regex separators, use custom splitting to preserve structure
+                        is_regex = any(char in self._fixed_separator for char in self._regex_chars)
+                        if is_regex:
+                            other_info = self._split_large_regex_chunk(s)
+                            final_chunks.extend(other_info)
                        else:
                            other_info = self._split_text(s, new_separators)
                            final_chunks.extend(other_info)
@ -154,3 +180,75 @@ class FixedRecursiveCharacterTextSplitter(EnhanceRecursiveCharacterTextSplitter)
                final_chunks.append(current_part)

        return final_chunks
+
+    def _split_with_regex_manual(self, text: str, pattern: str) -> list[str]:
+        """Manually split text using regex pattern by finding all matches."""
+        # Find all matches
+        matches = list(re.finditer(pattern, text))
+
+        if not matches:
+            return [text]
+
+        chunks = []
+        last_end = 0
+
+        for i, match in enumerate(matches):
+            # Get the matched separator (e.g., "一、", "二、")
+            separator = match.group(0)
+
+            # Find the end of this section (next match or end of text)
+            next_start = len(text)
+            if i + 1 < len(matches):
+                next_start = matches[i + 1].start()
+
+            # Create a chunk that includes the separator and all content up to next separator
+            chunk_content = text[match.start() : next_start].strip()
+            if chunk_content:
+                chunks.append(chunk_content)
+
+            last_end = next_start
+
+        # Add any remaining text after the last match
+        if last_end < len(text):
+            remaining = text[last_end:].strip()
+            if remaining:
+                chunks.append(remaining)
+
+        return chunks
+
+    def _split_large_regex_chunk(self, chunk: str) -> list[str]:
+        """Split large regex chunks at sentence boundaries while preserving structure."""
+        # Split at sentence boundaries (。！？.!?)
+        sentence_pattern = r"([。！？.!?])"
+        sentences = re.split(sentence_pattern, chunk)
+
+        # Rejoin sentences with their punctuation
+        sentences = ["".join(sentences[i : i + 2]) for i in range(0, len(sentences) - 1, 2)]
+        if len(sentences) % 2 == 1:
+            sentences.append(sentences[-1])
+
+        # Filter out empty sentences
+        sentences = [s.strip() for s in sentences if s.strip()]
+
+        # Group sentences into chunks that fit within chunk_size
+        chunks = []
+        current_chunk = ""
+        current_length = 0
+
+        for sentence in sentences:
+            sentence_length = len(sentence)
+
+            # If adding this sentence would exceed chunk_size, start a new chunk
+            if current_length + sentence_length > self._chunk_size and current_chunk:
+                chunks.append(current_chunk.strip())
+                current_chunk = sentence
+                current_length = sentence_length
+            else:
+                current_chunk += sentence
+                current_length += sentence_length
+
+        # Add the last chunk if it exists
+        if current_chunk:
+            chunks.append(current_chunk.strip())
+
+        return chunks
--- a/api/core/workflow/nodes/tool/tool_node.py
+++ b/api/core/workflow/nodes/tool/tool_node.py
@ -317,7 +317,8 @@ class ToolNode(BaseNode):
            elif message.type == ToolInvokeMessage.MessageType.FILE:
                assert message.meta is not None
                assert isinstance(message.meta, dict)
-                assert "file" in message.meta and isinstance(message.meta["file"], File)
+                assert "file" in message.meta
+                assert isinstance(message.meta["file"], File)
                files.append(message.meta["file"])
            elif message.type == ToolInvokeMessage.MessageType.LOG:
                assert isinstance(message.message, ToolInvokeMessage.LogMessage)