From 74c0baa7cc7e76a2481c51ebf946bab8e9646b0a Mon Sep 17 00:00:00 2001 From: baonudesifeizhai Date: Wed, 23 Jul 2025 01:32:14 -0400 Subject: [PATCH] fix: preserve Chinese numeric prefixes in text segmentation and fix linter error (#22765) --- api/core/rag/splitter/fixed_text_splitter.py | 104 ++++++++++++++++++- api/core/workflow/nodes/tool/tool_node.py | 3 +- 2 files changed, 103 insertions(+), 4 deletions(-) diff --git a/api/core/rag/splitter/fixed_text_splitter.py b/api/core/rag/splitter/fixed_text_splitter.py index bcaf299892..e020aba395 100644 --- a/api/core/rag/splitter/fixed_text_splitter.py +++ b/api/core/rag/splitter/fixed_text_splitter.py @@ -2,6 +2,7 @@ from __future__ import annotations +import re from typing import Any, Optional from core.model_manager import ModelInstance @@ -57,6 +58,9 @@ class EnhanceRecursiveCharacterTextSplitter(RecursiveCharacterTextSplitter): class FixedRecursiveCharacterTextSplitter(EnhanceRecursiveCharacterTextSplitter): + # Regex special characters for detection + _regex_chars = ["(", ")", "[", "]", "{", "}", "*", "+", "?", "|", "\\", ".", "^", "$"] + def __init__(self, fixed_separator: str = "\n\n", separators: Optional[list[str]] = None, **kwargs: Any): """Create a new TextSplitter.""" super().__init__(**kwargs) @@ -66,7 +70,23 @@ class FixedRecursiveCharacterTextSplitter(EnhanceRecursiveCharacterTextSplitter) def split_text(self, text: str) -> list[str]: """Split incoming text and return chunks.""" if self._fixed_separator: - chunks = text.split(self._fixed_separator) + # Check if the separator contains regex special characters + is_regex = any(char in self._fixed_separator for char in self._regex_chars) + + if is_regex: + # For regex separators, use finditer to find all matches and split manually + chunks = self._split_with_regex_manual(text, self._fixed_separator) + # Handle large chunks at sentence boundaries while preserving regex structure + final_chunks = [] + for chunk in chunks: + if len(chunk) > self._chunk_size: + final_chunks.extend(self._split_large_regex_chunk(chunk)) + else: + final_chunks.append(chunk) + return final_chunks + else: + # Use regular string splitting for simple separators + chunks = text.split(self._fixed_separator) else: chunks = [text] @@ -124,8 +144,14 @@ class FixedRecursiveCharacterTextSplitter(EnhanceRecursiveCharacterTextSplitter) if not new_separators: final_chunks.append(s) else: - other_info = self._split_text(s, new_separators) - final_chunks.extend(other_info) + # For regex separators, use custom splitting to preserve structure + is_regex = any(char in self._fixed_separator for char in self._regex_chars) + if is_regex: + other_info = self._split_large_regex_chunk(s) + final_chunks.extend(other_info) + else: + other_info = self._split_text(s, new_separators) + final_chunks.extend(other_info) if _good_splits: merged_text = self._merge_splits(_good_splits, _separator, _good_splits_lengths) @@ -154,3 +180,75 @@ class FixedRecursiveCharacterTextSplitter(EnhanceRecursiveCharacterTextSplitter) final_chunks.append(current_part) return final_chunks + + def _split_with_regex_manual(self, text: str, pattern: str) -> list[str]: + """Manually split text using regex pattern by finding all matches.""" + # Find all matches + matches = list(re.finditer(pattern, text)) + + if not matches: + return [text] + + chunks = [] + last_end = 0 + + for i, match in enumerate(matches): + # Get the matched separator (e.g., "一、", "二、") + separator = match.group(0) + + # Find the end of this section (next match or end of text) + next_start = len(text) + if i + 1 < len(matches): + next_start = matches[i + 1].start() + + # Create a chunk that includes the separator and all content up to next separator + chunk_content = text[match.start() : next_start].strip() + if chunk_content: + chunks.append(chunk_content) + + last_end = next_start + + # Add any remaining text after the last match + if last_end < len(text): + remaining = text[last_end:].strip() + if remaining: + chunks.append(remaining) + + return chunks + + def _split_large_regex_chunk(self, chunk: str) -> list[str]: + """Split large regex chunks at sentence boundaries while preserving structure.""" + # Split at sentence boundaries (。!?.!?) + sentence_pattern = r"([。!?.!?])" + sentences = re.split(sentence_pattern, chunk) + + # Rejoin sentences with their punctuation + sentences = ["".join(sentences[i : i + 2]) for i in range(0, len(sentences) - 1, 2)] + if len(sentences) % 2 == 1: + sentences.append(sentences[-1]) + + # Filter out empty sentences + sentences = [s.strip() for s in sentences if s.strip()] + + # Group sentences into chunks that fit within chunk_size + chunks = [] + current_chunk = "" + current_length = 0 + + for sentence in sentences: + sentence_length = len(sentence) + + # If adding this sentence would exceed chunk_size, start a new chunk + if current_length + sentence_length > self._chunk_size and current_chunk: + chunks.append(current_chunk.strip()) + current_chunk = sentence + current_length = sentence_length + else: + current_chunk += sentence + current_length += sentence_length + + # Add the last chunk if it exists + if current_chunk: + chunks.append(current_chunk.strip()) + + return chunks diff --git a/api/core/workflow/nodes/tool/tool_node.py b/api/core/workflow/nodes/tool/tool_node.py index 86d36f474d..c7b354c084 100644 --- a/api/core/workflow/nodes/tool/tool_node.py +++ b/api/core/workflow/nodes/tool/tool_node.py @@ -317,7 +317,8 @@ class ToolNode(BaseNode): elif message.type == ToolInvokeMessage.MessageType.FILE: assert message.meta is not None assert isinstance(message.meta, dict) - assert "file" in message.meta and isinstance(message.meta["file"], File) + assert "file" in message.meta + assert isinstance(message.meta["file"], File) files.append(message.meta["file"]) elif message.type == ToolInvokeMessage.MessageType.LOG: assert isinstance(message.message, ToolInvokeMessage.LogMessage)