From c14a6a6609632a160fc8b8be76babc064d308f74 Mon Sep 17 00:00:00 2001
From: baonudesifeizhai <baonudesifeizhai@gmail.com>
Date: Thu, 24 Jul 2025 02:05:17 -0400
Subject: [PATCH] fix: support Chinese regex separators in text segmentation

- Remove re.escape() usage to preserve regex functionality
- Replace str.split() with re.split() for regex support
- Replace 'in' operator with re.search() for regex patterns
- Add proper separator preservation logic for regex patterns
- Filter out short chunks containing only symbols

Fixes #22765
---
 api/core/rag/splitter/fixed_text_splitter.py | 71 +++++++++++++++++---
 api/core/rag/splitter/text_splitter.py       |  3 +-
 2 files changed, 65 insertions(+), 9 deletions(-)

diff --git a/api/core/rag/splitter/fixed_text_splitter.py b/api/core/rag/splitter/fixed_text_splitter.py
index bcaf299892..e5180c2166 100644
--- a/api/core/rag/splitter/fixed_text_splitter.py
+++ b/api/core/rag/splitter/fixed_text_splitter.py
@@ -2,6 +2,7 @@
 
 from __future__ import annotations
 
+import re
 from typing import Any, Optional
 
 from core.model_manager import ModelInstance
@@ -66,17 +67,54 @@ class FixedRecursiveCharacterTextSplitter(EnhanceRecursiveCharacterTextSplitter)
     def split_text(self, text: str) -> list[str]:
         """Split incoming text and return chunks."""
         if self._fixed_separator:
-            chunks = text.split(self._fixed_separator)
+            # Use re.split() instead of str.split() to support regex patterns
+            if self._keep_separator:
+                # For regex patterns, we need to handle separator preservation differently
+                # Use re.finditer to find all matches and manually construct splits
+                chunks = []
+                last_end = 0
+                for match in re.finditer(self._fixed_separator, text):
+                    # Add text before the match
+                    if match.start() > last_end:
+                        chunks.append(text[last_end : match.start()])
+                    # Add the matched separator + following content until next match or end
+                    separator_start = match.start()
+                    separator_end = match.end()
+
+                    # Find the next match to determine where this chunk should end
+                    next_match = None
+                    for next_m in re.finditer(self._fixed_separator, text[separator_end:]):
+                        next_match = next_m
+                        break
+
+                    if next_match:
+                        # There's a next match, so this chunk ends at the next separator
+                        chunk_end = separator_end + next_match.start()
+                        chunks.append(text[separator_start:chunk_end])
+                        last_end = separator_end + next_match.start()
+                    else:
+                        # This is the last match, so include all remaining text
+                        chunks.append(text[separator_start:])
+                        last_end = len(text)
+                        break
+
+                # Add any remaining text before the first match
+                if not chunks and text:
+                    chunks.append(text)
+            else:
+                chunks = re.split(self._fixed_separator, text)
         else:
             chunks = [text]
 
         final_chunks = []
         chunks_lengths = self._length_function(chunks)
         for chunk, chunk_length in zip(chunks, chunks_lengths):
-            if chunk_length > self._chunk_size:
-                final_chunks.extend(self.recursive_split_text(chunk))
-            else:
-                final_chunks.append(chunk)
+            # Filter out chunks that are too short or contain only symbols
+            if chunk_length > 1 and chunk.strip():  # Skip chunks with only 1 character or empty/whitespace
+                if chunk_length > self._chunk_size:
+                    final_chunks.extend(self.recursive_split_text(chunk))
+                else:
+                    final_chunks.append(chunk)
 
         return final_chunks
 
@@ -91,7 +129,8 @@ class FixedRecursiveCharacterTextSplitter(EnhanceRecursiveCharacterTextSplitter)
             if _s == "":
                 separator = _s
                 break
-            if _s in text:
+            # Use re.search() instead of 'in' to support regex patterns
+            if re.search(_s, text):
                 separator = _s
                 new_separators = self._separators[i + 1 :]
                 break
@@ -101,8 +140,24 @@ class FixedRecursiveCharacterTextSplitter(EnhanceRecursiveCharacterTextSplitter)
             if separator == " ":
                 splits = text.split()
             else:
-                splits = text.split(separator)
-                splits = [item + separator if i < len(splits) else item for i, item in enumerate(splits)]
+                # Use re.split() instead of str.split() to support regex patterns
+                if self._keep_separator:
+                    # For regex patterns, we need to handle separator preservation differently
+                    # Use re.finditer to find all matches and manually construct splits
+                    splits = []
+                    last_end = 0
+                    for match in re.finditer(separator, text):
+                        # Add text before the match
+                        if match.start() > last_end:
+                            splits.append(text[last_end : match.start()])
+                        # Add the matched separator
+                        splits.append(match.group(0))
+                        last_end = match.end()
+                    # Add remaining text after last match
+                    if last_end < len(text):
+                        splits.append(text[last_end:])
+                else:
+                    splits = re.split(separator, text)
         else:
             splits = list(text)
         splits = [s for s in splits if (s not in {"", "\n"})]
diff --git a/api/core/rag/splitter/text_splitter.py b/api/core/rag/splitter/text_splitter.py
index 529d8ccd27..c1c2885a8b 100644
--- a/api/core/rag/splitter/text_splitter.py
+++ b/api/core/rag/splitter/text_splitter.py
@@ -26,7 +26,8 @@ def _split_text_with_regex(text: str, separator: str, keep_separator: bool) -> l
     if separator:
         if keep_separator:
             # The parentheses in the pattern keep the delimiters in the result.
-            _splits = re.split(f"({re.escape(separator)})", text)
+            # Don't use re.escape() to preserve regex functionality
+            _splits = re.split(f"({separator})", text)
             splits = [_splits[i - 1] + _splits[i] for i in range(1, len(_splits), 2)]
             if len(_splits) % 2 != 0:
                 splits += _splits[-1:]