|
|
|
@ -2,6 +2,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
from __future__ import annotations
|
|
|
|
from __future__ import annotations
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import re
|
|
|
|
from typing import Any, Optional
|
|
|
|
from typing import Any, Optional
|
|
|
|
|
|
|
|
|
|
|
|
from core.model_manager import ModelInstance
|
|
|
|
from core.model_manager import ModelInstance
|
|
|
|
@ -66,17 +67,54 @@ class FixedRecursiveCharacterTextSplitter(EnhanceRecursiveCharacterTextSplitter)
|
|
|
|
def split_text(self, text: str) -> list[str]:
|
|
|
|
def split_text(self, text: str) -> list[str]:
|
|
|
|
"""Split incoming text and return chunks."""
|
|
|
|
"""Split incoming text and return chunks."""
|
|
|
|
if self._fixed_separator:
|
|
|
|
if self._fixed_separator:
|
|
|
|
chunks = text.split(self._fixed_separator)
|
|
|
|
# Use re.split() instead of str.split() to support regex patterns
|
|
|
|
|
|
|
|
if self._keep_separator:
|
|
|
|
|
|
|
|
# For regex patterns, we need to handle separator preservation differently
|
|
|
|
|
|
|
|
# Use re.finditer to find all matches and manually construct splits
|
|
|
|
|
|
|
|
chunks = []
|
|
|
|
|
|
|
|
last_end = 0
|
|
|
|
|
|
|
|
for match in re.finditer(self._fixed_separator, text):
|
|
|
|
|
|
|
|
# Add text before the match
|
|
|
|
|
|
|
|
if match.start() > last_end:
|
|
|
|
|
|
|
|
chunks.append(text[last_end : match.start()])
|
|
|
|
|
|
|
|
# Add the matched separator + following content until next match or end
|
|
|
|
|
|
|
|
separator_start = match.start()
|
|
|
|
|
|
|
|
separator_end = match.end()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Find the next match to determine where this chunk should end
|
|
|
|
|
|
|
|
next_match = None
|
|
|
|
|
|
|
|
for next_m in re.finditer(self._fixed_separator, text[separator_end:]):
|
|
|
|
|
|
|
|
next_match = next_m
|
|
|
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if next_match:
|
|
|
|
|
|
|
|
# There's a next match, so this chunk ends at the next separator
|
|
|
|
|
|
|
|
chunk_end = separator_end + next_match.start()
|
|
|
|
|
|
|
|
chunks.append(text[separator_start:chunk_end])
|
|
|
|
|
|
|
|
last_end = separator_end + next_match.start()
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
# This is the last match, so include all remaining text
|
|
|
|
|
|
|
|
chunks.append(text[separator_start:])
|
|
|
|
|
|
|
|
last_end = len(text)
|
|
|
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Add any remaining text before the first match
|
|
|
|
|
|
|
|
if not chunks and text:
|
|
|
|
|
|
|
|
chunks.append(text)
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
chunks = re.split(self._fixed_separator, text)
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
chunks = [text]
|
|
|
|
chunks = [text]
|
|
|
|
|
|
|
|
|
|
|
|
final_chunks = []
|
|
|
|
final_chunks = []
|
|
|
|
chunks_lengths = self._length_function(chunks)
|
|
|
|
chunks_lengths = self._length_function(chunks)
|
|
|
|
for chunk, chunk_length in zip(chunks, chunks_lengths):
|
|
|
|
for chunk, chunk_length in zip(chunks, chunks_lengths):
|
|
|
|
if chunk_length > self._chunk_size:
|
|
|
|
# Filter out chunks that are too short or contain only symbols
|
|
|
|
final_chunks.extend(self.recursive_split_text(chunk))
|
|
|
|
if chunk_length > 1 and chunk.strip(): # Skip chunks with only 1 character or empty/whitespace
|
|
|
|
else:
|
|
|
|
if chunk_length > self._chunk_size:
|
|
|
|
final_chunks.append(chunk)
|
|
|
|
final_chunks.extend(self.recursive_split_text(chunk))
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
final_chunks.append(chunk)
|
|
|
|
|
|
|
|
|
|
|
|
return final_chunks
|
|
|
|
return final_chunks
|
|
|
|
|
|
|
|
|
|
|
|
@ -91,7 +129,8 @@ class FixedRecursiveCharacterTextSplitter(EnhanceRecursiveCharacterTextSplitter)
|
|
|
|
if _s == "":
|
|
|
|
if _s == "":
|
|
|
|
separator = _s
|
|
|
|
separator = _s
|
|
|
|
break
|
|
|
|
break
|
|
|
|
if _s in text:
|
|
|
|
# Use re.search() instead of 'in' to support regex patterns
|
|
|
|
|
|
|
|
if re.search(_s, text):
|
|
|
|
separator = _s
|
|
|
|
separator = _s
|
|
|
|
new_separators = self._separators[i + 1 :]
|
|
|
|
new_separators = self._separators[i + 1 :]
|
|
|
|
break
|
|
|
|
break
|
|
|
|
@ -101,8 +140,24 @@ class FixedRecursiveCharacterTextSplitter(EnhanceRecursiveCharacterTextSplitter)
|
|
|
|
if separator == " ":
|
|
|
|
if separator == " ":
|
|
|
|
splits = text.split()
|
|
|
|
splits = text.split()
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
splits = text.split(separator)
|
|
|
|
# Use re.split() instead of str.split() to support regex patterns
|
|
|
|
splits = [item + separator if i < len(splits) else item for i, item in enumerate(splits)]
|
|
|
|
if self._keep_separator:
|
|
|
|
|
|
|
|
# For regex patterns, we need to handle separator preservation differently
|
|
|
|
|
|
|
|
# Use re.finditer to find all matches and manually construct splits
|
|
|
|
|
|
|
|
splits = []
|
|
|
|
|
|
|
|
last_end = 0
|
|
|
|
|
|
|
|
for match in re.finditer(separator, text):
|
|
|
|
|
|
|
|
# Add text before the match
|
|
|
|
|
|
|
|
if match.start() > last_end:
|
|
|
|
|
|
|
|
splits.append(text[last_end : match.start()])
|
|
|
|
|
|
|
|
# Add the matched separator
|
|
|
|
|
|
|
|
splits.append(match.group(0))
|
|
|
|
|
|
|
|
last_end = match.end()
|
|
|
|
|
|
|
|
# Add remaining text after last match
|
|
|
|
|
|
|
|
if last_end < len(text):
|
|
|
|
|
|
|
|
splits.append(text[last_end:])
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
splits = re.split(separator, text)
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
splits = list(text)
|
|
|
|
splits = list(text)
|
|
|
|
splits = [s for s in splits if (s not in {"", "\n"})]
|
|
|
|
splits = [s for s in splits if (s not in {"", "\n"})]
|
|
|
|
|