|
|
|
|
@ -6,7 +6,6 @@ import re
|
|
|
|
|
from abc import ABC, abstractmethod
|
|
|
|
|
from collections.abc import Callable, Collection, Iterable, Sequence, Set
|
|
|
|
|
from dataclasses import dataclass
|
|
|
|
|
from enum import Enum
|
|
|
|
|
from typing import (
|
|
|
|
|
Any,
|
|
|
|
|
Literal,
|
|
|
|
|
@ -477,27 +476,6 @@ class TokenTextSplitter(TextSplitter):
|
|
|
|
|
return split_text_on_tokens(text=text, tokenizer=tokenizer)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Language(str, Enum):
|
|
|
|
|
"""Enum of the programming languages."""
|
|
|
|
|
|
|
|
|
|
CPP = "cpp"
|
|
|
|
|
GO = "go"
|
|
|
|
|
JAVA = "java"
|
|
|
|
|
JS = "js"
|
|
|
|
|
PHP = "php"
|
|
|
|
|
PROTO = "proto"
|
|
|
|
|
PYTHON = "python"
|
|
|
|
|
RST = "rst"
|
|
|
|
|
RUBY = "ruby"
|
|
|
|
|
RUST = "rust"
|
|
|
|
|
SCALA = "scala"
|
|
|
|
|
SWIFT = "swift"
|
|
|
|
|
MARKDOWN = "markdown"
|
|
|
|
|
LATEX = "latex"
|
|
|
|
|
HTML = "html"
|
|
|
|
|
SOL = "sol"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class RecursiveCharacterTextSplitter(TextSplitter):
|
|
|
|
|
"""Splitting text by recursively look at characters.
|
|
|
|
|
|
|
|
|
|
@ -554,350 +532,3 @@ class RecursiveCharacterTextSplitter(TextSplitter):
|
|
|
|
|
|
|
|
|
|
def split_text(self, text: str) -> list[str]:
|
|
|
|
|
return self._split_text(text, self._separators)
|
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
def from_language(
|
|
|
|
|
cls, language: Language, **kwargs: Any
|
|
|
|
|
) -> RecursiveCharacterTextSplitter:
|
|
|
|
|
separators = cls.get_separators_for_language(language)
|
|
|
|
|
return cls(separators=separators, **kwargs)
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def get_separators_for_language(language: Language) -> list[str]:
|
|
|
|
|
if language == Language.CPP:
|
|
|
|
|
return [
|
|
|
|
|
# Split along class definitions
|
|
|
|
|
"\nclass ",
|
|
|
|
|
# Split along function definitions
|
|
|
|
|
"\nvoid ",
|
|
|
|
|
"\nint ",
|
|
|
|
|
"\nfloat ",
|
|
|
|
|
"\ndouble ",
|
|
|
|
|
# Split along control flow statements
|
|
|
|
|
"\nif ",
|
|
|
|
|
"\nfor ",
|
|
|
|
|
"\nwhile ",
|
|
|
|
|
"\nswitch ",
|
|
|
|
|
"\ncase ",
|
|
|
|
|
# Split by the normal type of lines
|
|
|
|
|
"\n\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" ",
|
|
|
|
|
"",
|
|
|
|
|
]
|
|
|
|
|
elif language == Language.GO:
|
|
|
|
|
return [
|
|
|
|
|
# Split along function definitions
|
|
|
|
|
"\nfunc ",
|
|
|
|
|
"\nvar ",
|
|
|
|
|
"\nconst ",
|
|
|
|
|
"\ntype ",
|
|
|
|
|
# Split along control flow statements
|
|
|
|
|
"\nif ",
|
|
|
|
|
"\nfor ",
|
|
|
|
|
"\nswitch ",
|
|
|
|
|
"\ncase ",
|
|
|
|
|
# Split by the normal type of lines
|
|
|
|
|
"\n\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" ",
|
|
|
|
|
"",
|
|
|
|
|
]
|
|
|
|
|
elif language == Language.JAVA:
|
|
|
|
|
return [
|
|
|
|
|
# Split along class definitions
|
|
|
|
|
"\nclass ",
|
|
|
|
|
# Split along method definitions
|
|
|
|
|
"\npublic ",
|
|
|
|
|
"\nprotected ",
|
|
|
|
|
"\nprivate ",
|
|
|
|
|
"\nstatic ",
|
|
|
|
|
# Split along control flow statements
|
|
|
|
|
"\nif ",
|
|
|
|
|
"\nfor ",
|
|
|
|
|
"\nwhile ",
|
|
|
|
|
"\nswitch ",
|
|
|
|
|
"\ncase ",
|
|
|
|
|
# Split by the normal type of lines
|
|
|
|
|
"\n\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" ",
|
|
|
|
|
"",
|
|
|
|
|
]
|
|
|
|
|
elif language == Language.JS:
|
|
|
|
|
return [
|
|
|
|
|
# Split along function definitions
|
|
|
|
|
"\nfunction ",
|
|
|
|
|
"\nconst ",
|
|
|
|
|
"\nlet ",
|
|
|
|
|
"\nvar ",
|
|
|
|
|
"\nclass ",
|
|
|
|
|
# Split along control flow statements
|
|
|
|
|
"\nif ",
|
|
|
|
|
"\nfor ",
|
|
|
|
|
"\nwhile ",
|
|
|
|
|
"\nswitch ",
|
|
|
|
|
"\ncase ",
|
|
|
|
|
"\ndefault ",
|
|
|
|
|
# Split by the normal type of lines
|
|
|
|
|
"\n\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" ",
|
|
|
|
|
"",
|
|
|
|
|
]
|
|
|
|
|
elif language == Language.PHP:
|
|
|
|
|
return [
|
|
|
|
|
# Split along function definitions
|
|
|
|
|
"\nfunction ",
|
|
|
|
|
# Split along class definitions
|
|
|
|
|
"\nclass ",
|
|
|
|
|
# Split along control flow statements
|
|
|
|
|
"\nif ",
|
|
|
|
|
"\nforeach ",
|
|
|
|
|
"\nwhile ",
|
|
|
|
|
"\ndo ",
|
|
|
|
|
"\nswitch ",
|
|
|
|
|
"\ncase ",
|
|
|
|
|
# Split by the normal type of lines
|
|
|
|
|
"\n\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" ",
|
|
|
|
|
"",
|
|
|
|
|
]
|
|
|
|
|
elif language == Language.PROTO:
|
|
|
|
|
return [
|
|
|
|
|
# Split along message definitions
|
|
|
|
|
"\nmessage ",
|
|
|
|
|
# Split along service definitions
|
|
|
|
|
"\nservice ",
|
|
|
|
|
# Split along enum definitions
|
|
|
|
|
"\nenum ",
|
|
|
|
|
# Split along option definitions
|
|
|
|
|
"\noption ",
|
|
|
|
|
# Split along import statements
|
|
|
|
|
"\nimport ",
|
|
|
|
|
# Split along syntax declarations
|
|
|
|
|
"\nsyntax ",
|
|
|
|
|
# Split by the normal type of lines
|
|
|
|
|
"\n\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" ",
|
|
|
|
|
"",
|
|
|
|
|
]
|
|
|
|
|
elif language == Language.PYTHON:
|
|
|
|
|
return [
|
|
|
|
|
# First, try to split along class definitions
|
|
|
|
|
"\nclass ",
|
|
|
|
|
"\ndef ",
|
|
|
|
|
"\n\tdef ",
|
|
|
|
|
# Now split by the normal type of lines
|
|
|
|
|
"\n\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" ",
|
|
|
|
|
"",
|
|
|
|
|
]
|
|
|
|
|
elif language == Language.RST:
|
|
|
|
|
return [
|
|
|
|
|
# Split along section titles
|
|
|
|
|
"\n=+\n",
|
|
|
|
|
"\n-+\n",
|
|
|
|
|
"\n\\*+\n",
|
|
|
|
|
# Split along directive markers
|
|
|
|
|
"\n\n.. *\n\n",
|
|
|
|
|
# Split by the normal type of lines
|
|
|
|
|
"\n\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" ",
|
|
|
|
|
"",
|
|
|
|
|
]
|
|
|
|
|
elif language == Language.RUBY:
|
|
|
|
|
return [
|
|
|
|
|
# Split along method definitions
|
|
|
|
|
"\ndef ",
|
|
|
|
|
"\nclass ",
|
|
|
|
|
# Split along control flow statements
|
|
|
|
|
"\nif ",
|
|
|
|
|
"\nunless ",
|
|
|
|
|
"\nwhile ",
|
|
|
|
|
"\nfor ",
|
|
|
|
|
"\ndo ",
|
|
|
|
|
"\nbegin ",
|
|
|
|
|
"\nrescue ",
|
|
|
|
|
# Split by the normal type of lines
|
|
|
|
|
"\n\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" ",
|
|
|
|
|
"",
|
|
|
|
|
]
|
|
|
|
|
elif language == Language.RUST:
|
|
|
|
|
return [
|
|
|
|
|
# Split along function definitions
|
|
|
|
|
"\nfn ",
|
|
|
|
|
"\nconst ",
|
|
|
|
|
"\nlet ",
|
|
|
|
|
# Split along control flow statements
|
|
|
|
|
"\nif ",
|
|
|
|
|
"\nwhile ",
|
|
|
|
|
"\nfor ",
|
|
|
|
|
"\nloop ",
|
|
|
|
|
"\nmatch ",
|
|
|
|
|
"\nconst ",
|
|
|
|
|
# Split by the normal type of lines
|
|
|
|
|
"\n\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" ",
|
|
|
|
|
"",
|
|
|
|
|
]
|
|
|
|
|
elif language == Language.SCALA:
|
|
|
|
|
return [
|
|
|
|
|
# Split along class definitions
|
|
|
|
|
"\nclass ",
|
|
|
|
|
"\nobject ",
|
|
|
|
|
# Split along method definitions
|
|
|
|
|
"\ndef ",
|
|
|
|
|
"\nval ",
|
|
|
|
|
"\nvar ",
|
|
|
|
|
# Split along control flow statements
|
|
|
|
|
"\nif ",
|
|
|
|
|
"\nfor ",
|
|
|
|
|
"\nwhile ",
|
|
|
|
|
"\nmatch ",
|
|
|
|
|
"\ncase ",
|
|
|
|
|
# Split by the normal type of lines
|
|
|
|
|
"\n\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" ",
|
|
|
|
|
"",
|
|
|
|
|
]
|
|
|
|
|
elif language == Language.SWIFT:
|
|
|
|
|
return [
|
|
|
|
|
# Split along function definitions
|
|
|
|
|
"\nfunc ",
|
|
|
|
|
# Split along class definitions
|
|
|
|
|
"\nclass ",
|
|
|
|
|
"\nstruct ",
|
|
|
|
|
"\nenum ",
|
|
|
|
|
# Split along control flow statements
|
|
|
|
|
"\nif ",
|
|
|
|
|
"\nfor ",
|
|
|
|
|
"\nwhile ",
|
|
|
|
|
"\ndo ",
|
|
|
|
|
"\nswitch ",
|
|
|
|
|
"\ncase ",
|
|
|
|
|
# Split by the normal type of lines
|
|
|
|
|
"\n\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" ",
|
|
|
|
|
"",
|
|
|
|
|
]
|
|
|
|
|
elif language == Language.MARKDOWN:
|
|
|
|
|
return [
|
|
|
|
|
# First, try to split along Markdown headings (starting with level 2)
|
|
|
|
|
"\n#{1,6} ",
|
|
|
|
|
# Note the alternative syntax for headings (below) is not handled here
|
|
|
|
|
# Heading level 2
|
|
|
|
|
# ---------------
|
|
|
|
|
# End of code block
|
|
|
|
|
"```\n",
|
|
|
|
|
# Horizontal lines
|
|
|
|
|
"\n\\*\\*\\*+\n",
|
|
|
|
|
"\n---+\n",
|
|
|
|
|
"\n___+\n",
|
|
|
|
|
# Note that this splitter doesn't handle horizontal lines defined
|
|
|
|
|
# by *three or more* of ***, ---, or ___, but this is not handled
|
|
|
|
|
"\n\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" ",
|
|
|
|
|
"",
|
|
|
|
|
]
|
|
|
|
|
elif language == Language.LATEX:
|
|
|
|
|
return [
|
|
|
|
|
# First, try to split along Latex sections
|
|
|
|
|
"\n\\\\chapter{",
|
|
|
|
|
"\n\\\\section{",
|
|
|
|
|
"\n\\\\subsection{",
|
|
|
|
|
"\n\\\\subsubsection{",
|
|
|
|
|
# Now split by environments
|
|
|
|
|
"\n\\\begin{enumerate}",
|
|
|
|
|
"\n\\\begin{itemize}",
|
|
|
|
|
"\n\\\begin{description}",
|
|
|
|
|
"\n\\\begin{list}",
|
|
|
|
|
"\n\\\begin{quote}",
|
|
|
|
|
"\n\\\begin{quotation}",
|
|
|
|
|
"\n\\\begin{verse}",
|
|
|
|
|
"\n\\\begin{verbatim}",
|
|
|
|
|
# Now split by math environments
|
|
|
|
|
"\n\\\begin{align}",
|
|
|
|
|
"$$",
|
|
|
|
|
"$",
|
|
|
|
|
# Now split by the normal type of lines
|
|
|
|
|
" ",
|
|
|
|
|
"",
|
|
|
|
|
]
|
|
|
|
|
elif language == Language.HTML:
|
|
|
|
|
return [
|
|
|
|
|
# First, try to split along HTML tags
|
|
|
|
|
"<body",
|
|
|
|
|
"<div",
|
|
|
|
|
"<p",
|
|
|
|
|
"<br",
|
|
|
|
|
"<li",
|
|
|
|
|
"<h1",
|
|
|
|
|
"<h2",
|
|
|
|
|
"<h3",
|
|
|
|
|
"<h4",
|
|
|
|
|
"<h5",
|
|
|
|
|
"<h6",
|
|
|
|
|
"<span",
|
|
|
|
|
"<table",
|
|
|
|
|
"<tr",
|
|
|
|
|
"<td",
|
|
|
|
|
"<th",
|
|
|
|
|
"<ul",
|
|
|
|
|
"<ol",
|
|
|
|
|
"<header",
|
|
|
|
|
"<footer",
|
|
|
|
|
"<nav",
|
|
|
|
|
# Head
|
|
|
|
|
"<head",
|
|
|
|
|
"<style",
|
|
|
|
|
"<script",
|
|
|
|
|
"<meta",
|
|
|
|
|
"<title",
|
|
|
|
|
"",
|
|
|
|
|
]
|
|
|
|
|
elif language == Language.SOL:
|
|
|
|
|
return [
|
|
|
|
|
# Split along compiler information definitions
|
|
|
|
|
"\npragma ",
|
|
|
|
|
"\nusing ",
|
|
|
|
|
# Split along contract definitions
|
|
|
|
|
"\ncontract ",
|
|
|
|
|
"\ninterface ",
|
|
|
|
|
"\nlibrary ",
|
|
|
|
|
# Split along method definitions
|
|
|
|
|
"\nconstructor ",
|
|
|
|
|
"\ntype ",
|
|
|
|
|
"\nfunction ",
|
|
|
|
|
"\nevent ",
|
|
|
|
|
"\nmodifier ",
|
|
|
|
|
"\nerror ",
|
|
|
|
|
"\nstruct ",
|
|
|
|
|
"\nenum ",
|
|
|
|
|
# Split along control flow statements
|
|
|
|
|
"\nif ",
|
|
|
|
|
"\nfor ",
|
|
|
|
|
"\nwhile ",
|
|
|
|
|
"\ndo while ",
|
|
|
|
|
"\nassembly ",
|
|
|
|
|
# Split by the normal type of lines
|
|
|
|
|
"\n\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" ",
|
|
|
|
|
"",
|
|
|
|
|
]
|
|
|
|
|
else:
|
|
|
|
|
raise ValueError(
|
|
|
|
|
f"Language {language} is not supported! "
|
|
|
|
|
f"Please choose from {list(Language)}"
|
|
|
|
|
)
|
|
|
|
|
|