Spaces:
Runtime error
Runtime error
from __future__ import annotations | |
import re | |
from typing import Any, List, Literal, Optional, Union | |
from langchain_text_splitters.base import Language, TextSplitter | |
class CharacterTextSplitter(TextSplitter): | |
"""Splitting text that looks at characters.""" | |
def __init__( | |
self, separator: str = "\n\n", is_separator_regex: bool = False, **kwargs: Any | |
) -> None: | |
"""Create a new TextSplitter.""" | |
super().__init__(**kwargs) | |
self._separator = separator | |
self._is_separator_regex = is_separator_regex | |
def split_text(self, text: str) -> List[str]: | |
"""Split incoming text and return chunks.""" | |
# First we naively split the large input into a bunch of smaller ones. | |
separator = ( | |
self._separator if self._is_separator_regex else re.escape(self._separator) | |
) | |
splits = _split_text_with_regex(text, separator, self._keep_separator) | |
_separator = "" if self._keep_separator else self._separator | |
return self._merge_splits(splits, _separator) | |
def _split_text_with_regex( | |
text: str, separator: str, keep_separator: Union[bool, Literal["start", "end"]] | |
) -> List[str]: | |
# Now that we have the separator, split the text | |
if separator: | |
if keep_separator: | |
# The parentheses in the pattern keep the delimiters in the result. | |
_splits = re.split(f"({separator})", text) | |
splits = ( | |
([_splits[i] + _splits[i + 1] for i in range(0, len(_splits) - 1, 2)]) | |
if keep_separator == "end" | |
else ([_splits[i] + _splits[i + 1] for i in range(1, len(_splits), 2)]) | |
) | |
if len(_splits) % 2 == 0: | |
splits += _splits[-1:] | |
splits = ( | |
(splits + [_splits[-1]]) | |
if keep_separator == "end" | |
else ([_splits[0]] + splits) | |
) | |
else: | |
splits = re.split(separator, text) | |
else: | |
splits = list(text) | |
return [s for s in splits if s != ""] | |
class RecursiveCharacterTextSplitter(TextSplitter): | |
"""Splitting text by recursively look at characters. | |
Recursively tries to split by different characters to find one | |
that works. | |
""" | |
def __init__( | |
self, | |
separators: Optional[List[str]] = None, | |
keep_separator: bool = True, | |
is_separator_regex: bool = False, | |
**kwargs: Any, | |
) -> None: | |
"""Create a new TextSplitter.""" | |
super().__init__(keep_separator=keep_separator, **kwargs) | |
self._separators = separators or ["\n\n", "\n", " ", ""] | |
self._is_separator_regex = is_separator_regex | |
def _split_text(self, text: str, separators: List[str]) -> List[str]: | |
"""Split incoming text and return chunks.""" | |
final_chunks = [] | |
# Get appropriate separator to use | |
separator = separators[-1] | |
new_separators = [] | |
for i, _s in enumerate(separators): | |
_separator = _s if self._is_separator_regex else re.escape(_s) | |
if _s == "": | |
separator = _s | |
break | |
if re.search(_separator, text): | |
separator = _s | |
new_separators = separators[i + 1 :] | |
break | |
_separator = separator if self._is_separator_regex else re.escape(separator) | |
splits = _split_text_with_regex(text, _separator, self._keep_separator) | |
# Now go merging things, recursively splitting longer texts. | |
_good_splits = [] | |
_separator = "" if self._keep_separator else separator | |
for s in splits: | |
if self._length_function(s) < self._chunk_size: | |
_good_splits.append(s) | |
else: | |
if _good_splits: | |
merged_text = self._merge_splits(_good_splits, _separator) | |
final_chunks.extend(merged_text) | |
_good_splits = [] | |
if not new_separators: | |
final_chunks.append(s) | |
else: | |
other_info = self._split_text(s, new_separators) | |
final_chunks.extend(other_info) | |
if _good_splits: | |
merged_text = self._merge_splits(_good_splits, _separator) | |
final_chunks.extend(merged_text) | |
return final_chunks | |
def split_text(self, text: str) -> List[str]: | |
return self._split_text(text, self._separators) | |
def from_language( | |
cls, language: Language, **kwargs: Any | |
) -> RecursiveCharacterTextSplitter: | |
separators = cls.get_separators_for_language(language) | |
return cls(separators=separators, is_separator_regex=True, **kwargs) | |
def get_separators_for_language(language: Language) -> List[str]: | |
if language == Language.CPP: | |
return [ | |
# Split along class definitions | |
"\nclass ", | |
# Split along function definitions | |
"\nvoid ", | |
"\nint ", | |
"\nfloat ", | |
"\ndouble ", | |
# Split along control flow statements | |
"\nif ", | |
"\nfor ", | |
"\nwhile ", | |
"\nswitch ", | |
"\ncase ", | |
# Split by the normal type of lines | |
"\n\n", | |
"\n", | |
" ", | |
"", | |
] | |
elif language == Language.GO: | |
return [ | |
# Split along function definitions | |
"\nfunc ", | |
"\nvar ", | |
"\nconst ", | |
"\ntype ", | |
# Split along control flow statements | |
"\nif ", | |
"\nfor ", | |
"\nswitch ", | |
"\ncase ", | |
# Split by the normal type of lines | |
"\n\n", | |
"\n", | |
" ", | |
"", | |
] | |
elif language == Language.JAVA: | |
return [ | |
# Split along class definitions | |
"\nclass ", | |
# Split along method definitions | |
"\npublic ", | |
"\nprotected ", | |
"\nprivate ", | |
"\nstatic ", | |
# Split along control flow statements | |
"\nif ", | |
"\nfor ", | |
"\nwhile ", | |
"\nswitch ", | |
"\ncase ", | |
# Split by the normal type of lines | |
"\n\n", | |
"\n", | |
" ", | |
"", | |
] | |
elif language == Language.KOTLIN: | |
return [ | |
# Split along class definitions | |
"\nclass ", | |
# Split along method definitions | |
"\npublic ", | |
"\nprotected ", | |
"\nprivate ", | |
"\ninternal ", | |
"\ncompanion ", | |
"\nfun ", | |
"\nval ", | |
"\nvar ", | |
# Split along control flow statements | |
"\nif ", | |
"\nfor ", | |
"\nwhile ", | |
"\nwhen ", | |
"\ncase ", | |
"\nelse ", | |
# Split by the normal type of lines | |
"\n\n", | |
"\n", | |
" ", | |
"", | |
] | |
elif language == Language.JS: | |
return [ | |
# Split along function definitions | |
"\nfunction ", | |
"\nconst ", | |
"\nlet ", | |
"\nvar ", | |
"\nclass ", | |
# Split along control flow statements | |
"\nif ", | |
"\nfor ", | |
"\nwhile ", | |
"\nswitch ", | |
"\ncase ", | |
"\ndefault ", | |
# Split by the normal type of lines | |
"\n\n", | |
"\n", | |
" ", | |
"", | |
] | |
elif language == Language.TS: | |
return [ | |
"\nenum ", | |
"\ninterface ", | |
"\nnamespace ", | |
"\ntype ", | |
# Split along class definitions | |
"\nclass ", | |
# Split along function definitions | |
"\nfunction ", | |
"\nconst ", | |
"\nlet ", | |
"\nvar ", | |
# Split along control flow statements | |
"\nif ", | |
"\nfor ", | |
"\nwhile ", | |
"\nswitch ", | |
"\ncase ", | |
"\ndefault ", | |
# Split by the normal type of lines | |
"\n\n", | |
"\n", | |
" ", | |
"", | |
] | |
elif language == Language.PHP: | |
return [ | |
# Split along function definitions | |
"\nfunction ", | |
# Split along class definitions | |
"\nclass ", | |
# Split along control flow statements | |
"\nif ", | |
"\nforeach ", | |
"\nwhile ", | |
"\ndo ", | |
"\nswitch ", | |
"\ncase ", | |
# Split by the normal type of lines | |
"\n\n", | |
"\n", | |
" ", | |
"", | |
] | |
elif language == Language.PROTO: | |
return [ | |
# Split along message definitions | |
"\nmessage ", | |
# Split along service definitions | |
"\nservice ", | |
# Split along enum definitions | |
"\nenum ", | |
# Split along option definitions | |
"\noption ", | |
# Split along import statements | |
"\nimport ", | |
# Split along syntax declarations | |
"\nsyntax ", | |
# Split by the normal type of lines | |
"\n\n", | |
"\n", | |
" ", | |
"", | |
] | |
elif language == Language.PYTHON: | |
return [ | |
# First, try to split along class definitions | |
"\nclass ", | |
"\ndef ", | |
"\n\tdef ", | |
# Now split by the normal type of lines | |
"\n\n", | |
"\n", | |
" ", | |
"", | |
] | |
elif language == Language.RST: | |
return [ | |
# Split along section titles | |
"\n=+\n", | |
"\n-+\n", | |
"\n\\*+\n", | |
# Split along directive markers | |
"\n\n.. *\n\n", | |
# Split by the normal type of lines | |
"\n\n", | |
"\n", | |
" ", | |
"", | |
] | |
elif language == Language.RUBY: | |
return [ | |
# Split along method definitions | |
"\ndef ", | |
"\nclass ", | |
# Split along control flow statements | |
"\nif ", | |
"\nunless ", | |
"\nwhile ", | |
"\nfor ", | |
"\ndo ", | |
"\nbegin ", | |
"\nrescue ", | |
# Split by the normal type of lines | |
"\n\n", | |
"\n", | |
" ", | |
"", | |
] | |
elif language == Language.RUST: | |
return [ | |
# Split along function definitions | |
"\nfn ", | |
"\nconst ", | |
"\nlet ", | |
# Split along control flow statements | |
"\nif ", | |
"\nwhile ", | |
"\nfor ", | |
"\nloop ", | |
"\nmatch ", | |
"\nconst ", | |
# Split by the normal type of lines | |
"\n\n", | |
"\n", | |
" ", | |
"", | |
] | |
elif language == Language.SCALA: | |
return [ | |
# Split along class definitions | |
"\nclass ", | |
"\nobject ", | |
# Split along method definitions | |
"\ndef ", | |
"\nval ", | |
"\nvar ", | |
# Split along control flow statements | |
"\nif ", | |
"\nfor ", | |
"\nwhile ", | |
"\nmatch ", | |
"\ncase ", | |
# Split by the normal type of lines | |
"\n\n", | |
"\n", | |
" ", | |
"", | |
] | |
elif language == Language.SWIFT: | |
return [ | |
# Split along function definitions | |
"\nfunc ", | |
# Split along class definitions | |
"\nclass ", | |
"\nstruct ", | |
"\nenum ", | |
# Split along control flow statements | |
"\nif ", | |
"\nfor ", | |
"\nwhile ", | |
"\ndo ", | |
"\nswitch ", | |
"\ncase ", | |
# Split by the normal type of lines | |
"\n\n", | |
"\n", | |
" ", | |
"", | |
] | |
elif language == Language.MARKDOWN: | |
return [ | |
# First, try to split along Markdown headings (starting with level 2) | |
"\n#{1,6} ", | |
# Note the alternative syntax for headings (below) is not handled here | |
# Heading level 2 | |
# --------------- | |
# End of code block | |
"```\n", | |
# Horizontal lines | |
"\n\\*\\*\\*+\n", | |
"\n---+\n", | |
"\n___+\n", | |
# Note that this splitter doesn't handle horizontal lines defined | |
# by *three or more* of ***, ---, or ___, but this is not handled | |
"\n\n", | |
"\n", | |
" ", | |
"", | |
] | |
elif language == Language.LATEX: | |
return [ | |
# First, try to split along Latex sections | |
"\n\\\\chapter{", | |
"\n\\\\section{", | |
"\n\\\\subsection{", | |
"\n\\\\subsubsection{", | |
# Now split by environments | |
"\n\\\\begin{enumerate}", | |
"\n\\\\begin{itemize}", | |
"\n\\\\begin{description}", | |
"\n\\\\begin{list}", | |
"\n\\\\begin{quote}", | |
"\n\\\\begin{quotation}", | |
"\n\\\\begin{verse}", | |
"\n\\\\begin{verbatim}", | |
# Now split by math environments | |
"\n\\\begin{align}", | |
"$$", | |
"$", | |
# Now split by the normal type of lines | |
" ", | |
"", | |
] | |
elif language == Language.HTML: | |
return [ | |
# First, try to split along HTML tags | |
"<body", | |
"<div", | |
"<p", | |
"<br", | |
"<li", | |
"<h1", | |
"<h2", | |
"<h3", | |
"<h4", | |
"<h5", | |
"<h6", | |
"<span", | |
"<table", | |
"<tr", | |
"<td", | |
"<th", | |
"<ul", | |
"<ol", | |
"<header", | |
"<footer", | |
"<nav", | |
# Head | |
"<head", | |
"<style", | |
"<script", | |
"<meta", | |
"<title", | |
"", | |
] | |
elif language == Language.CSHARP: | |
return [ | |
"\ninterface ", | |
"\nenum ", | |
"\nimplements ", | |
"\ndelegate ", | |
"\nevent ", | |
# Split along class definitions | |
"\nclass ", | |
"\nabstract ", | |
# Split along method definitions | |
"\npublic ", | |
"\nprotected ", | |
"\nprivate ", | |
"\nstatic ", | |
"\nreturn ", | |
# Split along control flow statements | |
"\nif ", | |
"\ncontinue ", | |
"\nfor ", | |
"\nforeach ", | |
"\nwhile ", | |
"\nswitch ", | |
"\nbreak ", | |
"\ncase ", | |
"\nelse ", | |
# Split by exceptions | |
"\ntry ", | |
"\nthrow ", | |
"\nfinally ", | |
"\ncatch ", | |
# Split by the normal type of lines | |
"\n\n", | |
"\n", | |
" ", | |
"", | |
] | |
elif language == Language.SOL: | |
return [ | |
# Split along compiler information definitions | |
"\npragma ", | |
"\nusing ", | |
# Split along contract definitions | |
"\ncontract ", | |
"\ninterface ", | |
"\nlibrary ", | |
# Split along method definitions | |
"\nconstructor ", | |
"\ntype ", | |
"\nfunction ", | |
"\nevent ", | |
"\nmodifier ", | |
"\nerror ", | |
"\nstruct ", | |
"\nenum ", | |
# Split along control flow statements | |
"\nif ", | |
"\nfor ", | |
"\nwhile ", | |
"\ndo while ", | |
"\nassembly ", | |
# Split by the normal type of lines | |
"\n\n", | |
"\n", | |
" ", | |
"", | |
] | |
elif language == Language.COBOL: | |
return [ | |
# Split along divisions | |
"\nIDENTIFICATION DIVISION.", | |
"\nENVIRONMENT DIVISION.", | |
"\nDATA DIVISION.", | |
"\nPROCEDURE DIVISION.", | |
# Split along sections within DATA DIVISION | |
"\nWORKING-STORAGE SECTION.", | |
"\nLINKAGE SECTION.", | |
"\nFILE SECTION.", | |
# Split along sections within PROCEDURE DIVISION | |
"\nINPUT-OUTPUT SECTION.", | |
# Split along paragraphs and common statements | |
"\nOPEN ", | |
"\nCLOSE ", | |
"\nREAD ", | |
"\nWRITE ", | |
"\nIF ", | |
"\nELSE ", | |
"\nMOVE ", | |
"\nPERFORM ", | |
"\nUNTIL ", | |
"\nVARYING ", | |
"\nACCEPT ", | |
"\nDISPLAY ", | |
"\nSTOP RUN.", | |
# Split by the normal type of lines | |
"\n", | |
" ", | |
"", | |
] | |
elif language == Language.LUA: | |
return [ | |
# Split along variable and table definitions | |
"\nlocal ", | |
# Split along function definitions | |
"\nfunction ", | |
# Split along control flow statements | |
"\nif ", | |
"\nfor ", | |
"\nwhile ", | |
"\nrepeat ", | |
# Split by the normal type of lines | |
"\n\n", | |
"\n", | |
" ", | |
"", | |
] | |
elif language == Language.HASKELL: | |
return [ | |
# Split along function definitions | |
"\nmain :: ", | |
"\nmain = ", | |
"\nlet ", | |
"\nin ", | |
"\ndo ", | |
"\nwhere ", | |
"\n:: ", | |
"\n= ", | |
# Split along type declarations | |
"\ndata ", | |
"\nnewtype ", | |
"\ntype ", | |
"\n:: ", | |
# Split along module declarations | |
"\nmodule ", | |
# Split along import statements | |
"\nimport ", | |
"\nqualified ", | |
"\nimport qualified ", | |
# Split along typeclass declarations | |
"\nclass ", | |
"\ninstance ", | |
# Split along case expressions | |
"\ncase ", | |
# Split along guards in function definitions | |
"\n| ", | |
# Split along record field declarations | |
"\ndata ", | |
"\n= {", | |
"\n, ", | |
# Split by the normal type of lines | |
"\n\n", | |
"\n", | |
" ", | |
"", | |
] | |
elif language in Language._value2member_map_: | |
raise ValueError(f"Language {language} is not implemented yet!") | |
else: | |
raise ValueError( | |
f"Language {language} is not supported! " | |
f"Please choose from {list(Language)}" | |
) | |