Spaces:
Runtime error
Runtime error
from __future__ import annotations | |
from typing import Any, List | |
from langchain_text_splitters.base import TextSplitter | |
class SpacyTextSplitter(TextSplitter): | |
"""Splitting text using Spacy package. | |
Per default, Spacy's `en_core_web_sm` model is used and | |
its default max_length is 1000000 (it is the length of maximum character | |
this model takes which can be increased for large files). For a faster, but | |
potentially less accurate splitting, you can use `pipeline='sentencizer'`. | |
""" | |
def __init__( | |
self, | |
separator: str = "\n\n", | |
pipeline: str = "en_core_web_sm", | |
max_length: int = 1_000_000, | |
**kwargs: Any, | |
) -> None: | |
"""Initialize the spacy text splitter.""" | |
super().__init__(**kwargs) | |
self._tokenizer = _make_spacy_pipeline_for_splitting( | |
pipeline, max_length=max_length | |
) | |
self._separator = separator | |
def split_text(self, text: str) -> List[str]: | |
"""Split incoming text and return chunks.""" | |
splits = (s.text for s in self._tokenizer(text).sents) | |
return self._merge_splits(splits, self._separator) | |
def _make_spacy_pipeline_for_splitting( | |
pipeline: str, *, max_length: int = 1_000_000 | |
) -> Any: # avoid importing spacy | |
try: | |
import spacy | |
except ImportError: | |
raise ImportError( | |
"Spacy is not installed, please install it with `pip install spacy`." | |
) | |
if pipeline == "sentencizer": | |
from spacy.lang.en import English | |
sentencizer: Any = English() | |
sentencizer.add_pipe("sentencizer") | |
else: | |
sentencizer = spacy.load(pipeline, exclude=["ner", "tagger"]) | |
sentencizer.max_length = max_length | |
return sentencizer | |