Spaces:

anpigon
/

langchain-qa-bot

Runtime error

add langchain docs

ed4d993 11 months ago

1.76 kB

	from __future__ import annotations

	from typing import Any, List

	from langchain_text_splitters.base import TextSplitter


	class SpacyTextSplitter(TextSplitter):
	"""Splitting text using Spacy package.


	Per default, Spacy's `en_core_web_sm` model is used and
	its default max_length is 1000000 (it is the length of maximum character
	this model takes which can be increased for large files). For a faster, but
	potentially less accurate splitting, you can use `pipeline='sentencizer'`.
	"""

	def __init__(
	self,
	separator: str = "\n\n",
	pipeline: str = "en_core_web_sm",
	max_length: int = 1_000_000,
	**kwargs: Any,
	) -> None:
	"""Initialize the spacy text splitter."""
	super().__init__(**kwargs)
	self._tokenizer = _make_spacy_pipeline_for_splitting(
	pipeline, max_length=max_length
	)
	self._separator = separator

	def split_text(self, text: str) -> List[str]:
	"""Split incoming text and return chunks."""
	splits = (s.text for s in self._tokenizer(text).sents)
	return self._merge_splits(splits, self._separator)


	def _make_spacy_pipeline_for_splitting(
	pipeline: str, *, max_length: int = 1_000_000
	) -> Any: # avoid importing spacy
	try:
	import spacy
	except ImportError:
	raise ImportError(
	"Spacy is not installed, please install it with `pip install spacy`."
	)
	if pipeline == "sentencizer":
	from spacy.lang.en import English

	sentencizer: Any = English()
	sentencizer.add_pipe("sentencizer")
	else:
	sentencizer = spacy.load(pipeline, exclude=["ner", "tagger"])
	sentencizer.max_length = max_length
	return sentencizer