tommymarto's picture
first attempt to hf spaces
7f7b773
raw
history blame contribute delete
574 Bytes
import spacy_transformers # needed by SpacyTextSplitter when using the en_core_web_trf pipeline
import spacy
from typing import Iterable, Iterator
from langchain.docstore.document import Document
from langchain.text_splitter import SpacyTextSplitter
class SpacySplitter:
def __init__(self):
self.splitter = SpacyTextSplitter(chunk_size=1000, pipeline="en_core_web_trf")
def split_documents(self, docs: Iterable[Document]) -> Iterator[Document]:
spacy.prefer_gpu(gpu_id=1)
chunks = self.splitter.split_documents(docs)
return chunks