File size: 574 Bytes
7f7b773
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
import spacy_transformers # needed by SpacyTextSplitter when using the en_core_web_trf pipeline
import spacy
from typing import Iterable, Iterator
from langchain.docstore.document import Document
from langchain.text_splitter import SpacyTextSplitter


class SpacySplitter:
    def __init__(self):
        self.splitter = SpacyTextSplitter(chunk_size=1000, pipeline="en_core_web_trf")

    def split_documents(self, docs: Iterable[Document]) -> Iterator[Document]:
        spacy.prefer_gpu(gpu_id=1)
        chunks = self.splitter.split_documents(docs)
        return chunks