File size: 740 Bytes
76a27ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
from langchain.document_loaders import UnstructuredWordDocumentLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter 
from langchain.embeddings import HuggingFaceEmbeddings

def data_loader(data):
    loader = DirectoryLoader(
        data,
        glob=("*.docx"),
        loader_cls=UnstructuredWordDocumentLoader,
    )
    return loader.load()

def chunk_text(extracted_data):
    text_spliter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    text_chunk = text_spliter.split_documents(extracted_data)
    return text_chunk

def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings