from langchain.document_loaders import UnstructuredWordDocumentLoader, DirectoryLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.embeddings import HuggingFaceEmbeddings def data_loader(data): loader = DirectoryLoader( data, glob=("*.docx"), loader_cls=UnstructuredWordDocumentLoader, ) return loader.load() def chunk_text(extracted_data): text_spliter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) text_chunk = text_spliter.split_documents(extracted_data) return text_chunk def download_hugging_face_embeddings(): embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") return embeddings