SergeyO7's picture
Update app.py
52af776 verified
raw
history blame
2.37 kB
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document # Updated import
# from langchain_openai import OpenAIEmbeddings
from langchain_huggingface import HuggingFaceEmbeddings # Updated import
from langchain_community.vectorstores import Chroma
from dotenv import load_dotenv
import os
import shutil # Added import
# Load environment variables
load_dotenv()
# Assumes OPENAI_API_KEY is set in .env
CHROMA_PATH = "chroma"
DATA_PATH = "" # Update this to your actual data path
def main():
if not os.path.exists("model_cache"):
os.makedirs("model_cache")
if not os.path.exists("chroma"):
os.makedirs("chroma")
generate_data_store()
def generate_data_store():
documents = load_documents()
if documents:
chunks = split_text(documents)
save_to_chroma(chunks)
def load_documents():
file_path = os.path.join(DATA_PATH, "pl250320251.md")
if not os.path.exists(file_path):
print(f"Error: File {file_path} not found.")
return []
loader = UnstructuredMarkdownLoader(file_path)
documents = loader.load()
return documents
def split_text(documents: list[Document]):
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200,
length_function=len,
add_start_index=True,
)
chunks = text_splitter.split_documents(documents)
print(f"Split {len(documents)} documents into {len(chunks)} chunks.")
document = chunks[10]
print(document.page_content)
print(document.metadata)
return chunks
def save_to_chroma(chunks: list[Document]):
# Clear out the database first
if os.path.exists(CHROMA_PATH):
shutil.rmtree(CHROMA_PATH)
# Initialize embeddings with cache
embeddings = HuggingFaceEmbeddings(
model_name="BAAI/bge-m3",
cache_folder="model_cache" # Правильное место для кэша
)
# Create Chroma DB
db = Chroma.from_documents(
chunks,
embeddings, # Используем предварительно созданный объект
persist_directory=CHROMA_PATH
)
print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")
if __name__ == "__main__":
main()