Spaces:
Sleeping
Sleeping
from langchain_community.document_loaders import UnstructuredMarkdownLoader | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain_core.documents import Document # Updated import | |
# from langchain_openai import OpenAIEmbeddings | |
from langchain_huggingface import HuggingFaceEmbeddings # Updated import | |
from langchain_community.vectorstores import Chroma | |
from dotenv import load_dotenv | |
import os | |
import shutil # Added import | |
# Load environment variables | |
load_dotenv() | |
# Assumes OPENAI_API_KEY is set in .env | |
CHROMA_PATH = "chroma" | |
DATA_PATH = "" # Update this to your actual data path | |
def main(): | |
if not os.path.exists("model_cache"): | |
os.makedirs("model_cache") | |
if not os.path.exists("chroma"): | |
os.makedirs("chroma") | |
generate_data_store() | |
def generate_data_store(): | |
documents = load_documents() | |
if documents: | |
chunks = split_text(documents) | |
save_to_chroma(chunks) | |
def load_documents(): | |
file_path = os.path.join(DATA_PATH, "pl250320251.md") | |
if not os.path.exists(file_path): | |
print(f"Error: File {file_path} not found.") | |
return [] | |
loader = UnstructuredMarkdownLoader(file_path) | |
documents = loader.load() | |
return documents | |
def split_text(documents: list[Document]): | |
text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size=1000, | |
chunk_overlap=200, | |
length_function=len, | |
add_start_index=True, | |
) | |
chunks = text_splitter.split_documents(documents) | |
print(f"Split {len(documents)} documents into {len(chunks)} chunks.") | |
document = chunks[10] | |
print(document.page_content) | |
print(document.metadata) | |
return chunks | |
def save_to_chroma(chunks: list[Document]): | |
# Clear out the database first | |
if os.path.exists(CHROMA_PATH): | |
shutil.rmtree(CHROMA_PATH) | |
# Initialize embeddings with cache | |
embeddings = HuggingFaceEmbeddings( | |
model_name="BAAI/bge-m3", | |
cache_folder="model_cache" # Правильное место для кэша | |
) | |
# Create Chroma DB | |
db = Chroma.from_documents( | |
chunks, | |
embeddings, # Используем предварительно созданный объект | |
persist_directory=CHROMA_PATH | |
) | |
print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.") | |
if __name__ == "__main__": | |
main() |