File size: 2,370 Bytes
8edd424
cd75fd1
16919b8
d026131
16919b8
cd75fd1
859a3f4
cd75fd1
16919b8
cd75fd1
8edd424
cd75fd1
8edd424
cd75fd1
 
8edd424
cd75fd1
8edd424
52af776
 
 
 
 
cd75fd1
 
 
 
8edd424
 
 
cd75fd1
 
9550184
8edd424
 
 
03f014f
cd75fd1
 
 
 
 
bd4a39c
 
cd75fd1
 
 
 
 
 
 
 
 
 
 
 
 
 
e3c118a
cd75fd1
 
 
e3c118a
 
 
 
 
 
 
cd75fd1
16919b8
e3c118a
16919b8
022791c
cd75fd1
 
022791c
cd75fd1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document  # Updated import
# from langchain_openai import OpenAIEmbeddings
from langchain_huggingface import HuggingFaceEmbeddings  # Updated import
from langchain_community.vectorstores import Chroma
from dotenv import load_dotenv
import os
import shutil  # Added import

# Load environment variables
load_dotenv()
# Assumes OPENAI_API_KEY is set in .env

CHROMA_PATH = "chroma"
DATA_PATH = ""  # Update this to your actual data path

def main():
    if not os.path.exists("model_cache"):
        os.makedirs("model_cache")
    if not os.path.exists("chroma"):
        os.makedirs("chroma")
    
    generate_data_store()

def generate_data_store():
    documents = load_documents()
    if documents:
        chunks = split_text(documents)
        save_to_chroma(chunks)

def load_documents():
    file_path = os.path.join(DATA_PATH, "pl250320251.md")
    if not os.path.exists(file_path):
        print(f"Error: File {file_path} not found.")
        return []
    loader = UnstructuredMarkdownLoader(file_path)
    documents = loader.load()
    return documents

def split_text(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len,
        add_start_index=True,
    )
    chunks = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(chunks)} chunks.")

    document = chunks[10]
    print(document.page_content)
    print(document.metadata)

    return chunks


def save_to_chroma(chunks: list[Document]):
    # Clear out the database first
    if os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH)

    # Initialize embeddings with cache
    embeddings = HuggingFaceEmbeddings(
        model_name="BAAI/bge-m3",
        cache_folder="model_cache"  # Правильное место для кэша
    )

    # Create Chroma DB
    db = Chroma.from_documents(
        chunks, 
        embeddings,  # Используем предварительно созданный объект
        persist_directory=CHROMA_PATH
    )
    print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")

if __name__ == "__main__":
    main()