Spaces:
Sleeping
Sleeping
File size: 2,370 Bytes
8edd424 cd75fd1 16919b8 d026131 16919b8 cd75fd1 859a3f4 cd75fd1 16919b8 cd75fd1 8edd424 cd75fd1 8edd424 cd75fd1 8edd424 cd75fd1 8edd424 52af776 cd75fd1 8edd424 cd75fd1 9550184 8edd424 03f014f cd75fd1 bd4a39c cd75fd1 e3c118a cd75fd1 e3c118a cd75fd1 16919b8 e3c118a 16919b8 022791c cd75fd1 022791c cd75fd1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 |
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document # Updated import
# from langchain_openai import OpenAIEmbeddings
from langchain_huggingface import HuggingFaceEmbeddings # Updated import
from langchain_community.vectorstores import Chroma
from dotenv import load_dotenv
import os
import shutil # Added import
# Load environment variables
load_dotenv()
# Assumes OPENAI_API_KEY is set in .env
CHROMA_PATH = "chroma"
DATA_PATH = "" # Update this to your actual data path
def main():
if not os.path.exists("model_cache"):
os.makedirs("model_cache")
if not os.path.exists("chroma"):
os.makedirs("chroma")
generate_data_store()
def generate_data_store():
documents = load_documents()
if documents:
chunks = split_text(documents)
save_to_chroma(chunks)
def load_documents():
file_path = os.path.join(DATA_PATH, "pl250320251.md")
if not os.path.exists(file_path):
print(f"Error: File {file_path} not found.")
return []
loader = UnstructuredMarkdownLoader(file_path)
documents = loader.load()
return documents
def split_text(documents: list[Document]):
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200,
length_function=len,
add_start_index=True,
)
chunks = text_splitter.split_documents(documents)
print(f"Split {len(documents)} documents into {len(chunks)} chunks.")
document = chunks[10]
print(document.page_content)
print(document.metadata)
return chunks
def save_to_chroma(chunks: list[Document]):
# Clear out the database first
if os.path.exists(CHROMA_PATH):
shutil.rmtree(CHROMA_PATH)
# Initialize embeddings with cache
embeddings = HuggingFaceEmbeddings(
model_name="BAAI/bge-m3",
cache_folder="model_cache" # Правильное место для кэша
)
# Create Chroma DB
db = Chroma.from_documents(
chunks,
embeddings, # Используем предварительно созданный объект
persist_directory=CHROMA_PATH
)
print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")
if __name__ == "__main__":
main() |