Langchained_PGPS_RAG

Sleeping

App Files Files Community

Langchained_PGPS_RAG / app.py

SergeyO7

Update app.py

52af776 verified 2 months ago

raw

history blame

2.37 kB

	from langchain_community.document_loaders import UnstructuredMarkdownLoader
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain_core.documents import Document # Updated import
	# from langchain_openai import OpenAIEmbeddings
	from langchain_huggingface import HuggingFaceEmbeddings # Updated import
	from langchain_community.vectorstores import Chroma
	from dotenv import load_dotenv
	import os
	import shutil # Added import

	# Load environment variables
	load_dotenv()
	# Assumes OPENAI_API_KEY is set in .env

	CHROMA_PATH = "chroma"
	DATA_PATH = "" # Update this to your actual data path

	def main():
	if not os.path.exists("model_cache"):
	os.makedirs("model_cache")
	if not os.path.exists("chroma"):
	os.makedirs("chroma")

	generate_data_store()

	def generate_data_store():
	documents = load_documents()
	if documents:
	chunks = split_text(documents)
	save_to_chroma(chunks)

	def load_documents():
	file_path = os.path.join(DATA_PATH, "pl250320251.md")
	if not os.path.exists(file_path):
	print(f"Error: File {file_path} not found.")
	return []
	loader = UnstructuredMarkdownLoader(file_path)
	documents = loader.load()
	return documents

	def split_text(documents: list[Document]):
	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=1000,
	chunk_overlap=200,
	length_function=len,
	add_start_index=True,
	)
	chunks = text_splitter.split_documents(documents)
	print(f"Split {len(documents)} documents into {len(chunks)} chunks.")

	document = chunks[10]
	print(document.page_content)
	print(document.metadata)

	return chunks


	def save_to_chroma(chunks: list[Document]):
	# Clear out the database first
	if os.path.exists(CHROMA_PATH):
	shutil.rmtree(CHROMA_PATH)

	# Initialize embeddings with cache
	embeddings = HuggingFaceEmbeddings(
	model_name="BAAI/bge-m3",
	cache_folder="model_cache" # Правильное место для кэша
	)

	# Create Chroma DB
	db = Chroma.from_documents(
	chunks,
	embeddings, # Используем предварительно созданный объект
	persist_directory=CHROMA_PATH
	)
	print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")

	if __name__ == "__main__":
	main()