Spaces:

Surajkumaar
/

CVElytics

Running

CVElytics / vector.py

Surajkumaar

Upload 7 files

4116826 verified 18 days ago

1.79 kB

	from langchain_huggingface import HuggingFaceEmbeddings
	from langchain_chroma import Chroma
	from langchain_core.documents import Document
	import os
	import pandas as pd
	from dotenv import load_dotenv

	# Load environment variables
	load_dotenv()

	# Load CVE dataset
	df = pd.read_csv("cve.csv")

	# Set up the embedding model using HuggingFace with a fully qualified model name
	# Using a simpler model that's more compatible with Hugging Face Spaces
	embeddings = HuggingFaceEmbeddings(
	model_name="sentence-transformers/paraphrase-MiniLM-L3-v2", # Smaller, more compatible model
	model_kwargs={'device': 'cpu'} # Ensure it runs on CPU for compatibility
	)

	# Directory for the vector store - use /tmp for proper permissions in containerized environments
	db_location = "/tmp/chrome_langchain_db"
	add_documents = not os.path.exists(db_location)

	# Initialize Chroma DB
	vector_store = Chroma(
	collection_name="cve_data",
	persist_directory=db_location,
	embedding_function=embeddings
	)

	# Add documents only if DB doesn't exist yet
	if add_documents:
	documents = []
	ids = []

	for i, row in df.iterrows():
	# Replace with actual column names in your CSV
	cve_id = row.get("CVE_ID", f"CVE-{i}")
	description = row.get("Description", "")
	date = row.get("PublishedDate", "")

	content = f"CVE ID: {cve_id}\nDescription: {description}\nPublished Date: {date}"

	document = Document(
	page_content=content,
	metadata={"published_date": date},
	id=str(i)
	)

	documents.append(document)
	ids.append(str(i))

	vector_store.add_documents(documents=documents, ids=ids)

	# Create retriever from the vector store
	retriever = vector_store.as_retriever(search_kwargs={"k": 5})