CVElytics / vector.py
Surajkumaar
Upload 7 files
4116826 verified
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain_core.documents import Document
import os
import pandas as pd
from dotenv import load_dotenv
# Load environment variables
load_dotenv()
# Load CVE dataset
df = pd.read_csv("cve.csv")
# Set up the embedding model using HuggingFace with a fully qualified model name
# Using a simpler model that's more compatible with Hugging Face Spaces
embeddings = HuggingFaceEmbeddings(
model_name="sentence-transformers/paraphrase-MiniLM-L3-v2", # Smaller, more compatible model
model_kwargs={'device': 'cpu'} # Ensure it runs on CPU for compatibility
)
# Directory for the vector store - use /tmp for proper permissions in containerized environments
db_location = "/tmp/chrome_langchain_db"
add_documents = not os.path.exists(db_location)
# Initialize Chroma DB
vector_store = Chroma(
collection_name="cve_data",
persist_directory=db_location,
embedding_function=embeddings
)
# Add documents only if DB doesn't exist yet
if add_documents:
documents = []
ids = []
for i, row in df.iterrows():
# Replace with actual column names in your CSV
cve_id = row.get("CVE_ID", f"CVE-{i}")
description = row.get("Description", "")
date = row.get("PublishedDate", "")
content = f"CVE ID: {cve_id}\nDescription: {description}\nPublished Date: {date}"
document = Document(
page_content=content,
metadata={"published_date": date},
id=str(i)
)
documents.append(document)
ids.append(str(i))
vector_store.add_documents(documents=documents, ids=ids)
# Create retriever from the vector store
retriever = vector_store.as_retriever(search_kwargs={"k": 5})