Spaces:

Surajkumaar
/

CVElytics

Running

File size: 1,792 Bytes
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain_core.documents import Document
import os
import pandas as pd
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Load CVE dataset
df = pd.read_csv("cve.csv")

# Set up the embedding model using HuggingFace with a fully qualified model name
# Using a simpler model that's more compatible with Hugging Face Spaces
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/paraphrase-MiniLM-L3-v2",  # Smaller, more compatible model
    model_kwargs={'device': 'cpu'}  # Ensure it runs on CPU for compatibility
)

# Directory for the vector store - use /tmp for proper permissions in containerized environments
db_location = "/tmp/chrome_langchain_db"
add_documents = not os.path.exists(db_location)

# Initialize Chroma DB
vector_store = Chroma(
    collection_name="cve_data",
    persist_directory=db_location,
    embedding_function=embeddings
)

# Add documents only if DB doesn't exist yet
if add_documents:
    documents = []
    ids = []

    for i, row in df.iterrows():
        # Replace with actual column names in your CSV
        cve_id = row.get("CVE_ID", f"CVE-{i}")
        description = row.get("Description", "")
        date = row.get("PublishedDate", "")

        content = f"CVE ID: {cve_id}\nDescription: {description}\nPublished Date: {date}"
        
        document = Document(
            page_content=content,
            metadata={"published_date": date},
            id=str(i)
        )

        documents.append(document)
        ids.append(str(i))

    vector_store.add_documents(documents=documents, ids=ids)

# Create retriever from the vector store
retriever = vector_store.as_retriever(search_kwargs={"k": 5})