Spaces:
Running
Running
from langchain_huggingface import HuggingFaceEmbeddings | |
from langchain_chroma import Chroma | |
from langchain_core.documents import Document | |
import os | |
import pandas as pd | |
from dotenv import load_dotenv | |
# Load environment variables | |
load_dotenv() | |
# Load CVE dataset | |
df = pd.read_csv("cve.csv") | |
# Set up the embedding model using HuggingFace with a fully qualified model name | |
# Using a simpler model that's more compatible with Hugging Face Spaces | |
embeddings = HuggingFaceEmbeddings( | |
model_name="sentence-transformers/paraphrase-MiniLM-L3-v2", # Smaller, more compatible model | |
model_kwargs={'device': 'cpu'} # Ensure it runs on CPU for compatibility | |
) | |
# Directory for the vector store - use /tmp for proper permissions in containerized environments | |
db_location = "/tmp/chrome_langchain_db" | |
add_documents = not os.path.exists(db_location) | |
# Initialize Chroma DB | |
vector_store = Chroma( | |
collection_name="cve_data", | |
persist_directory=db_location, | |
embedding_function=embeddings | |
) | |
# Add documents only if DB doesn't exist yet | |
if add_documents: | |
documents = [] | |
ids = [] | |
for i, row in df.iterrows(): | |
# Replace with actual column names in your CSV | |
cve_id = row.get("CVE_ID", f"CVE-{i}") | |
description = row.get("Description", "") | |
date = row.get("PublishedDate", "") | |
content = f"CVE ID: {cve_id}\nDescription: {description}\nPublished Date: {date}" | |
document = Document( | |
page_content=content, | |
metadata={"published_date": date}, | |
id=str(i) | |
) | |
documents.append(document) | |
ids.append(str(i)) | |
vector_store.add_documents(documents=documents, ids=ids) | |
# Create retriever from the vector store | |
retriever = vector_store.as_retriever(search_kwargs={"k": 5}) | |