Spaces:
Sleeping
Sleeping
import json | |
import os | |
import numpy as np | |
import faiss | |
from dotenv import load_dotenv | |
import openai | |
from tqdm import tqdm | |
# === Steg 1: Ladda API-nyckel från .env === | |
load_dotenv() | |
api_key = os.getenv("OPENAI_API_KEY") | |
if not api_key: | |
raise ValueError("OPENAI_API_KEY saknas i .env-filen!") | |
openai.api_key = api_key | |
# === Steg 2: Ladda kunskapsbas från JSONL === | |
with open("knowledge_base.jsonl", "r", encoding="utf-8") as f: | |
data = [json.loads(line) for line in f] | |
texts = [] | |
metadata_list = [] | |
for item in data: | |
if "text" in item and item["text"].strip(): | |
text_clean = item["text"].strip() | |
texts.append(text_clean) | |
# Inkludera texten i metadata | |
metadata_item = {**item.get("metadata", {}), "text": text_clean} | |
metadata_list.append(metadata_item) | |
print(f"🔢 Totalt antal texter: {len(texts)}") | |
# === Steg 3: Skapa embeddings med ny syntax === | |
def get_embeddings(texts, model="text-embedding-ada-002", batch_size=100): | |
all_embeddings = [] | |
for i in tqdm(range(0, len(texts), batch_size), desc="🔄 Skapar embeddings via OpenAI..."): | |
batch = texts[i:i + batch_size] | |
try: | |
response = openai.embeddings.create( | |
model=model, | |
input=batch | |
) | |
except Exception as e: | |
print(f"❌ Fel vid API-anrop för batch {i} - {i+batch_size}: {e}") | |
continue | |
# Använd attributet .data istället för att subscript:a response | |
batch_embeddings = [item.embedding for item in response.data] | |
all_embeddings.extend(batch_embeddings) | |
return np.array(all_embeddings, dtype=np.float32) | |
embeddings = get_embeddings(texts) | |
if len(embeddings) == 0: | |
raise RuntimeError("Inga embeddings kunde skapas. Kontrollera API-nyckel och nätverksanslutning.") | |
# === Steg 4: Bygg FAISS-index === | |
embedding_dim = embeddings.shape[1] | |
index = faiss.IndexFlatL2(embedding_dim) | |
index.add(embeddings) | |
# === Steg 5: Spara index och metadata === | |
faiss.write_index(index, "faiss.index") | |
with open("faiss_metadata.json", "w", encoding="utf-8") as f: | |
json.dump(metadata_list, f, ensure_ascii=False, indent=2) | |
print("✅ FAISS-index skapat och sparat som 'faiss.index'") | |
print("📄 Metadata sparad i 'faiss_metadata.json'") | |