Spaces:
Runtime error
Runtime error
import os | |
import json | |
import faiss | |
import numpy as np | |
from sentence_transformers import SentenceTransformer | |
from tqdm import tqdm | |
# 1. λ°μ΄ν° κ²½λ‘ μ€μ | |
source_paths = [ | |
r"data/real_estate_agent/raw/past_papers/brokerage_law.jsonl", | |
r"data/real_estate_agent/raw/past_papers/civil_law.jsonl", | |
r"data/real_estate_agent/raw/past_papers/disclosure_taxation.jsonl", | |
r"data/real_estate_agent/raw/past_papers/introduction.jsonl", | |
r"data/real_estate_agent/raw/past_papers/public_law.jsonl", | |
] | |
INDEX_PATH = "data/index/index.faiss" | |
DOCS_PATH = "data/index/docs.npy" | |
# 2. μλ² λ© λͺ¨λΈ λ‘λ | |
embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") | |
def init_faiss(): | |
questions = [] | |
# 3. JSONL νμΌ μ½κΈ° | |
for path in source_paths: | |
with open(path, "r", encoding="utf-8") as f: | |
for line in f: | |
data = json.loads(line) | |
question_text = data.get("question", "") | |
if question_text: # μ§λ¬Έμ΄ λΉμ΄μμ§ μμΌλ©΄ μΆκ° | |
questions.append(question_text) | |
print(f"β μ΄ {len(questions)}κ° μ§λ¬Έ λ‘λ© μλ£") | |
# 4. μλ² λ© μμ± | |
embeddings = embedding_model.encode( | |
questions, | |
batch_size=32, | |
show_progress_bar=True | |
) | |
embeddings = np.array(embeddings).astype('float32') | |
# 5. FAISS μΈλ±μ€ μμ± | |
dimension = embeddings.shape[1] | |
index = faiss.IndexFlatL2(dimension) # L2 거리 κΈ°λ° μΈλ±μ€ | |
index.add(embeddings) | |
# 6. μ μ₯ | |
os.makedirs(os.path.dirname(INDEX_PATH), exist_ok=True) | |
faiss.write_index(index, INDEX_PATH) | |
np.save(DOCS_PATH, questions) | |
print(f"β FAISS μΈλ±μ€μ λ¬Έμ μ μ₯ μλ£!") | |
if __name__ == "__main__": | |
init_faiss() | |