import json import numpy as np from sentence_transformers import SentenceTransformer # Load dữ liệu with open("data/raw.json", "r", encoding="utf-8") as f: data = json.load(f) questions = [item["question"] for item in data] answers = [item["answer"] for item in data] # Load mô hình embedding model = SentenceTransformer("pkshatech/GLuCoSE-base-ja") # Tạo embedding cho câu hỏi và câu trả lời question_embeddings = model.encode(questions, convert_to_numpy=True) answer_embeddings = model.encode(answers, convert_to_numpy=True) # Lưu embedding dưới dạng numpy array np.save("data/question_embeddings.npy", question_embeddings) np.save("data/answer_embeddings.npy", answer_embeddings) # Lưu dữ liệu gốc with open("data/qa_data.json", "w", encoding="utf-8") as f: json.dump(data, f, ensure_ascii=False, indent=2)