# -*- coding: utf-8 -*- """module1.py Automatically generated by Colab. Original file is located at https://colab.research.google.com/drive/1AYXXKXRzUU4DWKWbJqvyjSwQ0dVQMS7Y """ import pandas as pd import numpy as np from sklearn.metrics.pairwise import cosine_similarity from sentence_transformers import SentenceTransformer class MisconceptionModel: def __init__(self, model_name, misconception_mapping_path, misconception_embs_paths): # 모델 초기화 self.model = SentenceTransformer(model_name) self.misconception_mapping = pd.read_parquet(misconception_mapping_path) self.misconception_names = self.misconception_mapping.set_index("MisconceptionId")["MisconceptionName"] self.misconception_embs = [ np.load(path) for path in misconception_embs_paths ] def preprocess(self, df): """데이터 프리프로세싱""" df_new = df.copy() for col in df.columns[df.dtypes == "object"]: df_new[col] = df_new[col].str.strip() for option in ["A", "B", "C", "D"]: df_new[f"Answer{option}Text"] = df_new[f"Answer{option}Text"].str.replace("Only\n", "Only ") return df_new def wide_to_long(self, df): """데이터를 wide 형식에서 long 형식으로 변환""" rows = [] for _, row in df.iterrows(): correct_option = row["CorrectAnswer"] correct_text = row[f"Answer{correct_option}Text"] for option in ["A", "B", "C", "D"]: if option == correct_option: continue misconception_id = row.get(f"Misconception{option}Id", np.nan) row_new = row[:"QuestionText"] row_new["CorrectAnswerText"] = correct_text row_new["Answer"] = option row_new["AnswerText"] = row[f"Answer{option}Text"] if not pd.isna(misconception_id): row_new["MisconceptionId"] = int(misconception_id) rows.append(row_new) df_long = pd.DataFrame(rows).reset_index(drop=True) df_long.insert(0, "QuestionId_Answer", df_long["QuestionId"].astype(str) + "_" + df_long["Answer"]) return df_long def predict(self, test_df): """테스트 데이터에 대한 예측 수행""" test_df_long = self.wide_to_long(test_df) prompt = ( "Subject: {SubjectName}\n" "Construct: {ConstructName}\n" "Question: {QuestionText}\n" "Incorrect Answer: {AnswerText}" ) test_df_long["anchor"] = [ prompt.format( SubjectName=row["SubjectName"], ConstructName=row["ConstructName"], QuestionText=row["QuestionText"], AnswerText=row["AnswerText"] ) for _, row in test_df_long.iterrows() ] # 테스트 데이터 임베딩 embs_test_query = self.model.encode(test_df_long["anchor"], normalize_embeddings=True) # 유사도 계산 및 순위 산출 rank_test = np.array([ np.argsort(np.argsort(-cosine_similarity(embs_test_query, embs_misconception)), axis=1, kind="stable") for embs_misconception in self.misconception_embs ]) rank_ave_test = np.mean(rank_test ** (1 / 4), axis=0) argsort_test = np.argsort(rank_ave_test, axis=1, kind="stable") test_df_long["PredictedMisconceptions"] = [argsort_test[i, :25].tolist() for i in range(len(argsort_test))] return test_df_long