File size: 3,584 Bytes
a3055d2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
# -*- coding: utf-8 -*-
"""module1.py

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1AYXXKXRzUU4DWKWbJqvyjSwQ0dVQMS7Y
"""

import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

class MisconceptionModel:
    def __init__(self, model_name, misconception_mapping_path, misconception_embs_paths):
        # λͺ¨λΈ μ΄ˆκΈ°ν™”
        self.model = SentenceTransformer(model_name)
        self.misconception_mapping = pd.read_parquet(misconception_mapping_path)
        self.misconception_names = self.misconception_mapping.set_index("MisconceptionId")["MisconceptionName"]
        self.misconception_embs = [
            np.load(path) for path in misconception_embs_paths
        ]

    def preprocess(self, df):
        """데이터 ν”„λ¦¬ν”„λ‘œμ„Έμ‹±"""
        df_new = df.copy()
        for col in df.columns[df.dtypes == "object"]:
            df_new[col] = df_new[col].str.strip()
        for option in ["A", "B", "C", "D"]:
            df_new[f"Answer{option}Text"] = df_new[f"Answer{option}Text"].str.replace("Only\n", "Only ")
        return df_new

    def wide_to_long(self, df):
        """데이터λ₯Ό wide ν˜•μ‹μ—μ„œ long ν˜•μ‹μœΌλ‘œ λ³€ν™˜"""
        rows = []
        for _, row in df.iterrows():
            correct_option = row["CorrectAnswer"]
            correct_text = row[f"Answer{correct_option}Text"]
            for option in ["A", "B", "C", "D"]:
                if option == correct_option:
                    continue
                misconception_id = row.get(f"Misconception{option}Id", np.nan)
                row_new = row[:"QuestionText"]
                row_new["CorrectAnswerText"] = correct_text
                row_new["Answer"] = option
                row_new["AnswerText"] = row[f"Answer{option}Text"]
                if not pd.isna(misconception_id):
                    row_new["MisconceptionId"] = int(misconception_id)
                rows.append(row_new)
        df_long = pd.DataFrame(rows).reset_index(drop=True)
        df_long.insert(0, "QuestionId_Answer", df_long["QuestionId"].astype(str) + "_" + df_long["Answer"])
        return df_long

    def predict(self, test_df):
        """ν…ŒμŠ€νŠΈ 데이터에 λŒ€ν•œ 예츑 μˆ˜ν–‰"""
        test_df_long = self.wide_to_long(test_df)

        prompt = (
            "Subject: {SubjectName}\n"
            "Construct: {ConstructName}\n"
            "Question: {QuestionText}\n"
            "Incorrect Answer: {AnswerText}"
        )
        test_df_long["anchor"] = [
            prompt.format(
                SubjectName=row["SubjectName"],
                ConstructName=row["ConstructName"],
                QuestionText=row["QuestionText"],
                AnswerText=row["AnswerText"]
            ) for _, row in test_df_long.iterrows()
        ]

        # ν…ŒμŠ€νŠΈ 데이터 μž„λ² λ”©
        embs_test_query = self.model.encode(test_df_long["anchor"], normalize_embeddings=True)

        # μœ μ‚¬λ„ 계산 및 μˆœμœ„ μ‚°μΆœ
        rank_test = np.array([
            np.argsort(np.argsort(-cosine_similarity(embs_test_query, embs_misconception)), axis=1, kind="stable")
            for embs_misconception in self.misconception_embs
        ])
        rank_ave_test = np.mean(rank_test ** (1 / 4), axis=0)
        argsort_test = np.argsort(rank_ave_test, axis=1, kind="stable")

        test_df_long["PredictedMisconceptions"] = [argsort_test[i, :25].tolist() for i in range(len(argsort_test))]
        return test_df_long