File size: 1,948 Bytes
84f0cff
 
 
 
 
 
 
 
 
7a92e6c
 
84f0cff
 
 
279839c
 
84f0cff
 
7a92e6c
84f0cff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
279839c
84f0cff
 
 
279839c
84f0cff
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
from sentence_transformers import SentenceTransformer, util
from collections import Counter


class SecondaryModelDependencies:
    def __init__(self):
        self.text_similarity_model = SentenceTransformer(
            'sentence-transformers/all-mpnet-base-v2')

    def calculate_features(self, answer: str, probability: float, backspace_count: int,
                           letter_click_counts: dict[str, int], gpt4o_answer: str):
        backspace_count_normalized = backspace_count / len(answer)
        letter_discrepancy = self.calculate_letter_discrepancy(
            answer, letter_click_counts)
        cosine_sim_gpt4o = self.calculate_similarity_gpt4o(
            answer, gpt4o_answer)

        return [
            probability, backspace_count_normalized, letter_discrepancy, cosine_sim_gpt4o
        ]

    def calculate_letter_discrepancy(self, text: str, letter_click_counts: dict[str, int]):
        # Calculate letter frequencies in the text
        text_letter_counts = Counter(text.lower())

        # Calculate the ratio of click counts to text counts for each letter, adjusting for letters not in text
        ratios = [letter_click_counts.get(letter, 0) / (text_letter_counts.get(letter, 0) + 1)
                  for letter in "abcdefghijklmnopqrstuvwxyz"]

        # Average the ratios and normalize by the length of the text
        average_ratio = sum(ratios) / len(ratios)
        discrepancy_ratio_normalized = average_ratio / \
            (len(text) if len(text) > 0 else 1)

        return discrepancy_ratio_normalized

    def calculate_similarity_gpt4o(self, answer: str, gpt4o_answer: str) -> float:
        embedding1 = self.text_similarity_model.encode(
            [answer], convert_to_tensor=True)
        embedding2 = self.text_similarity_model.encode(
            [gpt4o_answer], convert_to_tensor=True)
        cosine_scores = util.cos_sim(embedding1, embedding2)
        return cosine_scores.item()