Spaces:
Running
Running
File size: 1,948 Bytes
84f0cff 7a92e6c 84f0cff 279839c 84f0cff 7a92e6c 84f0cff 279839c 84f0cff 279839c 84f0cff |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 |
from sentence_transformers import SentenceTransformer, util
from collections import Counter
class SecondaryModelDependencies:
def __init__(self):
self.text_similarity_model = SentenceTransformer(
'sentence-transformers/all-mpnet-base-v2')
def calculate_features(self, answer: str, probability: float, backspace_count: int,
letter_click_counts: dict[str, int], gpt4o_answer: str):
backspace_count_normalized = backspace_count / len(answer)
letter_discrepancy = self.calculate_letter_discrepancy(
answer, letter_click_counts)
cosine_sim_gpt4o = self.calculate_similarity_gpt4o(
answer, gpt4o_answer)
return [
probability, backspace_count_normalized, letter_discrepancy, cosine_sim_gpt4o
]
def calculate_letter_discrepancy(self, text: str, letter_click_counts: dict[str, int]):
# Calculate letter frequencies in the text
text_letter_counts = Counter(text.lower())
# Calculate the ratio of click counts to text counts for each letter, adjusting for letters not in text
ratios = [letter_click_counts.get(letter, 0) / (text_letter_counts.get(letter, 0) + 1)
for letter in "abcdefghijklmnopqrstuvwxyz"]
# Average the ratios and normalize by the length of the text
average_ratio = sum(ratios) / len(ratios)
discrepancy_ratio_normalized = average_ratio / \
(len(text) if len(text) > 0 else 1)
return discrepancy_ratio_normalized
def calculate_similarity_gpt4o(self, answer: str, gpt4o_answer: str) -> float:
embedding1 = self.text_similarity_model.encode(
[answer], convert_to_tensor=True)
embedding2 = self.text_similarity_model.encode(
[gpt4o_answer], convert_to_tensor=True)
cosine_scores = util.cos_sim(embedding1, embedding2)
return cosine_scores.item()
|