Spaces:
Running
Running
Commit
·
ae51d62
1
Parent(s):
07ba2c0
feat: 3 feature classes + main endpoint
Browse files- .gitignore +1 -0
- gemma2b.py +62 -0
- hypothesis.py +117 -0
- prediction.py +63 -0
- randomforest.py +40 -0
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
__pycache__
|
gemma2b.py
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
2 |
+
import torch
|
3 |
+
from torch.nn.functional import cosine_similarity
|
4 |
+
from collections import Counter
|
5 |
+
import numpy as np
|
6 |
+
|
7 |
+
|
8 |
+
class Gemma2BDependencies:
|
9 |
+
def __init__(self, question: str, answer: str):
|
10 |
+
self.question = question
|
11 |
+
self.answer = answer
|
12 |
+
self.tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b")
|
13 |
+
self.model = AutoModelForCausalLM.from_pretrained("google/gemma-2b")
|
14 |
+
self.device = torch.device("cuda")
|
15 |
+
self.model.to(self.device)
|
16 |
+
|
17 |
+
def calculate_perplexity(self):
|
18 |
+
inputs = self.tokenizer(self.answer, return_tensors="pt",
|
19 |
+
truncation=True, max_length=1024)
|
20 |
+
inputs = {k: v.to(self.device) for k, v in inputs.items()}
|
21 |
+
|
22 |
+
# Calculate the model's output
|
23 |
+
with torch.no_grad():
|
24 |
+
outputs = self.model(**inputs, labels=inputs["input_ids"])
|
25 |
+
loss = outputs.loss
|
26 |
+
perplexity = torch.exp(loss)
|
27 |
+
|
28 |
+
return perplexity.item()
|
29 |
+
|
30 |
+
def calculate_burstiness(self):
|
31 |
+
# Tokenize the text using GPT-2 tokenizer
|
32 |
+
tokens = self.tokenizer.tokenize(self.answer)
|
33 |
+
|
34 |
+
# Count token frequencies
|
35 |
+
frequency_counts = list(Counter(tokens).values())
|
36 |
+
|
37 |
+
# Calculate variance and mean of frequencies
|
38 |
+
variance = np.var(frequency_counts)
|
39 |
+
mean = np.mean(frequency_counts)
|
40 |
+
|
41 |
+
# Compute Variance-to-Mean Ratio (VMR) for burstiness
|
42 |
+
vmr = variance / mean if mean > 0 else 0
|
43 |
+
return vmr
|
44 |
+
|
45 |
+
def get_embedding(self):
|
46 |
+
inputs = self.tokenizer(self.text, return_tensors="pt",
|
47 |
+
truncation=True, max_length=1024)
|
48 |
+
inputs = {k: v.to(self.device) for k, v in inputs.items()}
|
49 |
+
|
50 |
+
with torch.no_grad():
|
51 |
+
outputs = self.model(**inputs, output_hidden_states=True)
|
52 |
+
|
53 |
+
last_hidden_states = outputs.hidden_states[-1]
|
54 |
+
# Average the token embeddings to get a sentence-level embedding
|
55 |
+
embedding = torch.mean(last_hidden_states, dim=1)
|
56 |
+
return embedding
|
57 |
+
|
58 |
+
def calculate_cosine_similarity(self):
|
59 |
+
embedding1 = self.get_embedding(self.question)
|
60 |
+
embedding2 = self.get_embedding(self.answer)
|
61 |
+
# Ensure the embeddings are in the correct shape for cosine_similarity
|
62 |
+
return cosine_similarity(embedding1, embedding2).item()
|
hypothesis.py
ADDED
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import nltk
|
2 |
+
import joblib
|
3 |
+
import textstat
|
4 |
+
import pandas as pd
|
5 |
+
import numpy as np
|
6 |
+
from collections import defaultdict, Counter
|
7 |
+
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
|
8 |
+
from gemma2b import Gemma2BDependencies
|
9 |
+
|
10 |
+
|
11 |
+
class BaseModelHypothesis:
|
12 |
+
def __init__(self, question: str, answer: str):
|
13 |
+
nltk.download('punkt')
|
14 |
+
nltk.download('averaged_perceptron_tagger')
|
15 |
+
|
16 |
+
self.question = question
|
17 |
+
self.answer = answer
|
18 |
+
|
19 |
+
self.analyzer = SentimentIntensityAnalyzer()
|
20 |
+
self.lexicon_df = pd.read_csv(
|
21 |
+
"https://storage.googleapis.com/ta-ai-detector/datasets/NRC-Emotion-Lexicon.csv")
|
22 |
+
self.emotion_lexicon = self.process_emotion_lexicon()
|
23 |
+
self.gemma2bdependencies = Gemma2BDependencies(
|
24 |
+
self.question, self.answer)
|
25 |
+
|
26 |
+
self.features_normalized_text_length = []
|
27 |
+
self.features_not_normalized = []
|
28 |
+
|
29 |
+
self.scaler_normalized_text_length = joblib.load(
|
30 |
+
"scaler-normalized-text-length.joblib")
|
31 |
+
self.scaler_not_normalized = joblib.load(
|
32 |
+
"scaler-not-normalized.joblib")
|
33 |
+
|
34 |
+
def process_emotion_lexicon(self):
|
35 |
+
emotion_lexicon = {}
|
36 |
+
for _, row in self.lexicon_df.iterrows():
|
37 |
+
if row["word"] not in emotion_lexicon:
|
38 |
+
emotion_lexicon[row["word"]] = []
|
39 |
+
emotion_lexicon[row["word"]].append(row["emotion"])
|
40 |
+
return emotion_lexicon
|
41 |
+
|
42 |
+
def calculate_normalized_text_length_features(self):
|
43 |
+
self.features_normalized_text_length = self.extract_pos_features(
|
44 |
+
self.answer)
|
45 |
+
self.features_normalized_text_length = self.features_normalized_text_length + \
|
46 |
+
self.calculate_emotion_proportions(self.answer)
|
47 |
+
self.features_normalized_text_length.append(
|
48 |
+
self.measure_unique_word_ratio(self.answer))
|
49 |
+
|
50 |
+
return self.scaler_normalized_text_length.transform(np.array(self.features_normalized_text_length).astype(np.float32).reshape(1, -1))
|
51 |
+
|
52 |
+
def calculate_not_normalized_features(self):
|
53 |
+
self.features_not_normalized.append(
|
54 |
+
self.measure_sentiment_intensity(self.answer))
|
55 |
+
self.features_not_normalized = self.features_not_normalized + \
|
56 |
+
self.measure_readability(self.answer)
|
57 |
+
self.features_not_normalized.append(
|
58 |
+
self.gemma2bdependencies.calculate_perplexity(self.answer))
|
59 |
+
self.features_not_normalized.append(
|
60 |
+
self.gemma2bdependencies.calculate_burstiness(self.answer))
|
61 |
+
|
62 |
+
return self.scaler_not_normalized.transform(np.array(self.features_not_normalized).astype(np.float32).reshape(1, -1))
|
63 |
+
|
64 |
+
def extract_pos_features(self):
|
65 |
+
words = nltk.word_tokenize(self.answer)
|
66 |
+
pos_tags = nltk.pos_tag(words)
|
67 |
+
desired_tags = ["JJ", "VB", "RB", "PRP", "DT", "IN", "NN", "NNS"]
|
68 |
+
pos_counts = defaultdict(int, {tag: 0 for tag in desired_tags})
|
69 |
+
|
70 |
+
for _, pos in pos_tags:
|
71 |
+
if pos in pos_counts:
|
72 |
+
pos_counts[pos] += 1
|
73 |
+
|
74 |
+
total_words = len(words)
|
75 |
+
pos_ratios = [pos_counts[tag] / total_words for tag in desired_tags]
|
76 |
+
|
77 |
+
return pos_ratios
|
78 |
+
|
79 |
+
def measure_sentiment_intensity(self):
|
80 |
+
sentiment = self.analyzer.polarity_scores(self.answer)
|
81 |
+
return sentiment["compound"]
|
82 |
+
|
83 |
+
def measure_readability(self):
|
84 |
+
gunning_fog = textstat.gunning_fog(self.answer)
|
85 |
+
smog_index = textstat.smog_index(self.answer)
|
86 |
+
dale_chall_score = textstat.dale_chall_readability_score(self.answer)
|
87 |
+
|
88 |
+
return [gunning_fog, smog_index, dale_chall_score]
|
89 |
+
|
90 |
+
def calculate_emotion_proportions(self):
|
91 |
+
tokens = nltk.word_tokenize(self.answer)
|
92 |
+
|
93 |
+
total_tokens = len(tokens)
|
94 |
+
|
95 |
+
emotion_counts = {emotion: 0 for emotion in [
|
96 |
+
"negative", "positive", "fear", "anger", "trust", "sadness", "disgust", "anticipation", "joy", "surprise"]}
|
97 |
+
|
98 |
+
for token in tokens:
|
99 |
+
if token in self.emotion_lexicon:
|
100 |
+
for emotion in self.emotion_lexicon[token]:
|
101 |
+
emotion_counts[emotion] += 1
|
102 |
+
|
103 |
+
proportions = {emotion: count / total_tokens for emotion,
|
104 |
+
count in emotion_counts.items()}
|
105 |
+
|
106 |
+
return [
|
107 |
+
proportions["negative"], proportions["positive"], proportions["fear"], proportions["anger"], proportions["trust"],
|
108 |
+
proportions["sadness"], proportions["disgust"], proportions["anticipation"], proportions["joy"], proportions["surprise"]
|
109 |
+
]
|
110 |
+
|
111 |
+
def measure_unique_word_ratio(self):
|
112 |
+
tokens = nltk.word_tokenize(self.answer)
|
113 |
+
total_words = len(tokens)
|
114 |
+
|
115 |
+
unique_words = len(Counter(tokens).keys())
|
116 |
+
|
117 |
+
return (unique_words / total_words)
|
prediction.py
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import FastAPI
|
2 |
+
from pydantic import BaseModel
|
3 |
+
from hypothesis import BaseModelHypothesis
|
4 |
+
from randomforest import RandomForestDependencies
|
5 |
+
import torch.nn as nn
|
6 |
+
import torch
|
7 |
+
|
8 |
+
|
9 |
+
class AlbertCustomClassificationHead(nn.Module):
|
10 |
+
def __init__(self, albert_model, dropout_rate=0.1):
|
11 |
+
super(AlbertCustomClassificationHead, self).__init__()
|
12 |
+
self.albert_model = albert_model
|
13 |
+
self.dropout = nn.Dropout(dropout_rate)
|
14 |
+
self.classifier = nn.Linear(1024 + 25, 1)
|
15 |
+
|
16 |
+
def forward(self, input_ids, attention_mask, additional_features, labels=None):
|
17 |
+
albert_output = self.albert_model(
|
18 |
+
input_ids=input_ids, attention_mask=attention_mask).pooler_output
|
19 |
+
|
20 |
+
combined_features = torch.cat(
|
21 |
+
[albert_output, additional_features], dim=1)
|
22 |
+
|
23 |
+
dropout_output = self.dropout(combined_features)
|
24 |
+
|
25 |
+
logits = self.classifier(dropout_output)
|
26 |
+
|
27 |
+
if labels is not None:
|
28 |
+
loss_fn = nn.BCEWithLogitsLoss()
|
29 |
+
labels = labels.unsqueeze(1)
|
30 |
+
loss = loss_fn(logits, labels.float())
|
31 |
+
return logits, loss
|
32 |
+
else:
|
33 |
+
return logits
|
34 |
+
|
35 |
+
|
36 |
+
app = FastAPI()
|
37 |
+
|
38 |
+
|
39 |
+
class PredictRequest(BaseModel):
|
40 |
+
question: str
|
41 |
+
answer: str
|
42 |
+
backspace_count: int
|
43 |
+
typing_duration: int
|
44 |
+
letter_click_counts: dict[str, int]
|
45 |
+
|
46 |
+
|
47 |
+
@app.post("/predict")
|
48 |
+
async def predict(request: PredictRequest):
|
49 |
+
request_dict = request.model_dump()
|
50 |
+
|
51 |
+
question = request_dict.get("question")
|
52 |
+
answer = request_dict.get("answer")
|
53 |
+
backspace_count = request_dict.get("backspace_count")
|
54 |
+
typing_duration = request_dict.get("typing_duration")
|
55 |
+
letter_click_counts = request_dict.get("letter_click_counts")
|
56 |
+
|
57 |
+
hypothesis = BaseModelHypothesis()
|
58 |
+
features_normalized_text_length = hypothesis.calculate_normalized_text_length_features(
|
59 |
+
answer)
|
60 |
+
features_not_normalized = hypothesis.calculate_not_normalized_features(
|
61 |
+
answer)
|
62 |
+
|
63 |
+
return request_dict.get("backspace_count")
|
randomforest.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from gemma2b import Gemma2BDependencies
|
2 |
+
from collections import Counter
|
3 |
+
|
4 |
+
|
5 |
+
class RandomForestDependencies:
|
6 |
+
def __init__(self, question: str, answer: str):
|
7 |
+
self.question = question
|
8 |
+
self.answer = answer
|
9 |
+
|
10 |
+
self.gemma2bdependencies = Gemma2BDependencies(
|
11 |
+
self.question, self.answer)
|
12 |
+
self.random_forest_features = []
|
13 |
+
|
14 |
+
def calculate_features(self, probability: float, backspace_count: int, typing_duration: int, letter_click_counts: dict[str, int]):
|
15 |
+
cosine_similarity = self.gemma2bdependencies.calculate_cosine_similarity(
|
16 |
+
self.question, self.answer)
|
17 |
+
backspace_count_normalized = backspace_count / len(self.answer)
|
18 |
+
typing_duration_normalized = typing_duration / len(self.answer)
|
19 |
+
letter_discrepancy = self.calculate_letter_discrepancy(
|
20 |
+
self.answer, letter_click_counts)
|
21 |
+
|
22 |
+
self.random_forest_features = [
|
23 |
+
cosine_similarity, probability, backspace_count_normalized,
|
24 |
+
typing_duration_normalized, letter_discrepancy
|
25 |
+
]
|
26 |
+
|
27 |
+
def calculate_letter_discrepancy(self, letter_click_counts: dict[str, int]):
|
28 |
+
# Calculate letter frequencies in the text
|
29 |
+
text_letter_counts = Counter(self.answer.lower())
|
30 |
+
|
31 |
+
# Calculate the ratio of click counts to text counts for each letter, adjusting for letters not in text
|
32 |
+
ratios = [letter_click_counts.get(letter, 0) / (text_letter_counts.get(letter, 0) + 1)
|
33 |
+
for letter in "abcdefghijklmnopqrstuvwxyz"]
|
34 |
+
|
35 |
+
# Average the ratios and normalize by the length of the text
|
36 |
+
average_ratio = sum(ratios) / len(ratios)
|
37 |
+
discrepancy_ratio_normalized = average_ratio / \
|
38 |
+
(len(self.answer) if len(self.answer) > 0 else 1)
|
39 |
+
|
40 |
+
return discrepancy_ratio_normalized
|