bearking58 commited on
Commit
ae51d62
·
1 Parent(s): 07ba2c0

feat: 3 feature classes + main endpoint

Browse files
Files changed (5) hide show
  1. .gitignore +1 -0
  2. gemma2b.py +62 -0
  3. hypothesis.py +117 -0
  4. prediction.py +63 -0
  5. randomforest.py +40 -0
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ __pycache__
gemma2b.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModelForCausalLM
2
+ import torch
3
+ from torch.nn.functional import cosine_similarity
4
+ from collections import Counter
5
+ import numpy as np
6
+
7
+
8
+ class Gemma2BDependencies:
9
+ def __init__(self, question: str, answer: str):
10
+ self.question = question
11
+ self.answer = answer
12
+ self.tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b")
13
+ self.model = AutoModelForCausalLM.from_pretrained("google/gemma-2b")
14
+ self.device = torch.device("cuda")
15
+ self.model.to(self.device)
16
+
17
+ def calculate_perplexity(self):
18
+ inputs = self.tokenizer(self.answer, return_tensors="pt",
19
+ truncation=True, max_length=1024)
20
+ inputs = {k: v.to(self.device) for k, v in inputs.items()}
21
+
22
+ # Calculate the model's output
23
+ with torch.no_grad():
24
+ outputs = self.model(**inputs, labels=inputs["input_ids"])
25
+ loss = outputs.loss
26
+ perplexity = torch.exp(loss)
27
+
28
+ return perplexity.item()
29
+
30
+ def calculate_burstiness(self):
31
+ # Tokenize the text using GPT-2 tokenizer
32
+ tokens = self.tokenizer.tokenize(self.answer)
33
+
34
+ # Count token frequencies
35
+ frequency_counts = list(Counter(tokens).values())
36
+
37
+ # Calculate variance and mean of frequencies
38
+ variance = np.var(frequency_counts)
39
+ mean = np.mean(frequency_counts)
40
+
41
+ # Compute Variance-to-Mean Ratio (VMR) for burstiness
42
+ vmr = variance / mean if mean > 0 else 0
43
+ return vmr
44
+
45
+ def get_embedding(self):
46
+ inputs = self.tokenizer(self.text, return_tensors="pt",
47
+ truncation=True, max_length=1024)
48
+ inputs = {k: v.to(self.device) for k, v in inputs.items()}
49
+
50
+ with torch.no_grad():
51
+ outputs = self.model(**inputs, output_hidden_states=True)
52
+
53
+ last_hidden_states = outputs.hidden_states[-1]
54
+ # Average the token embeddings to get a sentence-level embedding
55
+ embedding = torch.mean(last_hidden_states, dim=1)
56
+ return embedding
57
+
58
+ def calculate_cosine_similarity(self):
59
+ embedding1 = self.get_embedding(self.question)
60
+ embedding2 = self.get_embedding(self.answer)
61
+ # Ensure the embeddings are in the correct shape for cosine_similarity
62
+ return cosine_similarity(embedding1, embedding2).item()
hypothesis.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import nltk
2
+ import joblib
3
+ import textstat
4
+ import pandas as pd
5
+ import numpy as np
6
+ from collections import defaultdict, Counter
7
+ from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
8
+ from gemma2b import Gemma2BDependencies
9
+
10
+
11
+ class BaseModelHypothesis:
12
+ def __init__(self, question: str, answer: str):
13
+ nltk.download('punkt')
14
+ nltk.download('averaged_perceptron_tagger')
15
+
16
+ self.question = question
17
+ self.answer = answer
18
+
19
+ self.analyzer = SentimentIntensityAnalyzer()
20
+ self.lexicon_df = pd.read_csv(
21
+ "https://storage.googleapis.com/ta-ai-detector/datasets/NRC-Emotion-Lexicon.csv")
22
+ self.emotion_lexicon = self.process_emotion_lexicon()
23
+ self.gemma2bdependencies = Gemma2BDependencies(
24
+ self.question, self.answer)
25
+
26
+ self.features_normalized_text_length = []
27
+ self.features_not_normalized = []
28
+
29
+ self.scaler_normalized_text_length = joblib.load(
30
+ "scaler-normalized-text-length.joblib")
31
+ self.scaler_not_normalized = joblib.load(
32
+ "scaler-not-normalized.joblib")
33
+
34
+ def process_emotion_lexicon(self):
35
+ emotion_lexicon = {}
36
+ for _, row in self.lexicon_df.iterrows():
37
+ if row["word"] not in emotion_lexicon:
38
+ emotion_lexicon[row["word"]] = []
39
+ emotion_lexicon[row["word"]].append(row["emotion"])
40
+ return emotion_lexicon
41
+
42
+ def calculate_normalized_text_length_features(self):
43
+ self.features_normalized_text_length = self.extract_pos_features(
44
+ self.answer)
45
+ self.features_normalized_text_length = self.features_normalized_text_length + \
46
+ self.calculate_emotion_proportions(self.answer)
47
+ self.features_normalized_text_length.append(
48
+ self.measure_unique_word_ratio(self.answer))
49
+
50
+ return self.scaler_normalized_text_length.transform(np.array(self.features_normalized_text_length).astype(np.float32).reshape(1, -1))
51
+
52
+ def calculate_not_normalized_features(self):
53
+ self.features_not_normalized.append(
54
+ self.measure_sentiment_intensity(self.answer))
55
+ self.features_not_normalized = self.features_not_normalized + \
56
+ self.measure_readability(self.answer)
57
+ self.features_not_normalized.append(
58
+ self.gemma2bdependencies.calculate_perplexity(self.answer))
59
+ self.features_not_normalized.append(
60
+ self.gemma2bdependencies.calculate_burstiness(self.answer))
61
+
62
+ return self.scaler_not_normalized.transform(np.array(self.features_not_normalized).astype(np.float32).reshape(1, -1))
63
+
64
+ def extract_pos_features(self):
65
+ words = nltk.word_tokenize(self.answer)
66
+ pos_tags = nltk.pos_tag(words)
67
+ desired_tags = ["JJ", "VB", "RB", "PRP", "DT", "IN", "NN", "NNS"]
68
+ pos_counts = defaultdict(int, {tag: 0 for tag in desired_tags})
69
+
70
+ for _, pos in pos_tags:
71
+ if pos in pos_counts:
72
+ pos_counts[pos] += 1
73
+
74
+ total_words = len(words)
75
+ pos_ratios = [pos_counts[tag] / total_words for tag in desired_tags]
76
+
77
+ return pos_ratios
78
+
79
+ def measure_sentiment_intensity(self):
80
+ sentiment = self.analyzer.polarity_scores(self.answer)
81
+ return sentiment["compound"]
82
+
83
+ def measure_readability(self):
84
+ gunning_fog = textstat.gunning_fog(self.answer)
85
+ smog_index = textstat.smog_index(self.answer)
86
+ dale_chall_score = textstat.dale_chall_readability_score(self.answer)
87
+
88
+ return [gunning_fog, smog_index, dale_chall_score]
89
+
90
+ def calculate_emotion_proportions(self):
91
+ tokens = nltk.word_tokenize(self.answer)
92
+
93
+ total_tokens = len(tokens)
94
+
95
+ emotion_counts = {emotion: 0 for emotion in [
96
+ "negative", "positive", "fear", "anger", "trust", "sadness", "disgust", "anticipation", "joy", "surprise"]}
97
+
98
+ for token in tokens:
99
+ if token in self.emotion_lexicon:
100
+ for emotion in self.emotion_lexicon[token]:
101
+ emotion_counts[emotion] += 1
102
+
103
+ proportions = {emotion: count / total_tokens for emotion,
104
+ count in emotion_counts.items()}
105
+
106
+ return [
107
+ proportions["negative"], proportions["positive"], proportions["fear"], proportions["anger"], proportions["trust"],
108
+ proportions["sadness"], proportions["disgust"], proportions["anticipation"], proportions["joy"], proportions["surprise"]
109
+ ]
110
+
111
+ def measure_unique_word_ratio(self):
112
+ tokens = nltk.word_tokenize(self.answer)
113
+ total_words = len(tokens)
114
+
115
+ unique_words = len(Counter(tokens).keys())
116
+
117
+ return (unique_words / total_words)
prediction.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+ from pydantic import BaseModel
3
+ from hypothesis import BaseModelHypothesis
4
+ from randomforest import RandomForestDependencies
5
+ import torch.nn as nn
6
+ import torch
7
+
8
+
9
+ class AlbertCustomClassificationHead(nn.Module):
10
+ def __init__(self, albert_model, dropout_rate=0.1):
11
+ super(AlbertCustomClassificationHead, self).__init__()
12
+ self.albert_model = albert_model
13
+ self.dropout = nn.Dropout(dropout_rate)
14
+ self.classifier = nn.Linear(1024 + 25, 1)
15
+
16
+ def forward(self, input_ids, attention_mask, additional_features, labels=None):
17
+ albert_output = self.albert_model(
18
+ input_ids=input_ids, attention_mask=attention_mask).pooler_output
19
+
20
+ combined_features = torch.cat(
21
+ [albert_output, additional_features], dim=1)
22
+
23
+ dropout_output = self.dropout(combined_features)
24
+
25
+ logits = self.classifier(dropout_output)
26
+
27
+ if labels is not None:
28
+ loss_fn = nn.BCEWithLogitsLoss()
29
+ labels = labels.unsqueeze(1)
30
+ loss = loss_fn(logits, labels.float())
31
+ return logits, loss
32
+ else:
33
+ return logits
34
+
35
+
36
+ app = FastAPI()
37
+
38
+
39
+ class PredictRequest(BaseModel):
40
+ question: str
41
+ answer: str
42
+ backspace_count: int
43
+ typing_duration: int
44
+ letter_click_counts: dict[str, int]
45
+
46
+
47
+ @app.post("/predict")
48
+ async def predict(request: PredictRequest):
49
+ request_dict = request.model_dump()
50
+
51
+ question = request_dict.get("question")
52
+ answer = request_dict.get("answer")
53
+ backspace_count = request_dict.get("backspace_count")
54
+ typing_duration = request_dict.get("typing_duration")
55
+ letter_click_counts = request_dict.get("letter_click_counts")
56
+
57
+ hypothesis = BaseModelHypothesis()
58
+ features_normalized_text_length = hypothesis.calculate_normalized_text_length_features(
59
+ answer)
60
+ features_not_normalized = hypothesis.calculate_not_normalized_features(
61
+ answer)
62
+
63
+ return request_dict.get("backspace_count")
randomforest.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from gemma2b import Gemma2BDependencies
2
+ from collections import Counter
3
+
4
+
5
+ class RandomForestDependencies:
6
+ def __init__(self, question: str, answer: str):
7
+ self.question = question
8
+ self.answer = answer
9
+
10
+ self.gemma2bdependencies = Gemma2BDependencies(
11
+ self.question, self.answer)
12
+ self.random_forest_features = []
13
+
14
+ def calculate_features(self, probability: float, backspace_count: int, typing_duration: int, letter_click_counts: dict[str, int]):
15
+ cosine_similarity = self.gemma2bdependencies.calculate_cosine_similarity(
16
+ self.question, self.answer)
17
+ backspace_count_normalized = backspace_count / len(self.answer)
18
+ typing_duration_normalized = typing_duration / len(self.answer)
19
+ letter_discrepancy = self.calculate_letter_discrepancy(
20
+ self.answer, letter_click_counts)
21
+
22
+ self.random_forest_features = [
23
+ cosine_similarity, probability, backspace_count_normalized,
24
+ typing_duration_normalized, letter_discrepancy
25
+ ]
26
+
27
+ def calculate_letter_discrepancy(self, letter_click_counts: dict[str, int]):
28
+ # Calculate letter frequencies in the text
29
+ text_letter_counts = Counter(self.answer.lower())
30
+
31
+ # Calculate the ratio of click counts to text counts for each letter, adjusting for letters not in text
32
+ ratios = [letter_click_counts.get(letter, 0) / (text_letter_counts.get(letter, 0) + 1)
33
+ for letter in "abcdefghijklmnopqrstuvwxyz"]
34
+
35
+ # Average the ratios and normalize by the length of the text
36
+ average_ratio = sum(ratios) / len(ratios)
37
+ discrepancy_ratio_normalized = average_ratio / \
38
+ (len(self.answer) if len(self.answer) > 0 else 1)
39
+
40
+ return discrepancy_ratio_normalized