Spaces:
Running
Running
Merge pull request #8 from YakobusIP/main
Browse files- core-model-prediction/gemma2b_dependencies.py +1 -20
- core-model-prediction/hypothesis.py +86 -32
- core-model-prediction/main_model.py +42 -24
- core-model-prediction/models/{albert_model.pth → albert_weights.pth} +2 -2
- core-model-prediction/models/random_forest.joblib +0 -0
- core-model-prediction/prediction.py +8 -10
- core-model-prediction/random_forest_dependencies.py +2 -7
- core-model-prediction/random_forest_model.py +8 -1
- core-model-prediction/requirements.txt +2 -2
- core-model-prediction/scalers/rf_scaler.joblib +0 -0
- core-model-prediction/scalers/{scaler-normalized-text-length.joblib → torch-scaler-normalized-text-length.joblib} +0 -0
- core-model-prediction/scalers/{scaler-not-normalized.joblib → torch-scaler-not-normalized.joblib} +0 -0
core-model-prediction/gemma2b_dependencies.py
CHANGED
@@ -43,7 +43,7 @@ class Gemma2BDependencies:
|
|
43 |
|
44 |
def calculate_burstiness(self, text: str):
|
45 |
# Tokenize the text using GPT-2 tokenizer
|
46 |
-
tokens = self.tokenizer.
|
47 |
|
48 |
# Count token frequencies
|
49 |
frequency_counts = list(Counter(tokens).values())
|
@@ -55,22 +55,3 @@ class Gemma2BDependencies:
|
|
55 |
# Compute Variance-to-Mean Ratio (VMR) for burstiness
|
56 |
vmr = variance / mean if mean > 0 else 0
|
57 |
return vmr
|
58 |
-
|
59 |
-
def get_embedding(self, text: str):
|
60 |
-
inputs = self.tokenizer(text, return_tensors="pt",
|
61 |
-
truncation=True, max_length=1024)
|
62 |
-
inputs = {k: v.to(self.device) for k, v in inputs.items()}
|
63 |
-
|
64 |
-
with torch.no_grad():
|
65 |
-
outputs = self.model(**inputs, output_hidden_states=True)
|
66 |
-
|
67 |
-
last_hidden_states = outputs.hidden_states[-1]
|
68 |
-
# Average the token embeddings to get a sentence-level embedding
|
69 |
-
embedding = torch.mean(last_hidden_states, dim=1)
|
70 |
-
return embedding
|
71 |
-
|
72 |
-
def calculate_cosine_similarity(self, question: str, answer: str):
|
73 |
-
embedding1 = self.get_embedding(question)
|
74 |
-
embedding2 = self.get_embedding(answer)
|
75 |
-
# Ensure the embeddings are in the correct shape for cosine_similarity
|
76 |
-
return cosine_similarity(embedding1, embedding2).item()
|
|
|
43 |
|
44 |
def calculate_burstiness(self, text: str):
|
45 |
# Tokenize the text using GPT-2 tokenizer
|
46 |
+
tokens = self.tokenizer.encode(text, add_special_tokens=False)
|
47 |
|
48 |
# Count token frequencies
|
49 |
frequency_counts = list(Counter(tokens).values())
|
|
|
55 |
# Compute Variance-to-Mean Ratio (VMR) for burstiness
|
56 |
vmr = variance / mean if mean > 0 else 0
|
57 |
return vmr
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
core-model-prediction/hypothesis.py
CHANGED
@@ -3,24 +3,47 @@ import joblib
|
|
3 |
import textstat
|
4 |
import pandas as pd
|
5 |
import numpy as np
|
6 |
-
from
|
|
|
7 |
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
|
8 |
from gemma2b_dependencies import Gemma2BDependencies
|
|
|
9 |
|
10 |
|
11 |
class BaseModelHypothesis:
|
12 |
def __init__(self):
|
13 |
nltk.download('punkt')
|
|
|
14 |
nltk.download('averaged_perceptron_tagger')
|
15 |
|
16 |
self.analyzer = SentimentIntensityAnalyzer()
|
17 |
self.lexicon_df = pd.read_csv(
|
18 |
-
"https://storage.googleapis.com/
|
19 |
self.emotion_lexicon = self.process_emotion_lexicon()
|
|
|
20 |
self.gemma2bdependencies = Gemma2BDependencies()
|
21 |
|
22 |
-
self.
|
23 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
|
25 |
self.scaler_normalized_text_length = joblib.load(
|
26 |
"scalers/scaler-normalized-text-length.joblib")
|
@@ -35,32 +58,43 @@ class BaseModelHypothesis:
|
|
35 |
emotion_lexicon[row["word"]].append(row["emotion"])
|
36 |
return emotion_lexicon
|
37 |
|
38 |
-
def
|
39 |
-
|
40 |
text)
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
self.
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
self.features_not_normalized.
|
50 |
-
self.
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
|
60 |
def extract_pos_features(self, text: str):
|
61 |
words = nltk.word_tokenize(text)
|
62 |
pos_tags = nltk.pos_tag(words)
|
63 |
-
desired_tags = ["
|
64 |
pos_counts = defaultdict(int, {tag: 0 for tag in desired_tags})
|
65 |
|
66 |
for _, pos in pos_tags:
|
@@ -83,20 +117,37 @@ class BaseModelHypothesis:
|
|
83 |
|
84 |
return [gunning_fog, smog_index, dale_chall_score]
|
85 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
def calculate_emotion_proportions(self, text: str):
|
87 |
tokens = nltk.word_tokenize(text)
|
|
|
|
|
|
|
|
|
88 |
|
89 |
-
|
90 |
|
91 |
emotion_counts = {emotion: 0 for emotion in [
|
92 |
"negative", "positive", "fear", "anger", "trust", "sadness", "disgust", "anticipation", "joy", "surprise"]}
|
93 |
|
94 |
-
for
|
95 |
-
if
|
96 |
-
for emotion in self.emotion_lexicon[
|
97 |
emotion_counts[emotion] += 1
|
98 |
|
99 |
-
proportions = {emotion: count /
|
100 |
count in emotion_counts.items()}
|
101 |
|
102 |
return [
|
@@ -105,9 +156,12 @@ class BaseModelHypothesis:
|
|
105 |
]
|
106 |
|
107 |
def measure_unique_word_ratio(self, text: str):
|
108 |
-
tokens = nltk.word_tokenize(text)
|
|
|
|
|
|
|
109 |
total_words = len(tokens)
|
110 |
|
111 |
-
unique_words = len(
|
112 |
|
113 |
return (unique_words / total_words)
|
|
|
3 |
import textstat
|
4 |
import pandas as pd
|
5 |
import numpy as np
|
6 |
+
from typing import List
|
7 |
+
from collections import defaultdict
|
8 |
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
|
9 |
from gemma2b_dependencies import Gemma2BDependencies
|
10 |
+
from string import punctuation
|
11 |
|
12 |
|
13 |
class BaseModelHypothesis:
|
14 |
def __init__(self):
|
15 |
nltk.download('punkt')
|
16 |
+
nltk.download('wordnet')
|
17 |
nltk.download('averaged_perceptron_tagger')
|
18 |
|
19 |
self.analyzer = SentimentIntensityAnalyzer()
|
20 |
self.lexicon_df = pd.read_csv(
|
21 |
+
"https://storage.googleapis.com/interview-ai-detector/higher-accuracy-final-model/NRC-Emotion-Lexicon.csv")
|
22 |
self.emotion_lexicon = self.process_emotion_lexicon()
|
23 |
+
self.lemmatizer = nltk.stem.WordNetLemmatizer()
|
24 |
self.gemma2bdependencies = Gemma2BDependencies()
|
25 |
|
26 |
+
self.additional_feature_columns = [
|
27 |
+
"nn_ratio", "nns_ratio", "jj_ratio", "in_ratio", "dt_ratio", "vb_ratio", "prp_ratio", "rb_ratio",
|
28 |
+
"compound_score", "gunning_fog", "smog_index", "dale_chall_score",
|
29 |
+
"negative_emotion_proportions", "positive_emotion_proportions", "fear_emotion_proportions",
|
30 |
+
"anger_emotion_proportions", "trust_emotion_proportions", "sadness_emotion_proportions",
|
31 |
+
"disgust_emotion_proportions", "anticipation_emotion_proportions", "joy_emotion_proportions",
|
32 |
+
"surprise_emotion_proportions", "unique_words_ratio", "perplexity", "burstiness"
|
33 |
+
]
|
34 |
+
|
35 |
+
self.features_normalized_text_length = [
|
36 |
+
"nn_ratio", "nns_ratio", "jj_ratio", "in_ratio", "dt_ratio", "vb_ratio", "prp_ratio", "rb_ratio",
|
37 |
+
"negative_emotion_proportions", "positive_emotion_proportions", "fear_emotion_proportions",
|
38 |
+
"anger_emotion_proportions", "trust_emotion_proportions", "sadness_emotion_proportions",
|
39 |
+
"disgust_emotion_proportions", "anticipation_emotion_proportions", "joy_emotion_proportions",
|
40 |
+
"surprise_emotion_proportions", "unique_words_ratio"
|
41 |
+
]
|
42 |
+
|
43 |
+
self.features_not_normalized = [
|
44 |
+
"compound_score", "gunning_fog", "smog_index", "dale_chall_score",
|
45 |
+
"perplexity", "burstiness"
|
46 |
+
]
|
47 |
|
48 |
self.scaler_normalized_text_length = joblib.load(
|
49 |
"scalers/scaler-normalized-text-length.joblib")
|
|
|
58 |
emotion_lexicon[row["word"]].append(row["emotion"])
|
59 |
return emotion_lexicon
|
60 |
|
61 |
+
def calculate_features_dataframe(self, text: str) -> np.ndarray:
|
62 |
+
normalized_text_length_features = self.calculate_normalized_text_length_features(
|
63 |
text)
|
64 |
+
not_normalized_features = self.calculate_not_normalized_features(text)
|
65 |
+
all_features = normalized_text_length_features + not_normalized_features
|
66 |
+
features_df = pd.DataFrame(
|
67 |
+
[all_features], columns=self.additional_feature_columns)
|
68 |
+
|
69 |
+
# Scaling features
|
70 |
+
features_df[self.features_normalized_text_length] = self.scaler_normalized_text_length.transform(
|
71 |
+
features_df[self.features_normalized_text_length])
|
72 |
+
features_df[self.features_not_normalized] = self.scaler_not_normalized.transform(
|
73 |
+
features_df[self.features_not_normalized])
|
74 |
+
|
75 |
+
ordered_df = features_df[self.additional_feature_columns]
|
76 |
+
|
77 |
+
return ordered_df.values.astype(np.float32).reshape(1, -1)
|
78 |
+
|
79 |
+
def calculate_normalized_text_length_features(self, text: str) -> List[float]:
|
80 |
+
pos_features = self.extract_pos_features(text)
|
81 |
+
emotion_features = self.calculate_emotion_proportions(text)
|
82 |
+
unique_word_ratio = [self.measure_unique_word_ratio(text)]
|
83 |
+
features = pos_features + emotion_features + unique_word_ratio
|
84 |
+
return features
|
85 |
+
|
86 |
+
def calculate_not_normalized_features(self, text: str) -> List[float]:
|
87 |
+
sentiment_intensity = self.measure_sentiment_intensity(text)
|
88 |
+
readability_scores = self.measure_readability(text)
|
89 |
+
perplexity = [self.gemma2bdependencies.calculate_perplexity(text)]
|
90 |
+
burstiness = [self.gemma2bdependencies.calculate_burstiness(text)]
|
91 |
+
features = sentiment_intensity + readability_scores + perplexity + burstiness
|
92 |
+
return features
|
93 |
|
94 |
def extract_pos_features(self, text: str):
|
95 |
words = nltk.word_tokenize(text)
|
96 |
pos_tags = nltk.pos_tag(words)
|
97 |
+
desired_tags = ["NN", "NNS", "JJ", "IN", "DT", "VB", "PRP", "RB"]
|
98 |
pos_counts = defaultdict(int, {tag: 0 for tag in desired_tags})
|
99 |
|
100 |
for _, pos in pos_tags:
|
|
|
117 |
|
118 |
return [gunning_fog, smog_index, dale_chall_score]
|
119 |
|
120 |
+
def __penn2morphy(self, penntag):
|
121 |
+
morphy_tag = {
|
122 |
+
'NN': 'n', 'NNS': 'n', 'NNP': 'n', 'NNPS': 'n', # Nouns
|
123 |
+
'JJ': 'a', 'JJR': 'a', 'JJS': 'a', # Adjectives
|
124 |
+
'VB': 'v', 'VBD': 'v', 'VBG': 'v', 'VBN': 'v', 'VBP': 'v', 'VBZ': 'v', # Verbs
|
125 |
+
'RB': 'r', 'RBR': 'r', 'RBS': 'r', # Adverbs
|
126 |
+
# Pronouns, determiners, prepositions, modal verbs
|
127 |
+
'PRP': 'n', 'PRP$': 'n', 'DT': 'n', 'IN': 'n', 'MD': 'v',
|
128 |
+
# Others, treated as nouns unless a better fit is found
|
129 |
+
'CC': 'n', 'CD': 'n', 'EX': 'n', 'FW': 'n', 'POS': 'n', 'TO': 'n', 'WDT': 'n', 'WP': 'n', 'WP$': 'n', 'WRB': 'n', 'PDT': 'n'
|
130 |
+
}
|
131 |
+
return morphy_tag.get(penntag[:2], 'n')
|
132 |
+
|
133 |
def calculate_emotion_proportions(self, text: str):
|
134 |
tokens = nltk.word_tokenize(text)
|
135 |
+
tagged_tokens = nltk.pos_tag(tokens)
|
136 |
+
|
137 |
+
lemmas = [self.lemmatizer.lemmatize(
|
138 |
+
token.lower(), pos=self.__penn2morphy(tag)) for token, tag in tagged_tokens]
|
139 |
|
140 |
+
total_lemmas = len(lemmas)
|
141 |
|
142 |
emotion_counts = {emotion: 0 for emotion in [
|
143 |
"negative", "positive", "fear", "anger", "trust", "sadness", "disgust", "anticipation", "joy", "surprise"]}
|
144 |
|
145 |
+
for lemma in lemmas:
|
146 |
+
if lemma in self.emotion_lexicon:
|
147 |
+
for emotion in self.emotion_lexicon[lemma]:
|
148 |
emotion_counts[emotion] += 1
|
149 |
|
150 |
+
proportions = {emotion: count / total_lemmas for emotion,
|
151 |
count in emotion_counts.items()}
|
152 |
|
153 |
return [
|
|
|
156 |
]
|
157 |
|
158 |
def measure_unique_word_ratio(self, text: str):
|
159 |
+
tokens = nltk.word_tokenize(text.lower())
|
160 |
+
|
161 |
+
tokens = [token for token in tokens if token not in punctuation]
|
162 |
+
|
163 |
total_words = len(tokens)
|
164 |
|
165 |
+
unique_words = len(set(tokens))
|
166 |
|
167 |
return (unique_words / total_words)
|
core-model-prediction/main_model.py
CHANGED
@@ -5,31 +5,50 @@ import torch
|
|
5 |
import numpy as np
|
6 |
|
7 |
|
8 |
-
class
|
9 |
-
def __init__(self, albert_model, num_additional_features=25,
|
10 |
-
|
11 |
-
|
12 |
-
self.
|
13 |
-
self.
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
|
|
|
|
|
|
18 |
|
19 |
-
|
20 |
-
|
|
|
|
|
|
|
|
|
21 |
|
22 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
|
24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
|
26 |
-
|
27 |
-
|
28 |
-
labels = labels.unsqueeze(1)
|
29 |
-
loss = loss_fn(logits, labels.float())
|
30 |
-
return logits, loss
|
31 |
-
else:
|
32 |
-
return logits
|
33 |
|
34 |
|
35 |
class PredictMainModel:
|
@@ -47,10 +66,9 @@ class PredictMainModel:
|
|
47 |
self.albert_model = AlbertModel.from_pretrained(self.model_name)
|
48 |
self.device = DeviceManager()
|
49 |
|
50 |
-
self.model =
|
51 |
self.albert_model).to(self.device)
|
52 |
-
|
53 |
-
self.model.load_state_dict(torch.load("models/albert_model.pth"))
|
54 |
|
55 |
def preprocess_input(self, text: str, additional_features: np.ndarray):
|
56 |
encoding = self.tokenizer.encode_plus(
|
|
|
5 |
import numpy as np
|
6 |
|
7 |
|
8 |
+
class AlbertSeparateTransformation(nn.Module):
|
9 |
+
def __init__(self, albert_model, num_additional_features=25,
|
10 |
+
hidden_size_albert=512, hidden_size_additional=128, classifier_hidden_size=256,
|
11 |
+
dropout_rate_albert=0.3, dropout_rate_additional=0.1, dropout_rate_classifier=0.1):
|
12 |
+
super(AlbertSeparateTransformation, self).__init__()
|
13 |
+
self.albert = albert_model
|
14 |
+
|
15 |
+
# Transform ALBERT's features to an intermediate space
|
16 |
+
self.albert_feature_transform = nn.Sequential(
|
17 |
+
nn.Linear(1024, hidden_size_albert),
|
18 |
+
nn.ReLU(),
|
19 |
+
nn.Dropout(dropout_rate_albert),
|
20 |
+
)
|
21 |
|
22 |
+
# Transform additional features to an intermediate space
|
23 |
+
self.additional_feature_transform = nn.Sequential(
|
24 |
+
nn.Linear(num_additional_features, hidden_size_additional),
|
25 |
+
nn.ReLU(),
|
26 |
+
nn.Dropout(dropout_rate_additional),
|
27 |
+
)
|
28 |
|
29 |
+
# Combine both transformed features and process for final prediction
|
30 |
+
self.classifier = nn.Sequential(
|
31 |
+
nn.Linear(hidden_size_albert + hidden_size_additional,
|
32 |
+
classifier_hidden_size),
|
33 |
+
nn.ReLU(),
|
34 |
+
nn.Dropout(dropout_rate_classifier),
|
35 |
+
nn.Linear(classifier_hidden_size, 1)
|
36 |
+
)
|
37 |
|
38 |
+
def forward(self, input_ids, attention_mask, additional_features):
|
39 |
+
albert_output = self.albert(
|
40 |
+
input_ids=input_ids, attention_mask=attention_mask).pooler_output
|
41 |
+
|
42 |
+
transformed_albert_features = self.albert_feature_transform(
|
43 |
+
albert_output)
|
44 |
+
transformed_additional_features = self.additional_feature_transform(
|
45 |
+
additional_features)
|
46 |
+
|
47 |
+
combined_features = torch.cat(
|
48 |
+
(transformed_albert_features, transformed_additional_features), dim=1)
|
49 |
|
50 |
+
logits = self.classifier(combined_features)
|
51 |
+
return logits
|
|
|
|
|
|
|
|
|
|
|
52 |
|
53 |
|
54 |
class PredictMainModel:
|
|
|
66 |
self.albert_model = AlbertModel.from_pretrained(self.model_name)
|
67 |
self.device = DeviceManager()
|
68 |
|
69 |
+
self.model = AlbertSeparateTransformation(
|
70 |
self.albert_model).to(self.device)
|
71 |
+
self.model.load_state_dict(torch.load("models/albert_weights.pth"))
|
|
|
72 |
|
73 |
def preprocess_input(self, text: str, additional_features: np.ndarray):
|
74 |
encoding = self.tokenizer.encode_plus(
|
core-model-prediction/models/{albert_model.pth → albert_weights.pth}
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:59184c88c7921ac5f115aa0b10b3224536b5f7d7ebb6cf07fd45eecccfcff3ae
|
3 |
+
size 73519347
|
core-model-prediction/models/random_forest.joblib
CHANGED
Binary files a/core-model-prediction/models/random_forest.joblib and b/core-model-prediction/models/random_forest.joblib differ
|
|
core-model-prediction/prediction.py
CHANGED
@@ -40,29 +40,27 @@ def process_instance(data: PredictRequest):
|
|
40 |
typing_duration = data.typing_duration
|
41 |
letter_click_counts = data.letter_click_counts
|
42 |
|
|
|
43 |
hypothesis = BaseModelHypothesis()
|
44 |
-
|
45 |
-
answer)
|
46 |
-
features_not_normalized = hypothesis.calculate_not_normalized_features(
|
47 |
-
answer)
|
48 |
-
|
49 |
-
combined_additional_features = np.concatenate(
|
50 |
-
(features_normalized_text_length, features_not_normalized), axis=1)
|
51 |
|
|
|
52 |
main_model = PredictMainModel()
|
53 |
main_model_probability = main_model.predict(
|
54 |
-
answer,
|
55 |
|
|
|
56 |
random_forest_features = RandomForestDependencies()
|
57 |
secondary_model_features = random_forest_features.calculate_features(
|
58 |
-
|
59 |
|
|
|
60 |
secondary_model = RandomForestModel()
|
61 |
secondary_model_prediction = secondary_model.predict(
|
62 |
secondary_model_features)
|
63 |
|
64 |
return {
|
65 |
-
"
|
66 |
"details": {
|
67 |
"main_model_probability": str(main_model_probability),
|
68 |
"final_prediction": secondary_model_prediction
|
|
|
40 |
typing_duration = data.typing_duration
|
41 |
letter_click_counts = data.letter_click_counts
|
42 |
|
43 |
+
# Data preparation for 1st model
|
44 |
hypothesis = BaseModelHypothesis()
|
45 |
+
additional_features = hypothesis.calculate_features_dataframe(answer)
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
|
47 |
+
# 1st model prediction
|
48 |
main_model = PredictMainModel()
|
49 |
main_model_probability = main_model.predict(
|
50 |
+
answer, additional_features)
|
51 |
|
52 |
+
# Data preparation for 2nd model
|
53 |
random_forest_features = RandomForestDependencies()
|
54 |
secondary_model_features = random_forest_features.calculate_features(
|
55 |
+
answer, main_model_probability, backspace_count, typing_duration, letter_click_counts)
|
56 |
|
57 |
+
# 2nd model prediction
|
58 |
secondary_model = RandomForestModel()
|
59 |
secondary_model_prediction = secondary_model.predict(
|
60 |
secondary_model_features)
|
61 |
|
62 |
return {
|
63 |
+
"predicted_class": "AI" if secondary_model_prediction == 1 else "HUMAN",
|
64 |
"details": {
|
65 |
"main_model_probability": str(main_model_probability),
|
66 |
"final_prediction": secondary_model_prediction
|
core-model-prediction/random_forest_dependencies.py
CHANGED
@@ -3,19 +3,14 @@ from collections import Counter
|
|
3 |
|
4 |
|
5 |
class RandomForestDependencies:
|
6 |
-
def
|
7 |
-
self.gemma2bdependencies = Gemma2BDependencies()
|
8 |
-
|
9 |
-
def calculate_features(self, question: str, answer: str, probability: float, backspace_count: int, typing_duration: int, letter_click_counts: dict[str, int]):
|
10 |
-
cosine_similarity = self.gemma2bdependencies.calculate_cosine_similarity(
|
11 |
-
question, answer)
|
12 |
backspace_count_normalized = backspace_count / len(answer)
|
13 |
typing_duration_normalized = typing_duration / len(answer)
|
14 |
letter_discrepancy = self.calculate_letter_discrepancy(
|
15 |
answer, letter_click_counts)
|
16 |
|
17 |
return [
|
18 |
-
|
19 |
typing_duration_normalized, letter_discrepancy
|
20 |
]
|
21 |
|
|
|
3 |
|
4 |
|
5 |
class RandomForestDependencies:
|
6 |
+
def calculate_features(self, answer: str, probability: float, backspace_count: int, typing_duration: int, letter_click_counts: dict[str, int]):
|
|
|
|
|
|
|
|
|
|
|
7 |
backspace_count_normalized = backspace_count / len(answer)
|
8 |
typing_duration_normalized = typing_duration / len(answer)
|
9 |
letter_discrepancy = self.calculate_letter_discrepancy(
|
10 |
answer, letter_click_counts)
|
11 |
|
12 |
return [
|
13 |
+
probability, backspace_count_normalized,
|
14 |
typing_duration_normalized, letter_discrepancy
|
15 |
]
|
16 |
|
core-model-prediction/random_forest_model.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
import joblib
|
2 |
import numpy as np
|
|
|
3 |
from typing import List
|
4 |
|
5 |
|
@@ -7,9 +8,15 @@ class RandomForestModel:
|
|
7 |
def __init__(self):
|
8 |
self.scaler = joblib.load("scalers/rf_scaler.joblib")
|
9 |
self.model = joblib.load("models/random_forest.joblib")
|
|
|
|
|
|
|
10 |
|
11 |
def preprocess_input(self, secondary_model_features: List[float]) -> np.ndarray:
|
12 |
-
|
|
|
|
|
|
|
13 |
|
14 |
def predict(self, secondary_model_features: List[float]):
|
15 |
return int(self.model.predict(self.preprocess_input(secondary_model_features))[0])
|
|
|
1 |
import joblib
|
2 |
import numpy as np
|
3 |
+
import pandas as pd
|
4 |
from typing import List
|
5 |
|
6 |
|
|
|
8 |
def __init__(self):
|
9 |
self.scaler = joblib.load("scalers/rf_scaler.joblib")
|
10 |
self.model = joblib.load("models/random_forest.joblib")
|
11 |
+
self.secondary_model_features = [
|
12 |
+
"machine_probability", "backspace_count_normalized", "typing_duration_normalized", "letter_discrepancy_normalized"
|
13 |
+
]
|
14 |
|
15 |
def preprocess_input(self, secondary_model_features: List[float]) -> np.ndarray:
|
16 |
+
features_df = pd.DataFrame([secondary_model_features], columns=[
|
17 |
+
self.secondary_model_features])
|
18 |
+
features_df = self.scaler.transform(features_df)
|
19 |
+
return features_df.values.astype(np.float32).reshape(1, -1)
|
20 |
|
21 |
def predict(self, secondary_model_features: List[float]):
|
22 |
return int(self.model.predict(self.preprocess_input(secondary_model_features))[0])
|
core-model-prediction/requirements.txt
CHANGED
@@ -2,8 +2,8 @@ nltk
|
|
2 |
vaderSentiment
|
3 |
pandas
|
4 |
textstat
|
5 |
-
scikit-learn==1.
|
6 |
-
transformers
|
7 |
fastapi
|
8 |
uvicorn
|
9 |
google-cloud-secret-manager
|
|
|
2 |
vaderSentiment
|
3 |
pandas
|
4 |
textstat
|
5 |
+
scikit-learn==1.2.2
|
6 |
+
transformers==4.38.2
|
7 |
fastapi
|
8 |
uvicorn
|
9 |
google-cloud-secret-manager
|
core-model-prediction/scalers/rf_scaler.joblib
CHANGED
Binary files a/core-model-prediction/scalers/rf_scaler.joblib and b/core-model-prediction/scalers/rf_scaler.joblib differ
|
|
core-model-prediction/scalers/{scaler-normalized-text-length.joblib → torch-scaler-normalized-text-length.joblib}
RENAMED
Binary files a/core-model-prediction/scalers/scaler-normalized-text-length.joblib and b/core-model-prediction/scalers/torch-scaler-normalized-text-length.joblib differ
|
|
core-model-prediction/scalers/{scaler-not-normalized.joblib → torch-scaler-not-normalized.joblib}
RENAMED
Binary files a/core-model-prediction/scalers/scaler-not-normalized.joblib and b/core-model-prediction/scalers/torch-scaler-not-normalized.joblib differ
|
|