bearking58's picture
fix: feature ordering
df00cec
raw
history blame
8.09 kB
import nltk
import joblib
import textstat
import pandas as pd
import numpy as np
from typing import List
from collections import defaultdict
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from gemma2b_dependencies import Gemma2BDependencies
from string import punctuation
import os
import zipfile
class BaseModelHypothesis:
def __init__(self):
self.analyzer = SentimentIntensityAnalyzer()
self.lexicon_df = pd.read_csv(
"https://storage.googleapis.com/interview-ai-detector/higher-accuracy-final-model/NRC-Emotion-Lexicon.csv")
self.emotion_lexicon = self.process_emotion_lexicon()
self.lemmatizer = nltk.stem.WordNetLemmatizer()
self.gemma2bdependencies = Gemma2BDependencies()
self.additional_feature_columns = [
"nn_ratio", "nns_ratio", "jj_ratio", "in_ratio", "dt_ratio", "vb_ratio", "prp_ratio", "rb_ratio",
"compound_score", "gunning_fog", "smog_index", "dale_chall_score",
"negative_emotion_proportions", "positive_emotion_proportions", "fear_emotion_proportions",
"anger_emotion_proportions", "trust_emotion_proportions", "sadness_emotion_proportions",
"disgust_emotion_proportions", "anticipation_emotion_proportions", "joy_emotion_proportions",
"surprise_emotion_proportions", "unique_words_ratio", "perplexity", "burstiness"
]
self.features_normalized_text_length = [
"nn_ratio", "nns_ratio", "jj_ratio", "in_ratio", "dt_ratio", "vb_ratio", "prp_ratio", "rb_ratio",
"negative_emotion_proportions", "positive_emotion_proportions", "fear_emotion_proportions",
"anger_emotion_proportions", "trust_emotion_proportions", "sadness_emotion_proportions",
"disgust_emotion_proportions", "anticipation_emotion_proportions", "joy_emotion_proportions",
"surprise_emotion_proportions", "unique_words_ratio"
]
self.features_not_normalized = [
"compound_score", "gunning_fog", "smog_index", "dale_chall_score",
"perplexity", "burstiness"
]
self.scaler_normalized_text_length = joblib.load(
"scalers/scaler-normalized-text-length.joblib")
self.scaler_not_normalized = joblib.load(
"scalers/scaler-not-normalized.joblib")
def process_emotion_lexicon(self):
emotion_lexicon = {}
for _, row in self.lexicon_df.iterrows():
if row["word"] not in emotion_lexicon:
emotion_lexicon[row["word"]] = []
emotion_lexicon[row["word"]].append(row["emotion"])
return emotion_lexicon
def calculate_features_dataframe(self, text: str) -> np.ndarray:
normalized_text_length_features = self.calculate_normalized_text_length_features(
text)
not_normalized_features = self.calculate_not_normalized_features(text)
all_features = normalized_text_length_features + not_normalized_features
features_df = pd.DataFrame(
[all_features], columns=[
"nn_ratio", "nns_ratio", "jj_ratio", "in_ratio", "dt_ratio", "vb_ratio", "prp_ratio", "rb_ratio",
"negative_emotion_proportions", "positive_emotion_proportions", "fear_emotion_proportions",
"anger_emotion_proportions", "trust_emotion_proportions", "sadness_emotion_proportions",
"disgust_emotion_proportions", "anticipation_emotion_proportions", "joy_emotion_proportions",
"surprise_emotion_proportions", "unique_words_ratio",
"compound_score", "gunning_fog", "smog_index", "dale_chall_score",
"perplexity", "burstiness"
])
# Scaling features
features_df[self.features_normalized_text_length] = self.scaler_normalized_text_length.transform(
features_df[self.features_normalized_text_length])
features_df[self.features_not_normalized] = self.scaler_not_normalized.transform(
features_df[self.features_not_normalized])
ordered_df = features_df[self.additional_feature_columns]
return ordered_df.values.astype(np.float32).reshape(1, -1)
def calculate_normalized_text_length_features(self, text: str) -> List[float]:
pos_features = self.extract_pos_features(text)
emotion_features = self.calculate_emotion_proportions(text)
unique_word_ratio = [self.measure_unique_word_ratio(text)]
features = pos_features + emotion_features + unique_word_ratio
return features
def calculate_not_normalized_features(self, text: str) -> List[float]:
sentiment_intensity = [self.measure_sentiment_intensity(text)]
readability_scores = self.measure_readability(text)
perplexity = [self.gemma2bdependencies.calculate_perplexity(text)]
burstiness = [self.gemma2bdependencies.calculate_burstiness(text)]
features = sentiment_intensity + readability_scores + perplexity + burstiness
return features
def extract_pos_features(self, text: str):
words = nltk.word_tokenize(text)
pos_tags = nltk.pos_tag(words)
desired_tags = ["NN", "NNS", "JJ", "IN", "DT", "VB", "PRP", "RB"]
pos_counts = defaultdict(int, {tag: 0 for tag in desired_tags})
for _, pos in pos_tags:
if pos in pos_counts:
pos_counts[pos] += 1
total_words = len(words)
pos_ratios = [pos_counts[tag] / total_words for tag in desired_tags]
return pos_ratios
def measure_sentiment_intensity(self, text: str):
sentiment = self.analyzer.polarity_scores(text)
return sentiment["compound"]
def measure_readability(self, text: str):
gunning_fog = textstat.gunning_fog(text)
smog_index = textstat.smog_index(text)
dale_chall_score = textstat.dale_chall_readability_score(text)
return [gunning_fog, smog_index, dale_chall_score]
def __penn2morphy(self, penntag):
morphy_tag = {
'NN': 'n', 'NNS': 'n', 'NNP': 'n', 'NNPS': 'n', # Nouns
'JJ': 'a', 'JJR': 'a', 'JJS': 'a', # Adjectives
'VB': 'v', 'VBD': 'v', 'VBG': 'v', 'VBN': 'v', 'VBP': 'v', 'VBZ': 'v', # Verbs
'RB': 'r', 'RBR': 'r', 'RBS': 'r', # Adverbs
# Pronouns, determiners, prepositions, modal verbs
'PRP': 'n', 'PRP$': 'n', 'DT': 'n', 'IN': 'n', 'MD': 'v',
# Others, treated as nouns unless a better fit is found
'CC': 'n', 'CD': 'n', 'EX': 'n', 'FW': 'n', 'POS': 'n', 'TO': 'n', 'WDT': 'n', 'WP': 'n', 'WP$': 'n', 'WRB': 'n', 'PDT': 'n'
}
return morphy_tag.get(penntag[:2], 'n')
def calculate_emotion_proportions(self, text: str):
tokens = nltk.word_tokenize(text)
tagged_tokens = nltk.pos_tag(tokens)
lemmas = [self.lemmatizer.lemmatize(
token.lower(), pos=self.__penn2morphy(tag)) for token, tag in tagged_tokens]
total_lemmas = len(lemmas)
emotion_counts = {emotion: 0 for emotion in [
"negative", "positive", "fear", "anger", "trust", "sadness", "disgust", "anticipation", "joy", "surprise"]}
for lemma in lemmas:
if lemma in self.emotion_lexicon:
for emotion in self.emotion_lexicon[lemma]:
emotion_counts[emotion] += 1
proportions = {emotion: count / total_lemmas for emotion,
count in emotion_counts.items()}
return [
proportions["negative"], proportions["positive"], proportions["fear"], proportions["anger"], proportions["trust"],
proportions["sadness"], proportions["disgust"], proportions["anticipation"], proportions["joy"], proportions["surprise"]
]
def measure_unique_word_ratio(self, text: str):
tokens = nltk.word_tokenize(text.lower())
tokens = [token for token in tokens if token not in punctuation]
total_words = len(tokens)
unique_words = len(set(tokens))
return (unique_words / total_words)