Spaces:
Running
Running
File size: 7,998 Bytes
ae51d62 1c1651d ae51d62 b4f3263 1c1651d 3953432 ae51d62 b4f3263 ae51d62 933b7b6 ae51d62 1c1651d b4f3263 ae51d62 1c1651d ae51d62 67bae95 ae51d62 67bae95 ae51d62 1c1651d b4f3263 1c1651d df00cec 1c1651d 04383e2 1c1651d ae51d62 b4f3263 ae51d62 1c1651d ae51d62 b4f3263 ae51d62 b4f3263 ae51d62 1c1651d b4f3263 1c1651d ae51d62 1c1651d ae51d62 1c1651d ae51d62 1c1651d ae51d62 b4f3263 1c1651d ae51d62 1c1651d ae51d62 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 |
import nltk
import joblib
import textstat
import pandas as pd
import numpy as np
from typing import List
from collections import defaultdict
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from gemma2b_dependencies import Gemma2BDependencies
from string import punctuation
import os
import zipfile
class BaseModelHypothesis:
def __init__(self):
self.analyzer = SentimentIntensityAnalyzer()
self.lexicon_df = pd.read_csv("NRC-Emotion-Lexicon.csv")
self.emotion_lexicon = self.process_emotion_lexicon()
self.lemmatizer = nltk.stem.WordNetLemmatizer()
self.gemma2bdependencies = Gemma2BDependencies()
self.additional_feature_columns = [
"nn_ratio", "nns_ratio", "jj_ratio", "in_ratio", "dt_ratio", "vb_ratio", "prp_ratio", "rb_ratio",
"compound_score", "gunning_fog", "smog_index", "dale_chall_score",
"negative_emotion_proportions", "positive_emotion_proportions", "fear_emotion_proportions",
"anger_emotion_proportions", "trust_emotion_proportions", "sadness_emotion_proportions",
"disgust_emotion_proportions", "anticipation_emotion_proportions", "joy_emotion_proportions",
"surprise_emotion_proportions", "unique_words_ratio", "perplexity", "burstiness"
]
self.features_normalized_text_length = [
"nn_ratio", "nns_ratio", "jj_ratio", "in_ratio", "dt_ratio", "vb_ratio", "prp_ratio", "rb_ratio",
"negative_emotion_proportions", "positive_emotion_proportions", "fear_emotion_proportions",
"anger_emotion_proportions", "trust_emotion_proportions", "sadness_emotion_proportions",
"disgust_emotion_proportions", "anticipation_emotion_proportions", "joy_emotion_proportions",
"surprise_emotion_proportions", "unique_words_ratio"
]
self.features_not_normalized = [
"compound_score", "gunning_fog", "smog_index", "dale_chall_score",
"perplexity", "burstiness"
]
self.scaler_normalized_text_length = joblib.load(
"scalers/scaler-normalized-text-length.joblib")
self.scaler_not_normalized = joblib.load(
"scalers/scaler-not-normalized.joblib")
def process_emotion_lexicon(self):
emotion_lexicon = {}
for _, row in self.lexicon_df.iterrows():
if row["word"] not in emotion_lexicon:
emotion_lexicon[row["word"]] = []
emotion_lexicon[row["word"]].append(row["emotion"])
return emotion_lexicon
def calculate_features_dataframe(self, text: str) -> np.ndarray:
normalized_text_length_features = self.calculate_normalized_text_length_features(
text)
not_normalized_features = self.calculate_not_normalized_features(text)
all_features = normalized_text_length_features + not_normalized_features
features_df = pd.DataFrame(
[all_features], columns=[
"nn_ratio", "nns_ratio", "jj_ratio", "in_ratio", "dt_ratio", "vb_ratio", "prp_ratio", "rb_ratio",
"negative_emotion_proportions", "positive_emotion_proportions", "fear_emotion_proportions",
"anger_emotion_proportions", "trust_emotion_proportions", "sadness_emotion_proportions",
"disgust_emotion_proportions", "anticipation_emotion_proportions", "joy_emotion_proportions",
"surprise_emotion_proportions", "unique_words_ratio",
"compound_score", "gunning_fog", "smog_index", "dale_chall_score",
"perplexity", "burstiness"
])
# Scaling features
features_df[self.features_normalized_text_length] = self.scaler_normalized_text_length.transform(
features_df[self.features_normalized_text_length])
features_df[self.features_not_normalized] = self.scaler_not_normalized.transform(
features_df[self.features_not_normalized])
ordered_df = features_df[self.additional_feature_columns]
return ordered_df.values.astype(np.float32).reshape(1, -1)
def calculate_normalized_text_length_features(self, text: str) -> List[float]:
pos_features = self.extract_pos_features(text)
emotion_features = self.calculate_emotion_proportions(text)
unique_word_ratio = [self.measure_unique_word_ratio(text)]
features = pos_features + emotion_features + unique_word_ratio
return features
def calculate_not_normalized_features(self, text: str) -> List[float]:
sentiment_intensity = [self.measure_sentiment_intensity(text)]
readability_scores = self.measure_readability(text)
perplexity = [self.gemma2bdependencies.calculate_perplexity(text)]
burstiness = [self.gemma2bdependencies.calculate_burstiness(text)]
features = sentiment_intensity + readability_scores + perplexity + burstiness
return features
def extract_pos_features(self, text: str):
words = nltk.word_tokenize(text)
pos_tags = nltk.pos_tag(words)
desired_tags = ["NN", "NNS", "JJ", "IN", "DT", "VB", "PRP", "RB"]
pos_counts = defaultdict(int, {tag: 0 for tag in desired_tags})
for _, pos in pos_tags:
if pos in pos_counts:
pos_counts[pos] += 1
total_words = len(words)
pos_ratios = [pos_counts[tag] / total_words for tag in desired_tags]
return pos_ratios
def measure_sentiment_intensity(self, text: str):
sentiment = self.analyzer.polarity_scores(text)
return sentiment["compound"]
def measure_readability(self, text: str):
gunning_fog = textstat.gunning_fog(text)
smog_index = textstat.smog_index(text)
dale_chall_score = textstat.dale_chall_readability_score(text)
return [gunning_fog, smog_index, dale_chall_score]
def __penn2morphy(self, penntag):
morphy_tag = {
'NN': 'n', 'NNS': 'n', 'NNP': 'n', 'NNPS': 'n', # Nouns
'JJ': 'a', 'JJR': 'a', 'JJS': 'a', # Adjectives
'VB': 'v', 'VBD': 'v', 'VBG': 'v', 'VBN': 'v', 'VBP': 'v', 'VBZ': 'v', # Verbs
'RB': 'r', 'RBR': 'r', 'RBS': 'r', # Adverbs
# Pronouns, determiners, prepositions, modal verbs
'PRP': 'n', 'PRP$': 'n', 'DT': 'n', 'IN': 'n', 'MD': 'v',
# Others, treated as nouns unless a better fit is found
'CC': 'n', 'CD': 'n', 'EX': 'n', 'FW': 'n', 'POS': 'n', 'TO': 'n', 'WDT': 'n', 'WP': 'n', 'WP$': 'n', 'WRB': 'n', 'PDT': 'n'
}
return morphy_tag.get(penntag[:2], 'n')
def calculate_emotion_proportions(self, text: str):
tokens = nltk.word_tokenize(text)
tagged_tokens = nltk.pos_tag(tokens)
lemmas = [self.lemmatizer.lemmatize(
token.lower(), pos=self.__penn2morphy(tag)) for token, tag in tagged_tokens]
total_lemmas = len(lemmas)
emotion_counts = {emotion: 0 for emotion in [
"negative", "positive", "fear", "anger", "trust", "sadness", "disgust", "anticipation", "joy", "surprise"]}
for lemma in lemmas:
if lemma in self.emotion_lexicon:
for emotion in self.emotion_lexicon[lemma]:
emotion_counts[emotion] += 1
proportions = {emotion: count / total_lemmas for emotion,
count in emotion_counts.items()}
return [
proportions["negative"], proportions["positive"], proportions["fear"], proportions["anger"], proportions["trust"],
proportions["sadness"], proportions["disgust"], proportions["anticipation"], proportions["joy"], proportions["surprise"]
]
def measure_unique_word_ratio(self, text: str):
tokens = nltk.word_tokenize(text.lower())
tokens = [token for token in tokens if token not in punctuation]
total_words = len(tokens)
unique_words = len(set(tokens))
return (unique_words / total_words)
|