File size: 7,998 Bytes
ae51d62
 
 
 
 
1c1651d
 
ae51d62
b4f3263
1c1651d
3953432
 
ae51d62
 
 
b4f3263
ae51d62
933b7b6
ae51d62
1c1651d
b4f3263
ae51d62
1c1651d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ae51d62
 
67bae95
ae51d62
67bae95
ae51d62
 
 
 
 
 
 
 
 
1c1651d
 
b4f3263
1c1651d
 
 
df00cec
 
 
 
 
 
 
 
 
1c1651d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
04383e2
 
1c1651d
 
 
 
ae51d62
b4f3263
 
ae51d62
1c1651d
ae51d62
 
 
 
 
 
 
 
 
 
 
b4f3263
 
ae51d62
 
b4f3263
 
 
 
ae51d62
 
 
1c1651d
 
 
 
 
 
 
 
 
 
 
 
 
b4f3263
 
1c1651d
 
 
 
ae51d62
1c1651d
ae51d62
 
 
 
1c1651d
 
 
ae51d62
 
1c1651d
ae51d62
 
 
 
 
 
 
b4f3263
1c1651d
 
 
 
ae51d62
 
1c1651d
ae51d62
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
import nltk
import joblib
import textstat
import pandas as pd
import numpy as np
from typing import List
from collections import defaultdict
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from gemma2b_dependencies import Gemma2BDependencies
from string import punctuation
import os
import zipfile


class BaseModelHypothesis:
    def __init__(self):
        self.analyzer = SentimentIntensityAnalyzer()
        self.lexicon_df = pd.read_csv("NRC-Emotion-Lexicon.csv")
        self.emotion_lexicon = self.process_emotion_lexicon()
        self.lemmatizer = nltk.stem.WordNetLemmatizer()
        self.gemma2bdependencies = Gemma2BDependencies()

        self.additional_feature_columns = [
            "nn_ratio", "nns_ratio", "jj_ratio", "in_ratio", "dt_ratio", "vb_ratio", "prp_ratio", "rb_ratio",
            "compound_score", "gunning_fog", "smog_index", "dale_chall_score",
            "negative_emotion_proportions", "positive_emotion_proportions", "fear_emotion_proportions",
            "anger_emotion_proportions", "trust_emotion_proportions", "sadness_emotion_proportions",
            "disgust_emotion_proportions", "anticipation_emotion_proportions", "joy_emotion_proportions",
            "surprise_emotion_proportions", "unique_words_ratio", "perplexity", "burstiness"
        ]

        self.features_normalized_text_length = [
            "nn_ratio", "nns_ratio", "jj_ratio", "in_ratio", "dt_ratio", "vb_ratio", "prp_ratio", "rb_ratio",
            "negative_emotion_proportions", "positive_emotion_proportions", "fear_emotion_proportions",
            "anger_emotion_proportions", "trust_emotion_proportions", "sadness_emotion_proportions",
            "disgust_emotion_proportions", "anticipation_emotion_proportions", "joy_emotion_proportions",
            "surprise_emotion_proportions", "unique_words_ratio"
        ]

        self.features_not_normalized = [
            "compound_score", "gunning_fog", "smog_index", "dale_chall_score",
            "perplexity", "burstiness"
        ]

        self.scaler_normalized_text_length = joblib.load(
            "scalers/scaler-normalized-text-length.joblib")
        self.scaler_not_normalized = joblib.load(
            "scalers/scaler-not-normalized.joblib")

    def process_emotion_lexicon(self):
        emotion_lexicon = {}
        for _, row in self.lexicon_df.iterrows():
            if row["word"] not in emotion_lexicon:
                emotion_lexicon[row["word"]] = []
            emotion_lexicon[row["word"]].append(row["emotion"])
        return emotion_lexicon

    def calculate_features_dataframe(self, text: str) -> np.ndarray:
        normalized_text_length_features = self.calculate_normalized_text_length_features(
            text)
        not_normalized_features = self.calculate_not_normalized_features(text)
        all_features = normalized_text_length_features + not_normalized_features
        features_df = pd.DataFrame(
            [all_features], columns=[
                "nn_ratio", "nns_ratio", "jj_ratio", "in_ratio", "dt_ratio", "vb_ratio", "prp_ratio", "rb_ratio",
                "negative_emotion_proportions", "positive_emotion_proportions", "fear_emotion_proportions",
                "anger_emotion_proportions", "trust_emotion_proportions", "sadness_emotion_proportions",
                "disgust_emotion_proportions", "anticipation_emotion_proportions", "joy_emotion_proportions",
                "surprise_emotion_proportions", "unique_words_ratio",
                "compound_score", "gunning_fog", "smog_index", "dale_chall_score",
                "perplexity", "burstiness"
            ])

        # Scaling features
        features_df[self.features_normalized_text_length] = self.scaler_normalized_text_length.transform(
            features_df[self.features_normalized_text_length])
        features_df[self.features_not_normalized] = self.scaler_not_normalized.transform(
            features_df[self.features_not_normalized])

        ordered_df = features_df[self.additional_feature_columns]

        return ordered_df.values.astype(np.float32).reshape(1, -1)

    def calculate_normalized_text_length_features(self, text: str) -> List[float]:
        pos_features = self.extract_pos_features(text)
        emotion_features = self.calculate_emotion_proportions(text)
        unique_word_ratio = [self.measure_unique_word_ratio(text)]
        features = pos_features + emotion_features + unique_word_ratio
        return features

    def calculate_not_normalized_features(self, text: str) -> List[float]:
        sentiment_intensity = [self.measure_sentiment_intensity(text)]
        readability_scores = self.measure_readability(text)
        perplexity = [self.gemma2bdependencies.calculate_perplexity(text)]
        burstiness = [self.gemma2bdependencies.calculate_burstiness(text)]
        features = sentiment_intensity + readability_scores + perplexity + burstiness
        return features

    def extract_pos_features(self, text: str):
        words = nltk.word_tokenize(text)
        pos_tags = nltk.pos_tag(words)
        desired_tags = ["NN", "NNS", "JJ", "IN", "DT", "VB", "PRP", "RB"]
        pos_counts = defaultdict(int, {tag: 0 for tag in desired_tags})

        for _, pos in pos_tags:
            if pos in pos_counts:
                pos_counts[pos] += 1

        total_words = len(words)
        pos_ratios = [pos_counts[tag] / total_words for tag in desired_tags]

        return pos_ratios

    def measure_sentiment_intensity(self, text: str):
        sentiment = self.analyzer.polarity_scores(text)
        return sentiment["compound"]

    def measure_readability(self, text: str):
        gunning_fog = textstat.gunning_fog(text)
        smog_index = textstat.smog_index(text)
        dale_chall_score = textstat.dale_chall_readability_score(text)

        return [gunning_fog, smog_index, dale_chall_score]

    def __penn2morphy(self, penntag):
        morphy_tag = {
            'NN': 'n', 'NNS': 'n', 'NNP': 'n', 'NNPS': 'n',  # Nouns
            'JJ': 'a', 'JJR': 'a', 'JJS': 'a',  # Adjectives
            'VB': 'v', 'VBD': 'v', 'VBG': 'v', 'VBN': 'v', 'VBP': 'v', 'VBZ': 'v',  # Verbs
            'RB': 'r', 'RBR': 'r', 'RBS': 'r',  # Adverbs
            # Pronouns, determiners, prepositions, modal verbs
            'PRP': 'n', 'PRP$': 'n', 'DT': 'n', 'IN': 'n', 'MD': 'v',
            # Others, treated as nouns unless a better fit is found
            'CC': 'n', 'CD': 'n', 'EX': 'n', 'FW': 'n', 'POS': 'n', 'TO': 'n', 'WDT': 'n', 'WP': 'n', 'WP$': 'n', 'WRB': 'n', 'PDT': 'n'
        }
        return morphy_tag.get(penntag[:2], 'n')

    def calculate_emotion_proportions(self, text: str):
        tokens = nltk.word_tokenize(text)
        tagged_tokens = nltk.pos_tag(tokens)

        lemmas = [self.lemmatizer.lemmatize(
            token.lower(), pos=self.__penn2morphy(tag)) for token, tag in tagged_tokens]

        total_lemmas = len(lemmas)

        emotion_counts = {emotion: 0 for emotion in [
            "negative", "positive", "fear", "anger", "trust", "sadness", "disgust", "anticipation", "joy", "surprise"]}

        for lemma in lemmas:
            if lemma in self.emotion_lexicon:
                for emotion in self.emotion_lexicon[lemma]:
                    emotion_counts[emotion] += 1

        proportions = {emotion: count / total_lemmas for emotion,
                       count in emotion_counts.items()}

        return [
            proportions["negative"], proportions["positive"], proportions["fear"], proportions["anger"], proportions["trust"],
            proportions["sadness"], proportions["disgust"], proportions["anticipation"], proportions["joy"], proportions["surprise"]
        ]

    def measure_unique_word_ratio(self, text: str):
        tokens = nltk.word_tokenize(text.lower())

        tokens = [token for token in tokens if token not in punctuation]

        total_words = len(tokens)

        unique_words = len(set(tokens))

        return (unique_words / total_words)