import nltk
import joblib
import textstat
import pandas as pd
import numpy as np
from collections import defaultdict, Counter
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from gemma2b_dependencies import Gemma2BDependencies


class BaseModelHypothesis:
    def __init__(self):
        nltk.download('punkt')
        nltk.download('averaged_perceptron_tagger')

        self.analyzer = SentimentIntensityAnalyzer()
        self.lexicon_df = pd.read_csv(
            "https://storage.googleapis.com/ta-ai-detector/datasets/NRC-Emotion-Lexicon.csv")
        self.emotion_lexicon = self.process_emotion_lexicon()
        self.gemma2bdependencies = Gemma2BDependencies()

        self.features_normalized_text_length = []
        self.features_not_normalized = []

        self.scaler_normalized_text_length = joblib.load(
            "scalers/scaler-normalized-text-length.joblib")
        self.scaler_not_normalized = joblib.load(
            "scalers/scaler-not-normalized.joblib")

    def process_emotion_lexicon(self):
        emotion_lexicon = {}
        for _, row in self.lexicon_df.iterrows():
            if row["word"] not in emotion_lexicon:
                emotion_lexicon[row["word"]] = []
            emotion_lexicon[row["word"]].append(row["emotion"])
        return emotion_lexicon

    def calculate_normalized_text_length_features(self, text: str) -> np.ndarray:
        self.features_normalized_text_length = self.extract_pos_features(
            text)
        self.features_normalized_text_length = self.features_normalized_text_length + \
            self.calculate_emotion_proportions(text)
        self.features_normalized_text_length.append(
            self.measure_unique_word_ratio(text))

        return self.scaler_normalized_text_length.transform(np.array(self.features_normalized_text_length).astype(np.float32).reshape(1, -1))

    def calculate_not_normalized_features(self, text: str) -> np.ndarray:
        self.features_not_normalized.append(
            self.measure_sentiment_intensity(text))
        self.features_not_normalized = self.features_not_normalized + \
            self.measure_readability(text)
        self.features_not_normalized.append(
            self.gemma2bdependencies.calculate_perplexity(text))
        self.features_not_normalized.append(
            self.gemma2bdependencies.calculate_burstiness(text))

        return self.scaler_not_normalized.transform(np.array(self.features_not_normalized).astype(np.float32).reshape(1, -1))

    def extract_pos_features(self, text: str):
        words = nltk.word_tokenize(text)
        pos_tags = nltk.pos_tag(words)
        desired_tags = ["JJ", "VB", "RB", "PRP", "DT", "IN", "NN", "NNS"]
        pos_counts = defaultdict(int, {tag: 0 for tag in desired_tags})

        for _, pos in pos_tags:
            if pos in pos_counts:
                pos_counts[pos] += 1

        total_words = len(words)
        pos_ratios = [pos_counts[tag] / total_words for tag in desired_tags]

        return pos_ratios

    def measure_sentiment_intensity(self, text: str):
        sentiment = self.analyzer.polarity_scores(text)
        return sentiment["compound"]

    def measure_readability(self, text: str):
        gunning_fog = textstat.gunning_fog(text)
        smog_index = textstat.smog_index(text)
        dale_chall_score = textstat.dale_chall_readability_score(text)

        return [gunning_fog, smog_index, dale_chall_score]

    def calculate_emotion_proportions(self, text: str):
        tokens = nltk.word_tokenize(text)

        total_tokens = len(tokens)

        emotion_counts = {emotion: 0 for emotion in [
            "negative", "positive", "fear", "anger", "trust", "sadness", "disgust", "anticipation", "joy", "surprise"]}

        for token in tokens:
            if token in self.emotion_lexicon:
                for emotion in self.emotion_lexicon[token]:
                    emotion_counts[emotion] += 1

        proportions = {emotion: count / total_tokens for emotion,
                       count in emotion_counts.items()}

        return [
            proportions["negative"], proportions["positive"], proportions["fear"], proportions["anger"], proportions["trust"],
            proportions["sadness"], proportions["disgust"], proportions["anticipation"], proportions["joy"], proportions["surprise"]
        ]

    def measure_unique_word_ratio(self, text: str):
        tokens = nltk.word_tokenize(text)
        total_words = len(tokens)

        unique_words = len(Counter(tokens).keys())

        return (unique_words / total_words)