Spaces:

panduwana
/

interview-ai-detector

Running

App Files Files Community

interview-ai-detector / hypothesis.py

panduwana

un-gcp-ize

933b7b6 3 months ago

raw

history blame contribute delete

8 kB

	import nltk
	import joblib
	import textstat
	import pandas as pd
	import numpy as np
	from typing import List
	from collections import defaultdict
	from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
	from gemma2b_dependencies import Gemma2BDependencies
	from string import punctuation
	import os
	import zipfile


	class BaseModelHypothesis:
	def __init__(self):
	self.analyzer = SentimentIntensityAnalyzer()
	self.lexicon_df = pd.read_csv("NRC-Emotion-Lexicon.csv")
	self.emotion_lexicon = self.process_emotion_lexicon()
	self.lemmatizer = nltk.stem.WordNetLemmatizer()
	self.gemma2bdependencies = Gemma2BDependencies()

	self.additional_feature_columns = [
	"nn_ratio", "nns_ratio", "jj_ratio", "in_ratio", "dt_ratio", "vb_ratio", "prp_ratio", "rb_ratio",
	"compound_score", "gunning_fog", "smog_index", "dale_chall_score",
	"negative_emotion_proportions", "positive_emotion_proportions", "fear_emotion_proportions",
	"anger_emotion_proportions", "trust_emotion_proportions", "sadness_emotion_proportions",
	"disgust_emotion_proportions", "anticipation_emotion_proportions", "joy_emotion_proportions",
	"surprise_emotion_proportions", "unique_words_ratio", "perplexity", "burstiness"
	]

	self.features_normalized_text_length = [
	"nn_ratio", "nns_ratio", "jj_ratio", "in_ratio", "dt_ratio", "vb_ratio", "prp_ratio", "rb_ratio",
	"negative_emotion_proportions", "positive_emotion_proportions", "fear_emotion_proportions",
	"anger_emotion_proportions", "trust_emotion_proportions", "sadness_emotion_proportions",
	"disgust_emotion_proportions", "anticipation_emotion_proportions", "joy_emotion_proportions",
	"surprise_emotion_proportions", "unique_words_ratio"
	]

	self.features_not_normalized = [
	"compound_score", "gunning_fog", "smog_index", "dale_chall_score",
	"perplexity", "burstiness"
	]

	self.scaler_normalized_text_length = joblib.load(
	"scalers/scaler-normalized-text-length.joblib")
	self.scaler_not_normalized = joblib.load(
	"scalers/scaler-not-normalized.joblib")

	def process_emotion_lexicon(self):
	emotion_lexicon = {}
	for _, row in self.lexicon_df.iterrows():
	if row["word"] not in emotion_lexicon:
	emotion_lexicon[row["word"]] = []
	emotion_lexicon[row["word"]].append(row["emotion"])
	return emotion_lexicon

	def calculate_features_dataframe(self, text: str) -> np.ndarray:
	normalized_text_length_features = self.calculate_normalized_text_length_features(
	text)
	not_normalized_features = self.calculate_not_normalized_features(text)
	all_features = normalized_text_length_features + not_normalized_features
	features_df = pd.DataFrame(
	[all_features], columns=[
	"nn_ratio", "nns_ratio", "jj_ratio", "in_ratio", "dt_ratio", "vb_ratio", "prp_ratio", "rb_ratio",
	"negative_emotion_proportions", "positive_emotion_proportions", "fear_emotion_proportions",
	"anger_emotion_proportions", "trust_emotion_proportions", "sadness_emotion_proportions",
	"disgust_emotion_proportions", "anticipation_emotion_proportions", "joy_emotion_proportions",
	"surprise_emotion_proportions", "unique_words_ratio",
	"compound_score", "gunning_fog", "smog_index", "dale_chall_score",
	"perplexity", "burstiness"
	])

	# Scaling features
	features_df[self.features_normalized_text_length] = self.scaler_normalized_text_length.transform(
	features_df[self.features_normalized_text_length])
	features_df[self.features_not_normalized] = self.scaler_not_normalized.transform(
	features_df[self.features_not_normalized])

	ordered_df = features_df[self.additional_feature_columns]

	return ordered_df.values.astype(np.float32).reshape(1, -1)

	def calculate_normalized_text_length_features(self, text: str) -> List[float]:
	pos_features = self.extract_pos_features(text)
	emotion_features = self.calculate_emotion_proportions(text)
	unique_word_ratio = [self.measure_unique_word_ratio(text)]
	features = pos_features + emotion_features + unique_word_ratio
	return features

	def calculate_not_normalized_features(self, text: str) -> List[float]:
	sentiment_intensity = [self.measure_sentiment_intensity(text)]
	readability_scores = self.measure_readability(text)
	perplexity = [self.gemma2bdependencies.calculate_perplexity(text)]
	burstiness = [self.gemma2bdependencies.calculate_burstiness(text)]
	features = sentiment_intensity + readability_scores + perplexity + burstiness
	return features

	def extract_pos_features(self, text: str):
	words = nltk.word_tokenize(text)
	pos_tags = nltk.pos_tag(words)
	desired_tags = ["NN", "NNS", "JJ", "IN", "DT", "VB", "PRP", "RB"]
	pos_counts = defaultdict(int, {tag: 0 for tag in desired_tags})

	for _, pos in pos_tags:
	if pos in pos_counts:
	pos_counts[pos] += 1

	total_words = len(words)
	pos_ratios = [pos_counts[tag] / total_words for tag in desired_tags]

	return pos_ratios

	def measure_sentiment_intensity(self, text: str):
	sentiment = self.analyzer.polarity_scores(text)
	return sentiment["compound"]

	def measure_readability(self, text: str):
	gunning_fog = textstat.gunning_fog(text)
	smog_index = textstat.smog_index(text)
	dale_chall_score = textstat.dale_chall_readability_score(text)

	return [gunning_fog, smog_index, dale_chall_score]

	def __penn2morphy(self, penntag):
	morphy_tag = {
	'NN': 'n', 'NNS': 'n', 'NNP': 'n', 'NNPS': 'n', # Nouns
	'JJ': 'a', 'JJR': 'a', 'JJS': 'a', # Adjectives
	'VB': 'v', 'VBD': 'v', 'VBG': 'v', 'VBN': 'v', 'VBP': 'v', 'VBZ': 'v', # Verbs
	'RB': 'r', 'RBR': 'r', 'RBS': 'r', # Adverbs
	# Pronouns, determiners, prepositions, modal verbs
	'PRP': 'n', 'PRP$': 'n', 'DT': 'n', 'IN': 'n', 'MD': 'v',
	# Others, treated as nouns unless a better fit is found
	'CC': 'n', 'CD': 'n', 'EX': 'n', 'FW': 'n', 'POS': 'n', 'TO': 'n', 'WDT': 'n', 'WP': 'n', 'WP$': 'n', 'WRB': 'n', 'PDT': 'n'
	}
	return morphy_tag.get(penntag[:2], 'n')

	def calculate_emotion_proportions(self, text: str):
	tokens = nltk.word_tokenize(text)
	tagged_tokens = nltk.pos_tag(tokens)

	lemmas = [self.lemmatizer.lemmatize(
	token.lower(), pos=self.__penn2morphy(tag)) for token, tag in tagged_tokens]

	total_lemmas = len(lemmas)

	emotion_counts = {emotion: 0 for emotion in [
	"negative", "positive", "fear", "anger", "trust", "sadness", "disgust", "anticipation", "joy", "surprise"]}

	for lemma in lemmas:
	if lemma in self.emotion_lexicon:
	for emotion in self.emotion_lexicon[lemma]:
	emotion_counts[emotion] += 1

	proportions = {emotion: count / total_lemmas for emotion,
	count in emotion_counts.items()}

	return [
	proportions["negative"], proportions["positive"], proportions["fear"], proportions["anger"], proportions["trust"],
	proportions["sadness"], proportions["disgust"], proportions["anticipation"], proportions["joy"], proportions["surprise"]
	]

	def measure_unique_word_ratio(self, text: str):
	tokens = nltk.word_tokenize(text.lower())

	tokens = [token for token in tokens if token not in punctuation]

	total_words = len(tokens)

	unique_words = len(set(tokens))

	return (unique_words / total_words)