Spaces:

panduwana
/

interview-ai-detector

Running

App Files Files Community

bearking58 commited on Apr 20, 2024

Commit

ae51d62

1 Parent(s): 07ba2c0

feat: 3 feature classes + main endpoint

Browse files

Files changed (5) hide show

.gitignore +1 -0
gemma2b.py +62 -0
hypothesis.py +117 -0
prediction.py +63 -0
randomforest.py +40 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ __pycache__

gemma2b.py ADDED Viewed

	@@ -0,0 +1,62 @@

+from transformers import AutoTokenizer, AutoModelForCausalLM
+import torch
+from torch.nn.functional import cosine_similarity
+from collections import Counter
+import numpy as np
+class Gemma2BDependencies:
+    def __init__(self, question: str, answer: str):
+        self.question = question
+        self.answer = answer
+        self.tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b")
+        self.model = AutoModelForCausalLM.from_pretrained("google/gemma-2b")
+        self.device = torch.device("cuda")
+        self.model.to(self.device)
+    def calculate_perplexity(self):
+        inputs = self.tokenizer(self.answer, return_tensors="pt",
+                                truncation=True, max_length=1024)
+        inputs = {k: v.to(self.device) for k, v in inputs.items()}
+        # Calculate the model's output
+        with torch.no_grad():
+            outputs = self.model(**inputs, labels=inputs["input_ids"])
+            loss = outputs.loss
+            perplexity = torch.exp(loss)
+        return perplexity.item()
+    def calculate_burstiness(self):
+        # Tokenize the text using GPT-2 tokenizer
+        tokens = self.tokenizer.tokenize(self.answer)
+        # Count token frequencies
+        frequency_counts = list(Counter(tokens).values())
+        # Calculate variance and mean of frequencies
+        variance = np.var(frequency_counts)
+        mean = np.mean(frequency_counts)
+        # Compute Variance-to-Mean Ratio (VMR) for burstiness
+        vmr = variance / mean if mean > 0 else 0
+        return vmr
+    def get_embedding(self):
+        inputs = self.tokenizer(self.text, return_tensors="pt",
+                                truncation=True, max_length=1024)
+        inputs = {k: v.to(self.device) for k, v in inputs.items()}
+        with torch.no_grad():
+            outputs = self.model(**inputs, output_hidden_states=True)
+        last_hidden_states = outputs.hidden_states[-1]
+        # Average the token embeddings to get a sentence-level embedding
+        embedding = torch.mean(last_hidden_states, dim=1)
+        return embedding
+    def calculate_cosine_similarity(self):
+        embedding1 = self.get_embedding(self.question)
+        embedding2 = self.get_embedding(self.answer)
+        # Ensure the embeddings are in the correct shape for cosine_similarity
+        return cosine_similarity(embedding1, embedding2).item()

hypothesis.py ADDED Viewed

	@@ -0,0 +1,117 @@

+import nltk
+import joblib
+import textstat
+import pandas as pd
+import numpy as np
+from collections import defaultdict, Counter
+from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
+from gemma2b import Gemma2BDependencies
+class BaseModelHypothesis:
+    def __init__(self, question: str, answer: str):
+        nltk.download('punkt')
+        nltk.download('averaged_perceptron_tagger')
+        self.question = question
+        self.answer = answer
+        self.analyzer = SentimentIntensityAnalyzer()
+        self.lexicon_df = pd.read_csv(
+            "https://storage.googleapis.com/ta-ai-detector/datasets/NRC-Emotion-Lexicon.csv")
+        self.emotion_lexicon = self.process_emotion_lexicon()
+        self.gemma2bdependencies = Gemma2BDependencies(
+            self.question, self.answer)
+        self.features_normalized_text_length = []
+        self.features_not_normalized = []
+        self.scaler_normalized_text_length = joblib.load(
+            "scaler-normalized-text-length.joblib")
+        self.scaler_not_normalized = joblib.load(
+            "scaler-not-normalized.joblib")
+    def process_emotion_lexicon(self):
+        emotion_lexicon = {}
+        for _, row in self.lexicon_df.iterrows():
+            if row["word"] not in emotion_lexicon:
+                emotion_lexicon[row["word"]] = []
+            emotion_lexicon[row["word"]].append(row["emotion"])
+        return emotion_lexicon
+    def calculate_normalized_text_length_features(self):
+        self.features_normalized_text_length = self.extract_pos_features(
+            self.answer)
+        self.features_normalized_text_length = self.features_normalized_text_length + \
+            self.calculate_emotion_proportions(self.answer)
+        self.features_normalized_text_length.append(
+            self.measure_unique_word_ratio(self.answer))
+        return self.scaler_normalized_text_length.transform(np.array(self.features_normalized_text_length).astype(np.float32).reshape(1, -1))
+    def calculate_not_normalized_features(self):
+        self.features_not_normalized.append(
+            self.measure_sentiment_intensity(self.answer))
+        self.features_not_normalized = self.features_not_normalized + \
+            self.measure_readability(self.answer)
+        self.features_not_normalized.append(
+            self.gemma2bdependencies.calculate_perplexity(self.answer))
+        self.features_not_normalized.append(
+            self.gemma2bdependencies.calculate_burstiness(self.answer))
+        return self.scaler_not_normalized.transform(np.array(self.features_not_normalized).astype(np.float32).reshape(1, -1))
+    def extract_pos_features(self):
+        words = nltk.word_tokenize(self.answer)
+        pos_tags = nltk.pos_tag(words)
+        desired_tags = ["JJ", "VB", "RB", "PRP", "DT", "IN", "NN", "NNS"]
+        pos_counts = defaultdict(int, {tag: 0 for tag in desired_tags})
+        for _, pos in pos_tags:
+            if pos in pos_counts:
+                pos_counts[pos] += 1
+        total_words = len(words)
+        pos_ratios = [pos_counts[tag] / total_words for tag in desired_tags]
+        return pos_ratios
+    def measure_sentiment_intensity(self):
+        sentiment = self.analyzer.polarity_scores(self.answer)
+        return sentiment["compound"]
+    def measure_readability(self):
+        gunning_fog = textstat.gunning_fog(self.answer)
+        smog_index = textstat.smog_index(self.answer)
+        dale_chall_score = textstat.dale_chall_readability_score(self.answer)
+        return [gunning_fog, smog_index, dale_chall_score]
+    def calculate_emotion_proportions(self):
+        tokens = nltk.word_tokenize(self.answer)
+        total_tokens = len(tokens)
+        emotion_counts = {emotion: 0 for emotion in [
+            "negative", "positive", "fear", "anger", "trust", "sadness", "disgust", "anticipation", "joy", "surprise"]}
+        for token in tokens:
+            if token in self.emotion_lexicon:
+                for emotion in self.emotion_lexicon[token]:
+                    emotion_counts[emotion] += 1
+        proportions = {emotion: count / total_tokens for emotion,
+                       count in emotion_counts.items()}
+        return [
+            proportions["negative"], proportions["positive"], proportions["fear"], proportions["anger"], proportions["trust"],
+            proportions["sadness"], proportions["disgust"], proportions["anticipation"], proportions["joy"], proportions["surprise"]
+        ]
+    def measure_unique_word_ratio(self):
+        tokens = nltk.word_tokenize(self.answer)
+        total_words = len(tokens)
+        unique_words = len(Counter(tokens).keys())
+        return (unique_words / total_words)

prediction.py ADDED Viewed

	@@ -0,0 +1,63 @@

+from fastapi import FastAPI
+from pydantic import BaseModel
+from hypothesis import BaseModelHypothesis
+from randomforest import RandomForestDependencies
+import torch.nn as nn
+import torch
+class AlbertCustomClassificationHead(nn.Module):
+    def __init__(self, albert_model, dropout_rate=0.1):
+        super(AlbertCustomClassificationHead, self).__init__()
+        self.albert_model = albert_model
+        self.dropout = nn.Dropout(dropout_rate)
+        self.classifier = nn.Linear(1024 + 25, 1)
+    def forward(self, input_ids, attention_mask, additional_features, labels=None):
+        albert_output = self.albert_model(
+            input_ids=input_ids, attention_mask=attention_mask).pooler_output
+        combined_features = torch.cat(
+            [albert_output, additional_features], dim=1)
+        dropout_output = self.dropout(combined_features)
+        logits = self.classifier(dropout_output)
+        if labels is not None:
+            loss_fn = nn.BCEWithLogitsLoss()
+            labels = labels.unsqueeze(1)
+            loss = loss_fn(logits, labels.float())
+            return logits, loss
+        else:
+            return logits
+app = FastAPI()
+class PredictRequest(BaseModel):
+    question: str
+    answer: str
+    backspace_count: int
+    typing_duration: int
+    letter_click_counts: dict[str, int]
+@app.post("/predict")
+async def predict(request: PredictRequest):
+    request_dict = request.model_dump()
+    question = request_dict.get("question")
+    answer = request_dict.get("answer")
+    backspace_count = request_dict.get("backspace_count")
+    typing_duration = request_dict.get("typing_duration")
+    letter_click_counts = request_dict.get("letter_click_counts")
+    hypothesis = BaseModelHypothesis()
+    features_normalized_text_length = hypothesis.calculate_normalized_text_length_features(
+        answer)
+    features_not_normalized = hypothesis.calculate_not_normalized_features(
+        answer)
+    return request_dict.get("backspace_count")

randomforest.py ADDED Viewed

	@@ -0,0 +1,40 @@

+from gemma2b import Gemma2BDependencies
+from collections import Counter
+class RandomForestDependencies:
+    def __init__(self, question: str, answer: str):
+        self.question = question
+        self.answer = answer
+        self.gemma2bdependencies = Gemma2BDependencies(
+            self.question, self.answer)
+        self.random_forest_features = []
+    def calculate_features(self, probability: float, backspace_count: int, typing_duration: int, letter_click_counts: dict[str, int]):
+        cosine_similarity = self.gemma2bdependencies.calculate_cosine_similarity(
+            self.question, self.answer)
+        backspace_count_normalized = backspace_count / len(self.answer)
+        typing_duration_normalized = typing_duration / len(self.answer)
+        letter_discrepancy = self.calculate_letter_discrepancy(
+            self.answer, letter_click_counts)
+        self.random_forest_features = [
+            cosine_similarity, probability, backspace_count_normalized,
+            typing_duration_normalized, letter_discrepancy
+        ]
+    def calculate_letter_discrepancy(self, letter_click_counts: dict[str, int]):
+        # Calculate letter frequencies in the text
+        text_letter_counts = Counter(self.answer.lower())
+        # Calculate the ratio of click counts to text counts for each letter, adjusting for letters not in text
+        ratios = [letter_click_counts.get(letter, 0) / (text_letter_counts.get(letter, 0) + 1)
+                  for letter in "abcdefghijklmnopqrstuvwxyz"]
+        # Average the ratios and normalize by the length of the text
+        average_ratio = sum(ratios) / len(ratios)
+        discrepancy_ratio_normalized = average_ratio / \
+            (len(self.answer) if len(self.answer) > 0 else 1)
+        return discrepancy_ratio_normalized