Spaces:

panduwana
/

interview-ai-detector

Running

App Files Files Community

Yakobus Iryanto Prasethio commited on May 4, 2024

Commit

d290137

unverified ·

2 Parent(s): df00cec 84f0cff

Merge pull request #10 from YakobusIP/main

Browse files

Files changed (10) hide show

.gitignore +2 -1
cloudbuild-endpoint.yaml +0 -42
core-model-prediction/models/random_forest.joblib +0 -0
core-model-prediction/models/rf_weights.joblib +0 -0
core-model-prediction/prediction.py +5 -8
core-model-prediction/random_forest_dependencies.py +0 -30
core-model-prediction/random_forest_model.py +4 -3
core-model-prediction/requirements.txt +4 -0
core-model-prediction/scalers/{rf_scaler.joblib → secondary_scaler.joblib} +0 -0
core-model-prediction/secondary_model_dependencies.py +91 -0

.gitignore CHANGED Viewed

@@ -1,3 +1,4 @@
 __pycache__
 .env
-*.json

 __pycache__
 .env
+*.json
+*.ipynb

cloudbuild-endpoint.yaml DELETED Viewed

@@ -1,42 +0,0 @@
-steps:
-  - name: "gcr.io/google.com/cloudsdktool/cloud-sdk"
-    entrypoint: "bash"
-    id: upload-model
-    args:
-      - "-c"
-      - |
-        gcloud ai models upload \
-          --region="us-central1" \
-          --container-ports=8080 \
-          --container-image-uri="us-central1-docker.pkg.dev/${PROJECT_ID}/interview-ai-detector/model-prediction:latest" \
-          --container-predict-route="/predict" \
-          --container-health-route="/health" \
-          --display-name="interview-ai-detector-model"
-  - name: "gcr.io/google.com/cloudsdktool/cloud-sdk"
-    entrypoint: "bash"
-    id: create-endpoint
-    waitFor: ["upload-model"]
-    args:
-      - "-c"
-      - |
-        gcloud ai endpoints create \
-          --region="us-central1" \
-          --display-name="interview-ai-detector-endpoint" \
-          --format="value(name)"
-  - name: "gcr.io/google.com/cloudsdktool/cloud-sdk"
-    entrypoint: "bash"
-    waitFor: ["create-endpoint"]
-    args:
-      - "-c"
-      - |
-        _MODEL_ID=$(gcloud ai models list --region=us-central1 --format="value(name)" | head -n 1) \
-        _ENDPOINT_ID=$(gcloud ai endpoints list --region=us-central1 --format="value(name)" | head -n 1) \
-        gcloud ai endpoints deploy-model $_ENDPOINT_ID \
-          --region="us-central1" \
-          --model=$_MODEL_ID \
-          --display-name="interview-ai-detector-deployment" \
-          --machine-type="n1-standard-4" \
-          --accelerator="count=1,type=nvidia-tesla-t4" \
-          --service-account="vertex-ai-user-managed-sa@steady-climate-416810.iam.gserviceaccount.com"

core-model-prediction/models/random_forest.joblib DELETED Viewed

Binary file (96.1 kB)

core-model-prediction/models/rf_weights.joblib ADDED Viewed

Binary file (228 kB). View file

core-model-prediction/prediction.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from fastapi import FastAPI, Response, status
 from pydantic import BaseModel
 from hypothesis import BaseModelHypothesis
-from random_forest_dependencies import RandomForestDependencies
 from random_forest_model import RandomForestModel
 from main_model import PredictMainModel
 import numpy as np
@@ -50,9 +50,9 @@ def process_instance(data: PredictRequest):
         answer, additional_features)
     # Data preparation for 2nd model
-    random_forest_features = RandomForestDependencies()
-    secondary_model_features = random_forest_features.calculate_features(
-        answer, main_model_probability, backspace_count, typing_duration, letter_click_counts)
     # 2nd model prediction
     secondary_model = RandomForestModel()
@@ -61,8 +61,5 @@ def process_instance(data: PredictRequest):
     return {
         "predicted_class": "AI" if secondary_model_prediction == 1 else "HUMAN",
-        "details": {
-            "main_model_probability": str(main_model_probability),
-            "final_prediction": secondary_model_prediction
-        }
     }

 from fastapi import FastAPI, Response, status
 from pydantic import BaseModel
 from hypothesis import BaseModelHypothesis
+from secondary_model_dependencies import SecondaryModelDependencies
 from random_forest_model import RandomForestModel
 from main_model import PredictMainModel
 import numpy as np
         answer, additional_features)
     # Data preparation for 2nd model
+    secondary_model_dependencies = SecondaryModelDependencies()
+    secondary_model_features = secondary_model_dependencies.calculate_features(
+        question, answer, main_model_probability, backspace_count, typing_duration, letter_click_counts)
     # 2nd model prediction
     secondary_model = RandomForestModel()
     return {
         "predicted_class": "AI" if secondary_model_prediction == 1 else "HUMAN",
+        "main_model_probability": str(main_model_probability)
     }

core-model-prediction/random_forest_dependencies.py DELETED Viewed

@@ -1,30 +0,0 @@
-from gemma2b_dependencies import Gemma2BDependencies
-from collections import Counter
-class RandomForestDependencies:
-    def calculate_features(self, answer: str, probability: float, backspace_count: int, typing_duration: int, letter_click_counts: dict[str, int]):
-        backspace_count_normalized = backspace_count / len(answer)
-        typing_duration_normalized = typing_duration / len(answer)
-        letter_discrepancy = self.calculate_letter_discrepancy(
-            answer, letter_click_counts)
-        return [
-            probability, backspace_count_normalized,
-            typing_duration_normalized, letter_discrepancy
-        ]
-    def calculate_letter_discrepancy(self, text: str, letter_click_counts: dict[str, int]):
-        # Calculate letter frequencies in the text
-        text_letter_counts = Counter(text.lower())
-        # Calculate the ratio of click counts to text counts for each letter, adjusting for letters not in text
-        ratios = [letter_click_counts.get(letter, 0) / (text_letter_counts.get(letter, 0) + 1)
-                  for letter in "abcdefghijklmnopqrstuvwxyz"]
-        # Average the ratios and normalize by the length of the text
-        average_ratio = sum(ratios) / len(ratios)
-        discrepancy_ratio_normalized = average_ratio / \
-            (len(text) if len(text) > 0 else 1)
-        return discrepancy_ratio_normalized

core-model-prediction/random_forest_model.py CHANGED Viewed

@@ -6,10 +6,11 @@ from typing import List
 class RandomForestModel:
     def __init__(self):
-        self.scaler = joblib.load("scalers/rf_scaler.joblib")
-        self.model = joblib.load("models/random_forest.joblib")
         self.secondary_model_features = [
-            "machine_probability", "backspace_count_normalized", "typing_duration_normalized", "letter_discrepancy_normalized"
         ]
     def preprocess_input(self, secondary_model_features: List[float]) -> np.ndarray:

 class RandomForestModel:
     def __init__(self):
+        self.scaler = joblib.load("scalers/secondary_scaler.joblib")
+        self.model = joblib.load("models/rf_weights.joblib")
         self.secondary_model_features = [
+            "machine_probability", "backspace_count_normalized", "typing_duration_normalized",
+            "letter_discrepancy_normalized", "cosine_sim_gpt35", "cosine_sim_gpt4"
         ]
     def preprocess_input(self, secondary_model_features: List[float]) -> np.ndarray:

core-model-prediction/requirements.txt CHANGED Viewed

@@ -4,6 +4,10 @@ pandas
 textstat==0.7.3
 scikit-learn==1.2.2
 transformers==4.38.2
 fastapi
 uvicorn
 google-cloud-secret-manager

 textstat==0.7.3
 scikit-learn==1.2.2
 transformers==4.38.2
+sentence-transformers==2.7.0
+langchain
+openai
+langchain-openai
 fastapi
 uvicorn
 google-cloud-secret-manager

core-model-prediction/scalers/{rf_scaler.joblib → secondary_scaler.joblib} RENAMED Viewed

Binary files a/core-model-prediction/scalers/rf_scaler.joblib and b/core-model-prediction/scalers/secondary_scaler.joblib differ

core-model-prediction/secondary_model_dependencies.py ADDED Viewed

	@@ -0,0 +1,91 @@

+from sentence_transformers import SentenceTransformer, util
+from collections import Counter
+from langchain_openai import ChatOpenAI
+from langchain_core.messages import HumanMessage, SystemMessage
+from google.cloud import secretmanager
+class SecondaryModelDependencies:
+    def __init__(self):
+        self.text_similarity_model = SentenceTransformer(
+            'sentence-transformers/all-mpnet-base-v2')
+        api_key = self.access_openai_api_key()
+        self.llm_gpt35 = ChatOpenAI(
+            api_key=api_key, model="gpt-3.5-turbo")
+        self.llm_gpt4 = ChatOpenAI(
+            api_key=api_key, model="gpt-4-turbo")
+    def access_openai_api_key(self):
+        client = secretmanager.SecretManagerServiceClient()
+        name = "projects/steady-climate-416810/secrets/OPENAI_API_KEY/versions/1"
+        response = client.access_secret_version(request={"name": name})
+        return response.payload.data.decode('UTF-8')
+    def calculate_features(self, question: str, answer: str, probability: float, backspace_count: int, typing_duration: int, letter_click_counts: dict[str, int]):
+        backspace_count_normalized = backspace_count / len(answer)
+        typing_duration_normalized = typing_duration / len(answer)
+        letter_discrepancy = self.calculate_letter_discrepancy(
+            answer, letter_click_counts)
+        gpt35_answer = self.generate_gpt35_answer(question)
+        gpt4_answer = self.generate_gpt4_answer(question)
+        cosine_sim_gpt35 = self.calculate_similarity_gpt35(
+            answer, gpt35_answer)
+        cosine_sim_gpt4 = self.calculate_similarity_gpt4(answer, gpt4_answer)
+        return [
+            probability, backspace_count_normalized, typing_duration_normalized,
+            letter_discrepancy, cosine_sim_gpt35, cosine_sim_gpt4
+        ]
+    def calculate_letter_discrepancy(self, text: str, letter_click_counts: dict[str, int]):
+        # Calculate letter frequencies in the text
+        text_letter_counts = Counter(text.lower())
+        # Calculate the ratio of click counts to text counts for each letter, adjusting for letters not in text
+        ratios = [letter_click_counts.get(letter, 0) / (text_letter_counts.get(letter, 0) + 1)
+                  for letter in "abcdefghijklmnopqrstuvwxyz"]
+        # Average the ratios and normalize by the length of the text
+        average_ratio = sum(ratios) / len(ratios)
+        discrepancy_ratio_normalized = average_ratio / \
+            (len(text) if len(text) > 0 else 1)
+        return discrepancy_ratio_normalized
+    def generate_gpt35_answer(self, question: str):
+        messages = [
+            SystemMessage(
+                content="Please answer the following question based solely on your internal knowledge, without external references. Assume you are the human."),
+            HumanMessage(question)
+        ]
+        gpt35_answer = self.llm_gpt35.invoke(messages)
+        return gpt35_answer.content
+    def generate_gpt4_answer(self, question: str):
+        messages = [
+            SystemMessage(
+                content="Please answer the following question based solely on your internal knowledge, without external references. Assume you are the human."),
+            HumanMessage(question)
+        ]
+        gpt4_answer = self.llm_gpt4.invoke(messages)
+        return gpt4_answer.content
+    def calculate_similarity_gpt35(self, answer: str, gpt35_answer: str) -> float:
+        embedding1 = self.text_similarity_model.encode(
+            [answer], convert_to_tensor=True)
+        embedding2 = self.text_similarity_model.encode(
+            [gpt35_answer], convert_to_tensor=True)
+        cosine_scores = util.cos_sim(embedding1, embedding2)
+        return cosine_scores.item()
+    def calculate_similarity_gpt4(self, answer: str, gpt4_answer: str) -> float:
+        embedding1 = self.text_similarity_model.encode(
+            [answer], convert_to_tensor=True)
+        embedding2 = self.text_similarity_model.encode(
+            [gpt4_answer], convert_to_tensor=True)
+        cosine_scores = util.cos_sim(embedding1, embedding2)
+        return cosine_scores.item()