Spaces:
Running
Running
Merge pull request #10 from YakobusIP/main
Browse files- .gitignore +2 -1
- cloudbuild-endpoint.yaml +0 -42
- core-model-prediction/models/random_forest.joblib +0 -0
- core-model-prediction/models/rf_weights.joblib +0 -0
- core-model-prediction/prediction.py +5 -8
- core-model-prediction/random_forest_dependencies.py +0 -30
- core-model-prediction/random_forest_model.py +4 -3
- core-model-prediction/requirements.txt +4 -0
- core-model-prediction/scalers/{rf_scaler.joblib → secondary_scaler.joblib} +0 -0
- core-model-prediction/secondary_model_dependencies.py +91 -0
.gitignore
CHANGED
@@ -1,3 +1,4 @@
|
|
1 |
__pycache__
|
2 |
.env
|
3 |
-
*.json
|
|
|
|
1 |
__pycache__
|
2 |
.env
|
3 |
+
*.json
|
4 |
+
*.ipynb
|
cloudbuild-endpoint.yaml
DELETED
@@ -1,42 +0,0 @@
|
|
1 |
-
steps:
|
2 |
-
- name: "gcr.io/google.com/cloudsdktool/cloud-sdk"
|
3 |
-
entrypoint: "bash"
|
4 |
-
id: upload-model
|
5 |
-
args:
|
6 |
-
- "-c"
|
7 |
-
- |
|
8 |
-
gcloud ai models upload \
|
9 |
-
--region="us-central1" \
|
10 |
-
--container-ports=8080 \
|
11 |
-
--container-image-uri="us-central1-docker.pkg.dev/${PROJECT_ID}/interview-ai-detector/model-prediction:latest" \
|
12 |
-
--container-predict-route="/predict" \
|
13 |
-
--container-health-route="/health" \
|
14 |
-
--display-name="interview-ai-detector-model"
|
15 |
-
|
16 |
-
- name: "gcr.io/google.com/cloudsdktool/cloud-sdk"
|
17 |
-
entrypoint: "bash"
|
18 |
-
id: create-endpoint
|
19 |
-
waitFor: ["upload-model"]
|
20 |
-
args:
|
21 |
-
- "-c"
|
22 |
-
- |
|
23 |
-
gcloud ai endpoints create \
|
24 |
-
--region="us-central1" \
|
25 |
-
--display-name="interview-ai-detector-endpoint" \
|
26 |
-
--format="value(name)"
|
27 |
-
|
28 |
-
- name: "gcr.io/google.com/cloudsdktool/cloud-sdk"
|
29 |
-
entrypoint: "bash"
|
30 |
-
waitFor: ["create-endpoint"]
|
31 |
-
args:
|
32 |
-
- "-c"
|
33 |
-
- |
|
34 |
-
_MODEL_ID=$(gcloud ai models list --region=us-central1 --format="value(name)" | head -n 1) \
|
35 |
-
_ENDPOINT_ID=$(gcloud ai endpoints list --region=us-central1 --format="value(name)" | head -n 1) \
|
36 |
-
gcloud ai endpoints deploy-model $_ENDPOINT_ID \
|
37 |
-
--region="us-central1" \
|
38 |
-
--model=$_MODEL_ID \
|
39 |
-
--display-name="interview-ai-detector-deployment" \
|
40 |
-
--machine-type="n1-standard-4" \
|
41 |
-
--accelerator="count=1,type=nvidia-tesla-t4" \
|
42 |
-
--service-account="vertex-ai-user-managed-sa@steady-climate-416810.iam.gserviceaccount.com"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
core-model-prediction/models/random_forest.joblib
DELETED
Binary file (96.1 kB)
|
|
core-model-prediction/models/rf_weights.joblib
ADDED
Binary file (228 kB). View file
|
|
core-model-prediction/prediction.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
from fastapi import FastAPI, Response, status
|
2 |
from pydantic import BaseModel
|
3 |
from hypothesis import BaseModelHypothesis
|
4 |
-
from
|
5 |
from random_forest_model import RandomForestModel
|
6 |
from main_model import PredictMainModel
|
7 |
import numpy as np
|
@@ -50,9 +50,9 @@ def process_instance(data: PredictRequest):
|
|
50 |
answer, additional_features)
|
51 |
|
52 |
# Data preparation for 2nd model
|
53 |
-
|
54 |
-
secondary_model_features =
|
55 |
-
answer, main_model_probability, backspace_count, typing_duration, letter_click_counts)
|
56 |
|
57 |
# 2nd model prediction
|
58 |
secondary_model = RandomForestModel()
|
@@ -61,8 +61,5 @@ def process_instance(data: PredictRequest):
|
|
61 |
|
62 |
return {
|
63 |
"predicted_class": "AI" if secondary_model_prediction == 1 else "HUMAN",
|
64 |
-
"
|
65 |
-
"main_model_probability": str(main_model_probability),
|
66 |
-
"final_prediction": secondary_model_prediction
|
67 |
-
}
|
68 |
}
|
|
|
1 |
from fastapi import FastAPI, Response, status
|
2 |
from pydantic import BaseModel
|
3 |
from hypothesis import BaseModelHypothesis
|
4 |
+
from secondary_model_dependencies import SecondaryModelDependencies
|
5 |
from random_forest_model import RandomForestModel
|
6 |
from main_model import PredictMainModel
|
7 |
import numpy as np
|
|
|
50 |
answer, additional_features)
|
51 |
|
52 |
# Data preparation for 2nd model
|
53 |
+
secondary_model_dependencies = SecondaryModelDependencies()
|
54 |
+
secondary_model_features = secondary_model_dependencies.calculate_features(
|
55 |
+
question, answer, main_model_probability, backspace_count, typing_duration, letter_click_counts)
|
56 |
|
57 |
# 2nd model prediction
|
58 |
secondary_model = RandomForestModel()
|
|
|
61 |
|
62 |
return {
|
63 |
"predicted_class": "AI" if secondary_model_prediction == 1 else "HUMAN",
|
64 |
+
"main_model_probability": str(main_model_probability)
|
|
|
|
|
|
|
65 |
}
|
core-model-prediction/random_forest_dependencies.py
DELETED
@@ -1,30 +0,0 @@
|
|
1 |
-
from gemma2b_dependencies import Gemma2BDependencies
|
2 |
-
from collections import Counter
|
3 |
-
|
4 |
-
|
5 |
-
class RandomForestDependencies:
|
6 |
-
def calculate_features(self, answer: str, probability: float, backspace_count: int, typing_duration: int, letter_click_counts: dict[str, int]):
|
7 |
-
backspace_count_normalized = backspace_count / len(answer)
|
8 |
-
typing_duration_normalized = typing_duration / len(answer)
|
9 |
-
letter_discrepancy = self.calculate_letter_discrepancy(
|
10 |
-
answer, letter_click_counts)
|
11 |
-
|
12 |
-
return [
|
13 |
-
probability, backspace_count_normalized,
|
14 |
-
typing_duration_normalized, letter_discrepancy
|
15 |
-
]
|
16 |
-
|
17 |
-
def calculate_letter_discrepancy(self, text: str, letter_click_counts: dict[str, int]):
|
18 |
-
# Calculate letter frequencies in the text
|
19 |
-
text_letter_counts = Counter(text.lower())
|
20 |
-
|
21 |
-
# Calculate the ratio of click counts to text counts for each letter, adjusting for letters not in text
|
22 |
-
ratios = [letter_click_counts.get(letter, 0) / (text_letter_counts.get(letter, 0) + 1)
|
23 |
-
for letter in "abcdefghijklmnopqrstuvwxyz"]
|
24 |
-
|
25 |
-
# Average the ratios and normalize by the length of the text
|
26 |
-
average_ratio = sum(ratios) / len(ratios)
|
27 |
-
discrepancy_ratio_normalized = average_ratio / \
|
28 |
-
(len(text) if len(text) > 0 else 1)
|
29 |
-
|
30 |
-
return discrepancy_ratio_normalized
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
core-model-prediction/random_forest_model.py
CHANGED
@@ -6,10 +6,11 @@ from typing import List
|
|
6 |
|
7 |
class RandomForestModel:
|
8 |
def __init__(self):
|
9 |
-
self.scaler = joblib.load("scalers/
|
10 |
-
self.model = joblib.load("models/
|
11 |
self.secondary_model_features = [
|
12 |
-
"machine_probability", "backspace_count_normalized", "typing_duration_normalized",
|
|
|
13 |
]
|
14 |
|
15 |
def preprocess_input(self, secondary_model_features: List[float]) -> np.ndarray:
|
|
|
6 |
|
7 |
class RandomForestModel:
|
8 |
def __init__(self):
|
9 |
+
self.scaler = joblib.load("scalers/secondary_scaler.joblib")
|
10 |
+
self.model = joblib.load("models/rf_weights.joblib")
|
11 |
self.secondary_model_features = [
|
12 |
+
"machine_probability", "backspace_count_normalized", "typing_duration_normalized",
|
13 |
+
"letter_discrepancy_normalized", "cosine_sim_gpt35", "cosine_sim_gpt4"
|
14 |
]
|
15 |
|
16 |
def preprocess_input(self, secondary_model_features: List[float]) -> np.ndarray:
|
core-model-prediction/requirements.txt
CHANGED
@@ -4,6 +4,10 @@ pandas
|
|
4 |
textstat==0.7.3
|
5 |
scikit-learn==1.2.2
|
6 |
transformers==4.38.2
|
|
|
|
|
|
|
|
|
7 |
fastapi
|
8 |
uvicorn
|
9 |
google-cloud-secret-manager
|
|
|
4 |
textstat==0.7.3
|
5 |
scikit-learn==1.2.2
|
6 |
transformers==4.38.2
|
7 |
+
sentence-transformers==2.7.0
|
8 |
+
langchain
|
9 |
+
openai
|
10 |
+
langchain-openai
|
11 |
fastapi
|
12 |
uvicorn
|
13 |
google-cloud-secret-manager
|
core-model-prediction/scalers/{rf_scaler.joblib → secondary_scaler.joblib}
RENAMED
Binary files a/core-model-prediction/scalers/rf_scaler.joblib and b/core-model-prediction/scalers/secondary_scaler.joblib differ
|
|
core-model-prediction/secondary_model_dependencies.py
ADDED
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from sentence_transformers import SentenceTransformer, util
|
2 |
+
from collections import Counter
|
3 |
+
from langchain_openai import ChatOpenAI
|
4 |
+
from langchain_core.messages import HumanMessage, SystemMessage
|
5 |
+
from google.cloud import secretmanager
|
6 |
+
|
7 |
+
|
8 |
+
class SecondaryModelDependencies:
|
9 |
+
def __init__(self):
|
10 |
+
self.text_similarity_model = SentenceTransformer(
|
11 |
+
'sentence-transformers/all-mpnet-base-v2')
|
12 |
+
api_key = self.access_openai_api_key()
|
13 |
+
self.llm_gpt35 = ChatOpenAI(
|
14 |
+
api_key=api_key, model="gpt-3.5-turbo")
|
15 |
+
self.llm_gpt4 = ChatOpenAI(
|
16 |
+
api_key=api_key, model="gpt-4-turbo")
|
17 |
+
|
18 |
+
def access_openai_api_key(self):
|
19 |
+
client = secretmanager.SecretManagerServiceClient()
|
20 |
+
name = "projects/steady-climate-416810/secrets/OPENAI_API_KEY/versions/1"
|
21 |
+
response = client.access_secret_version(request={"name": name})
|
22 |
+
return response.payload.data.decode('UTF-8')
|
23 |
+
|
24 |
+
def calculate_features(self, question: str, answer: str, probability: float, backspace_count: int, typing_duration: int, letter_click_counts: dict[str, int]):
|
25 |
+
backspace_count_normalized = backspace_count / len(answer)
|
26 |
+
typing_duration_normalized = typing_duration / len(answer)
|
27 |
+
letter_discrepancy = self.calculate_letter_discrepancy(
|
28 |
+
answer, letter_click_counts)
|
29 |
+
|
30 |
+
gpt35_answer = self.generate_gpt35_answer(question)
|
31 |
+
gpt4_answer = self.generate_gpt4_answer(question)
|
32 |
+
|
33 |
+
cosine_sim_gpt35 = self.calculate_similarity_gpt35(
|
34 |
+
answer, gpt35_answer)
|
35 |
+
cosine_sim_gpt4 = self.calculate_similarity_gpt4(answer, gpt4_answer)
|
36 |
+
|
37 |
+
return [
|
38 |
+
probability, backspace_count_normalized, typing_duration_normalized,
|
39 |
+
letter_discrepancy, cosine_sim_gpt35, cosine_sim_gpt4
|
40 |
+
]
|
41 |
+
|
42 |
+
def calculate_letter_discrepancy(self, text: str, letter_click_counts: dict[str, int]):
|
43 |
+
# Calculate letter frequencies in the text
|
44 |
+
text_letter_counts = Counter(text.lower())
|
45 |
+
|
46 |
+
# Calculate the ratio of click counts to text counts for each letter, adjusting for letters not in text
|
47 |
+
ratios = [letter_click_counts.get(letter, 0) / (text_letter_counts.get(letter, 0) + 1)
|
48 |
+
for letter in "abcdefghijklmnopqrstuvwxyz"]
|
49 |
+
|
50 |
+
# Average the ratios and normalize by the length of the text
|
51 |
+
average_ratio = sum(ratios) / len(ratios)
|
52 |
+
discrepancy_ratio_normalized = average_ratio / \
|
53 |
+
(len(text) if len(text) > 0 else 1)
|
54 |
+
|
55 |
+
return discrepancy_ratio_normalized
|
56 |
+
|
57 |
+
def generate_gpt35_answer(self, question: str):
|
58 |
+
messages = [
|
59 |
+
SystemMessage(
|
60 |
+
content="Please answer the following question based solely on your internal knowledge, without external references. Assume you are the human."),
|
61 |
+
HumanMessage(question)
|
62 |
+
]
|
63 |
+
|
64 |
+
gpt35_answer = self.llm_gpt35.invoke(messages)
|
65 |
+
return gpt35_answer.content
|
66 |
+
|
67 |
+
def generate_gpt4_answer(self, question: str):
|
68 |
+
messages = [
|
69 |
+
SystemMessage(
|
70 |
+
content="Please answer the following question based solely on your internal knowledge, without external references. Assume you are the human."),
|
71 |
+
HumanMessage(question)
|
72 |
+
]
|
73 |
+
|
74 |
+
gpt4_answer = self.llm_gpt4.invoke(messages)
|
75 |
+
return gpt4_answer.content
|
76 |
+
|
77 |
+
def calculate_similarity_gpt35(self, answer: str, gpt35_answer: str) -> float:
|
78 |
+
embedding1 = self.text_similarity_model.encode(
|
79 |
+
[answer], convert_to_tensor=True)
|
80 |
+
embedding2 = self.text_similarity_model.encode(
|
81 |
+
[gpt35_answer], convert_to_tensor=True)
|
82 |
+
cosine_scores = util.cos_sim(embedding1, embedding2)
|
83 |
+
return cosine_scores.item()
|
84 |
+
|
85 |
+
def calculate_similarity_gpt4(self, answer: str, gpt4_answer: str) -> float:
|
86 |
+
embedding1 = self.text_similarity_model.encode(
|
87 |
+
[answer], convert_to_tensor=True)
|
88 |
+
embedding2 = self.text_similarity_model.encode(
|
89 |
+
[gpt4_answer], convert_to_tensor=True)
|
90 |
+
cosine_scores = util.cos_sim(embedding1, embedding2)
|
91 |
+
return cosine_scores.item()
|