Yakobus Iryanto Prasethio commited on
Commit
d290137
·
unverified ·
2 Parent(s): df00cec 84f0cff

Merge pull request #10 from YakobusIP/main

Browse files
.gitignore CHANGED
@@ -1,3 +1,4 @@
1
  __pycache__
2
  .env
3
- *.json
 
 
1
  __pycache__
2
  .env
3
+ *.json
4
+ *.ipynb
cloudbuild-endpoint.yaml DELETED
@@ -1,42 +0,0 @@
1
- steps:
2
- - name: "gcr.io/google.com/cloudsdktool/cloud-sdk"
3
- entrypoint: "bash"
4
- id: upload-model
5
- args:
6
- - "-c"
7
- - |
8
- gcloud ai models upload \
9
- --region="us-central1" \
10
- --container-ports=8080 \
11
- --container-image-uri="us-central1-docker.pkg.dev/${PROJECT_ID}/interview-ai-detector/model-prediction:latest" \
12
- --container-predict-route="/predict" \
13
- --container-health-route="/health" \
14
- --display-name="interview-ai-detector-model"
15
-
16
- - name: "gcr.io/google.com/cloudsdktool/cloud-sdk"
17
- entrypoint: "bash"
18
- id: create-endpoint
19
- waitFor: ["upload-model"]
20
- args:
21
- - "-c"
22
- - |
23
- gcloud ai endpoints create \
24
- --region="us-central1" \
25
- --display-name="interview-ai-detector-endpoint" \
26
- --format="value(name)"
27
-
28
- - name: "gcr.io/google.com/cloudsdktool/cloud-sdk"
29
- entrypoint: "bash"
30
- waitFor: ["create-endpoint"]
31
- args:
32
- - "-c"
33
- - |
34
- _MODEL_ID=$(gcloud ai models list --region=us-central1 --format="value(name)" | head -n 1) \
35
- _ENDPOINT_ID=$(gcloud ai endpoints list --region=us-central1 --format="value(name)" | head -n 1) \
36
- gcloud ai endpoints deploy-model $_ENDPOINT_ID \
37
- --region="us-central1" \
38
- --model=$_MODEL_ID \
39
- --display-name="interview-ai-detector-deployment" \
40
- --machine-type="n1-standard-4" \
41
- --accelerator="count=1,type=nvidia-tesla-t4" \
42
- --service-account="vertex-ai-user-managed-sa@steady-climate-416810.iam.gserviceaccount.com"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
core-model-prediction/models/random_forest.joblib DELETED
Binary file (96.1 kB)
 
core-model-prediction/models/rf_weights.joblib ADDED
Binary file (228 kB). View file
 
core-model-prediction/prediction.py CHANGED
@@ -1,7 +1,7 @@
1
  from fastapi import FastAPI, Response, status
2
  from pydantic import BaseModel
3
  from hypothesis import BaseModelHypothesis
4
- from random_forest_dependencies import RandomForestDependencies
5
  from random_forest_model import RandomForestModel
6
  from main_model import PredictMainModel
7
  import numpy as np
@@ -50,9 +50,9 @@ def process_instance(data: PredictRequest):
50
  answer, additional_features)
51
 
52
  # Data preparation for 2nd model
53
- random_forest_features = RandomForestDependencies()
54
- secondary_model_features = random_forest_features.calculate_features(
55
- answer, main_model_probability, backspace_count, typing_duration, letter_click_counts)
56
 
57
  # 2nd model prediction
58
  secondary_model = RandomForestModel()
@@ -61,8 +61,5 @@ def process_instance(data: PredictRequest):
61
 
62
  return {
63
  "predicted_class": "AI" if secondary_model_prediction == 1 else "HUMAN",
64
- "details": {
65
- "main_model_probability": str(main_model_probability),
66
- "final_prediction": secondary_model_prediction
67
- }
68
  }
 
1
  from fastapi import FastAPI, Response, status
2
  from pydantic import BaseModel
3
  from hypothesis import BaseModelHypothesis
4
+ from secondary_model_dependencies import SecondaryModelDependencies
5
  from random_forest_model import RandomForestModel
6
  from main_model import PredictMainModel
7
  import numpy as np
 
50
  answer, additional_features)
51
 
52
  # Data preparation for 2nd model
53
+ secondary_model_dependencies = SecondaryModelDependencies()
54
+ secondary_model_features = secondary_model_dependencies.calculate_features(
55
+ question, answer, main_model_probability, backspace_count, typing_duration, letter_click_counts)
56
 
57
  # 2nd model prediction
58
  secondary_model = RandomForestModel()
 
61
 
62
  return {
63
  "predicted_class": "AI" if secondary_model_prediction == 1 else "HUMAN",
64
+ "main_model_probability": str(main_model_probability)
 
 
 
65
  }
core-model-prediction/random_forest_dependencies.py DELETED
@@ -1,30 +0,0 @@
1
- from gemma2b_dependencies import Gemma2BDependencies
2
- from collections import Counter
3
-
4
-
5
- class RandomForestDependencies:
6
- def calculate_features(self, answer: str, probability: float, backspace_count: int, typing_duration: int, letter_click_counts: dict[str, int]):
7
- backspace_count_normalized = backspace_count / len(answer)
8
- typing_duration_normalized = typing_duration / len(answer)
9
- letter_discrepancy = self.calculate_letter_discrepancy(
10
- answer, letter_click_counts)
11
-
12
- return [
13
- probability, backspace_count_normalized,
14
- typing_duration_normalized, letter_discrepancy
15
- ]
16
-
17
- def calculate_letter_discrepancy(self, text: str, letter_click_counts: dict[str, int]):
18
- # Calculate letter frequencies in the text
19
- text_letter_counts = Counter(text.lower())
20
-
21
- # Calculate the ratio of click counts to text counts for each letter, adjusting for letters not in text
22
- ratios = [letter_click_counts.get(letter, 0) / (text_letter_counts.get(letter, 0) + 1)
23
- for letter in "abcdefghijklmnopqrstuvwxyz"]
24
-
25
- # Average the ratios and normalize by the length of the text
26
- average_ratio = sum(ratios) / len(ratios)
27
- discrepancy_ratio_normalized = average_ratio / \
28
- (len(text) if len(text) > 0 else 1)
29
-
30
- return discrepancy_ratio_normalized
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
core-model-prediction/random_forest_model.py CHANGED
@@ -6,10 +6,11 @@ from typing import List
6
 
7
  class RandomForestModel:
8
  def __init__(self):
9
- self.scaler = joblib.load("scalers/rf_scaler.joblib")
10
- self.model = joblib.load("models/random_forest.joblib")
11
  self.secondary_model_features = [
12
- "machine_probability", "backspace_count_normalized", "typing_duration_normalized", "letter_discrepancy_normalized"
 
13
  ]
14
 
15
  def preprocess_input(self, secondary_model_features: List[float]) -> np.ndarray:
 
6
 
7
  class RandomForestModel:
8
  def __init__(self):
9
+ self.scaler = joblib.load("scalers/secondary_scaler.joblib")
10
+ self.model = joblib.load("models/rf_weights.joblib")
11
  self.secondary_model_features = [
12
+ "machine_probability", "backspace_count_normalized", "typing_duration_normalized",
13
+ "letter_discrepancy_normalized", "cosine_sim_gpt35", "cosine_sim_gpt4"
14
  ]
15
 
16
  def preprocess_input(self, secondary_model_features: List[float]) -> np.ndarray:
core-model-prediction/requirements.txt CHANGED
@@ -4,6 +4,10 @@ pandas
4
  textstat==0.7.3
5
  scikit-learn==1.2.2
6
  transformers==4.38.2
 
 
 
 
7
  fastapi
8
  uvicorn
9
  google-cloud-secret-manager
 
4
  textstat==0.7.3
5
  scikit-learn==1.2.2
6
  transformers==4.38.2
7
+ sentence-transformers==2.7.0
8
+ langchain
9
+ openai
10
+ langchain-openai
11
  fastapi
12
  uvicorn
13
  google-cloud-secret-manager
core-model-prediction/scalers/{rf_scaler.joblib → secondary_scaler.joblib} RENAMED
Binary files a/core-model-prediction/scalers/rf_scaler.joblib and b/core-model-prediction/scalers/secondary_scaler.joblib differ
 
core-model-prediction/secondary_model_dependencies.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sentence_transformers import SentenceTransformer, util
2
+ from collections import Counter
3
+ from langchain_openai import ChatOpenAI
4
+ from langchain_core.messages import HumanMessage, SystemMessage
5
+ from google.cloud import secretmanager
6
+
7
+
8
+ class SecondaryModelDependencies:
9
+ def __init__(self):
10
+ self.text_similarity_model = SentenceTransformer(
11
+ 'sentence-transformers/all-mpnet-base-v2')
12
+ api_key = self.access_openai_api_key()
13
+ self.llm_gpt35 = ChatOpenAI(
14
+ api_key=api_key, model="gpt-3.5-turbo")
15
+ self.llm_gpt4 = ChatOpenAI(
16
+ api_key=api_key, model="gpt-4-turbo")
17
+
18
+ def access_openai_api_key(self):
19
+ client = secretmanager.SecretManagerServiceClient()
20
+ name = "projects/steady-climate-416810/secrets/OPENAI_API_KEY/versions/1"
21
+ response = client.access_secret_version(request={"name": name})
22
+ return response.payload.data.decode('UTF-8')
23
+
24
+ def calculate_features(self, question: str, answer: str, probability: float, backspace_count: int, typing_duration: int, letter_click_counts: dict[str, int]):
25
+ backspace_count_normalized = backspace_count / len(answer)
26
+ typing_duration_normalized = typing_duration / len(answer)
27
+ letter_discrepancy = self.calculate_letter_discrepancy(
28
+ answer, letter_click_counts)
29
+
30
+ gpt35_answer = self.generate_gpt35_answer(question)
31
+ gpt4_answer = self.generate_gpt4_answer(question)
32
+
33
+ cosine_sim_gpt35 = self.calculate_similarity_gpt35(
34
+ answer, gpt35_answer)
35
+ cosine_sim_gpt4 = self.calculate_similarity_gpt4(answer, gpt4_answer)
36
+
37
+ return [
38
+ probability, backspace_count_normalized, typing_duration_normalized,
39
+ letter_discrepancy, cosine_sim_gpt35, cosine_sim_gpt4
40
+ ]
41
+
42
+ def calculate_letter_discrepancy(self, text: str, letter_click_counts: dict[str, int]):
43
+ # Calculate letter frequencies in the text
44
+ text_letter_counts = Counter(text.lower())
45
+
46
+ # Calculate the ratio of click counts to text counts for each letter, adjusting for letters not in text
47
+ ratios = [letter_click_counts.get(letter, 0) / (text_letter_counts.get(letter, 0) + 1)
48
+ for letter in "abcdefghijklmnopqrstuvwxyz"]
49
+
50
+ # Average the ratios and normalize by the length of the text
51
+ average_ratio = sum(ratios) / len(ratios)
52
+ discrepancy_ratio_normalized = average_ratio / \
53
+ (len(text) if len(text) > 0 else 1)
54
+
55
+ return discrepancy_ratio_normalized
56
+
57
+ def generate_gpt35_answer(self, question: str):
58
+ messages = [
59
+ SystemMessage(
60
+ content="Please answer the following question based solely on your internal knowledge, without external references. Assume you are the human."),
61
+ HumanMessage(question)
62
+ ]
63
+
64
+ gpt35_answer = self.llm_gpt35.invoke(messages)
65
+ return gpt35_answer.content
66
+
67
+ def generate_gpt4_answer(self, question: str):
68
+ messages = [
69
+ SystemMessage(
70
+ content="Please answer the following question based solely on your internal knowledge, without external references. Assume you are the human."),
71
+ HumanMessage(question)
72
+ ]
73
+
74
+ gpt4_answer = self.llm_gpt4.invoke(messages)
75
+ return gpt4_answer.content
76
+
77
+ def calculate_similarity_gpt35(self, answer: str, gpt35_answer: str) -> float:
78
+ embedding1 = self.text_similarity_model.encode(
79
+ [answer], convert_to_tensor=True)
80
+ embedding2 = self.text_similarity_model.encode(
81
+ [gpt35_answer], convert_to_tensor=True)
82
+ cosine_scores = util.cos_sim(embedding1, embedding2)
83
+ return cosine_scores.item()
84
+
85
+ def calculate_similarity_gpt4(self, answer: str, gpt4_answer: str) -> float:
86
+ embedding1 = self.text_similarity_model.encode(
87
+ [answer], convert_to_tensor=True)
88
+ embedding2 = self.text_similarity_model.encode(
89
+ [gpt4_answer], convert_to_tensor=True)
90
+ cosine_scores = util.cos_sim(embedding1, embedding2)
91
+ return cosine_scores.item()