Yakobus Iryanto Prasethio commited on
Commit
caa781d
·
unverified ·
2 Parent(s): 57db935 6b1f3cb

Merge pull request #13 from Sistem-Cerdas-Recruitment/production

Browse files
core-model-prediction/models/rf_weights.joblib DELETED
Binary file (228 kB)
 
core-model-prediction/models/secondary_weights.joblib ADDED
Binary file (38.5 kB). View file
 
core-model-prediction/prediction.py CHANGED
@@ -2,7 +2,7 @@ from fastapi import FastAPI, Response, status
2
  from pydantic import BaseModel
3
  from hypothesis import BaseModelHypothesis
4
  from secondary_model_dependencies import SecondaryModelDependencies
5
- from random_forest_model import RandomForestModel
6
  from main_model import PredictMainModel
7
  import numpy as np
8
  from typing import List
@@ -11,13 +11,10 @@ app = FastAPI()
11
 
12
 
13
  class PredictRequest(BaseModel):
14
- question: str
15
  answer: str
16
  backspace_count: int
17
- typing_duration: int
18
  letter_click_counts: dict[str, int]
19
- gpt35_answer: str
20
- gpt4_answer: str
21
 
22
 
23
  class RequestModel(BaseModel):
@@ -36,13 +33,10 @@ async def predict(request: RequestModel):
36
 
37
 
38
  def process_instance(data: PredictRequest):
39
- question = data.question
40
  answer = data.answer
41
  backspace_count = data.backspace_count
42
- typing_duration = data.typing_duration
43
  letter_click_counts = data.letter_click_counts
44
- gpt35_answer = data.gpt35_answer
45
- gpt4_answer = data.gpt4_answer
46
 
47
  # Data preparation for 1st model
48
  hypothesis = BaseModelHypothesis()
@@ -56,26 +50,28 @@ def process_instance(data: PredictRequest):
56
  # Data preparation for 2nd model
57
  secondary_model_dependencies = SecondaryModelDependencies()
58
  secondary_model_features = secondary_model_dependencies.calculate_features(
59
- question, answer, main_model_probability, backspace_count, typing_duration,
60
- letter_click_counts, gpt35_answer, gpt4_answer)
61
 
62
  # 2nd model prediction
63
- secondary_model = RandomForestModel()
64
- secondary_model_prediction = secondary_model.predict(
65
  secondary_model_features)
66
 
 
 
67
  return {
68
- "predicted_class": "AI" if secondary_model_prediction == 1 else "HUMAN",
69
  "main_model_probability": str(main_model_probability),
70
- "secondary_model_prediction": secondary_model_prediction,
71
- "confidence": get_confidence(main_model_probability, secondary_model_prediction)
72
  }
73
 
74
 
75
- def get_confidence(main_model_output: float, secondary_model_output: int):
76
- if (main_model_output >= 0.8 and secondary_model_output == 1) or (main_model_output <= 0.2 and secondary_model_output == 0):
77
  return 'High Confidence'
78
- elif (0.5 < main_model_output < 0.8 and secondary_model_output == 1) or (0.2 < main_model_output <= 0.5 and secondary_model_output == 0):
79
  return 'Partially Confident'
80
  else:
81
  return 'Low Confidence'
 
2
  from pydantic import BaseModel
3
  from hypothesis import BaseModelHypothesis
4
  from secondary_model_dependencies import SecondaryModelDependencies
5
+ from secondary_model import SecondaryModel
6
  from main_model import PredictMainModel
7
  import numpy as np
8
  from typing import List
 
11
 
12
 
13
  class PredictRequest(BaseModel):
 
14
  answer: str
15
  backspace_count: int
 
16
  letter_click_counts: dict[str, int]
17
+ gpt4o_answer: str
 
18
 
19
 
20
  class RequestModel(BaseModel):
 
33
 
34
 
35
  def process_instance(data: PredictRequest):
 
36
  answer = data.answer
37
  backspace_count = data.backspace_count
 
38
  letter_click_counts = data.letter_click_counts
39
+ gpt4o_answer = data.gpt4o_answer
 
40
 
41
  # Data preparation for 1st model
42
  hypothesis = BaseModelHypothesis()
 
50
  # Data preparation for 2nd model
51
  secondary_model_dependencies = SecondaryModelDependencies()
52
  secondary_model_features = secondary_model_dependencies.calculate_features(
53
+ answer, main_model_probability, backspace_count,
54
+ letter_click_counts, gpt4o_answer)
55
 
56
  # 2nd model prediction
57
+ secondary_model = SecondaryModel()
58
+ secondary_model_probability = secondary_model.predict(
59
  secondary_model_features)
60
 
61
+ second_model_threshold = 0.54
62
+
63
  return {
64
+ "predicted_class": "AI" if secondary_model_probability > second_model_threshold else "HUMAN",
65
  "main_model_probability": str(main_model_probability),
66
+ "secondary_model_probability": str(secondary_model_probability),
67
+ "confidence": get_confidence(main_model_probability, secondary_model_probability, second_model_threshold)
68
  }
69
 
70
 
71
+ def get_confidence(main_model_output: float, secondary_model_output: int, threshold: float):
72
+ if (main_model_output >= 0.8 and secondary_model_output >= threshold) or (main_model_output <= 0.2 and secondary_model_output <= 1 - threshold):
73
  return 'High Confidence'
74
+ elif (0.5 < main_model_output < 0.8 and secondary_model_output >= threshold) or (0.2 < main_model_output <= 0.5 and secondary_model_output < threshold):
75
  return 'Partially Confident'
76
  else:
77
  return 'Low Confidence'
core-model-prediction/scalers/secondary_scaler.joblib CHANGED
Binary files a/core-model-prediction/scalers/secondary_scaler.joblib and b/core-model-prediction/scalers/secondary_scaler.joblib differ
 
core-model-prediction/{random_forest_model.py → secondary_model.py} RENAMED
@@ -4,21 +4,21 @@ import pandas as pd
4
  from typing import List
5
 
6
 
7
- class RandomForestModel:
8
  def __init__(self):
9
  self.scaler = joblib.load("scalers/secondary_scaler.joblib")
10
- self.model = joblib.load("models/rf_weights.joblib")
11
  self.secondary_model_features = [
12
- "machine_probability", "backspace_count_normalized", "typing_duration_normalized",
13
- "letter_discrepancy_normalized", "cosine_sim_gpt35", "cosine_sim_gpt4"
14
  ]
15
 
16
- def preprocess_input(self, secondary_model_features: List[float]) -> np.ndarray:
17
  features_df = pd.DataFrame(
18
  [secondary_model_features], columns=self.secondary_model_features)
19
  features_df[self.secondary_model_features] = self.scaler.transform(
20
  features_df[self.secondary_model_features])
21
- return features_df.values.astype(np.float32).reshape(1, -1)
22
 
23
- def predict(self, secondary_model_features: List[float]):
24
- return int(self.model.predict(self.preprocess_input(secondary_model_features))[0])
 
4
  from typing import List
5
 
6
 
7
+ class SecondaryModel:
8
  def __init__(self):
9
  self.scaler = joblib.load("scalers/secondary_scaler.joblib")
10
+ self.model = joblib.load("models/secondary_weights.joblib")
11
  self.secondary_model_features = [
12
+ "machine_probability", "backspace_count_normalized",
13
+ "letter_discrepancy_normalized", "cosine_sim_gpt4o"
14
  ]
15
 
16
+ def preprocess_input(self, secondary_model_features: List[float]) -> pd.DataFrame:
17
  features_df = pd.DataFrame(
18
  [secondary_model_features], columns=self.secondary_model_features)
19
  features_df[self.secondary_model_features] = self.scaler.transform(
20
  features_df[self.secondary_model_features])
21
+ return features_df
22
 
23
+ def predict(self, secondary_model_features: List[float]) -> float:
24
+ return self.model.predict_proba(self.preprocess_input(secondary_model_features))[:, -1][0]
core-model-prediction/secondary_model_dependencies.py CHANGED
@@ -7,20 +7,16 @@ class SecondaryModelDependencies:
7
  self.text_similarity_model = SentenceTransformer(
8
  'sentence-transformers/all-mpnet-base-v2')
9
 
10
- def calculate_features(self, question: str, answer: str, probability: float, backspace_count: int, typing_duration: int,
11
- letter_click_counts: dict[str, int], gpt35_answer: str, gpt4_answer: str):
12
  backspace_count_normalized = backspace_count / len(answer)
13
- typing_duration_normalized = typing_duration / len(answer)
14
  letter_discrepancy = self.calculate_letter_discrepancy(
15
  answer, letter_click_counts)
16
-
17
- cosine_sim_gpt35 = self.calculate_similarity_gpt35(
18
- answer, gpt35_answer)
19
- cosine_sim_gpt4 = self.calculate_similarity_gpt4(answer, gpt4_answer)
20
 
21
  return [
22
- probability, backspace_count_normalized, typing_duration_normalized,
23
- letter_discrepancy, cosine_sim_gpt35, cosine_sim_gpt4
24
  ]
25
 
26
  def calculate_letter_discrepancy(self, text: str, letter_click_counts: dict[str, int]):
@@ -38,18 +34,10 @@ class SecondaryModelDependencies:
38
 
39
  return discrepancy_ratio_normalized
40
 
41
- def calculate_similarity_gpt35(self, answer: str, gpt35_answer: str) -> float:
42
- embedding1 = self.text_similarity_model.encode(
43
- [answer], convert_to_tensor=True)
44
- embedding2 = self.text_similarity_model.encode(
45
- [gpt35_answer], convert_to_tensor=True)
46
- cosine_scores = util.cos_sim(embedding1, embedding2)
47
- return cosine_scores.item()
48
-
49
- def calculate_similarity_gpt4(self, answer: str, gpt4_answer: str) -> float:
50
  embedding1 = self.text_similarity_model.encode(
51
  [answer], convert_to_tensor=True)
52
  embedding2 = self.text_similarity_model.encode(
53
- [gpt4_answer], convert_to_tensor=True)
54
  cosine_scores = util.cos_sim(embedding1, embedding2)
55
  return cosine_scores.item()
 
7
  self.text_similarity_model = SentenceTransformer(
8
  'sentence-transformers/all-mpnet-base-v2')
9
 
10
+ def calculate_features(self, answer: str, probability: float, backspace_count: int,
11
+ letter_click_counts: dict[str, int], gpt4o_answer: str):
12
  backspace_count_normalized = backspace_count / len(answer)
 
13
  letter_discrepancy = self.calculate_letter_discrepancy(
14
  answer, letter_click_counts)
15
+ cosine_sim_gpt4o = self.calculate_similarity_gpt4o(
16
+ answer, gpt4o_answer)
 
 
17
 
18
  return [
19
+ probability, backspace_count_normalized, letter_discrepancy, cosine_sim_gpt4o
 
20
  ]
21
 
22
  def calculate_letter_discrepancy(self, text: str, letter_click_counts: dict[str, int]):
 
34
 
35
  return discrepancy_ratio_normalized
36
 
37
+ def calculate_similarity_gpt4o(self, answer: str, gpt4o_answer: str) -> float:
 
 
 
 
 
 
 
 
38
  embedding1 = self.text_similarity_model.encode(
39
  [answer], convert_to_tensor=True)
40
  embedding2 = self.text_similarity_model.encode(
41
+ [gpt4o_answer], convert_to_tensor=True)
42
  cosine_scores = util.cos_sim(embedding1, embedding2)
43
  return cosine_scores.item()