Muhammad541 commited on
Commit
e069344
·
verified ·
1 Parent(s): 6cba13b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +51 -50
app.py CHANGED
@@ -11,7 +11,6 @@ from sklearn.feature_extraction.text import TfidfVectorizer
11
  from flask import Flask, request, jsonify
12
  import logging
13
  from pymongo import MongoClient
14
- import requests
15
 
16
  # Set up logging
17
  logging.basicConfig(level=logging.INFO)
@@ -23,7 +22,7 @@ os.environ["TOKENIZERS_PARALLELISM"] = "false"
23
  # MongoDB connection
24
  MONGO_URI = "mongodb://muhammadbinimran1001:[email protected]:27017,dsm-shard-00-01.inrzs.mongodb.net:27017,dsm-shard-00-02.inrzs.mongodb.net:27017/?ssl=true&replicaSet=atlas-nbg4er-shard-0&authSource=admin&retryWrites=true&w=majority"
25
  client = MongoClient(MONGO_URI)
26
- db = client.get_database("test") # Adjust the database name as needed
27
  users_collection = db["users"]
28
  courses_collection = db["courses"]
29
  jobs_collection = db["jobs"]
@@ -62,7 +61,9 @@ course_similarity = None
62
  job_similarity = None
63
 
64
  # Improved dataset loading with fallback
65
- def load_dataset(file_path, required_columns=[], additional_columns=['popularity', 'completion_rate'], fallback_data=None):
 
 
66
  try:
67
  df = pd.read_csv(file_path)
68
  missing_required = [col for col in required_columns if col not in df.columns]
@@ -76,35 +77,21 @@ def load_dataset(file_path, required_columns=[], additional_columns=['popularity
76
  if missing_additional:
77
  logger.warning(f"Additional columns {missing_additional} missing in {file_path}. Adding default values.")
78
  for col in missing_additional:
79
- if col == 'popularity':
80
- df[col] = 0.8
81
- elif col == 'completion_rate':
82
- df[col] = 0.7
83
- else:
84
- df[col] = 0.0
85
 
86
- if 'level' in df.columns:
87
- df['level'] = df['level'].apply(lambda x: 'Intermediate' if pd.isna(x) or x.strip() == "" else x)
88
- else:
89
  logger.warning(f"'level' column missing in {file_path}. Adding default 'Intermediate'.")
90
  df['level'] = 'Intermediate'
 
 
91
 
92
  return df
93
- except ValueError as ve:
94
- logger.error(f"ValueError loading {file_path}: {ve}. Using fallback data.")
95
- if fallback_data is not None:
96
- logger.info(f"Using fallback data for {file_path}")
97
- return pd.DataFrame(fallback_data)
98
- return None
99
  except Exception as e:
100
  logger.error(f"Error loading {file_path}: {e}. Using fallback data.")
101
- if fallback_data is not None:
102
- logger.info(f"Using fallback data for {file_path}")
103
- return pd.DataFrame(fallback_data)
104
- return None
105
 
106
  # Load datasets with fallbacks
107
- questions_df = load_dataset("Generated_Skill-Based_Questions.csv", ["Skill", "Question", "Answer"], [], {
108
  'Skill': ['Linux', 'Git', 'Node.js', 'Python', 'Kubernetes'],
109
  'Question': ['Advanced Linux question', 'Advanced Git question', 'Basic Node.js question',
110
  'Intermediate Python question', 'Basic Kubernetes question'],
@@ -127,28 +114,28 @@ def load_universal_model():
127
  if os.path.exists(UNIVERSAL_MODEL_PATH):
128
  logger.info(f"Loading universal model from {UNIVERSAL_MODEL_PATH}")
129
  return SentenceTransformer(UNIVERSAL_MODEL_PATH)
130
- else:
131
- logger.info(f"Loading universal model: {default_model}")
132
- model = SentenceTransformer(default_model)
133
- model.save(UNIVERSAL_MODEL_PATH)
134
- return model
135
  except Exception as e:
136
  logger.error(f"Failed to load universal model {default_model}: {e}. Exiting.")
137
  exit(1)
138
 
139
  universal_model = load_universal_model()
140
 
141
- if os.path.exists(DETECTOR_MODEL_PATH):
142
- detector_tokenizer = AutoTokenizer.from_pretrained(DETECTOR_MODEL_PATH)
143
- detector_model = AutoModelForSequenceClassification.from_pretrained(DETECTOR_MODEL_PATH)
144
- else:
145
- detector_tokenizer = AutoTokenizer.from_pretrained("roberta-base-openai-detector")
146
- detector_model = AutoModelForSequenceClassification.from_pretrained("roberta-base-openai-detector")
147
 
148
  # Load Precomputed Resources
149
  def load_precomputed_resources():
150
  global tfidf_vectorizer, skill_tfidf, question_to_answer, faiss_index, answer_embeddings, course_similarity, job_similarity
151
- if all(os.path.exists(p) for p in [TFIDF_PATH, SKILL_TFIDF_PATH, QUESTION_ANSWER_PATH, FAISS_INDEX_PATH, ANSWER_EMBEDDINGS_PATH, COURSE_SIMILARITY_PATH, JOB_SIMILARITY_PATH]):
 
152
  try:
153
  with open(TFIDF_PATH, 'rb') as f: tfidf_vectorizer = pickle.load(f)
154
  with open(SKILL_TFIDF_PATH, 'rb') as f: skill_tfidf = pickle.load(f)
@@ -180,6 +167,10 @@ def precompute_resources():
180
  faiss_index = faiss.IndexFlatL2(answer_embeddings.shape[1])
181
  faiss_index.add(answer_embeddings)
182
 
 
 
 
 
183
  with open(TFIDF_PATH, 'wb') as f: pickle.dump(tfidf_vectorizer, f)
184
  with open(SKILL_TFIDF_PATH, 'wb') as f: pickle.dump(skill_tfidf, f)
185
  with open(QUESTION_ANSWER_PATH, 'wb') as f: pickle.dump(question_to_answer, f)
@@ -200,13 +191,13 @@ def evaluate_response(args):
200
  if not user_answer:
201
  return skill, 0.0, False
202
 
203
- inputs = detector_tokenizer(user_answer, return_tensors="pt", truncation=True, max_length=512)
204
  with torch.no_grad():
205
  logits = detector_model(**inputs).logits
206
  probs = scipy.special.softmax(logits, axis=1).tolist()[0]
207
  is_ai = probs[1] > 0.5
208
 
209
- user_embedding = universal_model.encode([user_answer], batch_size=128, convert_to_tensor=True, device="cuda" if torch.cuda.is_available() else "cpu")[0]
210
  expected_embedding = torch.tensor(answer_embeddings[question_idx])
211
  score = util.pytorch_cos_sim(user_embedding, expected_embedding).item() * 100
212
 
@@ -224,7 +215,8 @@ def evaluate_response(args):
224
  def get_questions_for_skills(skills):
225
  user_questions = []
226
  for skill in skills:
227
- skill_questions = questions_df[questions_df['Skill'] == skill]
 
228
  if not skill_questions.empty:
229
  user_questions.append(skill_questions.sample(1).iloc[0].to_dict())
230
  else:
@@ -246,7 +238,7 @@ def recommend_courses_from_mongo(skills_to_improve, user_level, upgrade=False):
246
  "skills": {"$in": skills_to_improve},
247
  "category": {"$regex": target_level, "$options": "i"}
248
  }
249
- courses = courses_collection.find(query).limit(3)
250
  return [{"title": course["title"], "provider": course.get("provider", "Unknown")} for course in courses]
251
  except Exception as e:
252
  logger.error(f"Course recommendation error: {e}")
@@ -262,7 +254,7 @@ def recommend_jobs_from_mongo(user_skills, user_level):
262
  "skills": {"$in": user_skills},
263
  "status": "active"
264
  }
265
- jobs = jobs_collection.find(query).limit(5)
266
  return [{"jobTitle": job["jobTitle"], "companyName": job["companyName"], "location": job.get("location", "Remote")} for job in jobs]
267
  except Exception as e:
268
  logger.error(f"Job recommendation error: {e}")
@@ -283,6 +275,9 @@ def get_questions():
283
  return jsonify({"error": "Missing skills field"}), 400
284
 
285
  user_skills = [s.strip() for s in data['skills'] if isinstance(s, str)]
 
 
 
286
  load_precomputed_resources()
287
  questions = get_questions_for_skills(user_skills)
288
  return jsonify({"questions": questions})
@@ -302,7 +297,7 @@ def assess_skills():
302
  answers = [a.strip() for a in data['answers'] if isinstance(a, str)]
303
  user_level = data.get('user_level', 'Intermediate').strip()
304
 
305
- if len(answers) != len(user_skills):
306
  return jsonify({"error": "Answers count must match skills count"}), 400
307
 
308
  load_precomputed_resources()
@@ -315,8 +310,12 @@ def assess_skills():
315
  if not answer or answer.lower() == 'skip':
316
  user_responses.append((row['Skill'], None, None))
317
  else:
318
- question_idx = questions_df.index[questions_df['Question'] == row['Question']][0]
319
- user_responses.append((row['Skill'], answer, question_idx))
 
 
 
 
320
 
321
  results = [evaluate_response(response) for response in user_responses]
322
 
@@ -331,18 +330,20 @@ def assess_skills():
331
  else:
332
  user_scores[skill] = score
333
  ai_flags[skill] = is_ai
334
- scores_list.append(score)
 
335
 
336
  # Update user profile with scores
337
  skill_scores = [{"skill": skill, "score": score} for skill, score, _ in results if score > 0]
338
  users_collection.update_one(
339
  {"_id": user_id},
340
- {"$set": {"skillScores": skill_scores}}
 
341
  )
342
 
343
  mean_score = np.mean(scores_list) if scores_list else 50
344
  dynamic_threshold = max(40, mean_score)
345
- weak_skills = [skill for skill, score in user_scores.items() if score < dynamic_threshold]
346
 
347
  courses = recommend_courses_from_mongo(weak_skills or user_skills, user_level, upgrade=not weak_skills)
348
  jobs = recommend_jobs_from_mongo(user_skills, user_level)
@@ -353,12 +354,12 @@ def assess_skills():
353
  {
354
  "skill": skill,
355
  "progress": f"{'■' * int(score//10)}{'-' * (10 - int(score//10))}",
356
- "score": f"{score:.2f} %",
357
  "origin": "AI-Generated" if is_ai else "Human-Written"
358
  } for skill, score, is_ai in results
359
  ],
360
- "mean_score": mean_score,
361
- "dynamic_threshold": dynamic_threshold,
362
  "weak_skills": weak_skills,
363
  "skipped_questions": skipped_questions
364
  },
@@ -370,4 +371,4 @@ def assess_skills():
370
  return jsonify({"error": "Internal server error"}), 500
371
 
372
  if __name__ == '__main__':
373
- app.run(host='0.0.0.0', port=7860, threaded=True)
 
11
  from flask import Flask, request, jsonify
12
  import logging
13
  from pymongo import MongoClient
 
14
 
15
  # Set up logging
16
  logging.basicConfig(level=logging.INFO)
 
22
  # MongoDB connection
23
  MONGO_URI = "mongodb://muhammadbinimran1001:[email protected]:27017,dsm-shard-00-01.inrzs.mongodb.net:27017,dsm-shard-00-02.inrzs.mongodb.net:27017/?ssl=true&replicaSet=atlas-nbg4er-shard-0&authSource=admin&retryWrites=true&w=majority"
24
  client = MongoClient(MONGO_URI)
25
+ db = client.get_database("test")
26
  users_collection = db["users"]
27
  courses_collection = db["courses"]
28
  jobs_collection = db["jobs"]
 
61
  job_similarity = None
62
 
63
  # Improved dataset loading with fallback
64
+ def load_dataset(file_path, required_columns=None, additional_columns=None, fallback_data=None):
65
+ required_columns = required_columns or ["Skill", "Question", "Answer"]
66
+ additional_columns = additional_columns or ['popularity', 'completion_rate']
67
  try:
68
  df = pd.read_csv(file_path)
69
  missing_required = [col for col in required_columns if col not in df.columns]
 
77
  if missing_additional:
78
  logger.warning(f"Additional columns {missing_additional} missing in {file_path}. Adding default values.")
79
  for col in missing_additional:
80
+ df[col] = 0.8 if col == 'popularity' else 0.7 if col == 'completion_rate' else 0.0
 
 
 
 
 
81
 
82
+ if 'level' not in df.columns:
 
 
83
  logger.warning(f"'level' column missing in {file_path}. Adding default 'Intermediate'.")
84
  df['level'] = 'Intermediate'
85
+ else:
86
+ df['level'] = df['level'].fillna('Intermediate')
87
 
88
  return df
 
 
 
 
 
 
89
  except Exception as e:
90
  logger.error(f"Error loading {file_path}: {e}. Using fallback data.")
91
+ return pd.DataFrame(fallback_data) if fallback_data is not None else None
 
 
 
92
 
93
  # Load datasets with fallbacks
94
+ questions_df = load_dataset("Generated_Skill-Based_Questions.csv", fallback_data={
95
  'Skill': ['Linux', 'Git', 'Node.js', 'Python', 'Kubernetes'],
96
  'Question': ['Advanced Linux question', 'Advanced Git question', 'Basic Node.js question',
97
  'Intermediate Python question', 'Basic Kubernetes question'],
 
114
  if os.path.exists(UNIVERSAL_MODEL_PATH):
115
  logger.info(f"Loading universal model from {UNIVERSAL_MODEL_PATH}")
116
  return SentenceTransformer(UNIVERSAL_MODEL_PATH)
117
+ logger.info(f"Loading universal model: {default_model}")
118
+ model = SentenceTransformer(default_model)
119
+ model.save(UNIVERSAL_MODEL_PATH)
120
+ return model
 
121
  except Exception as e:
122
  logger.error(f"Failed to load universal model {default_model}: {e}. Exiting.")
123
  exit(1)
124
 
125
  universal_model = load_universal_model()
126
 
127
+ try:
128
+ detector_tokenizer = AutoTokenizer.from_pretrained(DETECTOR_MODEL_PATH if os.path.exists(DETECTOR_MODEL_PATH) else "roberta-base-openai-detector")
129
+ detector_model = AutoModelForSequenceClassification.from_pretrained(DETECTOR_MODEL_PATH if os.path.exists(DETECTOR_MODEL_PATH) else "roberta-base-openai-detector")
130
+ except Exception as e:
131
+ logger.error(f"Failed to load detector model: {e}. Exiting.")
132
+ exit(1)
133
 
134
  # Load Precomputed Resources
135
  def load_precomputed_resources():
136
  global tfidf_vectorizer, skill_tfidf, question_to_answer, faiss_index, answer_embeddings, course_similarity, job_similarity
137
+ paths = [TFIDF_PATH, SKILL_TFIDF_PATH, QUESTION_ANSWER_PATH, FAISS_INDEX_PATH, ANSWER_EMBEDDINGS_PATH, COURSE_SIMILARITY_PATH, JOB_SIMILARITY_PATH]
138
+ if all(os.path.exists(p) for p in paths):
139
  try:
140
  with open(TFIDF_PATH, 'rb') as f: tfidf_vectorizer = pickle.load(f)
141
  with open(SKILL_TFIDF_PATH, 'rb') as f: skill_tfidf = pickle.load(f)
 
167
  faiss_index = faiss.IndexFlatL2(answer_embeddings.shape[1])
168
  faiss_index.add(answer_embeddings)
169
 
170
+ # Initialize course_similarity and job_similarity as empty dicts if not available
171
+ course_similarity = course_similarity or {}
172
+ job_similarity = job_similarity or {}
173
+
174
  with open(TFIDF_PATH, 'wb') as f: pickle.dump(tfidf_vectorizer, f)
175
  with open(SKILL_TFIDF_PATH, 'wb') as f: pickle.dump(skill_tfidf, f)
176
  with open(QUESTION_ANSWER_PATH, 'wb') as f: pickle.dump(question_to_answer, f)
 
191
  if not user_answer:
192
  return skill, 0.0, False
193
 
194
+ inputs = detector_tokenizer(user_answer, return_tensors="pt", truncation=True, max_length=512, padding=True)
195
  with torch.no_grad():
196
  logits = detector_model(**inputs).logits
197
  probs = scipy.special.softmax(logits, axis=1).tolist()[0]
198
  is_ai = probs[1] > 0.5
199
 
200
+ user_embedding = universal_model.encode([user_answer], batch_size=1, convert_to_tensor=True, device="cuda" if torch.cuda.is_available() else "cpu")[0]
201
  expected_embedding = torch.tensor(answer_embeddings[question_idx])
202
  score = util.pytorch_cos_sim(user_embedding, expected_embedding).item() * 100
203
 
 
215
  def get_questions_for_skills(skills):
216
  user_questions = []
217
  for skill in skills:
218
+ skill = skill.strip().capitalize() # Standardize skill format
219
+ skill_questions = questions_df[questions_df['Skill'].str.capitalize() == skill]
220
  if not skill_questions.empty:
221
  user_questions.append(skill_questions.sample(1).iloc[0].to_dict())
222
  else:
 
238
  "skills": {"$in": skills_to_improve},
239
  "category": {"$regex": target_level, "$options": "i"}
240
  }
241
+ courses = list(courses_collection.find(query).limit(3))
242
  return [{"title": course["title"], "provider": course.get("provider", "Unknown")} for course in courses]
243
  except Exception as e:
244
  logger.error(f"Course recommendation error: {e}")
 
254
  "skills": {"$in": user_skills},
255
  "status": "active"
256
  }
257
+ jobs = list(jobs_collection.find(query).limit(5))
258
  return [{"jobTitle": job["jobTitle"], "companyName": job["companyName"], "location": job.get("location", "Remote")} for job in jobs]
259
  except Exception as e:
260
  logger.error(f"Job recommendation error: {e}")
 
275
  return jsonify({"error": "Missing skills field"}), 400
276
 
277
  user_skills = [s.strip() for s in data['skills'] if isinstance(s, str)]
278
+ if not user_skills:
279
+ return jsonify({"error": "No valid skills provided"}), 400
280
+
281
  load_precomputed_resources()
282
  questions = get_questions_for_skills(user_skills)
283
  return jsonify({"questions": questions})
 
297
  answers = [a.strip() for a in data['answers'] if isinstance(a, str)]
298
  user_level = data.get('user_level', 'Intermediate').strip()
299
 
300
+ if not user_skills or len(answers) != len(user_skills):
301
  return jsonify({"error": "Answers count must match skills count"}), 400
302
 
303
  load_precomputed_resources()
 
310
  if not answer or answer.lower() == 'skip':
311
  user_responses.append((row['Skill'], None, None))
312
  else:
313
+ question_idx = questions_df.index[questions_df['Question'] == row['Question']].tolist()
314
+ if not question_idx:
315
+ logger.warning(f"Question not found in dataset: {row['Question']}")
316
+ user_responses.append((row['Skill'], None, None))
317
+ continue
318
+ user_responses.append((row['Skill'], answer, question_idx[0]))
319
 
320
  results = [evaluate_response(response) for response in user_responses]
321
 
 
330
  else:
331
  user_scores[skill] = score
332
  ai_flags[skill] = is_ai
333
+ if score > 0:
334
+ scores_list.append(score)
335
 
336
  # Update user profile with scores
337
  skill_scores = [{"skill": skill, "score": score} for skill, score, _ in results if score > 0]
338
  users_collection.update_one(
339
  {"_id": user_id},
340
+ {"$set": {"skillScores": skill_scores}},
341
+ upsert=True
342
  )
343
 
344
  mean_score = np.mean(scores_list) if scores_list else 50
345
  dynamic_threshold = max(40, mean_score)
346
+ weak_skills = [skill for skill, score in user_scores.items() if score > 0 and score < dynamic_threshold]
347
 
348
  courses = recommend_courses_from_mongo(weak_skills or user_skills, user_level, upgrade=not weak_skills)
349
  jobs = recommend_jobs_from_mongo(user_skills, user_level)
 
354
  {
355
  "skill": skill,
356
  "progress": f"{'■' * int(score//10)}{'-' * (10 - int(score//10))}",
357
+ "score": f"{score:.2f}%",
358
  "origin": "AI-Generated" if is_ai else "Human-Written"
359
  } for skill, score, is_ai in results
360
  ],
361
+ "mean_score": round(mean_score, 2),
362
+ "dynamic_threshold": round(dynamic_threshold, 2),
363
  "weak_skills": weak_skills,
364
  "skipped_questions": skipped_questions
365
  },
 
371
  return jsonify({"error": "Internal server error"}), 500
372
 
373
  if __name__ == '__main__':
374
+ app.run(host='0.0.0.0', port=7860)