Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -11,7 +11,6 @@ from sklearn.feature_extraction.text import TfidfVectorizer
|
|
11 |
from flask import Flask, request, jsonify
|
12 |
import logging
|
13 |
from pymongo import MongoClient
|
14 |
-
import requests
|
15 |
|
16 |
# Set up logging
|
17 |
logging.basicConfig(level=logging.INFO)
|
@@ -23,7 +22,7 @@ os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
|
23 |
# MongoDB connection
|
24 |
MONGO_URI = "mongodb://muhammadbinimran1001:[email protected]:27017,dsm-shard-00-01.inrzs.mongodb.net:27017,dsm-shard-00-02.inrzs.mongodb.net:27017/?ssl=true&replicaSet=atlas-nbg4er-shard-0&authSource=admin&retryWrites=true&w=majority"
|
25 |
client = MongoClient(MONGO_URI)
|
26 |
-
db = client.get_database("test")
|
27 |
users_collection = db["users"]
|
28 |
courses_collection = db["courses"]
|
29 |
jobs_collection = db["jobs"]
|
@@ -62,7 +61,9 @@ course_similarity = None
|
|
62 |
job_similarity = None
|
63 |
|
64 |
# Improved dataset loading with fallback
|
65 |
-
def load_dataset(file_path, required_columns=
|
|
|
|
|
66 |
try:
|
67 |
df = pd.read_csv(file_path)
|
68 |
missing_required = [col for col in required_columns if col not in df.columns]
|
@@ -76,35 +77,21 @@ def load_dataset(file_path, required_columns=[], additional_columns=['popularity
|
|
76 |
if missing_additional:
|
77 |
logger.warning(f"Additional columns {missing_additional} missing in {file_path}. Adding default values.")
|
78 |
for col in missing_additional:
|
79 |
-
if col == 'popularity'
|
80 |
-
df[col] = 0.8
|
81 |
-
elif col == 'completion_rate':
|
82 |
-
df[col] = 0.7
|
83 |
-
else:
|
84 |
-
df[col] = 0.0
|
85 |
|
86 |
-
if 'level' in df.columns:
|
87 |
-
df['level'] = df['level'].apply(lambda x: 'Intermediate' if pd.isna(x) or x.strip() == "" else x)
|
88 |
-
else:
|
89 |
logger.warning(f"'level' column missing in {file_path}. Adding default 'Intermediate'.")
|
90 |
df['level'] = 'Intermediate'
|
|
|
|
|
91 |
|
92 |
return df
|
93 |
-
except ValueError as ve:
|
94 |
-
logger.error(f"ValueError loading {file_path}: {ve}. Using fallback data.")
|
95 |
-
if fallback_data is not None:
|
96 |
-
logger.info(f"Using fallback data for {file_path}")
|
97 |
-
return pd.DataFrame(fallback_data)
|
98 |
-
return None
|
99 |
except Exception as e:
|
100 |
logger.error(f"Error loading {file_path}: {e}. Using fallback data.")
|
101 |
-
if fallback_data is not None
|
102 |
-
logger.info(f"Using fallback data for {file_path}")
|
103 |
-
return pd.DataFrame(fallback_data)
|
104 |
-
return None
|
105 |
|
106 |
# Load datasets with fallbacks
|
107 |
-
questions_df = load_dataset("Generated_Skill-Based_Questions.csv",
|
108 |
'Skill': ['Linux', 'Git', 'Node.js', 'Python', 'Kubernetes'],
|
109 |
'Question': ['Advanced Linux question', 'Advanced Git question', 'Basic Node.js question',
|
110 |
'Intermediate Python question', 'Basic Kubernetes question'],
|
@@ -127,28 +114,28 @@ def load_universal_model():
|
|
127 |
if os.path.exists(UNIVERSAL_MODEL_PATH):
|
128 |
logger.info(f"Loading universal model from {UNIVERSAL_MODEL_PATH}")
|
129 |
return SentenceTransformer(UNIVERSAL_MODEL_PATH)
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
return model
|
135 |
except Exception as e:
|
136 |
logger.error(f"Failed to load universal model {default_model}: {e}. Exiting.")
|
137 |
exit(1)
|
138 |
|
139 |
universal_model = load_universal_model()
|
140 |
|
141 |
-
|
142 |
-
detector_tokenizer = AutoTokenizer.from_pretrained(DETECTOR_MODEL_PATH)
|
143 |
-
detector_model = AutoModelForSequenceClassification.from_pretrained(DETECTOR_MODEL_PATH)
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
|
148 |
# Load Precomputed Resources
|
149 |
def load_precomputed_resources():
|
150 |
global tfidf_vectorizer, skill_tfidf, question_to_answer, faiss_index, answer_embeddings, course_similarity, job_similarity
|
151 |
-
|
|
|
152 |
try:
|
153 |
with open(TFIDF_PATH, 'rb') as f: tfidf_vectorizer = pickle.load(f)
|
154 |
with open(SKILL_TFIDF_PATH, 'rb') as f: skill_tfidf = pickle.load(f)
|
@@ -180,6 +167,10 @@ def precompute_resources():
|
|
180 |
faiss_index = faiss.IndexFlatL2(answer_embeddings.shape[1])
|
181 |
faiss_index.add(answer_embeddings)
|
182 |
|
|
|
|
|
|
|
|
|
183 |
with open(TFIDF_PATH, 'wb') as f: pickle.dump(tfidf_vectorizer, f)
|
184 |
with open(SKILL_TFIDF_PATH, 'wb') as f: pickle.dump(skill_tfidf, f)
|
185 |
with open(QUESTION_ANSWER_PATH, 'wb') as f: pickle.dump(question_to_answer, f)
|
@@ -200,13 +191,13 @@ def evaluate_response(args):
|
|
200 |
if not user_answer:
|
201 |
return skill, 0.0, False
|
202 |
|
203 |
-
inputs = detector_tokenizer(user_answer, return_tensors="pt", truncation=True, max_length=512)
|
204 |
with torch.no_grad():
|
205 |
logits = detector_model(**inputs).logits
|
206 |
probs = scipy.special.softmax(logits, axis=1).tolist()[0]
|
207 |
is_ai = probs[1] > 0.5
|
208 |
|
209 |
-
user_embedding = universal_model.encode([user_answer], batch_size=
|
210 |
expected_embedding = torch.tensor(answer_embeddings[question_idx])
|
211 |
score = util.pytorch_cos_sim(user_embedding, expected_embedding).item() * 100
|
212 |
|
@@ -224,7 +215,8 @@ def evaluate_response(args):
|
|
224 |
def get_questions_for_skills(skills):
|
225 |
user_questions = []
|
226 |
for skill in skills:
|
227 |
-
|
|
|
228 |
if not skill_questions.empty:
|
229 |
user_questions.append(skill_questions.sample(1).iloc[0].to_dict())
|
230 |
else:
|
@@ -246,7 +238,7 @@ def recommend_courses_from_mongo(skills_to_improve, user_level, upgrade=False):
|
|
246 |
"skills": {"$in": skills_to_improve},
|
247 |
"category": {"$regex": target_level, "$options": "i"}
|
248 |
}
|
249 |
-
courses = courses_collection.find(query).limit(3)
|
250 |
return [{"title": course["title"], "provider": course.get("provider", "Unknown")} for course in courses]
|
251 |
except Exception as e:
|
252 |
logger.error(f"Course recommendation error: {e}")
|
@@ -262,7 +254,7 @@ def recommend_jobs_from_mongo(user_skills, user_level):
|
|
262 |
"skills": {"$in": user_skills},
|
263 |
"status": "active"
|
264 |
}
|
265 |
-
jobs = jobs_collection.find(query).limit(5)
|
266 |
return [{"jobTitle": job["jobTitle"], "companyName": job["companyName"], "location": job.get("location", "Remote")} for job in jobs]
|
267 |
except Exception as e:
|
268 |
logger.error(f"Job recommendation error: {e}")
|
@@ -283,6 +275,9 @@ def get_questions():
|
|
283 |
return jsonify({"error": "Missing skills field"}), 400
|
284 |
|
285 |
user_skills = [s.strip() for s in data['skills'] if isinstance(s, str)]
|
|
|
|
|
|
|
286 |
load_precomputed_resources()
|
287 |
questions = get_questions_for_skills(user_skills)
|
288 |
return jsonify({"questions": questions})
|
@@ -302,7 +297,7 @@ def assess_skills():
|
|
302 |
answers = [a.strip() for a in data['answers'] if isinstance(a, str)]
|
303 |
user_level = data.get('user_level', 'Intermediate').strip()
|
304 |
|
305 |
-
if len(answers) != len(user_skills):
|
306 |
return jsonify({"error": "Answers count must match skills count"}), 400
|
307 |
|
308 |
load_precomputed_resources()
|
@@ -315,8 +310,12 @@ def assess_skills():
|
|
315 |
if not answer or answer.lower() == 'skip':
|
316 |
user_responses.append((row['Skill'], None, None))
|
317 |
else:
|
318 |
-
question_idx = questions_df.index[questions_df['Question'] == row['Question']]
|
319 |
-
|
|
|
|
|
|
|
|
|
320 |
|
321 |
results = [evaluate_response(response) for response in user_responses]
|
322 |
|
@@ -331,18 +330,20 @@ def assess_skills():
|
|
331 |
else:
|
332 |
user_scores[skill] = score
|
333 |
ai_flags[skill] = is_ai
|
334 |
-
|
|
|
335 |
|
336 |
# Update user profile with scores
|
337 |
skill_scores = [{"skill": skill, "score": score} for skill, score, _ in results if score > 0]
|
338 |
users_collection.update_one(
|
339 |
{"_id": user_id},
|
340 |
-
{"$set": {"skillScores": skill_scores}}
|
|
|
341 |
)
|
342 |
|
343 |
mean_score = np.mean(scores_list) if scores_list else 50
|
344 |
dynamic_threshold = max(40, mean_score)
|
345 |
-
weak_skills = [skill for skill, score in user_scores.items() if score < dynamic_threshold]
|
346 |
|
347 |
courses = recommend_courses_from_mongo(weak_skills or user_skills, user_level, upgrade=not weak_skills)
|
348 |
jobs = recommend_jobs_from_mongo(user_skills, user_level)
|
@@ -353,12 +354,12 @@ def assess_skills():
|
|
353 |
{
|
354 |
"skill": skill,
|
355 |
"progress": f"{'■' * int(score//10)}{'-' * (10 - int(score//10))}",
|
356 |
-
"score": f"{score:.2f}
|
357 |
"origin": "AI-Generated" if is_ai else "Human-Written"
|
358 |
} for skill, score, is_ai in results
|
359 |
],
|
360 |
-
"mean_score": mean_score,
|
361 |
-
"dynamic_threshold": dynamic_threshold,
|
362 |
"weak_skills": weak_skills,
|
363 |
"skipped_questions": skipped_questions
|
364 |
},
|
@@ -370,4 +371,4 @@ def assess_skills():
|
|
370 |
return jsonify({"error": "Internal server error"}), 500
|
371 |
|
372 |
if __name__ == '__main__':
|
373 |
-
app.run(host='0.0.0.0', port=7860
|
|
|
11 |
from flask import Flask, request, jsonify
|
12 |
import logging
|
13 |
from pymongo import MongoClient
|
|
|
14 |
|
15 |
# Set up logging
|
16 |
logging.basicConfig(level=logging.INFO)
|
|
|
22 |
# MongoDB connection
|
23 |
MONGO_URI = "mongodb://muhammadbinimran1001:[email protected]:27017,dsm-shard-00-01.inrzs.mongodb.net:27017,dsm-shard-00-02.inrzs.mongodb.net:27017/?ssl=true&replicaSet=atlas-nbg4er-shard-0&authSource=admin&retryWrites=true&w=majority"
|
24 |
client = MongoClient(MONGO_URI)
|
25 |
+
db = client.get_database("test")
|
26 |
users_collection = db["users"]
|
27 |
courses_collection = db["courses"]
|
28 |
jobs_collection = db["jobs"]
|
|
|
61 |
job_similarity = None
|
62 |
|
63 |
# Improved dataset loading with fallback
|
64 |
+
def load_dataset(file_path, required_columns=None, additional_columns=None, fallback_data=None):
|
65 |
+
required_columns = required_columns or ["Skill", "Question", "Answer"]
|
66 |
+
additional_columns = additional_columns or ['popularity', 'completion_rate']
|
67 |
try:
|
68 |
df = pd.read_csv(file_path)
|
69 |
missing_required = [col for col in required_columns if col not in df.columns]
|
|
|
77 |
if missing_additional:
|
78 |
logger.warning(f"Additional columns {missing_additional} missing in {file_path}. Adding default values.")
|
79 |
for col in missing_additional:
|
80 |
+
df[col] = 0.8 if col == 'popularity' else 0.7 if col == 'completion_rate' else 0.0
|
|
|
|
|
|
|
|
|
|
|
81 |
|
82 |
+
if 'level' not in df.columns:
|
|
|
|
|
83 |
logger.warning(f"'level' column missing in {file_path}. Adding default 'Intermediate'.")
|
84 |
df['level'] = 'Intermediate'
|
85 |
+
else:
|
86 |
+
df['level'] = df['level'].fillna('Intermediate')
|
87 |
|
88 |
return df
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
except Exception as e:
|
90 |
logger.error(f"Error loading {file_path}: {e}. Using fallback data.")
|
91 |
+
return pd.DataFrame(fallback_data) if fallback_data is not None else None
|
|
|
|
|
|
|
92 |
|
93 |
# Load datasets with fallbacks
|
94 |
+
questions_df = load_dataset("Generated_Skill-Based_Questions.csv", fallback_data={
|
95 |
'Skill': ['Linux', 'Git', 'Node.js', 'Python', 'Kubernetes'],
|
96 |
'Question': ['Advanced Linux question', 'Advanced Git question', 'Basic Node.js question',
|
97 |
'Intermediate Python question', 'Basic Kubernetes question'],
|
|
|
114 |
if os.path.exists(UNIVERSAL_MODEL_PATH):
|
115 |
logger.info(f"Loading universal model from {UNIVERSAL_MODEL_PATH}")
|
116 |
return SentenceTransformer(UNIVERSAL_MODEL_PATH)
|
117 |
+
logger.info(f"Loading universal model: {default_model}")
|
118 |
+
model = SentenceTransformer(default_model)
|
119 |
+
model.save(UNIVERSAL_MODEL_PATH)
|
120 |
+
return model
|
|
|
121 |
except Exception as e:
|
122 |
logger.error(f"Failed to load universal model {default_model}: {e}. Exiting.")
|
123 |
exit(1)
|
124 |
|
125 |
universal_model = load_universal_model()
|
126 |
|
127 |
+
try:
|
128 |
+
detector_tokenizer = AutoTokenizer.from_pretrained(DETECTOR_MODEL_PATH if os.path.exists(DETECTOR_MODEL_PATH) else "roberta-base-openai-detector")
|
129 |
+
detector_model = AutoModelForSequenceClassification.from_pretrained(DETECTOR_MODEL_PATH if os.path.exists(DETECTOR_MODEL_PATH) else "roberta-base-openai-detector")
|
130 |
+
except Exception as e:
|
131 |
+
logger.error(f"Failed to load detector model: {e}. Exiting.")
|
132 |
+
exit(1)
|
133 |
|
134 |
# Load Precomputed Resources
|
135 |
def load_precomputed_resources():
|
136 |
global tfidf_vectorizer, skill_tfidf, question_to_answer, faiss_index, answer_embeddings, course_similarity, job_similarity
|
137 |
+
paths = [TFIDF_PATH, SKILL_TFIDF_PATH, QUESTION_ANSWER_PATH, FAISS_INDEX_PATH, ANSWER_EMBEDDINGS_PATH, COURSE_SIMILARITY_PATH, JOB_SIMILARITY_PATH]
|
138 |
+
if all(os.path.exists(p) for p in paths):
|
139 |
try:
|
140 |
with open(TFIDF_PATH, 'rb') as f: tfidf_vectorizer = pickle.load(f)
|
141 |
with open(SKILL_TFIDF_PATH, 'rb') as f: skill_tfidf = pickle.load(f)
|
|
|
167 |
faiss_index = faiss.IndexFlatL2(answer_embeddings.shape[1])
|
168 |
faiss_index.add(answer_embeddings)
|
169 |
|
170 |
+
# Initialize course_similarity and job_similarity as empty dicts if not available
|
171 |
+
course_similarity = course_similarity or {}
|
172 |
+
job_similarity = job_similarity or {}
|
173 |
+
|
174 |
with open(TFIDF_PATH, 'wb') as f: pickle.dump(tfidf_vectorizer, f)
|
175 |
with open(SKILL_TFIDF_PATH, 'wb') as f: pickle.dump(skill_tfidf, f)
|
176 |
with open(QUESTION_ANSWER_PATH, 'wb') as f: pickle.dump(question_to_answer, f)
|
|
|
191 |
if not user_answer:
|
192 |
return skill, 0.0, False
|
193 |
|
194 |
+
inputs = detector_tokenizer(user_answer, return_tensors="pt", truncation=True, max_length=512, padding=True)
|
195 |
with torch.no_grad():
|
196 |
logits = detector_model(**inputs).logits
|
197 |
probs = scipy.special.softmax(logits, axis=1).tolist()[0]
|
198 |
is_ai = probs[1] > 0.5
|
199 |
|
200 |
+
user_embedding = universal_model.encode([user_answer], batch_size=1, convert_to_tensor=True, device="cuda" if torch.cuda.is_available() else "cpu")[0]
|
201 |
expected_embedding = torch.tensor(answer_embeddings[question_idx])
|
202 |
score = util.pytorch_cos_sim(user_embedding, expected_embedding).item() * 100
|
203 |
|
|
|
215 |
def get_questions_for_skills(skills):
|
216 |
user_questions = []
|
217 |
for skill in skills:
|
218 |
+
skill = skill.strip().capitalize() # Standardize skill format
|
219 |
+
skill_questions = questions_df[questions_df['Skill'].str.capitalize() == skill]
|
220 |
if not skill_questions.empty:
|
221 |
user_questions.append(skill_questions.sample(1).iloc[0].to_dict())
|
222 |
else:
|
|
|
238 |
"skills": {"$in": skills_to_improve},
|
239 |
"category": {"$regex": target_level, "$options": "i"}
|
240 |
}
|
241 |
+
courses = list(courses_collection.find(query).limit(3))
|
242 |
return [{"title": course["title"], "provider": course.get("provider", "Unknown")} for course in courses]
|
243 |
except Exception as e:
|
244 |
logger.error(f"Course recommendation error: {e}")
|
|
|
254 |
"skills": {"$in": user_skills},
|
255 |
"status": "active"
|
256 |
}
|
257 |
+
jobs = list(jobs_collection.find(query).limit(5))
|
258 |
return [{"jobTitle": job["jobTitle"], "companyName": job["companyName"], "location": job.get("location", "Remote")} for job in jobs]
|
259 |
except Exception as e:
|
260 |
logger.error(f"Job recommendation error: {e}")
|
|
|
275 |
return jsonify({"error": "Missing skills field"}), 400
|
276 |
|
277 |
user_skills = [s.strip() for s in data['skills'] if isinstance(s, str)]
|
278 |
+
if not user_skills:
|
279 |
+
return jsonify({"error": "No valid skills provided"}), 400
|
280 |
+
|
281 |
load_precomputed_resources()
|
282 |
questions = get_questions_for_skills(user_skills)
|
283 |
return jsonify({"questions": questions})
|
|
|
297 |
answers = [a.strip() for a in data['answers'] if isinstance(a, str)]
|
298 |
user_level = data.get('user_level', 'Intermediate').strip()
|
299 |
|
300 |
+
if not user_skills or len(answers) != len(user_skills):
|
301 |
return jsonify({"error": "Answers count must match skills count"}), 400
|
302 |
|
303 |
load_precomputed_resources()
|
|
|
310 |
if not answer or answer.lower() == 'skip':
|
311 |
user_responses.append((row['Skill'], None, None))
|
312 |
else:
|
313 |
+
question_idx = questions_df.index[questions_df['Question'] == row['Question']].tolist()
|
314 |
+
if not question_idx:
|
315 |
+
logger.warning(f"Question not found in dataset: {row['Question']}")
|
316 |
+
user_responses.append((row['Skill'], None, None))
|
317 |
+
continue
|
318 |
+
user_responses.append((row['Skill'], answer, question_idx[0]))
|
319 |
|
320 |
results = [evaluate_response(response) for response in user_responses]
|
321 |
|
|
|
330 |
else:
|
331 |
user_scores[skill] = score
|
332 |
ai_flags[skill] = is_ai
|
333 |
+
if score > 0:
|
334 |
+
scores_list.append(score)
|
335 |
|
336 |
# Update user profile with scores
|
337 |
skill_scores = [{"skill": skill, "score": score} for skill, score, _ in results if score > 0]
|
338 |
users_collection.update_one(
|
339 |
{"_id": user_id},
|
340 |
+
{"$set": {"skillScores": skill_scores}},
|
341 |
+
upsert=True
|
342 |
)
|
343 |
|
344 |
mean_score = np.mean(scores_list) if scores_list else 50
|
345 |
dynamic_threshold = max(40, mean_score)
|
346 |
+
weak_skills = [skill for skill, score in user_scores.items() if score > 0 and score < dynamic_threshold]
|
347 |
|
348 |
courses = recommend_courses_from_mongo(weak_skills or user_skills, user_level, upgrade=not weak_skills)
|
349 |
jobs = recommend_jobs_from_mongo(user_skills, user_level)
|
|
|
354 |
{
|
355 |
"skill": skill,
|
356 |
"progress": f"{'■' * int(score//10)}{'-' * (10 - int(score//10))}",
|
357 |
+
"score": f"{score:.2f}%",
|
358 |
"origin": "AI-Generated" if is_ai else "Human-Written"
|
359 |
} for skill, score, is_ai in results
|
360 |
],
|
361 |
+
"mean_score": round(mean_score, 2),
|
362 |
+
"dynamic_threshold": round(dynamic_threshold, 2),
|
363 |
"weak_skills": weak_skills,
|
364 |
"skipped_questions": skipped_questions
|
365 |
},
|
|
|
371 |
return jsonify({"error": "Internal server error"}), 500
|
372 |
|
373 |
if __name__ == '__main__':
|
374 |
+
app.run(host='0.0.0.0', port=7860)
|