Spaces:
Runtime error
Runtime error
import os | |
import pandas as pd | |
import torch | |
from sentence_transformers import SentenceTransformer, util | |
import faiss | |
import numpy as np | |
import pickle | |
from transformers import AutoTokenizer, AutoModelForSequenceClassification | |
import scipy.special | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from flask import Flask, request, jsonify | |
import logging | |
from pymongo import MongoClient | |
# Set up logging | |
logging.basicConfig(level=logging.INFO) | |
logger = logging.getLogger(__name__) | |
# Disable tokenizers parallelism to avoid fork-related deadlocks | |
os.environ["TOKENIZERS_PARALLELISM"] = "false" | |
# MongoDB connection | |
MONGO_URI = "mongodb://muhammadbinimran1001:[email protected]:27017,dsm-shard-00-01.inrzs.mongodb.net:27017,dsm-shard-00-02.inrzs.mongodb.net:27017/?ssl=true&replicaSet=atlas-nbg4er-shard-0&authSource=admin&retryWrites=true&w=majority" | |
client = MongoClient(MONGO_URI) | |
db = client.get_database("test") | |
users_collection = db["users"] | |
courses_collection = db["courses"] | |
jobs_collection = db["jobs"] | |
# Paths for saving artifacts | |
MODEL_DIR = "./saved_models" | |
FALLBACK_MODEL_DIR = "/tmp/saved_models" | |
try: | |
os.makedirs(MODEL_DIR, exist_ok=True) | |
logger.info(f"Using model directory: {MODEL_DIR}") | |
chosen_model_dir = MODEL_DIR | |
except Exception as e: | |
logger.warning(f"Failed to create {MODEL_DIR}: {e}. Using fallback directory.") | |
os.makedirs(FALLBACK_MODEL_DIR, exist_ok=True) | |
chosen_model_dir = FALLBACK_MODEL_DIR | |
# Update paths | |
UNIVERSAL_MODEL_PATH = os.path.join(chosen_model_dir, "universal_model") | |
DETECTOR_MODEL_PATH = os.path.join(chosen_model_dir, "detector_model") | |
TFIDF_PATH = os.path.join(chosen_model_dir, "tfidf_vectorizer.pkl") | |
SKILL_TFIDF_PATH = os.path.join(chosen_model_dir, "skill_tfidf.pkl") | |
QUESTION_ANSWER_PATH = os.path.join(chosen_model_dir, "question_to_answer.pkl") | |
FAISS_INDEX_PATH = os.path.join(chosen_model_dir, "faiss_index.index") | |
ANSWER_EMBEDDINGS_PATH = os.path.join(chosen_model_dir, "answer_embeddings.pkl") | |
COURSE_SIMILARITY_PATH = os.path.join(chosen_model_dir, "course_similarity.pkl") | |
JOB_SIMILARITY_PATH = os.path.join(chosen_model_dir, "job_similarity.pkl") | |
# Global variables for precomputed data | |
tfidf_vectorizer = None | |
skill_tfidf = None | |
question_to_answer = None | |
faiss_index = None | |
answer_embeddings = None | |
course_similarity = None | |
job_similarity = None | |
# Improved dataset loading with fallback | |
def load_dataset(file_path, required_columns=None, additional_columns=None, fallback_data=None): | |
required_columns = required_columns or ["Skill", "Question", "Answer"] | |
additional_columns = additional_columns or ['popularity', 'completion_rate'] | |
try: | |
df = pd.read_csv(file_path) | |
missing_required = [col for col in required_columns if col not in df.columns] | |
missing_additional = [col for col in additional_columns if col not in df.columns] | |
if missing_required: | |
logger.warning(f"Required columns {missing_required} missing in {file_path}. Adding empty values.") | |
for col in missing_required: | |
df[col] = "" | |
if missing_additional: | |
logger.warning(f"Additional columns {missing_additional} missing in {file_path}. Adding default values.") | |
for col in missing_additional: | |
df[col] = 0.8 if col == 'popularity' else 0.7 if col == 'completion_rate' else 0.0 | |
if 'level' not in df.columns: | |
logger.warning(f"'level' column missing in {file_path}. Adding default 'Intermediate'.") | |
df['level'] = 'Intermediate' | |
else: | |
df['level'] = df['level'].fillna('Intermediate') | |
return df | |
except Exception as e: | |
logger.error(f"Error loading {file_path}: {e}. Using fallback data.") | |
return pd.DataFrame(fallback_data) if fallback_data is not None else None | |
# Load datasets with fallbacks | |
questions_df = load_dataset("Generated_Skill-Based_Questions.csv", fallback_data={ | |
'Skill': ['Linux', 'Git', 'Node.js', 'Python', 'Kubernetes'], | |
'Question': ['Advanced Linux question', 'Advanced Git question', 'Basic Node.js question', | |
'Intermediate Python question', 'Basic Kubernetes question'], | |
'Answer': ['Linux answer', 'Git answer', 'Node.js answer', 'Python answer', 'Kubernetes answer'] | |
}) | |
# Validate questions_df | |
if questions_df is None or questions_df.empty: | |
logger.error("questions_df is empty or could not be loaded. Exiting.") | |
exit(1) | |
if not all(col in questions_df.columns for col in ["Skill", "Question", "Answer"]): | |
logger.error("questions_df is missing required columns. Exiting.") | |
exit(1) | |
logger.info(f"questions_df loaded with {len(questions_df)} rows. Skills available: {list(questions_df['Skill'].unique())}") | |
# Load or Initialize Models with Fallback | |
def load_universal_model(): | |
default_model = "all-MiniLM-L6-v2" | |
try: | |
if os.path.exists(UNIVERSAL_MODEL_PATH): | |
logger.info(f"Loading universal model from {UNIVERSAL_MODEL_PATH}") | |
return SentenceTransformer(UNIVERSAL_MODEL_PATH) | |
logger.info(f"Loading universal model: {default_model}") | |
model = SentenceTransformer(default_model) | |
model.save(UNIVERSAL_MODEL_PATH) | |
return model | |
except Exception as e: | |
logger.error(f"Failed to load universal model {default_model}: {e}. Exiting.") | |
exit(1) | |
universal_model = load_universal_model() | |
try: | |
detector_tokenizer = AutoTokenizer.from_pretrained(DETECTOR_MODEL_PATH if os.path.exists(DETECTOR_MODEL_PATH) else "roberta-base-openai-detector") | |
detector_model = AutoModelForSequenceClassification.from_pretrained(DETECTOR_MODEL_PATH if os.path.exists(DETECTOR_MODEL_PATH) else "roberta-base-openai-detector") | |
except Exception as e: | |
logger.error(f"Failed to load detector model: {e}. Exiting.") | |
exit(1) | |
# Load Precomputed Resources | |
def load_precomputed_resources(): | |
global tfidf_vectorizer, skill_tfidf, question_to_answer, faiss_index, answer_embeddings, course_similarity, job_similarity | |
paths = [TFIDF_PATH, SKILL_TFIDF_PATH, QUESTION_ANSWER_PATH, FAISS_INDEX_PATH, ANSWER_EMBEDDINGS_PATH, COURSE_SIMILARITY_PATH, JOB_SIMILARITY_PATH] | |
if all(os.path.exists(p) for p in paths): | |
try: | |
with open(TFIDF_PATH, 'rb') as f: tfidf_vectorizer = pickle.load(f) | |
with open(SKILL_TFIDF_PATH, 'rb') as f: skill_tfidf = pickle.load(f) | |
with open(QUESTION_ANSWER_PATH, 'rb') as f: question_to_answer = pickle.load(f) | |
faiss_index = faiss.read_index(FAISS_INDEX_PATH) | |
with open(ANSWER_EMBEDDINGS_PATH, 'rb') as f: answer_embeddings = pickle.load(f) | |
with open(COURSE_SIMILARITY_PATH, 'rb') as f: course_similarity = pickle.load(f) | |
with open(JOB_SIMILARITY_PATH, 'rb') as f: job_similarity = pickle.load(f) | |
logger.info("Loaded precomputed resources successfully") | |
except Exception as e: | |
logger.error(f"Error loading precomputed resources: {e}") | |
precompute_resources() | |
else: | |
precompute_resources() | |
# Precompute Resources Offline | |
def precompute_resources(): | |
global tfidf_vectorizer, skill_tfidf, question_to_answer, faiss_index, answer_embeddings, course_similarity, job_similarity | |
logger.info("Precomputing resources offline") | |
try: | |
tfidf_vectorizer = TfidfVectorizer(stop_words='english') | |
all_texts = questions_df['Answer'].tolist() + questions_df['Question'].tolist() | |
tfidf_vectorizer.fit(all_texts) | |
skill_tfidf = {skill.lower(): tfidf_vectorizer.transform([skill]).toarray()[0] for skill in questions_df['Skill'].unique()} | |
question_to_answer = dict(zip(questions_df['Question'], questions_df['Answer'])) | |
answer_embeddings = universal_model.encode(questions_df['Answer'].tolist(), batch_size=128, convert_to_tensor=True, device="cuda" if torch.cuda.is_available() else "cpu").cpu().numpy() | |
faiss_index = faiss.IndexFlatL2(answer_embeddings.shape[1]) | |
faiss_index.add(answer_embeddings) | |
# Initialize course_similarity and job_similarity as empty dicts if not available | |
course_similarity = course_similarity or {} | |
job_similarity = job_similarity or {} | |
with open(TFIDF_PATH, 'wb') as f: pickle.dump(tfidf_vectorizer, f) | |
with open(SKILL_TFIDF_PATH, 'wb') as f: pickle.dump(skill_tfidf, f) | |
with open(QUESTION_ANSWER_PATH, 'wb') as f: pickle.dump(question_to_answer, f) | |
faiss.write_index(faiss_index, FAISS_INDEX_PATH) | |
with open(ANSWER_EMBEDDINGS_PATH, 'wb') as f: pickle.dump(answer_embeddings, f) | |
with open(COURSE_SIMILARITY_PATH, 'wb') as f: pickle.dump(course_similarity, f) | |
with open(JOB_SIMILARITY_PATH, 'wb') as f: pickle.dump(job_similarity, f) | |
universal_model.save(UNIVERSAL_MODEL_PATH) | |
logger.info(f"Precomputed resources saved to {chosen_model_dir}") | |
except Exception as e: | |
logger.error(f"Error during precomputation: {e}") | |
raise | |
# Evaluation with precomputed data | |
def evaluate_response(args): | |
try: | |
skill, user_answer, question_idx = args | |
if not user_answer: | |
return skill, 0.0, False | |
inputs = detector_tokenizer(user_answer, return_tensors="pt", truncation=True, max_length=512, padding=True) | |
with torch.no_grad(): | |
logits = detector_model(**inputs).logits | |
probs = scipy.special.softmax(logits, axis=1).tolist()[0] | |
is_ai = probs[1] > 0.5 | |
user_embedding = universal_model.encode([user_answer], batch_size=1, convert_to_tensor=True, device="cuda" if torch.cuda.is_available() else "cpu")[0] | |
expected_embedding = torch.tensor(answer_embeddings[question_idx]) | |
score = util.pytorch_cos_sim(user_embedding, expected_embedding).item() * 100 | |
user_tfidf = tfidf_vectorizer.transform([user_answer]).toarray()[0] | |
skill_vec = skill_tfidf.get(skill.lower(), np.zeros_like(user_tfidf)) | |
relevance = np.dot(user_tfidf, skill_vec) / (np.linalg.norm(user_tfidf) * np.linalg.norm(skill_vec) + 1e-10) | |
score *= max(0.5, min(1.0, relevance)) | |
return skill, round(max(0, score), 2), is_ai | |
except Exception as e: | |
logger.error(f"Evaluation error for {skill}: {e}") | |
return skill, 0.0, False | |
# Fetch questions for given skills | |
def get_questions_for_skills(skills): | |
user_questions = [] | |
for skill in skills: | |
skill = skill.strip().capitalize() # Standardize skill format | |
skill_questions = questions_df[questions_df['Skill'].str.capitalize() == skill] | |
if not skill_questions.empty: | |
user_questions.append(skill_questions.sample(1).iloc[0].to_dict()) | |
else: | |
user_questions.append({ | |
'Skill': skill, | |
'Question': f"What are the best practices for using {skill} in a production environment?", | |
'Answer': f"Best practices for {skill} include proper documentation, monitoring, and security measures." | |
}) | |
return user_questions | |
# Recommend courses from MongoDB | |
def recommend_courses_from_mongo(skills_to_improve, user_level, upgrade=False): | |
try: | |
if not skills_to_improve: | |
return [] | |
target_level = 'Advanced' if upgrade else user_level | |
query = { | |
"skills": {"$in": skills_to_improve}, | |
"category": {"$regex": target_level, "$options": "i"} | |
} | |
courses = list(courses_collection.find(query).limit(3)) | |
return [{"title": course["title"], "provider": course.get("provider", "Unknown")} for course in courses] | |
except Exception as e: | |
logger.error(f"Course recommendation error: {e}") | |
return [] | |
# Recommend jobs from MongoDB | |
def recommend_jobs_from_mongo(user_skills, user_level): | |
try: | |
if not user_skills: | |
return [] | |
query = { | |
"skills": {"$in": user_skills}, | |
"status": "active" | |
} | |
jobs = list(jobs_collection.find(query).limit(5)) | |
return [{"jobTitle": job["jobTitle"], "companyName": job["companyName"], "location": job.get("location", "Remote")} for job in jobs] | |
except Exception as e: | |
logger.error(f"Job recommendation error: {e}") | |
return [] | |
# Flask application setup | |
app = Flask(__name__) | |
def health_check(): | |
return jsonify({"status": "active", "model_dir": chosen_model_dir}) | |
def get_questions(): | |
try: | |
data = request.get_json() | |
if not data or 'skills' not in data: | |
return jsonify({"error": "Missing skills field"}), 400 | |
user_skills = [s.strip() for s in data['skills'] if isinstance(s, str)] | |
if not user_skills: | |
return jsonify({"error": "No valid skills provided"}), 400 | |
load_precomputed_resources() | |
questions = get_questions_for_skills(user_skills) | |
return jsonify({"questions": questions}) | |
except Exception as e: | |
logger.error(f"Get questions error: {e}") | |
return jsonify({"error": "Internal server error"}), 500 | |
def assess_skills(): | |
try: | |
data = request.get_json() | |
if not data or 'skills' not in data or 'answers' not in data or 'userId' not in data: | |
return jsonify({"error": "Missing required fields"}), 400 | |
user_id = data['userId'] | |
user_skills = [s.strip() for s in data['skills'] if isinstance(s, str)] | |
answers = [a.strip() for a in data['answers'] if isinstance(a, str)] | |
user_level = data.get('user_level', 'Intermediate').strip() | |
if not user_skills or len(answers) != len(user_skills): | |
return jsonify({"error": "Answers count must match skills count"}), 400 | |
load_precomputed_resources() | |
user_questions = get_questions_for_skills(user_skills) | |
user_questions_df = pd.DataFrame(user_questions).reset_index(drop=True) | |
user_responses = [] | |
for idx, row in user_questions_df.iterrows(): | |
answer = answers[idx] | |
if not answer or answer.lower() == 'skip': | |
user_responses.append((row['Skill'], None, None)) | |
else: | |
question_idx = questions_df.index[questions_df['Question'] == row['Question']].tolist() | |
if not question_idx: | |
logger.warning(f"Question not found in dataset: {row['Question']}") | |
user_responses.append((row['Skill'], None, None)) | |
continue | |
user_responses.append((row['Skill'], answer, question_idx[0])) | |
results = [evaluate_response(response) for response in user_responses] | |
user_scores = {} | |
ai_flags = {} | |
scores_list = [] | |
skipped_questions = [f"{skill} ({question})" for skill, user_code, _ in user_responses if not user_code] | |
for skill, score, is_ai in results: | |
if skill in user_scores: | |
user_scores[skill] = max(user_scores[skill], score) | |
ai_flags[skill] = ai_flags[skill] or is_ai | |
else: | |
user_scores[skill] = score | |
ai_flags[skill] = is_ai | |
if score > 0: | |
scores_list.append(score) | |
# Update user profile with scores | |
skill_scores = [{"skill": skill, "score": score} for skill, score, _ in results if score > 0] | |
users_collection.update_one( | |
{"_id": user_id}, | |
{"$set": {"skillScores": skill_scores}}, | |
upsert=True | |
) | |
mean_score = np.mean(scores_list) if scores_list else 50 | |
dynamic_threshold = max(40, mean_score) | |
weak_skills = [skill for skill, score in user_scores.items() if score > 0 and score < dynamic_threshold] | |
courses = recommend_courses_from_mongo(weak_skills or user_skills, user_level, upgrade=not weak_skills) | |
jobs = recommend_jobs_from_mongo(user_skills, user_level) | |
return jsonify({ | |
"assessment_results": { | |
"skills": [ | |
{ | |
"skill": skill, | |
"progress": f"{'■' * int(score//10)}{'-' * (10 - int(score//10))}", | |
"score": f"{score:.2f}%", | |
"origin": "AI-Generated" if is_ai else "Human-Written" | |
} for skill, score, is_ai in results | |
], | |
"mean_score": round(mean_score, 2), | |
"dynamic_threshold": round(dynamic_threshold, 2), | |
"weak_skills": weak_skills, | |
"skipped_questions": skipped_questions | |
}, | |
"recommended_courses": courses, | |
"recommended_jobs": jobs | |
}) | |
except Exception as e: | |
logger.error(f"Assessment error: {e}") | |
return jsonify({"error": "Internal server error"}), 500 | |
if __name__ == '__main__': | |
app.run(host='0.0.0.0', port=7860) |