Spaces:

Essay-Grader
/

Detection_and_Plagiarism_Check

Running

App Files Files Community

Essay-Grader commited on Apr 26

Commit

5c3b44c

1 Parent(s): 74cee4f

Updated main.py to the app.py

Browse files

Files changed (2) hide show

app/{main.py → app.py} +45 -51
runtime.txt +1 -1

app/{main.py → app.py} RENAMED Viewed

@@ -48,12 +48,17 @@ RELOAD_INTERVAL = 300  # 5 minutes
 def load_models_impl():
     """Implementation of model loading logic with proper error handling"""
     global embedder, ai_tokenizer, ai_model, model_status
     # Track attempt time
     model_status["last_reload_attempt"] = time.time()
     model_status["last_error"] = None
     try:
         # Check Hugging Face Hub connectivity
         response = requests.head("https://huggingface.co", timeout=5)
         if response.status_code == 200:
@@ -63,34 +68,34 @@ def load_models_impl():
             logger.error(f"Failed to connect to Hugging Face Hub: {response.status_code}")
     except Exception as e:
         logger.error(f"Error checking Hugging Face Hub connectivity: {e}")
     try:
         # Load SentenceTransformer model for embeddings
         logger.info("Loading SentenceTransformer model...")
         embedder = SentenceTransformer('sentence-transformers/all-roberta-large-v1')
         # Load AI detection model
         ai_model_name = "ChrispamWrites/roberta-ai-detector-20250401_232702"
         logger.info(f"Loading AI detection model: {ai_model_name}")
         # Use local cache if available or download from HF
         ai_tokenizer = AutoTokenizer.from_pretrained(
             ai_model_name,
             local_files_only=not model_status["hub_accessible"],
             cache_dir="./model_cache"
         )
         # Load the config first
         ai_config = AutoConfig.from_pretrained(
             ai_model_name,
             local_files_only=not model_status["hub_accessible"],
             cache_dir="./model_cache"
         )
         # Modify the config to match the checkpoint's expected dimensions
-        ai_config.max_position_embeddings = 514
-        ai_config.type_vocab_size = 1
         # Load the model with this config
         ai_model = AutoModelForSequenceClassification.from_pretrained(
             ai_model_name,
@@ -98,7 +103,7 @@ def load_models_impl():
             local_files_only=not model_status["hub_accessible"],
             cache_dir="./model_cache"
         )
         # If the above doesn't work, try with ignore_mismatched_sizes
         if ai_model is None:
             logger.info("Attempting to load model with ignore_mismatched_sizes=True")
@@ -108,18 +113,18 @@ def load_models_impl():
                 cache_dir="./model_cache",
                 ignore_mismatched_sizes=True
             )
         # Verify models are loaded by testing them
         test_sentence = "This is a test sentence to verify model loading."
         # Test sentence transformer
         _ = embedder.encode(test_sentence)
         # Test AI detection model
         inputs = ai_tokenizer(test_sentence, return_tensors="pt", max_length=512, truncation=True)
         with torch.no_grad():
             _ = ai_model(**inputs)
         model_status["model_loaded"] = True
         logger.info("Models loaded successfully!")
         return True
@@ -141,7 +146,7 @@ async def load_models():
         retries += 1
         logger.info(f"Retrying model loading ({retries}/{MAX_RETRIES})...")
         time.sleep(5)  # Wait 5 seconds before retrying
     if not model_status["model_loaded"]:
         logger.warning(f"Failed to load models after {MAX_RETRIES} attempts. API will start, but analyze endpoint won't work.")
@@ -176,7 +181,7 @@ def detect_ai_generated(text):
         probs = torch.softmax(logits, dim=1).squeeze()
         predicted_class = torch.argmax(probs).item()
         confidence = probs[predicted_class].item()
     return {
         "label": "AI-generated" if predicted_class == 1 else "Human-written",
         "confidence": round(confidence * 100, 2)
@@ -192,12 +197,12 @@ async def health_check() -> Dict[str, Any]:
         (model_status["last_reload_attempt"] is None or
          current_time - model_status["last_reload_attempt"] > RELOAD_INTERVAL)
     )
     return {
         **model_status,
         "reload_needed": reload_needed,
-        "last_reload_attempt_time": time.strftime('%Y-%m-%d %H:%M:%S',
-                                            time.localtime(model_status["last_reload_attempt"]))
                                             if model_status["last_reload_attempt"] else None
     }
@@ -206,20 +211,20 @@ async def reload_models(background_tasks: BackgroundTasks):
     """Endpoint to manually trigger model reloading"""
     # Check if enough time has passed since last reload attempt
     current_time = time.time()
-    if (model_status["last_reload_attempt"] is not None and
         current_time - model_status["last_reload_attempt"] < 60):  # Prevent reloading more than once per minute
         return JSONResponse(content={
             "message": "Too many reload attempts. Please wait before trying again.",
             "seconds_until_next_attempt": 60 - int(current_time - model_status["last_reload_attempt"])
         }, status_code=429)
     background_tasks.add_task(background_model_reload, background_tasks)
     return {"message": "Model reload initiated in background"}
 @app.post("/analyze")
 async def analyze_essay(file: UploadFile = File(...), background_tasks: BackgroundTasks = None):
     global model_status, embedder, ai_tokenizer, ai_model
     # Check if models are loaded
     if not model_status["model_loaded"]:
         # Check if we should attempt to reload models
@@ -228,64 +233,53 @@ async def analyze_essay(file: UploadFile = File(...), background_tasks: Backgrou
             model_status["last_reload_attempt"] is None or
             current_time - model_status["last_reload_attempt"] > RELOAD_INTERVAL
         )
         if reload_needed and background_tasks:
             # Start a background reload
             background_tasks.add_task(background_model_reload, background_tasks)
             message = "Models are being reloaded in the background. Please try again in a few minutes."
         else:
             message = "Model not loaded. Check /health endpoint for details or try /reload-models endpoint."
         raise HTTPException(status_code=503, detail=message)
     # Check if models are actually initialized
     if embedder is None or ai_tokenizer is None or ai_model is None:
         logger.error("Models appear loaded but variables are None")
         raise HTTPException(status_code=503, detail="Model initialization incomplete. Please try again later.")
     if not file.filename.endswith(".pdf"):
         raise HTTPException(status_code=400, detail="Only PDF files are supported")
     with tempfile.TemporaryDirectory() as tmpdir:
         file_path = os.path.join(tmpdir, f"{uuid.uuid4()}.pdf")
         with open(file_path, "wb") as buffer:
             shutil.copyfileobj(file.file, buffer)
         try:
             essay_text = extract_text_from_pdf(file_path)
         except RuntimeError as e:
             raise HTTPException(status_code=500, detail=str(e))
         if not essay_text.strip():
             raise HTTPException(status_code=400, detail="The PDF seems to contain no extractable text.")
         try:
             # Run AI content detection
             ai_result = detect_ai_generated(essay_text)
             # Run internal plagiarism detection
             chunks = chunk_text(essay_text)
             if len(chunks) < 2:
                 raise HTTPException(status_code=400, detail="Not enough text chunks to assess internal plagiarism.")
             embeddings = embedder.encode(chunks)
             similarities = []
-            for i in range(len(embeddings)):
-                for j in range(i + 1, len(embeddings)):
-                    sim = cosine_similarity([embeddings[i]], [embeddings[j]])[0][0]
-                    similarities.append(sim)
-            max_similarity = max(similarities) if similarities else 0
-            avg_similarity = sum(similarities) / len(similarities) if similarities else 0
-            internal_score = round(avg_similarity * 100, 2)
         except Exception as e:
-            logger.error(f"Error during analysis: {e}")
-            raise HTTPException(status_code=500, detail=f"Analysis failed: {str(e)}")
-    return JSONResponse(content={
-        "ai_content_confidence": ai_result["confidence"],
-        "internal_plagiarism_score": internal_score,
-        "debug_note": "Processed with fixed model configuration"
-    })

 def load_models_impl():
     """Implementation of model loading logic with proper error handling"""
     global embedder, ai_tokenizer, ai_model, model_status
     # Track attempt time
     model_status["last_reload_attempt"] = time.time()
     model_status["last_error"] = None
     try:
+        # Placeholder for the code that should be inside the try block
+        pass
+    except Exception as e:
+        logger.error(f"An error occurred: {e}")
+        raise HTTPException(status_code=500, detail="An internal error occurred.")
         # Check Hugging Face Hub connectivity
         response = requests.head("https://huggingface.co", timeout=5)
         if response.status_code == 200:
             logger.error(f"Failed to connect to Hugging Face Hub: {response.status_code}")
     except Exception as e:
         logger.error(f"Error checking Hugging Face Hub connectivity: {e}")
     try:
         # Load SentenceTransformer model for embeddings
         logger.info("Loading SentenceTransformer model...")
         embedder = SentenceTransformer('sentence-transformers/all-roberta-large-v1')
         # Load AI detection model
         ai_model_name = "ChrispamWrites/roberta-ai-detector-20250401_232702"
         logger.info(f"Loading AI detection model: {ai_model_name}")
         # Use local cache if available or download from HF
         ai_tokenizer = AutoTokenizer.from_pretrained(
             ai_model_name,
             local_files_only=not model_status["hub_accessible"],
             cache_dir="./model_cache"
         )
         # Load the config first
         ai_config = AutoConfig.from_pretrained(
             ai_model_name,
             local_files_only=not model_status["hub_accessible"],
             cache_dir="./model_cache"
         )
         # Modify the config to match the checkpoint's expected dimensions
+        ai_config.max_position_embeddings = 514
+        ai_config.type_vocab_size = 1
         # Load the model with this config
         ai_model = AutoModelForSequenceClassification.from_pretrained(
             ai_model_name,
             local_files_only=not model_status["hub_accessible"],
             cache_dir="./model_cache"
         )
         # If the above doesn't work, try with ignore_mismatched_sizes
         if ai_model is None:
             logger.info("Attempting to load model with ignore_mismatched_sizes=True")
                 cache_dir="./model_cache",
                 ignore_mismatched_sizes=True
             )
         # Verify models are loaded by testing them
         test_sentence = "This is a test sentence to verify model loading."
         # Test sentence transformer
         _ = embedder.encode(test_sentence)
         # Test AI detection model
         inputs = ai_tokenizer(test_sentence, return_tensors="pt", max_length=512, truncation=True)
         with torch.no_grad():
             _ = ai_model(**inputs)
         model_status["model_loaded"] = True
         logger.info("Models loaded successfully!")
         return True
         retries += 1
         logger.info(f"Retrying model loading ({retries}/{MAX_RETRIES})...")
         time.sleep(5)  # Wait 5 seconds before retrying
     if not model_status["model_loaded"]:
         logger.warning(f"Failed to load models after {MAX_RETRIES} attempts. API will start, but analyze endpoint won't work.")
         probs = torch.softmax(logits, dim=1).squeeze()
         predicted_class = torch.argmax(probs).item()
         confidence = probs[predicted_class].item()
     return {
         "label": "AI-generated" if predicted_class == 1 else "Human-written",
         "confidence": round(confidence * 100, 2)
         (model_status["last_reload_attempt"] is None or
          current_time - model_status["last_reload_attempt"] > RELOAD_INTERVAL)
     )
     return {
         **model_status,
         "reload_needed": reload_needed,
+        "last_reload_attempt_time": time.strftime('%Y-%m-%d %H:%M:%S',
+                                            time.localtime(model_status["last_reload_attempt"]))
                                             if model_status["last_reload_attempt"] else None
     }
     """Endpoint to manually trigger model reloading"""
     # Check if enough time has passed since last reload attempt
     current_time = time.time()
+    if (model_status["last_reload_attempt"] is not None and
         current_time - model_status["last_reload_attempt"] < 60):  # Prevent reloading more than once per minute
         return JSONResponse(content={
             "message": "Too many reload attempts. Please wait before trying again.",
             "seconds_until_next_attempt": 60 - int(current_time - model_status["last_reload_attempt"])
         }, status_code=429)
     background_tasks.add_task(background_model_reload, background_tasks)
     return {"message": "Model reload initiated in background"}
 @app.post("/analyze")
 async def analyze_essay(file: UploadFile = File(...), background_tasks: BackgroundTasks = None):
     global model_status, embedder, ai_tokenizer, ai_model
     # Check if models are loaded
     if not model_status["model_loaded"]:
         # Check if we should attempt to reload models
             model_status["last_reload_attempt"] is None or
             current_time - model_status["last_reload_attempt"] > RELOAD_INTERVAL
         )
         if reload_needed and background_tasks:
             # Start a background reload
             background_tasks.add_task(background_model_reload, background_tasks)
             message = "Models are being reloaded in the background. Please try again in a few minutes."
         else:
             message = "Model not loaded. Check /health endpoint for details or try /reload-models endpoint."
         raise HTTPException(status_code=503, detail=message)
     # Check if models are actually initialized
     if embedder is None or ai_tokenizer is None or ai_model is None:
         logger.error("Models appear loaded but variables are None")
         raise HTTPException(status_code=503, detail="Model initialization incomplete. Please try again later.")
     if not file.filename.endswith(".pdf"):
         raise HTTPException(status_code=400, detail="Only PDF files are supported")
     with tempfile.TemporaryDirectory() as tmpdir:
         file_path = os.path.join(tmpdir, f"{uuid.uuid4()}.pdf")
         with open(file_path, "wb") as buffer:
             shutil.copyfileobj(file.file, buffer)
         try:
             essay_text = extract_text_from_pdf(file_path)
         except RuntimeError as e:
             raise HTTPException(status_code=500, detail=str(e))
         if not essay_text.strip():
             raise HTTPException(status_code=400, detail="The PDF seems to contain no extractable text.")
         try:
             # Run AI content detection
             ai_result = detect_ai_generated(essay_text)
             # Run internal plagiarism detection
             chunks = chunk_text(essay_text)
             if len(chunks) < 2:
                 raise HTTPException(status_code=400, detail="Not enough text chunks to assess internal plagiarism.")
             embeddings = embedder.encode(chunks)
             similarities = []
         except Exception as e:
+            logger.error(f"An error occurred during analysis: {e}")
+            raise HTTPException(status_code=500, detail="An error occurred during analysis.")
+        except Exception as e:
+            logger.error(f"An error occurred during analysis: {e}")
+            raise HTTPException(status_code=500, detail="An error occurred during analysis.")

runtime.txt CHANGED Viewed

	@@ -1 +1 @@
1	- python-3.~~11.9~~


1	+ python-3.10