Spaces:

Essay-Grader
/

Detection_and_Plagiarism_Check

Running

File size: 18,266 Bytes

# app.py: AI Detection and Plagiarism Check API 


from fastapi import FastAPI, UploadFile, File, HTTPException, BackgroundTasks
from fastapi.responses import JSONResponse
from sentence_transformers import SentenceTransformer
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from PyPDF2 import PdfReader
from sklearn.metrics.pairwise import cosine_similarity
import torch
import os
import numpy as np
import shutil
import uuid
import tempfile
import logging
import time
from typing import Dict, Any

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

app = FastAPI(
    title="Essay Analysis API",
    version="1.0.0",
    docs_url="/docs",
    redoc_url=None
)

# Configuration
CACHE_DIR = "/tmp/cache"
PLAGIARISM_THRESHOLD = 0.82
MAX_TEXT_LENGTH = 512
MODEL_NAME = "Essay-Grader/roberta-ai-detector-20250401_232702"
SENTENCE_MODEL = "sentence-transformers/all-roberta-large-v1"

# Global State
model_status = {
    "model_loaded": False,
    "last_error": None
}

# Model References
embedder = None
ai_tokenizer = None
ai_model = None

def initialize_models():
    global embedder, ai_tokenizer, ai_model
    
    try:
        # Cleanup existing models
        if embedder or ai_model:
            del embedder, ai_tokenizer, ai_model
            torch.cuda.empty_cache()

        # Load models
        logger.info("Loading models...")
        embedder = SentenceTransformer(SENTENCE_MODEL)
        ai_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
        ai_model = AutoModelForSequenceClassification.from_pretrained(
            MODEL_NAME,
            device_map="auto",
            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
        ).eval()

        # Warmup
        test_text = "Model initialization text. " * 50
        inputs = ai_tokenizer(test_text, return_tensors="pt", truncation=True)
        with torch.no_grad():
            ai_model(**inputs.to(ai_model.device))

        model_status.update({"model_loaded": True, "last_error": None})
        return True

    except Exception as e:
        error_msg = f"Model load failed: {str(e)}"
        logger.error(error_msg)
        model_status.update({"model_loaded": False, "last_error": error_msg})
        return False

@app.on_event("startup")
async def startup_event():
    for _ in range(3):
        if initialize_models():
            return
        time.sleep(5)
    logger.error("Failed to initialize models")

def extract_text_from_pdf(pdf_path: str) -> str:
    try:
        return " ".join(page.extract_text() for page in PdfReader(pdf_path).pages)
    except Exception as e:
        logger.error(f"PDF error: {str(e)}")
        raise HTTPException(400, "Invalid PDF file")

def chunk_text(text: str) -> list:
    sentences = [s.strip() for s in text.split('.') if s.strip()]
    return ['. '.join(sentences[i:i+5]) + '.' for i in range(0, len(sentences), 5)]

def analyze_content(text: str) -> Dict[str, float]:
    try:
        inputs = ai_tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=MAX_TEXT_LENGTH,
            return_tensors="pt"
        ).to(ai_model.device)
        
        with torch.no_grad():
            outputs = ai_model(**inputs)
            probs = torch.softmax(outputs.logits, dim=1).squeeze()
        
        return {
            "Human_Written": round(probs[0].item() * 100, 2),
            "AI_Generated": round(probs[1].item() * 100, 2)
        }
    except Exception as e:
        logger.error(f"AI analysis failed: {str(e)}")
        raise

def calculate_plagiarism(chunks: list) -> float:
    if len(chunks) < 2:
        return 0.0
    
    embeddings = embedder.encode(chunks, batch_size=32)
    similarity_matrix = cosine_similarity(embeddings)
    np.fill_diagonal(similarity_matrix, 0)
    
    similar_pairs = np.sum(similarity_matrix > PLAGIARISM_THRESHOLD)
    total_possible = len(chunks) * (len(chunks) - 1) // 2
    
    return round((similar_pairs / total_possible) * 100, 2) if total_possible else 0.0

@app.post("/analyze")
async def analyze_essay(file: UploadFile = File(...)) -> Dict[str, Any]:
    if not model_status["model_loaded"]:
        raise HTTPException(503, "Service unavailable")
    
    if not file.filename.lower().endswith(".pdf"):
        raise HTTPException(400, "PDF files only")

    try:
        with tempfile.TemporaryDirectory() as tmp_dir:
            # Save file
            file_path = f"{tmp_dir}/{uuid.uuid4()}.pdf"
            with open(file_path, "wb") as f:
                shutil.copyfileobj(file.file, f)
            
            # Process
            text = extract_text_from_pdf(file_path)
            if not text.strip():
                raise HTTPException(400, "Empty PDF content")
            
            return {
                "analysis": {
                    **analyze_content(text),
                    "Plagiarism_Score": calculate_plagiarism(chunk_text(text))
                },
                "status": "success"
            }

    except HTTPException:
        raise
    except Exception as e:
        logger.error(f"Processing failed: {str(e)}")
        raise HTTPException(500, "Analysis error")

@app.get("/health")
async def health_check() -> Dict[str, Any]:
    return {"status": "operational" if model_status["model_loaded"] else "degraded"}

@app.get("/")
async def root():
    return {"message": "Essay Analysis API - POST PDFs to /analyze"}


# from fastapi import FastAPI, UploadFile, File, HTTPException, BackgroundTasks
# from fastapi.responses import JSONResponse
# from sentence_transformers import SentenceTransformer
# from transformers import AutoModelForSequenceClassification, AutoTokenizer
# from PyPDF2 import PdfReader
# from sklearn.metrics.pairwise import cosine_similarity
# import torch
# import os
# import numpy as np
# import shutil
# import uuid
# import tempfile
# import logging
# import time
# from typing import Dict, Any

# # Configure logging
# logging.basicConfig(
#     level=logging.INFO,
#     format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
# )
# logger = logging.getLogger(__name__)

# app = FastAPI(
#     title="Essay Analysis API",
#     description="API for AI Content Detection and Plagiarism Checking",
#     version="1.0.0",
#     docs_url="/docs",
#     redoc_url=None
# )

# # Configuration Constants
# CACHE_DIR = "/tmp/cache"
# PLAGIARISM_THRESHOLD = 0.82  # Adjusted threshold for better differentiation
# MAX_TEXT_LENGTH = 512
# MODEL_NAME = "Essay-Grader/roberta-ai-detector-20250401_232702"
# SENTENCE_MODEL = "sentence-transformers/all-roberta-large-v1"

# # Global State Management
# model_status = {
#     "model_loaded": False,
#     "last_error": None,
#     "last_reload_attempt": None,
#     "retry_count": 0
# }

# # Model References
# embedder = None
# ai_tokenizer = None
# ai_model = None

# def initialize_models():
#     """Initialize ML models with enhanced error handling"""
#     global embedder, ai_tokenizer, ai_model
    
#     try:
#         # Clear previous models and cache
#         if embedder or ai_model:
#             del embedder, ai_tokenizer, ai_model
#             torch.cuda.empty_cache()

#         logger.info("Loading sentence transformer model...")
#         embedder = SentenceTransformer(
#             SENTENCE_MODEL,
#             cache_folder=CACHE_DIR,
#             device='cuda' if torch.cuda.is_available() else 'cpu'
#         )

#         logger.info(f"Loading AI detection model: {MODEL_NAME}")
#         ai_tokenizer = AutoTokenizer.from_pretrained(
#             MODEL_NAME,
#             cache_dir=CACHE_DIR,
#             use_fast=True,
#             padding_side='left'
#         )
        
#         ai_model = AutoModelForSequenceClassification.from_pretrained(
#             MODEL_NAME,
#             cache_dir=CACHE_DIR,
#             use_safetensors=True,
#             device_map="auto",
#             trust_remote_code=True,
#             torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
#         ).eval()

#         # Warmup with varied inputs
#         warmup_texts = [
#             "The quick brown fox jumps over the lazy dog.",
#             "Artificial intelligence is transforming modern society.",
#             "Climate change remains one of humanity's greatest challenges."
#         ]
        
#         device = ai_model.device
#         for text in warmup_texts:
#             inputs = ai_tokenizer(
#                 text,
#                 truncation=True,
#                 padding='max_length',
#                 max_length=MAX_TEXT_LENGTH,
#                 return_tensors="pt"
#             ).to(device)
            
#             with torch.no_grad(), torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):
#                 outputs = ai_model(**inputs)
#                 probs = torch.softmax(outputs.logits, dim=1)
#                 logger.debug(f"Warmup prob: {probs.cpu().numpy()}")

#         model_status.update({
#             "model_loaded": True,
#             "last_error": None
#         })
#         return True

#     except Exception as e:
#         error_msg = f"Model initialization failed: {str(e)}"
#         logger.error(error_msg, exc_info=True)
#         model_status.update({
#             "model_loaded": False,
#             "last_error": error_msg
#         })
#         return False

# @app.on_event("startup")
# async def startup_event():
#     """Enhanced startup with better resource management"""
#     os.makedirs(CACHE_DIR, exist_ok=True)
#     max_retries = 3

#     while model_status["retry_count"] < max_retries:
#         if initialize_models():
#             model_status.update({"retry_count": 0})
#             return
#         model_status["retry_count"] += 1
#         logger.warning(f"Retry attempt {model_status['retry_count']}/{max_retries}")
#         time.sleep(10)
#         torch.cuda.empty_cache()

#     logger.critical("Failed to initialize models after multiple attempts")

# def extract_text_from_pdf(pdf_path: str) -> str:
#     """Robust PDF text extraction"""
#     try:
#         reader = PdfReader(pdf_path)
#         text = []
#         for page in reader.pages:
#             page_text = page.extract_text()
#             if page_text:
#                 text.append(page_text.strip())
#         return "\n".join(text)
#     except Exception as e:
#         logger.error(f"PDF extraction failed: {str(e)}")
#         raise RuntimeError("Failed to extract text from PDF")

# def chunk_text(text: str, chunk_size: int = 5) -> list:
#     """Improved text chunking with overlap"""
#     sentences = [s.strip() for s in text.split('.') if s.strip()]
#     chunks = []
#     for i in range(0, len(sentences), chunk_size):
#         start = max(0, i - 1)  # Add overlap
#         end = i + chunk_size
#         chunk = '. '.join(sentences[start:end]) + '.'
#         chunks.append(chunk)
#     return chunks

# def analyze_ai_content(text: str) -> Dict[str, float]:
#     """Enhanced AI analysis with dynamic batching"""
#     try:
#         if len(text) < 100:
#             logger.warning("Text too short for reliable analysis")
#             return {"human_written": 50.0, "ai_generated": 50.0}

#         device = ai_model.device
#         inputs = ai_tokenizer(
#             text,
#             truncation=True,
#             padding='max_length',
#             max_length=MAX_TEXT_LENGTH,
#             return_tensors="pt"
#         ).to(device)

#         # Handle long texts with sliding window
#         if inputs.input_ids.shape[1] > MAX_TEXT_LENGTH:
#             window_size = MAX_TEXT_LENGTH - 128  # 128 token overlap
#             all_probs = []
#             for i in range(0, inputs.input_ids.shape[1], window_size):
#                 chunk = inputs.input_ids[:, i:i+MAX_TEXT_LENGTH]
#                 with torch.no_grad(), torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):
#                     outputs = ai_model(input_ids=chunk)
#                     probs = torch.softmax(outputs.logits, dim=1)
#                     all_probs.append(probs.cpu())
#             avg_probs = torch.mean(torch.cat(all_probs), dim=0)
#         else:
#             with torch.no_grad(), torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):
#                 outputs = ai_model(**inputs)
#                 avg_probs = torch.softmax(outputs.logits, dim=1).squeeze()

#         human = avg_probs[0].item() * 100
#         ai = avg_probs[1].item() * 100

#         logger.info(f"AI detection results - Human: {human:.2f}%, AI: {ai:.2f}%")
#         return {
#             "human_written": round(human, 2),
#             "ai_generated": round(ai, 2)
#         }

#     except Exception as e:
#         logger.error(f"AI analysis error: {str(e)}", exc_info=True)
#         raise RuntimeError("Failed to analyze text content")

# def calculate_plagiarism_score(chunks: list) -> float:
#     """Improved plagiarism detection with adaptive thresholds"""
#     if len(chunks) < 2:
#         return 0.0

#     try:
#         # Dynamic batch sizing
#         batch_size = 32 if torch.cuda.is_available() else 8
#         embeddings = []
        
#         for i in range(0, len(chunks), batch_size):
#             batch = chunks[i:i+batch_size]
#             batch_embeddings = embedder.encode(
#                 batch,
#                 convert_to_tensor=True,
#                 show_progress_bar=False,
#                 normalize_embeddings=True
#             )
#             embeddings.append(batch_embeddings.cpu().numpy())

#         embeddings = np.concatenate(embeddings)
#         similarity_matrix = cosine_similarity(embeddings)
#         np.fill_diagonal(similarity_matrix, -1)  # Ignore self-similarity

#         # Adaptive threshold calculation
#         avg_sim = np.mean(similarity_matrix)
#         std_dev = np.std(similarity_matrix)
#         dynamic_threshold = min(
#             PLAGIARISM_THRESHOLD,
#             avg_sim + std_dev * 0.5
#         )
#         logger.info(f"Using dynamic plagiarism threshold: {dynamic_threshold:.3f}")

#         similar_pairs = np.sum(similarity_matrix > dynamic_threshold)
#         total_possible = len(chunks) * (len(chunks) - 1) // 2
        
#         score = round((similar_pairs / total_possible) * 100, 2) if total_possible else 0.0
#         logger.info(f"Plagiarism score: {score}%")
#         return score

#     except Exception as e:
#         logger.error(f"Plagiarism calculation error: {str(e)}", exc_info=True)
#         raise RuntimeError("Failed to calculate plagiarism score")

# @app.post("/analyze")
# async def analyze_document(file: UploadFile = File(...)) -> Dict[str, Any]:
#     """Enhanced analysis endpoint with detailed processing"""
#     start_time = time.time()
    
#     if not model_status["model_loaded"]:
#         raise HTTPException(503, "Service unavailable - models not loaded")

#     if not file.filename.lower().endswith(".pdf"):
#         raise HTTPException(400, "Only PDF files are supported")

#     try:
#         with tempfile.TemporaryDirectory() as tmp_dir:
#             # File handling
#             file_path = os.path.join(tmp_dir, f"{uuid.uuid4()}.pdf")
#             with open(file_path, "wb") as buffer:
#                 shutil.copyfileobj(file.file, buffer)

#             logger.info(f"Processing file: {file.filename}")
#             text = extract_text_from_pdf(file_path)
            
#             if not text.strip():
#                 raise HTTPException(400, "No text found in document")
                
#             text_length = len(text)
#             logger.info(f"Extracted {text_length} characters")

#             if text_length < 200:
#                 raise HTTPException(400, "Insufficient text for analysis")

#             # Core analysis
#             ai_result = analyze_ai_content(text)
#             chunks = chunk_text(text)
#             logger.info(f"Analyzing {len(chunks)} text chunks")
#             plagiarism_score = calculate_plagiarism_score(chunks)

#             # Result compilation
#             processing_time = time.time() - start_time
#             logger.info(f"Analysis completed in {processing_time:.2f}s")
            
#             return {
#                 "analysis": {
#                     "ai_detection": ai_result,
#                     "plagiarism_score": plagiarism_score,
#                     "text_metrics": {
#                         "characters": text_length,
#                         "chunks_analyzed": len(chunks)
#                     },
#                     "processing_time": round(processing_time, 2)
#                 },
#                 "status": "success"
#             }

#     except HTTPException:
#         raise
#     except Exception as e:
#         logger.error(f"Analysis pipeline failed: {str(e)}", exc_info=True)
#         raise HTTPException(500, f"Analysis failed: {str(e)}")

# @app.get("/health")
# async def health_check() -> Dict[str, Any]:
#     """Enhanced health check with resource info"""
#     return {
#         "status": "operational" if model_status["model_loaded"] else "degraded",
#         "model_loaded": model_status["model_loaded"],
#         "last_error": model_status["last_error"],
#         "system": {
#             "device": str(ai_model.device) if ai_model else "unknown",
#             "torch_version": torch.__version__,
#             "cuda_available": torch.cuda.is_available()
#         }
#     }

# @app.post("/reload-models")
# async def reload_models(background_tasks: BackgroundTasks):
#     """Model reload endpoint with resource cleanup"""
#     background_tasks.add_task(initialize_models)
#     return {"status": "reload-initiated", "message": "Model reload in progress"}

# @app.get("/")
# async def root():
#     """Root endpoint with documentation"""
#     return {
#         "service": "Essay Analysis API",
#         "version": "1.0.0",
#         "endpoints": {
#             "/analyze": "POST - Analyze PDF document",
#             "/health": "GET - System health check",
#             "/reload-models": "POST - Reload AI models"
#         }
#     }