Spaces:

Essay-Grader
/

Detection_and_Plagiarism_Check

Running

App Files Files Community

Essay-Grader commited on 7 days ago

Commit

296bce3

1 Parent(s): 618a405

Added new file

Browse files

Files changed (4) hide show

Dockerfile +14 -17
app.py +143 -110
requirements.txt +4 -2
verify_model.py +15 -0

Dockerfile CHANGED Viewed

@@ -2,34 +2,31 @@ FROM python:3.9-slim
 WORKDIR /code
-# Set cache directories to writable location
-ENV TRANSFORMERS_CACHE=/tmp/cache
-ENV HF_HOME=/tmp/cache
-ENV SENTENCE_TRANSFORMERS_HOME=/tmp/cache
-ENV XDG_CACHE_HOME=/tmp/cache
-# Install system dependencies
 RUN apt-get update && apt-get install -y --no-install-recommends \
     build-essential \
     git \
     && rm -rf /var/lib/apt/lists/*
-# Create writable cache directory with proper permissions
-RUN mkdir -p /tmp/cache && chmod 775 /tmp/cache
-# Create non-root user and switch to it
-RUN useradd -m appuser && chown -R appuser /code /tmp/cache
 USER appuser
-# Copy requirements first for better caching
 COPY --chown=appuser:appuser requirements.txt .
-RUN pip install --no-cache-dir -r requirements.txt
 # Copy application code
 COPY --chown=appuser:appuser app.py .
-# Pre-download models with correct cache location
-RUN python -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('sentence-transformers/all-roberta-large-v1')" && \
-    python -c "from transformers import AutoModel; AutoModel.from_pretrained('Essay-Grader/roberta-ai-detector-20250401_232702', use_safetensors=True)"
-CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

 WORKDIR /code
+# Hugging Face Space requirements
+ENV HF_HOME=/tmp/cache \
+    TRANSFORMERS_CACHE=/tmp/cache \
+    SENTENCE_TRANSFORMERS_HOME=/tmp/cache \
+    PATH="/home/appuser/.local/bin:${PATH}"
+# System dependencies
 RUN apt-get update && apt-get install -y --no-install-recommends \
     build-essential \
     git \
     && rm -rf /var/lib/apt/lists/*
+# Create cache directory and non-root user
+RUN mkdir -p ${HF_HOME} && chmod 777 ${HF_HOME} && \
+    useradd -m appuser && chown -R appuser /code ${HF_HOME}
 USER appuser
+# Install Python dependencies
 COPY --chown=appuser:appuser requirements.txt .
+RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir -r requirements.txt
 # Copy application code
 COPY --chown=appuser:appuser app.py .
+# Hugging Face Space-specific CMD
+CMD ["python", "-m", "uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

app.py CHANGED Viewed

@@ -1,9 +1,8 @@
-# app.py: Updated API for AI detection and plagiarism checking using FastAPI
 from fastapi import FastAPI, UploadFile, File, HTTPException, BackgroundTasks
 from fastapi.responses import JSONResponse
 from sentence_transformers import SentenceTransformer
-from transformers import AutoTokenizer, AutoModelForSequenceClassification
 from PyPDF2 import PdfReader
 from sklearn.metrics.pairwise import cosine_similarity
 import torch
@@ -16,163 +15,191 @@ import logging
 import time
 from typing import Dict, Any
-# Set up logging
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
 app = FastAPI(
-    title="Essay Grader API",
-    description="API for AI content detection and plagiarism detection",
-    version="1.1.0"
 )
-# Configuration
-CACHE_DIR = "/tmp/cache"  # Writable directory for model caching
-os.makedirs(CACHE_DIR, exist_ok=True)  # Ensure cache directory exists
-PLAGIARISM_THRESHOLD = 0.85  # Similarity threshold for plagiarism detection
-MAX_TEXT_LENGTH = 512  # Maximum text length for AI detection
-# Global variables to track model loading status
 model_status = {
     "model_loaded": False,
     "last_error": None,
-    "last_reload_attempt": None
 }
-# Global variables for models
 embedder = None
 ai_tokenizer = None
 ai_model = None
-def load_models_impl():
-    """Implementation of model loading logic with proper error handling"""
-    global embedder, ai_tokenizer, ai_model, model_status
-    model_status["last_reload_attempt"] = time.time()
-    model_status["last_error"] = None
     try:
-        # Load SentenceTransformer model
-        logger.info("Loading SentenceTransformer model...")
         embedder = SentenceTransformer(
-            'sentence-transformers/all-roberta-large-v1',
             cache_folder=CACHE_DIR
         )
-        # Load AI detection model
-        ai_model_name = "Essay-Grader/roberta-ai-detector-20250401_232702"
-        logger.info(f"Loading AI detection model: {ai_model_name}")
-        # Load tokenizer and model
         ai_tokenizer = AutoTokenizer.from_pretrained(
-            ai_model_name,
             cache_dir=CACHE_DIR,
             use_fast=True
         )
-        ai_model = AutoModelForSequenceClassification.from_pretrained(
-            ai_model_name,
             cache_dir=CACHE_DIR,
-            use_safetensors=True,
-            device_map="auto"
         )
-        # Verify model loading
-        test_text = "Model verification text " * 50
-        inputs = ai_tokenizer(
-            test_text,
             return_tensors="pt",
             max_length=MAX_TEXT_LENGTH,
             truncation=True,
             padding=True
         )
         with torch.no_grad():
-            ai_model(**inputs)
-        model_status["model_loaded"] = True
-        logger.info("Models loaded successfully!")
         return True
     except Exception as e:
-        error_msg = f"Error loading models: {str(e)}"
         logger.error(error_msg)
-        model_status["model_loaded"] = False
-        model_status["last_error"] = error_msg
         return False
 @app.on_event("startup")
-async def initialize_app():
-    """Initialize application with retry logic"""
-    retries = 0
-    while retries < 3 and not model_status["model_loaded"]:
-        if load_models_impl():
-            break
-        retries += 1
-        logger.info(f"Retry {retries}/3 for model loading")
         time.sleep(5)
-    if not model_status["model_loaded"]:
-        logger.error("Failed to load models after 3 attempts")
 def extract_text_from_pdf(pdf_path: str) -> str:
-    """Extract text from PDF file"""
     try:
         reader = PdfReader(pdf_path)
         return " ".join(page.extract_text() or "" for page in reader.pages)
     except Exception as e:
-        logger.error(f"PDF extraction failed: {e}")
         raise RuntimeError("Failed to extract text from PDF")
 def chunk_text(text: str, chunk_size: int = 5) -> list:
-    """Split text into chunks of sentences"""
     sentences = [s.strip() for s in text.split('.') if s.strip()]
-    return ['. '.join(sentences[i:i+chunk_size]) + '.' for i in range(0, len(sentences), chunk_size)]
-def detect_ai_generated(text: str) -> Dict[str, float]:
-    """Detect AI-generated content and return both probabilities"""
-    inputs = ai_tokenizer(
-        text,
-        truncation=True,
-        padding=True,
-        return_tensors="pt",
-        max_length=MAX_TEXT_LENGTH
-    )
-    with torch.no_grad():
-        outputs = ai_model(**inputs)
-        probs = torch.softmax(outputs.logits, dim=1).squeeze()
-    return {
-        "human_written": round(probs[0].item() * 100, 2),
-        "ai_generated": round(probs[1].item() * 100, 2)
-    }
-def calculate_plagiarism_percent(chunks: list) -> float:
-    """Calculate plagiarism percentage based on text similarity"""
     if len(chunks) < 2:
         return 0.0
     embeddings = embedder.encode(chunks)
     similarity_matrix = cosine_similarity(embeddings)
-    np.fill_diagonal(similarity_matrix, 0)  # Ignore self-comparisons
-    # Count similar pairs
     similar_pairs = np.sum(similarity_matrix > PLAGIARISM_THRESHOLD)
-    total_possible = len(chunks) * (len(chunks) - 1) / 2
-    return round((similar_pairs / total_possible) * 100, 2) if total_possible > 0 else 0.0
 @app.post("/analyze")
-async def analyze_essay(
     file: UploadFile = File(...),
     background_tasks: BackgroundTasks = None
 ) -> Dict[str, Any]:
-    """Analyze PDF document for AI content and plagiarism"""
     if not model_status["model_loaded"]:
-        raise HTTPException(status_code=503, detail="Models not loaded - try /reload-models")
     if not file.filename.lower().endswith(".pdf"):
-        raise HTTPException(status_code=400, detail="Only PDF files supported")
     try:
         with tempfile.TemporaryDirectory() as tmp_dir:
@@ -180,46 +207,52 @@ async def analyze_essay(
             file_path = os.path.join(tmp_dir, f"{uuid.uuid4()}.pdf")
             with open(file_path, "wb") as buffer:
                 shutil.copyfileobj(file.file, buffer)
-            # Process PDF
             text = extract_text_from_pdf(file_path)
             if not text.strip():
-                raise HTTPException(status_code=400, detail="No text found in PDF")
-            # Run analyses
-            ai_result = detect_ai_generated(text)
             chunks = chunk_text(text)
-            plagiarism_percent = calculate_plagiarism_percent(chunks)
             return {
                 "analysis": {
                     "ai_detection": ai_result,
-                    "plagiarism_check": plagiarism_percent
                 },
-                "status": "completed"
             }
     except Exception as e:
-        logger.error(f"Analysis failed: {str(e)}")
-        raise HTTPException(status_code=500, detail=f"Analysis failed: {str(e)}")
 @app.post("/reload-models")
 async def reload_models(background_tasks: BackgroundTasks):
-    """Trigger model reload"""
-    background_tasks.add_task(load_models_impl)
-    return {"status": "reload-initiated", "message": "Model reload started"}
 @app.get("/health")
 async def health_check() -> Dict[str, Any]:
-    """System health check"""
     return {
         "model_loaded": model_status["model_loaded"],
         "last_error": model_status["last_error"],
-        "cache_dir": CACHE_DIR,
-        "status": "operational" if model_status["model_loaded"] else "degraded"
     }
 @app.get("/")
 async def root():
-    """Root endpoint with documentation redirect"""
-    return {"message": "Essay Analysis API - Visit /docs for API documentation"}

+# app.py: AI Detection and Plagiarism Check API
 from fastapi import FastAPI, UploadFile, File, HTTPException, BackgroundTasks
 from fastapi.responses import JSONResponse
 from sentence_transformers import SentenceTransformer
+from transformers import RobertaForSequenceClassification, AutoTokenizer
 from PyPDF2 import PdfReader
 from sklearn.metrics.pairwise import cosine_similarity
 import torch
 import time
 from typing import Dict, Any
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
 logger = logging.getLogger(__name__)
 app = FastAPI(
+    title="Essay Analysis API",
+    description="API for AI Content Detection and Plagiarism Checking",
+    version="1.0.0",
+    docs_url="/docs",
+    redoc_url=None
 )
+# Configuration Constants
+CACHE_DIR = "/tmp/cache"
+PLAGIARISM_THRESHOLD = 0.85
+MAX_TEXT_LENGTH = 512
+MODEL_NAME = "Essay-Grader/roberta-ai-detector-20250401_232702"
+SENTENCE_MODEL = "sentence-transformers/all-roberta-large-v1"
+# Global State Management
 model_status = {
     "model_loaded": False,
     "last_error": None,
+    "last_reload_attempt": None,
+    "retry_count": 0
 }
+# Model References
 embedder = None
 ai_tokenizer = None
 ai_model = None
+def initialize_models():
+    """Initialize ML models with error handling and retry logic"""
+    global embedder, ai_tokenizer, ai_model
     try:
+        # Initialize Sentence Transformer
+        logger.info("Loading sentence transformer model...")
         embedder = SentenceTransformer(
+            SENTENCE_MODEL,
             cache_folder=CACHE_DIR
         )
+        # Initialize AI Detection Model
+        logger.info(f"Loading AI detection model: {MODEL_NAME}")
         ai_tokenizer = AutoTokenizer.from_pretrained(
+            MODEL_NAME,
             cache_dir=CACHE_DIR,
             use_fast=True
         )
+        # Modified to fix safetensors loading issue
+        ai_model = RobertaForSequenceClassification.from_pretrained(
+            MODEL_NAME,
             cache_dir=CACHE_DIR,
+            device_map="auto" if torch.cuda.is_available() else None,
+            trust_remote_code=True
         )
+        # Model warmup
+        test_input = ai_tokenizer(
+            "Model initialization text " * 20,
             return_tensors="pt",
             max_length=MAX_TEXT_LENGTH,
             truncation=True,
             padding=True
         )
         with torch.no_grad():
+            # Move input tensors to model device
+            if hasattr(ai_model, "device"):
+                test_input = {k: v.to(ai_model.device) for k, v in test_input.items()}
+            ai_model(**test_input)
+        logger.info("All models loaded successfully")
+        model_status.update({
+            "model_loaded": True,
+            "last_error": None
+        })
         return True
     except Exception as e:
+        error_msg = f"Model initialization failed: {str(e)}"
         logger.error(error_msg)
+        model_status.update({
+            "last_error": error_msg,
+            "model_loaded": False
+        })
         return False
 @app.on_event("startup")
+async def startup_event():
+    """Application startup with retry logic"""
+    os.makedirs(CACHE_DIR, exist_ok=True)
+    max_retries = 3
+    while model_status["retry_count"] < max_retries:
+        if initialize_models():
+            model_status.update({
+                "model_loaded": True,
+                "retry_count": 0
+            })
+            return
+        model_status["retry_count"] += 1
+        logger.warning(f"Retry attempt {model_status['retry_count']}/{max_retries}")
         time.sleep(5)
+    logger.critical("Failed to initialize models after multiple attempts")
 def extract_text_from_pdf(pdf_path: str) -> str:
+    """Extract and concatenate text from PDF"""
     try:
         reader = PdfReader(pdf_path)
         return " ".join(page.extract_text() or "" for page in reader.pages)
     except Exception as e:
+        logger.error(f"PDF extraction error: {str(e)}")
         raise RuntimeError("Failed to extract text from PDF")
 def chunk_text(text: str, chunk_size: int = 5) -> list:
+    """Split text into coherent chunks"""
     sentences = [s.strip() for s in text.split('.') if s.strip()]
+    chunks = []
+    for i in range(0, len(sentences), chunk_size):
+        chunk = '. '.join(sentences[i:i+chunk_size]) + '.'
+        chunks.append(chunk)
+    return chunks
+def analyze_ai_content(text: str) -> Dict[str, float]:
+    """Analyze text for AI-generated content"""
+    try:
+        inputs = ai_tokenizer(
+            text,
+            truncation=True,
+            padding=True,
+            return_tensors="pt",
+            max_length=MAX_TEXT_LENGTH
+        )
+        # Move tensors to the same device as the model
+        device = next(ai_model.parameters()).device
+        inputs = {k: v.to(device) for k, v in inputs.items()}
+        with torch.no_grad():
+            outputs = ai_model(**inputs)
+            probs = torch.softmax(outputs.logits, dim=1).squeeze()
+        return {
+            "human_written": round(probs[0].item() * 100, 2),
+            "ai_generated": round(probs[1].item() * 100, 2)
+        }
+    except Exception as e:
+        logger.error(f"AI analysis failed: {str(e)}")
+        raise RuntimeError("Failed to analyze text content")
+def calculate_plagiarism_score(chunks: list) -> float:
+    """Calculate plagiarism percentage using similarity analysis"""
     if len(chunks) < 2:
         return 0.0
     embeddings = embedder.encode(chunks)
     similarity_matrix = cosine_similarity(embeddings)
+    np.fill_diagonal(similarity_matrix, 0)
     similar_pairs = np.sum(similarity_matrix > PLAGIARISM_THRESHOLD)
+    total_possible = len(chunks) * (len(chunks) - 1) // 2
+    return round((similar_pairs / total_possible) * 100, 2) if total_possible else 0.0
 @app.post("/analyze")
+async def analyze_document(
     file: UploadFile = File(...),
     background_tasks: BackgroundTasks = None
 ) -> Dict[str, Any]:
+    """Main analysis endpoint"""
     if not model_status["model_loaded"]:
+        raise HTTPException(
+            status_code=503,
+            detail="Service unavailable - models not loaded"
+        )
     if not file.filename.lower().endswith(".pdf"):
+        raise HTTPException(400, "Only PDF files are supported")
     try:
         with tempfile.TemporaryDirectory() as tmp_dir:
             file_path = os.path.join(tmp_dir, f"{uuid.uuid4()}.pdf")
             with open(file_path, "wb") as buffer:
                 shutil.copyfileobj(file.file, buffer)
+            # Process document
             text = extract_text_from_pdf(file_path)
             if not text.strip():
+                raise HTTPException(400, "No text found in document")
+            # Perform analysis
+            ai_result = analyze_ai_content(text)
             chunks = chunk_text(text)
+            plagiarism_score = calculate_plagiarism_score(chunks)
             return {
                 "analysis": {
                     "ai_detection": ai_result,
+                    "plagiarism_score": plagiarism_score
                 },
+                "status": "success"
             }
+    except HTTPException:
+        raise
     except Exception as e:
+        logger.error(f"Analysis pipeline failed: {str(e)}")
+        raise HTTPException(500, f"Analysis failed: {str(e)}")
 @app.post("/reload-models")
 async def reload_models(background_tasks: BackgroundTasks):
+    """Model reload endpoint"""
+    background_tasks.add_task(initialize_models)
+    return {"status": "reload-initiated", "message": "Model reload in progress"}
 @app.get("/health")
 async def health_check() -> Dict[str, Any]:
+    """System health endpoint"""
     return {
+        "status": "operational" if model_status["model_loaded"] else "degraded",
         "model_loaded": model_status["model_loaded"],
         "last_error": model_status["last_error"],
+        "retry_count": model_status["retry_count"]
     }
 @app.get("/")
 async def root():
+    """Root endpoint"""
+    return {
+        "service": "Essay Analysis API",
+        "version": "1.0.0",
+        "endpoints": ["/analyze", "/health", "/reload-models"]
+    }

requirements.txt CHANGED Viewed

@@ -7,7 +7,9 @@ torch==2.3.0
 scikit-learn==1.4.0
 PyPDF2==3.0.1
 numpy==1.26.4
-pandas==2.2.1
 requests==2.31.0
 python-multipart==0.0.9
-safetensors==0.4.3

 scikit-learn==1.4.0
 PyPDF2==3.0.1
 numpy==1.26.4
 requests==2.31.0
+safetensors==0.4.3
+huggingface_hub>=0.23.0,<1.0
 python-multipart==0.0.9
+click==8.1.7
+accelerate>=0.23.0

verify_model.py ADDED Viewed

	@@ -0,0 +1,15 @@

+from transformers import AutoModelForSequenceClassification, AutoTokenizer
+model = AutoModelForSequenceClassification.from_pretrained(
+    "Essay-Grader/roberta-ai-detector-20250401_232702",
+    trust_remote_code=True,
+    device_map="auto"
+)
+tokenizer = AutoTokenizer.from_pretrained(
+    "Essay-Grader/roberta-ai-detector-20250401_232702"
+)
+text = "Sample essay text for verification"
+inputs = tokenizer(text, return_tensors="pt")
+outputs = model(**inputs)
+print("Model output:", outputs.logits)