Spaces:

Essay-Grader
/

Detection_and_Plagiarism_Check

Running

App Files Files Community

Essay-Grader commited on 8 days ago

Commit

8a681a9

0 Parent(s):

Initial commit with Docker deployment

Browse files

Files changed (5) hide show

Dockerfile +26 -0
README.md +85 -0
app/main.py +291 -0
app_loader.py +3 -0
requirements.txt +228 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,26 @@

+FROM python:3.9-slim
+WORKDIR /code
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    git \
+    && rm -rf /var/lib/apt/lists/*
+# Copy requirements first for better caching
+COPY ./requirements.txt /code/requirements.txt
+RUN pip install --no-cache-dir -r /code/requirements.txt
+# Create model cache directory
+RUN mkdir -p ./model_cache
+# Copy application code
+COPY . /code/
+# Pre-download models (this will take some time)
+RUN python -c "from sentence_transformers import SentenceTransformer; model = SentenceTransformer('sentence-transformers/all-roberta-large-v1')"
+RUN python -c "from transformers import AutoTokenizer, AutoModelForSequenceClassification; tokenizer = AutoTokenizer.from_pretrained('ChrispamWrites/roberta-ai-detector-20250401_232702', cache_dir='./model_cache'); model = AutoModelForSequenceClassification.from_pretrained('ChrispamWrites/roberta-ai-detector-20250401_232702', cache_dir='./model_cache')"
+# Run the application on port 7860 (Hugging Face Space default port)
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

README.md ADDED Viewed

	@@ -0,0 +1,85 @@

+# Essay Grader API
+This API uses advanced AI models to evaluate essays for:
+- AI-generated content detection (identifies if content was written by AI)
+- Internal plagiarism detection (identifies repetitive patterns within the text)
+## Endpoints
+### `GET /health`
+Checks the API health status and model loading state.
+**Response:**
+```json
+{
+  "model_loaded": true,
+  "hub_accessible": true,
+  "pdf_processing": true
+}
+```
+### `POST /analyze`
+Upload a PDF essay for comprehensive analysis.
+**Request:**
+- Content-Type: multipart/form-data
+- Body: file (PDF document)
+**Response:**
+```json
+{
+  "ai_content_detection": {
+    "label": "Human-written",
+    "confidence": 92.5
+  },
+  "internal_plagiarism_score": 18.3,
+  "max_similarity_between_chunks": 45.2,
+  "chunks_analyzed": 12
+}
+```
+# Narrowed Response
+**Response:**
+```json
+{
+  "ai_content_detection": {
+    "confidence": 92.5
+  },
+  "internal_plagiarism_score": 18.3,
+}
+```
+## Usage Examples
+### Using cURL:
+```bash
+curl -X 'POST' \
+  'https://yourusername-essay-grader-api.hf.space/analyze' \
+  -H 'accept: application/json' \
+  -H 'Content-Type: multipart/form-data' \
+  -F 'file=@your_essay.pdf'
+```
+### Using Python Requests:
+```python
+import requests
+url = "https://yourusername-essay-grader-api.hf.space/analyze"
+files = {"file": open("your_essay.pdf", "rb")}
+response = requests.post(url, files=files)
+result = response.json()
+print(result)
+```
+## Technical Details
+This API uses:
+- RoBERTa-based models for AI content detection
+- Sentence transformers for semantic analysis
+- PyPDF2 for PDF text extraction
+The application is built with FastAPI and deployed on Hugging Face Spaces.
+```Created by: Christian Mpambira(BED-COM-22-20)```

app/main.py ADDED Viewed

	@@ -0,0 +1,291 @@

+# app.py: API for AI detection and plagiarism checking using FastAPI
+from fastapi import FastAPI, UploadFile, File, HTTPException, BackgroundTasks
+from fastapi.responses import JSONResponse
+from sentence_transformers import SentenceTransformer
+from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
+from PyPDF2 import PdfReader
+from sklearn.metrics.pairwise import cosine_similarity
+import torch
+import os
+import shutil
+import uuid
+import tempfile
+import logging
+import requests
+import time
+from typing import Dict, Any, List
+# Set up logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+app = FastAPI(
+    title="Essay Grader API",
+    description="API for AI content detection and internal plagiarism detection",
+    version="1.0.0"
+)
+# Global variables to track model loading status
+model_status = {
+    "model_loaded": False,
+    "hub_accessible": False,
+    "pdf_processing": True,
+    "last_error": None,
+    "last_reload_attempt": None
+}
+# Global variables for models
+embedder = None
+ai_tokenizer = None
+ai_model = None
+# Maximum number of retries for model loading
+MAX_RETRIES = 3
+# Time between reload attempts (in seconds)
+RELOAD_INTERVAL = 300  # 5 minutes
+def load_models_impl():
+    """Implementation of model loading logic with proper error handling"""
+    global embedder, ai_tokenizer, ai_model, model_status
+    # Track attempt time
+    model_status["last_reload_attempt"] = time.time()
+    model_status["last_error"] = None
+    try:
+        # Check Hugging Face Hub connectivity
+        response = requests.head("https://huggingface.co", timeout=5)
+        if response.status_code == 200:
+            model_status["hub_accessible"] = True
+            logger.info("Successfully connected to Hugging Face Hub")
+        else:
+            logger.error(f"Failed to connect to Hugging Face Hub: {response.status_code}")
+    except Exception as e:
+        logger.error(f"Error checking Hugging Face Hub connectivity: {e}")
+    try:
+        # Load SentenceTransformer model for embeddings
+        logger.info("Loading SentenceTransformer model...")
+        embedder = SentenceTransformer('sentence-transformers/all-roberta-large-v1')
+        # Load AI detection model
+        ai_model_name = "ChrispamWrites/roberta-ai-detector-20250401_232702"
+        logger.info(f"Loading AI detection model: {ai_model_name}")
+        # Use local cache if available or download from HF
+        ai_tokenizer = AutoTokenizer.from_pretrained(
+            ai_model_name,
+            local_files_only=not model_status["hub_accessible"],
+            cache_dir="./model_cache"
+        )
+        # Load the config first
+        ai_config = AutoConfig.from_pretrained(
+            ai_model_name,
+            local_files_only=not model_status["hub_accessible"],
+            cache_dir="./model_cache"
+        )
+        # Modify the config to match the checkpoint's expected dimensions
+        ai_config.max_position_embeddings = 514
+        ai_config.type_vocab_size = 1
+        # Load the model with this config
+        ai_model = AutoModelForSequenceClassification.from_pretrained(
+            ai_model_name,
+            config=ai_config,
+            local_files_only=not model_status["hub_accessible"],
+            cache_dir="./model_cache"
+        )
+        # If the above doesn't work, try with ignore_mismatched_sizes
+        if ai_model is None:
+            logger.info("Attempting to load model with ignore_mismatched_sizes=True")
+            ai_model = AutoModelForSequenceClassification.from_pretrained(
+                ai_model_name,
+                local_files_only=not model_status["hub_accessible"],
+                cache_dir="./model_cache",
+                ignore_mismatched_sizes=True
+            )
+        # Verify models are loaded by testing them
+        test_sentence = "This is a test sentence to verify model loading."
+        # Test sentence transformer
+        _ = embedder.encode(test_sentence)
+        # Test AI detection model
+        inputs = ai_tokenizer(test_sentence, return_tensors="pt", max_length=512, truncation=True)
+        with torch.no_grad():
+            _ = ai_model(**inputs)
+        model_status["model_loaded"] = True
+        logger.info("Models loaded successfully!")
+        return True
+    except Exception as e:
+        error_msg = f"Error loading models: {str(e)}"
+        logger.error(error_msg)
+        model_status["model_loaded"] = False
+        model_status["last_error"] = error_msg
+        return False
+# Load models with proper error handling
+@app.on_event("startup")
+async def load_models():
+    """Initial model loading on startup with retry mechanism"""
+    retries = 0
+    while retries < MAX_RETRIES:
+        if load_models_impl():
+            break
+        retries += 1
+        logger.info(f"Retrying model loading ({retries}/{MAX_RETRIES})...")
+        time.sleep(5)  # Wait 5 seconds before retrying
+    if not model_status["model_loaded"]:
+        logger.warning(f"Failed to load models after {MAX_RETRIES} attempts. API will start, but analyze endpoint won't work.")
+async def background_model_reload(background_tasks: BackgroundTasks):
+    """Background task to reload models"""
+    if load_models_impl():
+        logger.info("Successfully reloaded models in background task")
+    else:
+        logger.error("Failed to reload models in background task")
+def extract_text_from_pdf(pdf_path):
+    try:
+        reader = PdfReader(pdf_path)
+        text = ""
+        for page in reader.pages:
+            text += page.extract_text() or ""
+        return text
+    except Exception as e:
+        logger.error(f"Failed to extract text from PDF: {e}")
+        raise RuntimeError(f"Failed to extract text: {e}")
+def chunk_text(text, chunk_size=5):
+    sentences = text.split(".")
+    chunks = [".".join(sentences[i:i + chunk_size]).strip() for i in range(0, len(sentences), chunk_size)]
+    return [chunk for chunk in chunks if chunk]
+def detect_ai_generated(text):
+    inputs = ai_tokenizer(text, truncation=True, padding=True, return_tensors="pt", max_length=512)
+    with torch.no_grad():
+        outputs = ai_model(**inputs)
+        logits = outputs.logits
+        probs = torch.softmax(logits, dim=1).squeeze()
+        predicted_class = torch.argmax(probs).item()
+        confidence = probs[predicted_class].item()
+    return {
+        "label": "AI-generated" if predicted_class == 1 else "Human-written",
+        "confidence": round(confidence * 100, 2)
+    }
+@app.get("/health")
+async def health_check() -> Dict[str, Any]:
+    """Health check endpoint to verify API and model status"""
+    # Check if reload is needed
+    current_time = time.time()
+    reload_needed = (
+        not model_status["model_loaded"] and
+        (model_status["last_reload_attempt"] is None or
+         current_time - model_status["last_reload_attempt"] > RELOAD_INTERVAL)
+    )
+    return {
+        **model_status,
+        "reload_needed": reload_needed,
+        "last_reload_attempt_time": time.strftime('%Y-%m-%d %H:%M:%S',
+                                            time.localtime(model_status["last_reload_attempt"]))
+                                            if model_status["last_reload_attempt"] else None
+    }
+@app.post("/reload-models")
+async def reload_models(background_tasks: BackgroundTasks):
+    """Endpoint to manually trigger model reloading"""
+    # Check if enough time has passed since last reload attempt
+    current_time = time.time()
+    if (model_status["last_reload_attempt"] is not None and
+        current_time - model_status["last_reload_attempt"] < 60):  # Prevent reloading more than once per minute
+        return JSONResponse(content={
+            "message": "Too many reload attempts. Please wait before trying again.",
+            "seconds_until_next_attempt": 60 - int(current_time - model_status["last_reload_attempt"])
+        }, status_code=429)
+    background_tasks.add_task(background_model_reload, background_tasks)
+    return {"message": "Model reload initiated in background"}
+@app.post("/analyze")
+async def analyze_essay(file: UploadFile = File(...), background_tasks: BackgroundTasks = None):
+    global model_status, embedder, ai_tokenizer, ai_model
+    # Check if models are loaded
+    if not model_status["model_loaded"]:
+        # Check if we should attempt to reload models
+        current_time = time.time()
+        reload_needed = (
+            model_status["last_reload_attempt"] is None or
+            current_time - model_status["last_reload_attempt"] > RELOAD_INTERVAL
+        )
+        if reload_needed and background_tasks:
+            # Start a background reload
+            background_tasks.add_task(background_model_reload, background_tasks)
+            message = "Models are being reloaded in the background. Please try again in a few minutes."
+        else:
+            message = "Model not loaded. Check /health endpoint for details or try /reload-models endpoint."
+        raise HTTPException(status_code=503, detail=message)
+    # Check if models are actually initialized
+    if embedder is None or ai_tokenizer is None or ai_model is None:
+        logger.error("Models appear loaded but variables are None")
+        raise HTTPException(status_code=503, detail="Model initialization incomplete. Please try again later.")
+    if not file.filename.endswith(".pdf"):
+        raise HTTPException(status_code=400, detail="Only PDF files are supported")
+    with tempfile.TemporaryDirectory() as tmpdir:
+        file_path = os.path.join(tmpdir, f"{uuid.uuid4()}.pdf")
+        with open(file_path, "wb") as buffer:
+            shutil.copyfileobj(file.file, buffer)
+        try:
+            essay_text = extract_text_from_pdf(file_path)
+        except RuntimeError as e:
+            raise HTTPException(status_code=500, detail=str(e))
+        if not essay_text.strip():
+            raise HTTPException(status_code=400, detail="The PDF seems to contain no extractable text.")
+        try:
+            # Run AI content detection
+            ai_result = detect_ai_generated(essay_text)
+            # Run internal plagiarism detection
+            chunks = chunk_text(essay_text)
+            if len(chunks) < 2:
+                raise HTTPException(status_code=400, detail="Not enough text chunks to assess internal plagiarism.")
+            embeddings = embedder.encode(chunks)
+            similarities = []
+            for i in range(len(embeddings)):
+                for j in range(i + 1, len(embeddings)):
+                    sim = cosine_similarity([embeddings[i]], [embeddings[j]])[0][0]
+                    similarities.append(sim)
+            max_similarity = max(similarities) if similarities else 0
+            avg_similarity = sum(similarities) / len(similarities) if similarities else 0
+            internal_score = round(avg_similarity * 100, 2)
+        except Exception as e:
+            logger.error(f"Error during analysis: {e}")
+            raise HTTPException(status_code=500, detail=f"Analysis failed: {str(e)}")
+    return JSONResponse(content={
+        "ai_content_confidence": ai_result["confidence"],
+        "internal_plagiarism_score": internal_score,
+        "debug_note": "Processed with fixed model configuration"
+    })

app_loader.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from app import app
2	+
3	+ # This file is to ensure the app is imported correctly by the Hugging Face Spaces environment

requirements.txt ADDED Viewed

	@@ -0,0 +1,228 @@

+absl-py==2.2.1
+accelerate==1.5.2
+aiohappyeyeballs==2.6.1
+aiohttp==3.11.14
+aiosignal==1.3.2
+annotated-types==0.7.0
+anyio==4.9.0
+argcomplete==1.10.0
+argon2-cffi==23.1.0
+argon2-cffi-bindings==21.2.0
+arrow==1.3.0
+asttokens==3.0.0
+astunparse==1.6.3
+async-lru==2.0.5
+attrs==25.3.0
+babel==2.17.0
+beautifulsoup4==4.8.0
+bleach==6.2.0
+CacheControl==0.14.2
+cachetools==5.5.2
+certifi==2025.1.31
+cffi==1.17.1
+chardet==3.0.4
+charset-normalizer==3.4.1
+click==8.1.8
+colorama==0.4.6
+comm==0.2.2
+contourpy==1.3.1
+cryptography==44.0.2
+cycler==0.12.1
+datasets==3.5.0
+debugpy==1.8.13
+decorator==5.2.1
+defusedxml==0.7.1
+dill==0.3.8
+docx2txt==0.8
+EbookLib==0.17.1
+evaluate==0.4.3
+executing==2.2.0
+extract-msg==0.23.1
+fastapi==0.115.12
+fastjsonschema==2.21.1
+filelock==3.18.0
+firebase-admin==6.7.0
+flatbuffers==25.2.10
+fonttools==4.56.0
+fqdn==1.5.1
+frozenlist==1.5.0
+fsspec==2024.12.0
+gast==0.6.0
+google-api-core==2.24.2
+google-api-python-client==2.166.0
+google-auth==2.38.0
+google-auth-httplib2==0.2.0
+google-auth-oauthlib==1.2.1
+google-cloud-core==2.4.3
+google-cloud-firestore==2.20.1
+google-cloud-storage==3.1.0
+google-crc32c==1.7.1
+google-pasta==0.2.0
+google-resumable-media==2.7.2
+googleapis-common-protos==1.69.2
+grpcio==1.71.0
+grpcio-status==1.71.0
+gunicorn==23.0.0
+h11==0.14.0
+h5py==3.13.0
+httpcore==1.0.7
+httplib2==0.22.0
+httpx==0.28.1
+huggingface-hub==0.29.3
+idna==3.10
+IMAPClient==2.1.0
+ipykernel==6.29.5
+ipython==9.0.2
+ipython_pygments_lexers==1.1.1
+ipywidgets==8.1.5
+isoduration==20.11.0
+jedi==0.19.2
+Jinja2==3.1.6
+joblib==1.4.2
+json5==0.10.0
+jsonpointer==3.0.0
+jsonschema==4.23.0
+jsonschema-specifications==2024.10.1
+jupyter==1.1.1
+jupyter-console==6.6.3
+jupyter-events==0.12.0
+jupyter-lsp==2.2.5
+jupyter_client==8.6.3
+jupyter_core==5.7.2
+jupyter_server==2.15.0
+jupyter_server_terminals==0.5.3
+jupyterlab==4.3.6
+jupyterlab_pygments==0.3.0
+jupyterlab_server==2.27.3
+jupyterlab_widgets==3.0.13
+keras==3.9.1
+kiwisolver==1.4.8
+libclang==18.1.1
+lxml==5.3.2
+Markdown==3.7
+markdown-it-py==3.0.0
+MarkupSafe==3.0.2
+matplotlib==3.10.1
+matplotlib-inline==0.1.7
+mdurl==0.1.2
+mistune==3.1.3
+ml_dtypes==0.5.1
+mpmath==1.3.0
+msgpack==1.1.0
+multidict==6.2.0
+multiprocess==0.70.16
+namex==0.0.8
+nbclient==0.10.2
+nbconvert==7.16.6
+nbformat==5.10.4
+nest-asyncio==1.6.0
+networkx==3.4.2
+nltk==3.9.1
+notebook==7.3.3
+notebook_shim==0.2.4
+numpy==1.26.4
+oauthlib==3.2.2
+olefile==0.46
+opt_einsum==3.4.0
+optree==0.14.1
+overrides==7.7.0
+packaging==24.2
+pandas==2.2.3
+pandocfilters==1.5.1
+parso==0.8.4
+pdfminer.six==20181108
+pillow==11.1.0
+platformdirs==4.3.7
+prometheus_client==0.21.1
+prompt_toolkit==3.0.50
+propcache==0.3.1
+proto-plus==1.26.1
+protobuf==5.29.4
+psutil==7.0.0
+pure_eval==0.2.3
+pyarrow==19.0.1
+pyasn1==0.6.1
+pyasn1_modules==0.4.1
+pycparser==2.22
+pycryptodome==3.22.0
+pydantic==2.11.1
+pydantic_core==2.33.0
+Pygments==2.19.1
+PyJWT==2.10.1
+pyparsing==3.2.3
+PyPDF2==3.0.1
+python-dateutil==2.9.0.post0
+python-docx==1.1.2
+python-dotenv==1.1.0
+python-json-logger==3.3.0
+python-multipart==0.0.20
+python-pptx==0.6.18
+pytz==2025.2
+pywin32==310
+pywinpty==2.0.15
+PyYAML==6.0.2
+pyzmq==26.3.0
+referencing==0.36.2
+regex==2024.11.6
+requests==2.32.3
+requests-oauthlib==2.0.0
+rfc3339-validator==0.1.4
+rfc3986-validator==0.1.1
+rich==13.9.4
+rpds-py==0.24.0
+rsa==4.9
+safetensors==0.5.3
+scikit-learn==1.6.1
+scipy==1.15.2
+seaborn==0.13.2
+Send2Trash==1.8.3
+sentence-transformers==4.1.0
+six==1.12.0
+sniffio==1.3.1
+sortedcontainers==2.4.0
+soupsieve==2.6
+SpeechRecognition==3.8.1
+stack-data==0.6.3
+starlette==0.46.1
+sympy==1.13.1
+tensorboard==2.19.0
+tensorboard-data-server==0.7.2
+tensorflow==2.19.0
+tensorflow-estimator==2.15.0
+tensorflow-intel==2.15.1
+tensorflow-io-gcs-filesystem==0.31.0
+termcolor==2.5.0
+terminado==0.18.1
+textblob==0.19.0
+textract==1.6.3
+tf_keras==2.19.0
+threadpoolctl==3.6.0
+tinycss2==1.4.0
+tokenizers==0.21.1
+torch==2.6.0
+torchaudio==2.6.0
+torchvision==0.21.0
+tornado==6.4.2
+tqdm==4.67.1
+traitlets==5.14.3
+transformers==4.50.2
+types-python-dateutil==2.9.0.20241206
+typing-inspection==0.4.0
+typing_extensions==4.13.0
+tzdata==2025.2
+tzlocal==1.5.1
+uri-template==1.3.0
+uritemplate==4.1.1
+urllib3==2.3.0
+uvicorn==0.34.0
+wcwidth==0.2.13
+webcolors==24.11.1
+webencodings==0.5.1
+websocket-client==1.8.0
+Werkzeug==3.1.3
+widgetsnbextension==4.0.13
+wrapt==1.14.1
+xlrd==1.2.0
+XlsxWriter==3.2.2
+xxhash==3.5.0
+yarl==1.18.3