Essay-Grader commited on
Commit
296bce3
·
1 Parent(s): 618a405

Added new file

Browse files
Files changed (4) hide show
  1. Dockerfile +14 -17
  2. app.py +143 -110
  3. requirements.txt +4 -2
  4. verify_model.py +15 -0
Dockerfile CHANGED
@@ -2,34 +2,31 @@ FROM python:3.9-slim
2
 
3
  WORKDIR /code
4
 
5
- # Set cache directories to writable location
6
- ENV TRANSFORMERS_CACHE=/tmp/cache
7
- ENV HF_HOME=/tmp/cache
8
- ENV SENTENCE_TRANSFORMERS_HOME=/tmp/cache
9
- ENV XDG_CACHE_HOME=/tmp/cache
10
 
11
- # Install system dependencies
12
  RUN apt-get update && apt-get install -y --no-install-recommends \
13
  build-essential \
14
  git \
15
  && rm -rf /var/lib/apt/lists/*
16
 
17
- # Create writable cache directory with proper permissions
18
- RUN mkdir -p /tmp/cache && chmod 775 /tmp/cache
 
19
 
20
- # Create non-root user and switch to it
21
- RUN useradd -m appuser && chown -R appuser /code /tmp/cache
22
  USER appuser
23
 
24
- # Copy requirements first for better caching
25
  COPY --chown=appuser:appuser requirements.txt .
26
- RUN pip install --no-cache-dir -r requirements.txt
 
27
 
28
  # Copy application code
29
  COPY --chown=appuser:appuser app.py .
30
 
31
- # Pre-download models with correct cache location
32
- RUN python -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('sentence-transformers/all-roberta-large-v1')" && \
33
- python -c "from transformers import AutoModel; AutoModel.from_pretrained('Essay-Grader/roberta-ai-detector-20250401_232702', use_safetensors=True)"
34
-
35
- CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
 
2
 
3
  WORKDIR /code
4
 
5
+ # Hugging Face Space requirements
6
+ ENV HF_HOME=/tmp/cache \
7
+ TRANSFORMERS_CACHE=/tmp/cache \
8
+ SENTENCE_TRANSFORMERS_HOME=/tmp/cache \
9
+ PATH="/home/appuser/.local/bin:${PATH}"
10
 
11
+ # System dependencies
12
  RUN apt-get update && apt-get install -y --no-install-recommends \
13
  build-essential \
14
  git \
15
  && rm -rf /var/lib/apt/lists/*
16
 
17
+ # Create cache directory and non-root user
18
+ RUN mkdir -p ${HF_HOME} && chmod 777 ${HF_HOME} && \
19
+ useradd -m appuser && chown -R appuser /code ${HF_HOME}
20
 
 
 
21
  USER appuser
22
 
23
+ # Install Python dependencies
24
  COPY --chown=appuser:appuser requirements.txt .
25
+ RUN pip install --no-cache-dir --upgrade pip && \
26
+ pip install --no-cache-dir -r requirements.txt
27
 
28
  # Copy application code
29
  COPY --chown=appuser:appuser app.py .
30
 
31
+ # Hugging Face Space-specific CMD
32
+ CMD ["python", "-m", "uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
 
 
 
app.py CHANGED
@@ -1,9 +1,8 @@
1
- # app.py: Updated API for AI detection and plagiarism checking using FastAPI
2
-
3
  from fastapi import FastAPI, UploadFile, File, HTTPException, BackgroundTasks
4
  from fastapi.responses import JSONResponse
5
  from sentence_transformers import SentenceTransformer
6
- from transformers import AutoTokenizer, AutoModelForSequenceClassification
7
  from PyPDF2 import PdfReader
8
  from sklearn.metrics.pairwise import cosine_similarity
9
  import torch
@@ -16,163 +15,191 @@ import logging
16
  import time
17
  from typing import Dict, Any
18
 
19
- # Set up logging
20
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 
 
 
21
  logger = logging.getLogger(__name__)
22
 
23
  app = FastAPI(
24
- title="Essay Grader API",
25
- description="API for AI content detection and plagiarism detection",
26
- version="1.1.0"
 
 
27
  )
28
 
29
- # Configuration
30
- CACHE_DIR = "/tmp/cache" # Writable directory for model caching
31
- os.makedirs(CACHE_DIR, exist_ok=True) # Ensure cache directory exists
32
- PLAGIARISM_THRESHOLD = 0.85 # Similarity threshold for plagiarism detection
33
- MAX_TEXT_LENGTH = 512 # Maximum text length for AI detection
 
34
 
35
- # Global variables to track model loading status
36
  model_status = {
37
  "model_loaded": False,
38
  "last_error": None,
39
- "last_reload_attempt": None
 
40
  }
41
 
42
- # Global variables for models
43
  embedder = None
44
  ai_tokenizer = None
45
  ai_model = None
46
 
47
- def load_models_impl():
48
- """Implementation of model loading logic with proper error handling"""
49
- global embedder, ai_tokenizer, ai_model, model_status
50
-
51
- model_status["last_reload_attempt"] = time.time()
52
- model_status["last_error"] = None
53
-
54
  try:
55
- # Load SentenceTransformer model
56
- logger.info("Loading SentenceTransformer model...")
57
  embedder = SentenceTransformer(
58
- 'sentence-transformers/all-roberta-large-v1',
59
  cache_folder=CACHE_DIR
60
  )
61
 
62
- # Load AI detection model
63
- ai_model_name = "Essay-Grader/roberta-ai-detector-20250401_232702"
64
- logger.info(f"Loading AI detection model: {ai_model_name}")
65
-
66
- # Load tokenizer and model
67
  ai_tokenizer = AutoTokenizer.from_pretrained(
68
- ai_model_name,
69
  cache_dir=CACHE_DIR,
70
  use_fast=True
71
  )
72
-
73
- ai_model = AutoModelForSequenceClassification.from_pretrained(
74
- ai_model_name,
 
75
  cache_dir=CACHE_DIR,
76
- use_safetensors=True,
77
- device_map="auto"
78
  )
79
 
80
- # Verify model loading
81
- test_text = "Model verification text " * 50
82
- inputs = ai_tokenizer(
83
- test_text,
84
  return_tensors="pt",
85
  max_length=MAX_TEXT_LENGTH,
86
  truncation=True,
87
  padding=True
88
  )
89
  with torch.no_grad():
90
- ai_model(**inputs)
91
-
92
- model_status["model_loaded"] = True
93
- logger.info("Models loaded successfully!")
 
 
 
 
 
 
94
  return True
95
 
96
  except Exception as e:
97
- error_msg = f"Error loading models: {str(e)}"
98
  logger.error(error_msg)
99
- model_status["model_loaded"] = False
100
- model_status["last_error"] = error_msg
 
 
101
  return False
102
 
103
  @app.on_event("startup")
104
- async def initialize_app():
105
- """Initialize application with retry logic"""
106
- retries = 0
107
- while retries < 3 and not model_status["model_loaded"]:
108
- if load_models_impl():
109
- break
110
- retries += 1
111
- logger.info(f"Retry {retries}/3 for model loading")
 
 
 
 
 
 
112
  time.sleep(5)
113
-
114
- if not model_status["model_loaded"]:
115
- logger.error("Failed to load models after 3 attempts")
116
 
117
  def extract_text_from_pdf(pdf_path: str) -> str:
118
- """Extract text from PDF file"""
119
  try:
120
  reader = PdfReader(pdf_path)
121
  return " ".join(page.extract_text() or "" for page in reader.pages)
122
  except Exception as e:
123
- logger.error(f"PDF extraction failed: {e}")
124
  raise RuntimeError("Failed to extract text from PDF")
125
 
126
  def chunk_text(text: str, chunk_size: int = 5) -> list:
127
- """Split text into chunks of sentences"""
128
  sentences = [s.strip() for s in text.split('.') if s.strip()]
129
- return ['. '.join(sentences[i:i+chunk_size]) + '.' for i in range(0, len(sentences), chunk_size)]
130
-
131
- def detect_ai_generated(text: str) -> Dict[str, float]:
132
- """Detect AI-generated content and return both probabilities"""
133
- inputs = ai_tokenizer(
134
- text,
135
- truncation=True,
136
- padding=True,
137
- return_tensors="pt",
138
- max_length=MAX_TEXT_LENGTH
139
- )
140
-
141
- with torch.no_grad():
142
- outputs = ai_model(**inputs)
143
- probs = torch.softmax(outputs.logits, dim=1).squeeze()
144
-
145
- return {
146
- "human_written": round(probs[0].item() * 100, 2),
147
- "ai_generated": round(probs[1].item() * 100, 2)
148
- }
 
 
 
 
 
 
 
 
 
 
 
 
149
 
150
- def calculate_plagiarism_percent(chunks: list) -> float:
151
- """Calculate plagiarism percentage based on text similarity"""
152
  if len(chunks) < 2:
153
  return 0.0
154
 
155
  embeddings = embedder.encode(chunks)
156
  similarity_matrix = cosine_similarity(embeddings)
157
- np.fill_diagonal(similarity_matrix, 0) # Ignore self-comparisons
158
 
159
- # Count similar pairs
160
  similar_pairs = np.sum(similarity_matrix > PLAGIARISM_THRESHOLD)
161
- total_possible = len(chunks) * (len(chunks) - 1) / 2
162
 
163
- return round((similar_pairs / total_possible) * 100, 2) if total_possible > 0 else 0.0
164
 
165
  @app.post("/analyze")
166
- async def analyze_essay(
167
  file: UploadFile = File(...),
168
  background_tasks: BackgroundTasks = None
169
  ) -> Dict[str, Any]:
170
- """Analyze PDF document for AI content and plagiarism"""
171
  if not model_status["model_loaded"]:
172
- raise HTTPException(status_code=503, detail="Models not loaded - try /reload-models")
 
 
 
173
 
174
  if not file.filename.lower().endswith(".pdf"):
175
- raise HTTPException(status_code=400, detail="Only PDF files supported")
176
 
177
  try:
178
  with tempfile.TemporaryDirectory() as tmp_dir:
@@ -180,46 +207,52 @@ async def analyze_essay(
180
  file_path = os.path.join(tmp_dir, f"{uuid.uuid4()}.pdf")
181
  with open(file_path, "wb") as buffer:
182
  shutil.copyfileobj(file.file, buffer)
183
-
184
- # Process PDF
185
  text = extract_text_from_pdf(file_path)
186
  if not text.strip():
187
- raise HTTPException(status_code=400, detail="No text found in PDF")
188
 
189
- # Run analyses
190
- ai_result = detect_ai_generated(text)
191
  chunks = chunk_text(text)
192
- plagiarism_percent = calculate_plagiarism_percent(chunks)
193
 
194
  return {
195
  "analysis": {
196
  "ai_detection": ai_result,
197
- "plagiarism_check": plagiarism_percent
198
  },
199
- "status": "completed"
200
  }
201
 
 
 
202
  except Exception as e:
203
- logger.error(f"Analysis failed: {str(e)}")
204
- raise HTTPException(status_code=500, detail=f"Analysis failed: {str(e)}")
205
 
206
  @app.post("/reload-models")
207
  async def reload_models(background_tasks: BackgroundTasks):
208
- """Trigger model reload"""
209
- background_tasks.add_task(load_models_impl)
210
- return {"status": "reload-initiated", "message": "Model reload started"}
211
 
212
  @app.get("/health")
213
  async def health_check() -> Dict[str, Any]:
214
- """System health check"""
215
  return {
 
216
  "model_loaded": model_status["model_loaded"],
217
  "last_error": model_status["last_error"],
218
- "cache_dir": CACHE_DIR,
219
- "status": "operational" if model_status["model_loaded"] else "degraded"
220
  }
221
 
222
  @app.get("/")
223
  async def root():
224
- """Root endpoint with documentation redirect"""
225
- return {"message": "Essay Analysis API - Visit /docs for API documentation"}
 
 
 
 
 
1
+ # app.py: AI Detection and Plagiarism Check API
 
2
  from fastapi import FastAPI, UploadFile, File, HTTPException, BackgroundTasks
3
  from fastapi.responses import JSONResponse
4
  from sentence_transformers import SentenceTransformer
5
+ from transformers import RobertaForSequenceClassification, AutoTokenizer
6
  from PyPDF2 import PdfReader
7
  from sklearn.metrics.pairwise import cosine_similarity
8
  import torch
 
15
  import time
16
  from typing import Dict, Any
17
 
18
+ # Configure logging
19
+ logging.basicConfig(
20
+ level=logging.INFO,
21
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
22
+ )
23
  logger = logging.getLogger(__name__)
24
 
25
  app = FastAPI(
26
+ title="Essay Analysis API",
27
+ description="API for AI Content Detection and Plagiarism Checking",
28
+ version="1.0.0",
29
+ docs_url="/docs",
30
+ redoc_url=None
31
  )
32
 
33
+ # Configuration Constants
34
+ CACHE_DIR = "/tmp/cache"
35
+ PLAGIARISM_THRESHOLD = 0.85
36
+ MAX_TEXT_LENGTH = 512
37
+ MODEL_NAME = "Essay-Grader/roberta-ai-detector-20250401_232702"
38
+ SENTENCE_MODEL = "sentence-transformers/all-roberta-large-v1"
39
 
40
+ # Global State Management
41
  model_status = {
42
  "model_loaded": False,
43
  "last_error": None,
44
+ "last_reload_attempt": None,
45
+ "retry_count": 0
46
  }
47
 
48
+ # Model References
49
  embedder = None
50
  ai_tokenizer = None
51
  ai_model = None
52
 
53
+ def initialize_models():
54
+ """Initialize ML models with error handling and retry logic"""
55
+ global embedder, ai_tokenizer, ai_model
56
+
 
 
 
57
  try:
58
+ # Initialize Sentence Transformer
59
+ logger.info("Loading sentence transformer model...")
60
  embedder = SentenceTransformer(
61
+ SENTENCE_MODEL,
62
  cache_folder=CACHE_DIR
63
  )
64
 
65
+ # Initialize AI Detection Model
66
+ logger.info(f"Loading AI detection model: {MODEL_NAME}")
 
 
 
67
  ai_tokenizer = AutoTokenizer.from_pretrained(
68
+ MODEL_NAME,
69
  cache_dir=CACHE_DIR,
70
  use_fast=True
71
  )
72
+
73
+ # Modified to fix safetensors loading issue
74
+ ai_model = RobertaForSequenceClassification.from_pretrained(
75
+ MODEL_NAME,
76
  cache_dir=CACHE_DIR,
77
+ device_map="auto" if torch.cuda.is_available() else None,
78
+ trust_remote_code=True
79
  )
80
 
81
+ # Model warmup
82
+ test_input = ai_tokenizer(
83
+ "Model initialization text " * 20,
 
84
  return_tensors="pt",
85
  max_length=MAX_TEXT_LENGTH,
86
  truncation=True,
87
  padding=True
88
  )
89
  with torch.no_grad():
90
+ # Move input tensors to model device
91
+ if hasattr(ai_model, "device"):
92
+ test_input = {k: v.to(ai_model.device) for k, v in test_input.items()}
93
+ ai_model(**test_input)
94
+
95
+ logger.info("All models loaded successfully")
96
+ model_status.update({
97
+ "model_loaded": True,
98
+ "last_error": None
99
+ })
100
  return True
101
 
102
  except Exception as e:
103
+ error_msg = f"Model initialization failed: {str(e)}"
104
  logger.error(error_msg)
105
+ model_status.update({
106
+ "last_error": error_msg,
107
+ "model_loaded": False
108
+ })
109
  return False
110
 
111
  @app.on_event("startup")
112
+ async def startup_event():
113
+ """Application startup with retry logic"""
114
+ os.makedirs(CACHE_DIR, exist_ok=True)
115
+ max_retries = 3
116
+
117
+ while model_status["retry_count"] < max_retries:
118
+ if initialize_models():
119
+ model_status.update({
120
+ "model_loaded": True,
121
+ "retry_count": 0
122
+ })
123
+ return
124
+ model_status["retry_count"] += 1
125
+ logger.warning(f"Retry attempt {model_status['retry_count']}/{max_retries}")
126
  time.sleep(5)
127
+
128
+ logger.critical("Failed to initialize models after multiple attempts")
 
129
 
130
  def extract_text_from_pdf(pdf_path: str) -> str:
131
+ """Extract and concatenate text from PDF"""
132
  try:
133
  reader = PdfReader(pdf_path)
134
  return " ".join(page.extract_text() or "" for page in reader.pages)
135
  except Exception as e:
136
+ logger.error(f"PDF extraction error: {str(e)}")
137
  raise RuntimeError("Failed to extract text from PDF")
138
 
139
  def chunk_text(text: str, chunk_size: int = 5) -> list:
140
+ """Split text into coherent chunks"""
141
  sentences = [s.strip() for s in text.split('.') if s.strip()]
142
+ chunks = []
143
+ for i in range(0, len(sentences), chunk_size):
144
+ chunk = '. '.join(sentences[i:i+chunk_size]) + '.'
145
+ chunks.append(chunk)
146
+ return chunks
147
+
148
+ def analyze_ai_content(text: str) -> Dict[str, float]:
149
+ """Analyze text for AI-generated content"""
150
+ try:
151
+ inputs = ai_tokenizer(
152
+ text,
153
+ truncation=True,
154
+ padding=True,
155
+ return_tensors="pt",
156
+ max_length=MAX_TEXT_LENGTH
157
+ )
158
+
159
+ # Move tensors to the same device as the model
160
+ device = next(ai_model.parameters()).device
161
+ inputs = {k: v.to(device) for k, v in inputs.items()}
162
+
163
+ with torch.no_grad():
164
+ outputs = ai_model(**inputs)
165
+ probs = torch.softmax(outputs.logits, dim=1).squeeze()
166
+
167
+ return {
168
+ "human_written": round(probs[0].item() * 100, 2),
169
+ "ai_generated": round(probs[1].item() * 100, 2)
170
+ }
171
+ except Exception as e:
172
+ logger.error(f"AI analysis failed: {str(e)}")
173
+ raise RuntimeError("Failed to analyze text content")
174
 
175
+ def calculate_plagiarism_score(chunks: list) -> float:
176
+ """Calculate plagiarism percentage using similarity analysis"""
177
  if len(chunks) < 2:
178
  return 0.0
179
 
180
  embeddings = embedder.encode(chunks)
181
  similarity_matrix = cosine_similarity(embeddings)
182
+ np.fill_diagonal(similarity_matrix, 0)
183
 
 
184
  similar_pairs = np.sum(similarity_matrix > PLAGIARISM_THRESHOLD)
185
+ total_possible = len(chunks) * (len(chunks) - 1) // 2
186
 
187
+ return round((similar_pairs / total_possible) * 100, 2) if total_possible else 0.0
188
 
189
  @app.post("/analyze")
190
+ async def analyze_document(
191
  file: UploadFile = File(...),
192
  background_tasks: BackgroundTasks = None
193
  ) -> Dict[str, Any]:
194
+ """Main analysis endpoint"""
195
  if not model_status["model_loaded"]:
196
+ raise HTTPException(
197
+ status_code=503,
198
+ detail="Service unavailable - models not loaded"
199
+ )
200
 
201
  if not file.filename.lower().endswith(".pdf"):
202
+ raise HTTPException(400, "Only PDF files are supported")
203
 
204
  try:
205
  with tempfile.TemporaryDirectory() as tmp_dir:
 
207
  file_path = os.path.join(tmp_dir, f"{uuid.uuid4()}.pdf")
208
  with open(file_path, "wb") as buffer:
209
  shutil.copyfileobj(file.file, buffer)
210
+
211
+ # Process document
212
  text = extract_text_from_pdf(file_path)
213
  if not text.strip():
214
+ raise HTTPException(400, "No text found in document")
215
 
216
+ # Perform analysis
217
+ ai_result = analyze_ai_content(text)
218
  chunks = chunk_text(text)
219
+ plagiarism_score = calculate_plagiarism_score(chunks)
220
 
221
  return {
222
  "analysis": {
223
  "ai_detection": ai_result,
224
+ "plagiarism_score": plagiarism_score
225
  },
226
+ "status": "success"
227
  }
228
 
229
+ except HTTPException:
230
+ raise
231
  except Exception as e:
232
+ logger.error(f"Analysis pipeline failed: {str(e)}")
233
+ raise HTTPException(500, f"Analysis failed: {str(e)}")
234
 
235
  @app.post("/reload-models")
236
  async def reload_models(background_tasks: BackgroundTasks):
237
+ """Model reload endpoint"""
238
+ background_tasks.add_task(initialize_models)
239
+ return {"status": "reload-initiated", "message": "Model reload in progress"}
240
 
241
  @app.get("/health")
242
  async def health_check() -> Dict[str, Any]:
243
+ """System health endpoint"""
244
  return {
245
+ "status": "operational" if model_status["model_loaded"] else "degraded",
246
  "model_loaded": model_status["model_loaded"],
247
  "last_error": model_status["last_error"],
248
+ "retry_count": model_status["retry_count"]
 
249
  }
250
 
251
  @app.get("/")
252
  async def root():
253
+ """Root endpoint"""
254
+ return {
255
+ "service": "Essay Analysis API",
256
+ "version": "1.0.0",
257
+ "endpoints": ["/analyze", "/health", "/reload-models"]
258
+ }
requirements.txt CHANGED
@@ -7,7 +7,9 @@ torch==2.3.0
7
  scikit-learn==1.4.0
8
  PyPDF2==3.0.1
9
  numpy==1.26.4
10
- pandas==2.2.1
11
  requests==2.31.0
 
 
12
  python-multipart==0.0.9
13
- safetensors==0.4.3
 
 
7
  scikit-learn==1.4.0
8
  PyPDF2==3.0.1
9
  numpy==1.26.4
 
10
  requests==2.31.0
11
+ safetensors==0.4.3
12
+ huggingface_hub>=0.23.0,<1.0
13
  python-multipart==0.0.9
14
+ click==8.1.7
15
+ accelerate>=0.23.0
verify_model.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer
2
+
3
+ model = AutoModelForSequenceClassification.from_pretrained(
4
+ "Essay-Grader/roberta-ai-detector-20250401_232702",
5
+ trust_remote_code=True,
6
+ device_map="auto"
7
+ )
8
+ tokenizer = AutoTokenizer.from_pretrained(
9
+ "Essay-Grader/roberta-ai-detector-20250401_232702"
10
+ )
11
+
12
+ text = "Sample essay text for verification"
13
+ inputs = tokenizer(text, return_tensors="pt")
14
+ outputs = model(**inputs)
15
+ print("Model output:", outputs.logits)