Dannyar608 commited on
Commit
f17f847
·
verified ·
1 Parent(s): c4f9a1a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +160 -2
app.py CHANGED
@@ -235,12 +235,170 @@ def remove_sensitive_info(text: str) -> str:
235
  return text
236
 
237
  # ========== TRANSCRIPT PARSING ==========
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
238
  def parse_transcript_with_ai(text: str, progress=gr.Progress()) -> Dict:
239
  """Use AI model to parse transcript text with progress feedback"""
240
  model, tokenizer = model_loader.load_model(model_loader.current_model or DEFAULT_MODEL, progress)
241
  if model is None or tokenizer is None:
242
  raise gr.Error(f"Model failed to load. {model_loader.error or 'Please try loading a model first.'}")
243
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
244
  # Pre-process the text
245
  text = remove_sensitive_info(text[:15000]) # Limit input size
246
 
@@ -263,7 +421,7 @@ def parse_transcript_with_ai(text: str, progress=gr.Progress()) -> Dict:
263
  """
264
 
265
  try:
266
- progress(0.1, desc="Processing transcript...")
267
 
268
  # Tokenize and generate response
269
  inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
@@ -271,7 +429,7 @@ def parse_transcript_with_ai(text: str, progress=gr.Progress()) -> Dict:
271
 
272
  outputs = model.generate(
273
  **inputs,
274
- max_new_tokens=1500, # Reduced from original
275
  temperature=0.1,
276
  do_sample=True
277
  )
 
235
  return text
236
 
237
  # ========== TRANSCRIPT PARSING ==========
238
+ class TranscriptParser:
239
+ def __init__(self):
240
+ self.student_data = {}
241
+ self.requirements = {}
242
+ self.current_courses = []
243
+ self.course_history = []
244
+
245
+ def parse_transcript(self, text: str) -> Dict:
246
+ """Main method to parse transcript text"""
247
+ self._extract_student_info(text)
248
+ self._extract_requirements(text)
249
+ self._extract_course_history(text)
250
+ self._extract_current_courses(text)
251
+
252
+ return {
253
+ "student_info": self.student_data,
254
+ "requirements": self.requirements,
255
+ "current_courses": self.current_courses,
256
+ "course_history": self.course_history,
257
+ "completion_status": self._calculate_completion()
258
+ }
259
+
260
+ def _extract_student_info(self, text: str):
261
+ """Extract student personal information"""
262
+ header_match = re.search(
263
+ r"(\d{7}) - ([\w\s,]+)\s*\|\s*Cohort \w+\s*\|\s*Un-weighted GPA ([\d.]+)\s*\|\s*Comm Serv Hours (\d+)",
264
+ text
265
+ )
266
+ if header_match:
267
+ self.student_data = {
268
+ "id": header_match.group(1),
269
+ "name": header_match.group(2).strip(),
270
+ "unweighted_gpa": float(header_match.group(3)),
271
+ "community_service_hours": int(header_match.group(4))
272
+ }
273
+
274
+ # Extract additional info
275
+ grade_match = re.search(
276
+ r"Current Grade: (\d+)\s*\|\s*YOG (\d{4})\s*\|\s*Weighted GPA ([\d.]+)\s*\|\s*Total Credits Earned ([\d.]+)",
277
+ text
278
+ )
279
+ if grade_match:
280
+ self.student_data.update({
281
+ "current_grade": grade_match.group(1),
282
+ "graduation_year": grade_match.group(2),
283
+ "weighted_gpa": float(grade_match.group(3)),
284
+ "total_credits": float(grade_match.group(4))
285
+ })
286
+
287
+ def _extract_requirements(self, text: str):
288
+ """Parse the graduation requirements section"""
289
+ req_table = re.findall(
290
+ r"\|([A-Z]-[\w\s]+)\s*\|([^\|]+)\|([\d.]+)\s*\|([\d.]+)\s*\|([\d.]+)\s*\|([^\|]+)\|",
291
+ text
292
+ )
293
+
294
+ for row in req_table:
295
+ req_name = row[0].strip()
296
+ self.requirements[req_name] = {
297
+ "required": float(row[2]),
298
+ "completed": float(row[4]),
299
+ "status": f"{row[5].strip()}%"
300
+ }
301
+
302
+ def _extract_course_history(self, text: str):
303
+ """Parse the detailed course history"""
304
+ course_lines = re.findall(
305
+ r"\|([A-Z]-[\w\s&\(\)]+)\s*\|(\d{4}-\d{4})\s*\|(\d{2})\s*\|([A-Z0-9]+)\s*\|([^\|]+)\|([^\|]+)\|([^\|]+)\|([A-Z])\s*\|([YRXW]?)\s*\|([^\|]+)\|",
306
+ text
307
+ )
308
+
309
+ for course in course_lines:
310
+ self.course_history.append({
311
+ "requirement_category": course[0].strip(),
312
+ "school_year": course[1],
313
+ "grade_level": course[2],
314
+ "course_code": course[3],
315
+ "description": course[4].strip(),
316
+ "term": course[5].strip(),
317
+ "district_number": course[6].strip(),
318
+ "grade": course[7],
319
+ "inclusion_status": course[8],
320
+ "credits": course[9].strip()
321
+ })
322
+
323
+ def _extract_current_courses(self, text: str):
324
+ """Identify courses currently in progress"""
325
+ in_progress = [c for c in self.course_history if "inProgress" in c["credits"]]
326
+ self.current_courses = [
327
+ {
328
+ "course": c["description"],
329
+ "category": c["requirement_category"],
330
+ "term": c["term"],
331
+ "credits": c["credits"]
332
+ }
333
+ for c in in_progress
334
+ ]
335
+
336
+ def _calculate_completion(self) -> Dict:
337
+ """Calculate overall completion status"""
338
+ total_required = sum(req["required"] for req in self.requirements.values())
339
+ total_completed = sum(req["completed"] for req in self.requirements.values())
340
+
341
+ return {
342
+ "total_required": total_required,
343
+ "total_completed": total_completed,
344
+ "percent_complete": round((total_completed / total_required) * 100, 1),
345
+ "remaining_credits": total_required - total_completed
346
+ }
347
+
348
+ def to_json(self) -> str:
349
+ """Export parsed data as JSON"""
350
+ return json.dumps({
351
+ "student_info": self.student_data,
352
+ "requirements": self.requirements,
353
+ "current_courses": self.current_courses,
354
+ "course_history": self.course_history,
355
+ "completion_status": self._calculate_completion()
356
+ }, indent=2)
357
+
358
  def parse_transcript_with_ai(text: str, progress=gr.Progress()) -> Dict:
359
  """Use AI model to parse transcript text with progress feedback"""
360
  model, tokenizer = model_loader.load_model(model_loader.current_model or DEFAULT_MODEL, progress)
361
  if model is None or tokenizer is None:
362
  raise gr.Error(f"Model failed to load. {model_loader.error or 'Please try loading a model first.'}")
363
 
364
+ # First try the structured parser
365
+ try:
366
+ progress(0.1, desc="Parsing transcript structure...")
367
+ parser = TranscriptParser()
368
+ parsed_data = parser.parse_transcript(text)
369
+ progress(0.9, desc="Formatting results...")
370
+
371
+ # Convert to expected format
372
+ formatted_data = {
373
+ "grade_level": parsed_data["student_info"].get("current_grade", "Unknown"),
374
+ "gpa": {
375
+ "weighted": parsed_data["student_info"].get("weighted_gpa", "N/A"),
376
+ "unweighted": parsed_data["student_info"].get("unweighted_gpa", "N/A")
377
+ },
378
+ "courses": []
379
+ }
380
+
381
+ # Add courses
382
+ for course in parsed_data["course_history"]:
383
+ formatted_data["courses"].append({
384
+ "code": course["course_code"],
385
+ "name": course["description"],
386
+ "grade": course["grade"],
387
+ "credits": course["credits"],
388
+ "year": course["school_year"],
389
+ "grade_level": course["grade_level"]
390
+ })
391
+
392
+ progress(1.0)
393
+ return validate_parsed_data(formatted_data)
394
+
395
+ except Exception as e:
396
+ print(f"Structured parsing failed, falling back to AI: {str(e)}")
397
+ # Fall back to AI parsing if structured parsing fails
398
+ return parse_transcript_with_ai_fallback(text, progress)
399
+
400
+ def parse_transcript_with_ai_fallback(text: str, progress=gr.Progress()) -> Dict:
401
+ """Fallback AI parsing method"""
402
  # Pre-process the text
403
  text = remove_sensitive_info(text[:15000]) # Limit input size
404
 
 
421
  """
422
 
423
  try:
424
+ progress(0.1, desc="Processing transcript with AI...")
425
 
426
  # Tokenize and generate response
427
  inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
 
429
 
430
  outputs = model.generate(
431
  **inputs,
432
+ max_new_tokens=1500,
433
  temperature=0.1,
434
  do_sample=True
435
  )