Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -235,12 +235,170 @@ def remove_sensitive_info(text: str) -> str:
|
|
235 |
return text
|
236 |
|
237 |
# ========== TRANSCRIPT PARSING ==========
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
238 |
def parse_transcript_with_ai(text: str, progress=gr.Progress()) -> Dict:
|
239 |
"""Use AI model to parse transcript text with progress feedback"""
|
240 |
model, tokenizer = model_loader.load_model(model_loader.current_model or DEFAULT_MODEL, progress)
|
241 |
if model is None or tokenizer is None:
|
242 |
raise gr.Error(f"Model failed to load. {model_loader.error or 'Please try loading a model first.'}")
|
243 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
244 |
# Pre-process the text
|
245 |
text = remove_sensitive_info(text[:15000]) # Limit input size
|
246 |
|
@@ -263,7 +421,7 @@ def parse_transcript_with_ai(text: str, progress=gr.Progress()) -> Dict:
|
|
263 |
"""
|
264 |
|
265 |
try:
|
266 |
-
progress(0.1, desc="Processing transcript...")
|
267 |
|
268 |
# Tokenize and generate response
|
269 |
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
|
@@ -271,7 +429,7 @@ def parse_transcript_with_ai(text: str, progress=gr.Progress()) -> Dict:
|
|
271 |
|
272 |
outputs = model.generate(
|
273 |
**inputs,
|
274 |
-
max_new_tokens=1500,
|
275 |
temperature=0.1,
|
276 |
do_sample=True
|
277 |
)
|
|
|
235 |
return text
|
236 |
|
237 |
# ========== TRANSCRIPT PARSING ==========
|
238 |
+
class TranscriptParser:
|
239 |
+
def __init__(self):
|
240 |
+
self.student_data = {}
|
241 |
+
self.requirements = {}
|
242 |
+
self.current_courses = []
|
243 |
+
self.course_history = []
|
244 |
+
|
245 |
+
def parse_transcript(self, text: str) -> Dict:
|
246 |
+
"""Main method to parse transcript text"""
|
247 |
+
self._extract_student_info(text)
|
248 |
+
self._extract_requirements(text)
|
249 |
+
self._extract_course_history(text)
|
250 |
+
self._extract_current_courses(text)
|
251 |
+
|
252 |
+
return {
|
253 |
+
"student_info": self.student_data,
|
254 |
+
"requirements": self.requirements,
|
255 |
+
"current_courses": self.current_courses,
|
256 |
+
"course_history": self.course_history,
|
257 |
+
"completion_status": self._calculate_completion()
|
258 |
+
}
|
259 |
+
|
260 |
+
def _extract_student_info(self, text: str):
|
261 |
+
"""Extract student personal information"""
|
262 |
+
header_match = re.search(
|
263 |
+
r"(\d{7}) - ([\w\s,]+)\s*\|\s*Cohort \w+\s*\|\s*Un-weighted GPA ([\d.]+)\s*\|\s*Comm Serv Hours (\d+)",
|
264 |
+
text
|
265 |
+
)
|
266 |
+
if header_match:
|
267 |
+
self.student_data = {
|
268 |
+
"id": header_match.group(1),
|
269 |
+
"name": header_match.group(2).strip(),
|
270 |
+
"unweighted_gpa": float(header_match.group(3)),
|
271 |
+
"community_service_hours": int(header_match.group(4))
|
272 |
+
}
|
273 |
+
|
274 |
+
# Extract additional info
|
275 |
+
grade_match = re.search(
|
276 |
+
r"Current Grade: (\d+)\s*\|\s*YOG (\d{4})\s*\|\s*Weighted GPA ([\d.]+)\s*\|\s*Total Credits Earned ([\d.]+)",
|
277 |
+
text
|
278 |
+
)
|
279 |
+
if grade_match:
|
280 |
+
self.student_data.update({
|
281 |
+
"current_grade": grade_match.group(1),
|
282 |
+
"graduation_year": grade_match.group(2),
|
283 |
+
"weighted_gpa": float(grade_match.group(3)),
|
284 |
+
"total_credits": float(grade_match.group(4))
|
285 |
+
})
|
286 |
+
|
287 |
+
def _extract_requirements(self, text: str):
|
288 |
+
"""Parse the graduation requirements section"""
|
289 |
+
req_table = re.findall(
|
290 |
+
r"\|([A-Z]-[\w\s]+)\s*\|([^\|]+)\|([\d.]+)\s*\|([\d.]+)\s*\|([\d.]+)\s*\|([^\|]+)\|",
|
291 |
+
text
|
292 |
+
)
|
293 |
+
|
294 |
+
for row in req_table:
|
295 |
+
req_name = row[0].strip()
|
296 |
+
self.requirements[req_name] = {
|
297 |
+
"required": float(row[2]),
|
298 |
+
"completed": float(row[4]),
|
299 |
+
"status": f"{row[5].strip()}%"
|
300 |
+
}
|
301 |
+
|
302 |
+
def _extract_course_history(self, text: str):
|
303 |
+
"""Parse the detailed course history"""
|
304 |
+
course_lines = re.findall(
|
305 |
+
r"\|([A-Z]-[\w\s&\(\)]+)\s*\|(\d{4}-\d{4})\s*\|(\d{2})\s*\|([A-Z0-9]+)\s*\|([^\|]+)\|([^\|]+)\|([^\|]+)\|([A-Z])\s*\|([YRXW]?)\s*\|([^\|]+)\|",
|
306 |
+
text
|
307 |
+
)
|
308 |
+
|
309 |
+
for course in course_lines:
|
310 |
+
self.course_history.append({
|
311 |
+
"requirement_category": course[0].strip(),
|
312 |
+
"school_year": course[1],
|
313 |
+
"grade_level": course[2],
|
314 |
+
"course_code": course[3],
|
315 |
+
"description": course[4].strip(),
|
316 |
+
"term": course[5].strip(),
|
317 |
+
"district_number": course[6].strip(),
|
318 |
+
"grade": course[7],
|
319 |
+
"inclusion_status": course[8],
|
320 |
+
"credits": course[9].strip()
|
321 |
+
})
|
322 |
+
|
323 |
+
def _extract_current_courses(self, text: str):
|
324 |
+
"""Identify courses currently in progress"""
|
325 |
+
in_progress = [c for c in self.course_history if "inProgress" in c["credits"]]
|
326 |
+
self.current_courses = [
|
327 |
+
{
|
328 |
+
"course": c["description"],
|
329 |
+
"category": c["requirement_category"],
|
330 |
+
"term": c["term"],
|
331 |
+
"credits": c["credits"]
|
332 |
+
}
|
333 |
+
for c in in_progress
|
334 |
+
]
|
335 |
+
|
336 |
+
def _calculate_completion(self) -> Dict:
|
337 |
+
"""Calculate overall completion status"""
|
338 |
+
total_required = sum(req["required"] for req in self.requirements.values())
|
339 |
+
total_completed = sum(req["completed"] for req in self.requirements.values())
|
340 |
+
|
341 |
+
return {
|
342 |
+
"total_required": total_required,
|
343 |
+
"total_completed": total_completed,
|
344 |
+
"percent_complete": round((total_completed / total_required) * 100, 1),
|
345 |
+
"remaining_credits": total_required - total_completed
|
346 |
+
}
|
347 |
+
|
348 |
+
def to_json(self) -> str:
|
349 |
+
"""Export parsed data as JSON"""
|
350 |
+
return json.dumps({
|
351 |
+
"student_info": self.student_data,
|
352 |
+
"requirements": self.requirements,
|
353 |
+
"current_courses": self.current_courses,
|
354 |
+
"course_history": self.course_history,
|
355 |
+
"completion_status": self._calculate_completion()
|
356 |
+
}, indent=2)
|
357 |
+
|
358 |
def parse_transcript_with_ai(text: str, progress=gr.Progress()) -> Dict:
|
359 |
"""Use AI model to parse transcript text with progress feedback"""
|
360 |
model, tokenizer = model_loader.load_model(model_loader.current_model or DEFAULT_MODEL, progress)
|
361 |
if model is None or tokenizer is None:
|
362 |
raise gr.Error(f"Model failed to load. {model_loader.error or 'Please try loading a model first.'}")
|
363 |
|
364 |
+
# First try the structured parser
|
365 |
+
try:
|
366 |
+
progress(0.1, desc="Parsing transcript structure...")
|
367 |
+
parser = TranscriptParser()
|
368 |
+
parsed_data = parser.parse_transcript(text)
|
369 |
+
progress(0.9, desc="Formatting results...")
|
370 |
+
|
371 |
+
# Convert to expected format
|
372 |
+
formatted_data = {
|
373 |
+
"grade_level": parsed_data["student_info"].get("current_grade", "Unknown"),
|
374 |
+
"gpa": {
|
375 |
+
"weighted": parsed_data["student_info"].get("weighted_gpa", "N/A"),
|
376 |
+
"unweighted": parsed_data["student_info"].get("unweighted_gpa", "N/A")
|
377 |
+
},
|
378 |
+
"courses": []
|
379 |
+
}
|
380 |
+
|
381 |
+
# Add courses
|
382 |
+
for course in parsed_data["course_history"]:
|
383 |
+
formatted_data["courses"].append({
|
384 |
+
"code": course["course_code"],
|
385 |
+
"name": course["description"],
|
386 |
+
"grade": course["grade"],
|
387 |
+
"credits": course["credits"],
|
388 |
+
"year": course["school_year"],
|
389 |
+
"grade_level": course["grade_level"]
|
390 |
+
})
|
391 |
+
|
392 |
+
progress(1.0)
|
393 |
+
return validate_parsed_data(formatted_data)
|
394 |
+
|
395 |
+
except Exception as e:
|
396 |
+
print(f"Structured parsing failed, falling back to AI: {str(e)}")
|
397 |
+
# Fall back to AI parsing if structured parsing fails
|
398 |
+
return parse_transcript_with_ai_fallback(text, progress)
|
399 |
+
|
400 |
+
def parse_transcript_with_ai_fallback(text: str, progress=gr.Progress()) -> Dict:
|
401 |
+
"""Fallback AI parsing method"""
|
402 |
# Pre-process the text
|
403 |
text = remove_sensitive_info(text[:15000]) # Limit input size
|
404 |
|
|
|
421 |
"""
|
422 |
|
423 |
try:
|
424 |
+
progress(0.1, desc="Processing transcript with AI...")
|
425 |
|
426 |
# Tokenize and generate response
|
427 |
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
|
|
|
429 |
|
430 |
outputs = model.generate(
|
431 |
**inputs,
|
432 |
+
max_new_tokens=1500,
|
433 |
temperature=0.1,
|
434 |
do_sample=True
|
435 |
)
|