Spaces:

Dannyar608
/

Final_project

Running

App Files Files Community

Dannyar608 commited on 3 days ago

Commit

f17f847

verified ·

1 Parent(s): c4f9a1a

Update app.py

Browse files

Files changed (1) hide show

app.py +160 -2

app.py CHANGED Viewed

@@ -235,12 +235,170 @@ def remove_sensitive_info(text: str) -> str:
     return text
 # ========== TRANSCRIPT PARSING ==========
 def parse_transcript_with_ai(text: str, progress=gr.Progress()) -> Dict:
     """Use AI model to parse transcript text with progress feedback"""
     model, tokenizer = model_loader.load_model(model_loader.current_model or DEFAULT_MODEL, progress)
     if model is None or tokenizer is None:
         raise gr.Error(f"Model failed to load. {model_loader.error or 'Please try loading a model first.'}")
     # Pre-process the text
     text = remove_sensitive_info(text[:15000])  # Limit input size
@@ -263,7 +421,7 @@ def parse_transcript_with_ai(text: str, progress=gr.Progress()) -> Dict:
     """
     try:
-        progress(0.1, desc="Processing transcript...")
         # Tokenize and generate response
         inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
@@ -271,7 +429,7 @@ def parse_transcript_with_ai(text: str, progress=gr.Progress()) -> Dict:
         outputs = model.generate(
             **inputs,
-            max_new_tokens=1500,  # Reduced from original
             temperature=0.1,
             do_sample=True
         )

     return text
 # ========== TRANSCRIPT PARSING ==========
+class TranscriptParser:
+    def __init__(self):
+        self.student_data = {}
+        self.requirements = {}
+        self.current_courses = []
+        self.course_history = []
+    def parse_transcript(self, text: str) -> Dict:
+        """Main method to parse transcript text"""
+        self._extract_student_info(text)
+        self._extract_requirements(text)
+        self._extract_course_history(text)
+        self._extract_current_courses(text)
+        return {
+            "student_info": self.student_data,
+            "requirements": self.requirements,
+            "current_courses": self.current_courses,
+            "course_history": self.course_history,
+            "completion_status": self._calculate_completion()
+        }
+    def _extract_student_info(self, text: str):
+        """Extract student personal information"""
+        header_match = re.search(
+            r"(\d{7}) - ([\w\s,]+)\s*\|\s*Cohort \w+\s*\|\s*Un-weighted GPA ([\d.]+)\s*\|\s*Comm Serv Hours (\d+)",
+            text
+        )
+        if header_match:
+            self.student_data = {
+                "id": header_match.group(1),
+                "name": header_match.group(2).strip(),
+                "unweighted_gpa": float(header_match.group(3)),
+                "community_service_hours": int(header_match.group(4))
+            }
+        # Extract additional info
+        grade_match = re.search(
+            r"Current Grade: (\d+)\s*\|\s*YOG (\d{4})\s*\|\s*Weighted GPA ([\d.]+)\s*\|\s*Total Credits Earned ([\d.]+)",
+            text
+        )
+        if grade_match:
+            self.student_data.update({
+                "current_grade": grade_match.group(1),
+                "graduation_year": grade_match.group(2),
+                "weighted_gpa": float(grade_match.group(3)),
+                "total_credits": float(grade_match.group(4))
+            })
+    def _extract_requirements(self, text: str):
+        """Parse the graduation requirements section"""
+        req_table = re.findall(
+            r"\|([A-Z]-[\w\s]+)\s*\|([^\|]+)\|([\d.]+)\s*\|([\d.]+)\s*\|([\d.]+)\s*\|([^\|]+)\|",
+            text
+        )
+        for row in req_table:
+            req_name = row[0].strip()
+            self.requirements[req_name] = {
+                "required": float(row[2]),
+                "completed": float(row[4]),
+                "status": f"{row[5].strip()}%"
+            }
+    def _extract_course_history(self, text: str):
+        """Parse the detailed course history"""
+        course_lines = re.findall(
+            r"\|([A-Z]-[\w\s&\(\)]+)\s*\|(\d{4}-\d{4})\s*\|(\d{2})\s*\|([A-Z0-9]+)\s*\|([^\|]+)\|([^\|]+)\|([^\|]+)\|([A-Z])\s*\|([YRXW]?)\s*\|([^\|]+)\|",
+            text
+        )
+        for course in course_lines:
+            self.course_history.append({
+                "requirement_category": course[0].strip(),
+                "school_year": course[1],
+                "grade_level": course[2],
+                "course_code": course[3],
+                "description": course[4].strip(),
+                "term": course[5].strip(),
+                "district_number": course[6].strip(),
+                "grade": course[7],
+                "inclusion_status": course[8],
+                "credits": course[9].strip()
+            })
+    def _extract_current_courses(self, text: str):
+        """Identify courses currently in progress"""
+        in_progress = [c for c in self.course_history if "inProgress" in c["credits"]]
+        self.current_courses = [
+            {
+                "course": c["description"],
+                "category": c["requirement_category"],
+                "term": c["term"],
+                "credits": c["credits"]
+            }
+            for c in in_progress
+        ]
+    def _calculate_completion(self) -> Dict:
+        """Calculate overall completion status"""
+        total_required = sum(req["required"] for req in self.requirements.values())
+        total_completed = sum(req["completed"] for req in self.requirements.values())
+        return {
+            "total_required": total_required,
+            "total_completed": total_completed,
+            "percent_complete": round((total_completed / total_required) * 100, 1),
+            "remaining_credits": total_required - total_completed
+        }
+    def to_json(self) -> str:
+        """Export parsed data as JSON"""
+        return json.dumps({
+            "student_info": self.student_data,
+            "requirements": self.requirements,
+            "current_courses": self.current_courses,
+            "course_history": self.course_history,
+            "completion_status": self._calculate_completion()
+        }, indent=2)
 def parse_transcript_with_ai(text: str, progress=gr.Progress()) -> Dict:
     """Use AI model to parse transcript text with progress feedback"""
     model, tokenizer = model_loader.load_model(model_loader.current_model or DEFAULT_MODEL, progress)
     if model is None or tokenizer is None:
         raise gr.Error(f"Model failed to load. {model_loader.error or 'Please try loading a model first.'}")
+    # First try the structured parser
+    try:
+        progress(0.1, desc="Parsing transcript structure...")
+        parser = TranscriptParser()
+        parsed_data = parser.parse_transcript(text)
+        progress(0.9, desc="Formatting results...")
+        # Convert to expected format
+        formatted_data = {
+            "grade_level": parsed_data["student_info"].get("current_grade", "Unknown"),
+            "gpa": {
+                "weighted": parsed_data["student_info"].get("weighted_gpa", "N/A"),
+                "unweighted": parsed_data["student_info"].get("unweighted_gpa", "N/A")
+            },
+            "courses": []
+        }
+        # Add courses
+        for course in parsed_data["course_history"]:
+            formatted_data["courses"].append({
+                "code": course["course_code"],
+                "name": course["description"],
+                "grade": course["grade"],
+                "credits": course["credits"],
+                "year": course["school_year"],
+                "grade_level": course["grade_level"]
+            })
+        progress(1.0)
+        return validate_parsed_data(formatted_data)
+    except Exception as e:
+        print(f"Structured parsing failed, falling back to AI: {str(e)}")
+        # Fall back to AI parsing if structured parsing fails
+        return parse_transcript_with_ai_fallback(text, progress)
+def parse_transcript_with_ai_fallback(text: str, progress=gr.Progress()) -> Dict:
+    """Fallback AI parsing method"""
     # Pre-process the text
     text = remove_sensitive_info(text[:15000])  # Limit input size
     """
     try:
+        progress(0.1, desc="Processing transcript with AI...")
         # Tokenize and generate response
         inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
         outputs = model.generate(
             **inputs,
+            max_new_tokens=1500,
             temperature=0.1,
             do_sample=True
         )