Spaces:

Dannyar608
/

Final_project

Runtime error

App Files Files Community

Dannyar608 commited on Apr 30

Commit

85bd875

verified ·

1 Parent(s): afd797f

Update app.py

Browse files

Files changed (1) hide show

app.py +269 -72

app.py CHANGED Viewed

@@ -6,6 +6,8 @@ import re
 from PyPDF2 import PdfReader
 from collections import defaultdict
 from transformers import pipeline
 # Initialize NER model (will load only if transformers is available)
 try:
@@ -15,55 +17,201 @@ except Exception as e:
     ner_pipeline = None
 # ========== IMPROVED TRANSCRIPT PARSING ==========
-def extract_gpa(text, gpa_type):
-    pattern = rf'{gpa_type}\s*([\d\.]+)'
-    match = re.search(pattern, text)
-    return match.group(1) if match else "N/A"
-def extract_courses_from_table(text):
-    # This pattern matches the course table rows in the transcript
-    course_pattern = re.compile(
-        r'(\d{4}-\d{4})\s*'  # School year
-        r'\|?\s*(\d+)\s*'     # Grade level
-        r'\|?\s*([A-Z0-9]+)\s*'  # Course code
-        r'\|?\s*([^\|]+?)\s*'  # Course name (captures until next pipe)
-        r'(?:\|\s*[^\|]*){2}'  # Skip Term and DstNumber
-        r'\|\s*([A-FW]?)\s*'   # Grade (FG column)
-        r'(?:\|\s*[^\|]*)'     # Skip Incl column
-        r'\|\s*([\d\.]+|inProgress)'  # Credits
-    )
-    courses_by_grade = defaultdict(list)
-    for match in re.finditer(course_pattern, text):
-        year_range, grade_level, course_code, course_name, grade, credits = match.groups()
-        # Clean up course name
-        course_name = course_name.strip()
-        if 'DE:' in course_name:
-            course_name = course_name.replace('DE:', 'Dual Enrollment:')
-        if 'AP' in course_name:
-            course_name = course_name.replace('AP', 'AP ')
-        course_info = {
-            'name': f"{course_code} {course_name}",
-            'year': year_range,
-            'credits': credits
         }
-        if grade and grade.strip():
-            course_info['grade'] = grade.strip()
-        courses_by_grade[grade_level].append(course_info)
-    return courses_by_grade
-def parse_transcript(file):
-    if file.name.endswith('.pdf'):
-        text = ''
-        reader = PdfReader(file)
-        for page in reader.pages:
-            text += page.extract_text() + '\n'
         # Extract GPA information
         gpa_data = {
@@ -75,32 +223,66 @@ def parse_transcript(file):
         grade_match = re.search(r'Current Grade:\s*(\d+)', text)
         grade_level = grade_match.group(1) if grade_match else "Unknown"
-        # Extract all courses with grades and year taken
-        courses_by_grade = extract_courses_from_table(text)
         # Prepare detailed output
         output_text = f"Student Transcript Summary\n{'='*40}\n"
-        output_text += f"Current Grade Level: {grade_level}\n"
-        output_text += f"Weighted GPA: {gpa_data['weighted']}\n"
-        output_text += f"Unweighted GPA: {gpa_data['unweighted']}\n\n"
         output_text += "Course History:\n{'='*40}\n"
-        # Sort grades numerically (09, 10, 11, 12)
-        for grade in sorted(courses_by_grade.keys(), key=int):
-            output_text += f"\nGrade {grade}:\n{'-'*30}\n"
-            for course in courses_by_grade[grade]:
-                output_text += f"- {course['name']}"
-                if 'grade' in course and course['grade']:
-                    output_text += f" (Grade: {course['grade']})"
-                if 'credits' in course:
-                    output_text += f" | Credits: {course['credits']}"
-                output_text += f" | Year: {course['year']}\n"
-        return output_text, {
-            "gpa": gpa_data,
-            "grade_level": grade_level,
-            "courses": dict(courses_by_grade)
-        }
     else:
         return "Unsupported file format (PDF only for transcript parsing)", None
@@ -279,15 +461,22 @@ def transcript_display(transcript_dict):
     if isinstance(courses_by_grade, dict):
         # Sort grades numerically
-        for grade in sorted(courses_by_grade.keys(), key=int):
             display += f"\n**Grade {grade}**\n"
             for course in courses_by_grade[grade]:
-                display += f"- {course['name']}"
                 if 'grade' in course and course['grade']:
                     display += f" (Grade: {course['grade']})"
                 if 'credits' in course:
                     display += f" | Credits: {course['credits']}"
-                display += f" | Year: {course['year']}\n"
     if 'gpa' in transcript_dict:
         gpa = transcript_dict['gpa']
@@ -375,13 +564,21 @@ def generate_response(message, history):
     elif any(word in message.lower() for word in course_help):
         response = "Here's a summary of your courses:\n"
-        for grade in sorted(courses.keys(), key=int):
-            response += f"\nGrade {grade}:\n"
-            for course in courses[grade]:
-                response += f"- {course['name']}"
-                if 'grade' in course:
-                    response += f" (Grade: {course['grade']})"
-                response += "\n"
         return response
     elif "help" in message.lower():
@@ -464,4 +661,4 @@ with gr.Blocks() as app:
 if __name__ == "__main__":
     app.launch()

 from PyPDF2 import PdfReader
 from collections import defaultdict
 from transformers import pipeline
+from typing import List, Dict, Union
+import pdfplumber
 # Initialize NER model (will load only if transformers is available)
 try:
     ner_pipeline = None
 # ========== IMPROVED TRANSCRIPT PARSING ==========
+class UniversalTranscriptParser:
+    def __init__(self):
+        # Patterns for different transcript types
+        self.patterns = {
+            'miami_dade': self._compile_miami_dade_patterns(),
+            'homeschool': self._compile_homeschool_patterns(),
+            'doral_academy': self._compile_doral_academy_patterns()
+        }
+        # Grade level mappings
+        self.grade_level_map = {
+            '09': '9th Grade', '10': '10th Grade', '11': '11th Grade', '12': '12th Grade',
+            '07': '7th Grade', '08': '8th Grade', 'MA': 'Middle School'
+        }
+    def parse_transcript(self, text: str) -> Dict[str, Union[Dict, List[Dict]]:
+        """Determine transcript type and parse accordingly"""
+        transcript_type = self._identify_transcript_type(text)
+        if transcript_type == 'homeschool':
+            return self._parse_homeschool(text)
+        elif transcript_type == 'doral_academy':
+            return self._parse_doral_academy(text)
+        else:  # Default to Miami-Dade pattern
+            return self._parse_miami_dade(text)
+    def _identify_transcript_type(self, text: str) -> str:
+        """Identify which type of transcript we're processing"""
+        if re.search(r'Sample OFFICIAL HIGH SCHOOL TRANSCRIPT', text):
+            return 'homeschool'
+        elif re.search(r'DORAL ACADEMY HIGH SCHOOL', text):
+            return 'doral_academy'
+        return 'miami_dade'
+    def _parse_homeschool(self, text: str) -> Dict[str, Union[Dict, List[Dict]]:
+        """Parse homeschool transcript format"""
+        courses = []
+        current_grade = None
+        current_year = None
+        # Extract student info
+        student_info = {}
+        name_match = re.search(r'Student Name:\s*(.+)\s*SSN:', text)
+        if name_match:
+            student_info['name'] = name_match.group(1).strip()
+        # Process each line
+        for line in text.split('\n'):
+            # Check for grade level header
+            grade_match = re.match(r'^\|?\s*(\d+th Grade)\s*\|.*(\d{4}-\d{4})', line)
+            if grade_match:
+                current_grade = grade_match.group(1)
+                current_year = grade_match.group(2)
+                continue
+            # Course line pattern
+            course_match = re.match(
+                r'^\|?\s*([^\|]+?)\s*\|\s*([A-Z][+*]?)\s*\|\s*([^\|]+)\s*\|\s*(\d+\.?\d*)\s*\|\s*(\d+)',
+                line
+            )
+            if course_match and current_grade:
+                course_name = course_match.group(1).strip()
+                # Clean course names that start with | or have extra spaces
+                course_name = re.sub(r'^\|?\s*', '', course_name)
+                courses.append({
+                    'name': course_name,
+                    'grade_level': current_grade,
+                    'school_year': current_year,
+                    'grade': course_match.group(2),
+                    'credit_type': course_match.group(3).strip(),
+                    'credits': float(course_match.group(4)),
+                    'quality_points': int(course_match.group(5)),
+                    'transcript_type': 'homeschool'
+                })
+        # Extract GPA information from homeschool transcript
+        gpa_data = {}
+        gpa_match = re.search(r'Cum\. GPA\s*\|\s*([\d\.]+)', text)
+        if gpa_match:
+            gpa_data['unweighted'] = gpa_match.group(1)
+            gpa_data['weighted'] = gpa_match.group(1)  # Homeschool often has same weighted/unweighted
+        return {
+            'student_info': student_info,
+            'courses': {'All': courses},  # Homeschool doesn't separate by grade in same way
+            'gpa': gpa_data,
+            'grade_level': current_grade.replace('th Grade', '') if current_grade else "Unknown"
         }
+    def _parse_doral_academy(self, text: str) -> Dict[str, Union[Dict, List[Dict]]]:
+        """Parse Doral Academy specific format"""
+        courses = []
+        # Extract student info
+        student_info = {}
+        name_match = re.search(r'LEGAL NAME:\s*([^\n]+)', text)
+        if name_match:
+            student_info['name'] = name_match.group(1).strip()
+        # Extract school year information
+        year_pattern = re.compile(r'YEAR:\s*(\d{4}-\d{4})\s*GRADE LEVEL:\s*(\d{2})', re.MULTILINE)
+        year_matches = year_pattern.finditer(text)
+        # Create mapping of grade levels to years
+        grade_year_map = {}
+        for match in year_matches:
+            grade_year_map[match.group(2)] = match.group(1)
+        # Course pattern for Doral Academy
+        course_pattern = re.compile(
+            r'(\d)\s+(\d{7})\s+([^\n]+?)\s+([A-Z]{2})\s+([A-Z])\s+([A-Z])\s+([A-Z])\s+(\d\.\d{2})\s+(\d\.\d{2})',
+            re.MULTILINE
+        )
+        courses_by_grade = defaultdict(list)
+        for match in course_pattern.finditer(text):
+            grade_level_num = match.group(1)
+            grade_level = self.grade_level_map.get(grade_level_num, f"Grade {grade_level_num}")
+            school_year = grade_year_map.get(grade_level_num, "Unknown")
+            course_info = {
+                'course_code': match.group(2),
+                'name': match.group(3).strip(),
+                'subject_area': match.group(4),
+                'grade': match.group(5),
+                'inclusion_status': match.group(6),
+                'credit_status': match.group(7),
+                'credits_attempted': float(match.group(8)),
+                'credits': float(match.group(9)),
+                'grade_level': grade_level,
+                'school_year': school_year,
+                'transcript_type': 'doral_academy'
+            }
+            courses_by_grade[grade_level_num].append(course_info)
+        # Extract GPA information from Doral Academy transcript
+        gpa_data = {}
+        unweighted_match = re.search(r'Un-weighted GPA\s*([\d\.]+)', text)
+        weighted_match = re.search(r'Weighted GPA\s*([\d\.]+)', text)
+        if unweighted_match:
+            gpa_data['unweighted'] = unweighted_match.group(1)
+        if weighted_match:
+            gpa_data['weighted'] = weighted_match.group(1)
+        # Extract current grade level
+        grade_match = re.search(r'GRADE LEVEL:\s*12', text)  # Adjust as needed
+        grade_level = "12" if grade_match else "Unknown"
+        return {
+            'student_info': student_info,
+            'courses': dict(courses_by_grade),
+            'gpa': gpa_data,
+            'grade_level': grade_level
+        }
+    def _parse_miami_dade(self, text: str) -> Dict[str, Union[Dict, List[Dict]]:
+        """Parse standard Miami-Dade format"""
+        courses = []
+        courses_by_grade = defaultdict(list)
+        # Extract student info
+        student_info = {}
+        name_match = re.search(r'0783977 - ([^,]+),\s*([^\n]+)', text)
+        if name_match:
+            student_info['name'] = f"{name_match.group(2)} {name_match.group(1)}"
+        # Course pattern for Miami-Dade
+        course_pattern = re.compile(
+            r'([A-Z]-[A-Za-z\s&]+)\s*\|\s*(\d{4}-\d{4})\s*\|\s*(\d{2})\s*\|\s*([A-Z0-9]+)\s*\|\s*([^\|]+)\s*\|\s*([^\|]+)\s*\|\s*([^\|]+)\s*\|\s*([A-Z]?)\s*\|\s*([A-Z]?)\s*\|\s*([^\|]+)',
+            re.MULTILINE
+        )
+        for match in course_pattern.finditer(text):
+            grade_level = self.grade_level_map.get(match.group(3), match.group(3)
+            credits = match.group(10).strip()
+            course_info = {
+                'requirement_category': match.group(1).strip(),
+                'school_year': match.group(2),
+                'grade_level': grade_level if isinstance(grade_level, str) else f"Grade {match.group(3)}",
+                'course_code': match.group(4).strip(),
+                'name': match.group(5).strip(),
+                'term': match.group(6).strip(),
+                'district_number': match.group(7).strip(),
+                'grade': match.group(8),
+                'inclusion_status': match.group(9),
+                'credits': 0.0 if 'inProgress' in credits else float(credits.replace(' ', '')),
+                'transcript_type': 'miami_dade'
+            }
+            courses_by_grade[match.group(3)].append(course_info)
         # Extract GPA information
         gpa_data = {
         grade_match = re.search(r'Current Grade:\s*(\d+)', text)
         grade_level = grade_match.group(1) if grade_match else "Unknown"
+        return {
+            'student_info': student_info,
+            'courses': dict(courses_by_grade),
+            'gpa': gpa_data,
+            'grade_level': grade_level
+        }
+def extract_gpa(text, gpa_type):
+    pattern = rf'{gpa_type}\s*([\d\.]+)'
+    match = re.search(pattern, text)
+    return match.group(1) if match else "N/A"
+def parse_transcript(file):
+    parser = UniversalTranscriptParser()
+    if file.name.endswith('.pdf'):
+        text = ''
+        with pdfplumber.open(file.name) as pdf:
+            for page in pdf.pages:
+                text += page.extract_text() + '\n'
+        parsed_data = parser.parse_transcript(text)
         # Prepare detailed output
         output_text = f"Student Transcript Summary\n{'='*40}\n"
+        if 'student_info' in parsed_data and 'name' in parsed_data['student_info']:
+            output_text += f"Student: {parsed_data['student_info']['name']}\n"
+        output_text += f"Current Grade Level: {parsed_data.get('grade_level', 'Unknown')}\n"
+        if 'gpa' in parsed_data:
+            gpa = parsed_data['gpa']
+            output_text += f"Weighted GPA: {gpa.get('weighted', 'N/A')}\n"
+            output_text += f"Unweighted GPA: {gpa.get('unweighted', 'N/A')}\n\n"
         output_text += "Course History:\n{'='*40}\n"
+        if 'courses' in parsed_data:
+            courses_by_grade = parsed_data['courses']
+            # Sort grades numerically (09, 10, 11, 12) or use original order
+            try:
+                grades_sorted = sorted(courses_by_grade.keys(), key=int)
+            except:
+                grades_sorted = sorted(courses_by_grade.keys())
+            for grade in grades_sorted:
+                output_text += f"\nGrade {grade}:\n{'-'*30}\n"
+                for course in courses_by_grade[grade]:
+                    output_text += f"- {course.get('name', 'Unnamed Course')}"
+                    if 'grade' in course and course['grade']:
+                        output_text += f" (Grade: {course['grade']})"
+                    if 'credits' in course:
+                        output_text += f" | Credits: {course['credits']}"
+                    if 'school_year' in course:
+                        output_text += f" | Year: {course['school_year']}"
+                    output_text += "\n"
+        return output_text, parsed_data
     else:
         return "Unsupported file format (PDF only for transcript parsing)", None
     if isinstance(courses_by_grade, dict):
         # Sort grades numerically
+        try:
+            grades_sorted = sorted(courses_by_grade.keys(), key=int)
+        except:
+            grades_sorted = sorted(courses_by_grade.keys())
+        for grade in grades_sorted:
             display += f"\n**Grade {grade}**\n"
             for course in courses_by_grade[grade]:
+                display += f"- {course.get('name', 'Unnamed Course')}"
                 if 'grade' in course and course['grade']:
                     display += f" (Grade: {course['grade']})"
                 if 'credits' in course:
                     display += f" | Credits: {course['credits']}"
+                if 'school_year' in course:
+                    display += f" | Year: {course['school_year']}"
+                display += "\n"
     if 'gpa' in transcript_dict:
         gpa = transcript_dict['gpa']
     elif any(word in message.lower() for word in course_help):
         response = "Here's a summary of your courses:\n"
+        if isinstance(courses, dict):
+            try:
+                grades_sorted = sorted(courses.keys(), key=int)
+            except:
+                grades_sorted = sorted(courses.keys())
+            for grade in grades_sorted:
+                response += f"\nGrade {grade}:\n"
+                for course in courses[grade]:
+                    response += f"- {course.get('name', 'Unnamed Course')}"
+                    if 'grade' in course:
+                        response += f" (Grade: {course['grade']})"
+                    response += "\n"
+        else:
+            response += "No detailed course information available."
         return response
     elif "help" in message.lower():
 if __name__ == "__main__":
     app.launch()