Dannyar608 commited on
Commit
85bd875
·
verified ·
1 Parent(s): afd797f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +269 -72
app.py CHANGED
@@ -6,6 +6,8 @@ import re
6
  from PyPDF2 import PdfReader
7
  from collections import defaultdict
8
  from transformers import pipeline
 
 
9
 
10
  # Initialize NER model (will load only if transformers is available)
11
  try:
@@ -15,55 +17,201 @@ except Exception as e:
15
  ner_pipeline = None
16
 
17
  # ========== IMPROVED TRANSCRIPT PARSING ==========
18
- def extract_gpa(text, gpa_type):
19
- pattern = rf'{gpa_type}\s*([\d\.]+)'
20
- match = re.search(pattern, text)
21
- return match.group(1) if match else "N/A"
22
-
23
- def extract_courses_from_table(text):
24
- # This pattern matches the course table rows in the transcript
25
- course_pattern = re.compile(
26
- r'(\d{4}-\d{4})\s*' # School year
27
- r'\|?\s*(\d+)\s*' # Grade level
28
- r'\|?\s*([A-Z0-9]+)\s*' # Course code
29
- r'\|?\s*([^\|]+?)\s*' # Course name (captures until next pipe)
30
- r'(?:\|\s*[^\|]*){2}' # Skip Term and DstNumber
31
- r'\|\s*([A-FW]?)\s*' # Grade (FG column)
32
- r'(?:\|\s*[^\|]*)' # Skip Incl column
33
- r'\|\s*([\d\.]+|inProgress)' # Credits
34
- )
 
 
 
 
 
 
 
 
35
 
36
- courses_by_grade = defaultdict(list)
 
 
 
 
 
 
37
 
38
- for match in re.finditer(course_pattern, text):
39
- year_range, grade_level, course_code, course_name, grade, credits = match.groups()
 
 
 
40
 
41
- # Clean up course name
42
- course_name = course_name.strip()
43
- if 'DE:' in course_name:
44
- course_name = course_name.replace('DE:', 'Dual Enrollment:')
45
- if 'AP' in course_name:
46
- course_name = course_name.replace('AP', 'AP ')
47
 
48
- course_info = {
49
- 'name': f"{course_code} {course_name}",
50
- 'year': year_range,
51
- 'credits': credits
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  }
 
 
 
 
53
 
54
- if grade and grade.strip():
55
- course_info['grade'] = grade.strip()
 
 
 
56
 
57
- courses_by_grade[grade_level].append(course_info)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
59
- return courses_by_grade
60
-
61
- def parse_transcript(file):
62
- if file.name.endswith('.pdf'):
63
- text = ''
64
- reader = PdfReader(file)
65
- for page in reader.pages:
66
- text += page.extract_text() + '\n'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
68
  # Extract GPA information
69
  gpa_data = {
@@ -75,32 +223,66 @@ def parse_transcript(file):
75
  grade_match = re.search(r'Current Grade:\s*(\d+)', text)
76
  grade_level = grade_match.group(1) if grade_match else "Unknown"
77
 
78
- # Extract all courses with grades and year taken
79
- courses_by_grade = extract_courses_from_table(text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
 
81
  # Prepare detailed output
82
  output_text = f"Student Transcript Summary\n{'='*40}\n"
83
- output_text += f"Current Grade Level: {grade_level}\n"
84
- output_text += f"Weighted GPA: {gpa_data['weighted']}\n"
85
- output_text += f"Unweighted GPA: {gpa_data['unweighted']}\n\n"
 
 
 
 
 
 
 
 
86
  output_text += "Course History:\n{'='*40}\n"
87
 
88
- # Sort grades numerically (09, 10, 11, 12)
89
- for grade in sorted(courses_by_grade.keys(), key=int):
90
- output_text += f"\nGrade {grade}:\n{'-'*30}\n"
91
- for course in courses_by_grade[grade]:
92
- output_text += f"- {course['name']}"
93
- if 'grade' in course and course['grade']:
94
- output_text += f" (Grade: {course['grade']})"
95
- if 'credits' in course:
96
- output_text += f" | Credits: {course['credits']}"
97
- output_text += f" | Year: {course['year']}\n"
 
 
 
 
 
 
 
 
 
 
98
 
99
- return output_text, {
100
- "gpa": gpa_data,
101
- "grade_level": grade_level,
102
- "courses": dict(courses_by_grade)
103
- }
104
  else:
105
  return "Unsupported file format (PDF only for transcript parsing)", None
106
 
@@ -279,15 +461,22 @@ def transcript_display(transcript_dict):
279
 
280
  if isinstance(courses_by_grade, dict):
281
  # Sort grades numerically
282
- for grade in sorted(courses_by_grade.keys(), key=int):
 
 
 
 
 
283
  display += f"\n**Grade {grade}**\n"
284
  for course in courses_by_grade[grade]:
285
- display += f"- {course['name']}"
286
  if 'grade' in course and course['grade']:
287
  display += f" (Grade: {course['grade']})"
288
  if 'credits' in course:
289
  display += f" | Credits: {course['credits']}"
290
- display += f" | Year: {course['year']}\n"
 
 
291
 
292
  if 'gpa' in transcript_dict:
293
  gpa = transcript_dict['gpa']
@@ -375,13 +564,21 @@ def generate_response(message, history):
375
 
376
  elif any(word in message.lower() for word in course_help):
377
  response = "Here's a summary of your courses:\n"
378
- for grade in sorted(courses.keys(), key=int):
379
- response += f"\nGrade {grade}:\n"
380
- for course in courses[grade]:
381
- response += f"- {course['name']}"
382
- if 'grade' in course:
383
- response += f" (Grade: {course['grade']})"
384
- response += "\n"
 
 
 
 
 
 
 
 
385
  return response
386
 
387
  elif "help" in message.lower():
@@ -464,4 +661,4 @@ with gr.Blocks() as app:
464
 
465
  if __name__ == "__main__":
466
  app.launch()
467
-
 
6
  from PyPDF2 import PdfReader
7
  from collections import defaultdict
8
  from transformers import pipeline
9
+ from typing import List, Dict, Union
10
+ import pdfplumber
11
 
12
  # Initialize NER model (will load only if transformers is available)
13
  try:
 
17
  ner_pipeline = None
18
 
19
  # ========== IMPROVED TRANSCRIPT PARSING ==========
20
+ class UniversalTranscriptParser:
21
+ def __init__(self):
22
+ # Patterns for different transcript types
23
+ self.patterns = {
24
+ 'miami_dade': self._compile_miami_dade_patterns(),
25
+ 'homeschool': self._compile_homeschool_patterns(),
26
+ 'doral_academy': self._compile_doral_academy_patterns()
27
+ }
28
+
29
+ # Grade level mappings
30
+ self.grade_level_map = {
31
+ '09': '9th Grade', '10': '10th Grade', '11': '11th Grade', '12': '12th Grade',
32
+ '07': '7th Grade', '08': '8th Grade', 'MA': 'Middle School'
33
+ }
34
+
35
+ def parse_transcript(self, text: str) -> Dict[str, Union[Dict, List[Dict]]:
36
+ """Determine transcript type and parse accordingly"""
37
+ transcript_type = self._identify_transcript_type(text)
38
+
39
+ if transcript_type == 'homeschool':
40
+ return self._parse_homeschool(text)
41
+ elif transcript_type == 'doral_academy':
42
+ return self._parse_doral_academy(text)
43
+ else: # Default to Miami-Dade pattern
44
+ return self._parse_miami_dade(text)
45
 
46
+ def _identify_transcript_type(self, text: str) -> str:
47
+ """Identify which type of transcript we're processing"""
48
+ if re.search(r'Sample OFFICIAL HIGH SCHOOL TRANSCRIPT', text):
49
+ return 'homeschool'
50
+ elif re.search(r'DORAL ACADEMY HIGH SCHOOL', text):
51
+ return 'doral_academy'
52
+ return 'miami_dade'
53
 
54
+ def _parse_homeschool(self, text: str) -> Dict[str, Union[Dict, List[Dict]]:
55
+ """Parse homeschool transcript format"""
56
+ courses = []
57
+ current_grade = None
58
+ current_year = None
59
 
60
+ # Extract student info
61
+ student_info = {}
62
+ name_match = re.search(r'Student Name:\s*(.+)\s*SSN:', text)
63
+ if name_match:
64
+ student_info['name'] = name_match.group(1).strip()
 
65
 
66
+ # Process each line
67
+ for line in text.split('\n'):
68
+ # Check for grade level header
69
+ grade_match = re.match(r'^\|?\s*(\d+th Grade)\s*\|.*(\d{4}-\d{4})', line)
70
+ if grade_match:
71
+ current_grade = grade_match.group(1)
72
+ current_year = grade_match.group(2)
73
+ continue
74
+
75
+ # Course line pattern
76
+ course_match = re.match(
77
+ r'^\|?\s*([^\|]+?)\s*\|\s*([A-Z][+*]?)\s*\|\s*([^\|]+)\s*\|\s*(\d+\.?\d*)\s*\|\s*(\d+)',
78
+ line
79
+ )
80
+
81
+ if course_match and current_grade:
82
+ course_name = course_match.group(1).strip()
83
+ # Clean course names that start with | or have extra spaces
84
+ course_name = re.sub(r'^\|?\s*', '', course_name)
85
+
86
+ courses.append({
87
+ 'name': course_name,
88
+ 'grade_level': current_grade,
89
+ 'school_year': current_year,
90
+ 'grade': course_match.group(2),
91
+ 'credit_type': course_match.group(3).strip(),
92
+ 'credits': float(course_match.group(4)),
93
+ 'quality_points': int(course_match.group(5)),
94
+ 'transcript_type': 'homeschool'
95
+ })
96
+
97
+ # Extract GPA information from homeschool transcript
98
+ gpa_data = {}
99
+ gpa_match = re.search(r'Cum\. GPA\s*\|\s*([\d\.]+)', text)
100
+ if gpa_match:
101
+ gpa_data['unweighted'] = gpa_match.group(1)
102
+ gpa_data['weighted'] = gpa_match.group(1) # Homeschool often has same weighted/unweighted
103
+
104
+ return {
105
+ 'student_info': student_info,
106
+ 'courses': {'All': courses}, # Homeschool doesn't separate by grade in same way
107
+ 'gpa': gpa_data,
108
+ 'grade_level': current_grade.replace('th Grade', '') if current_grade else "Unknown"
109
  }
110
+
111
+ def _parse_doral_academy(self, text: str) -> Dict[str, Union[Dict, List[Dict]]]:
112
+ """Parse Doral Academy specific format"""
113
+ courses = []
114
 
115
+ # Extract student info
116
+ student_info = {}
117
+ name_match = re.search(r'LEGAL NAME:\s*([^\n]+)', text)
118
+ if name_match:
119
+ student_info['name'] = name_match.group(1).strip()
120
 
121
+ # Extract school year information
122
+ year_pattern = re.compile(r'YEAR:\s*(\d{4}-\d{4})\s*GRADE LEVEL:\s*(\d{2})', re.MULTILINE)
123
+ year_matches = year_pattern.finditer(text)
124
+
125
+ # Create mapping of grade levels to years
126
+ grade_year_map = {}
127
+ for match in year_matches:
128
+ grade_year_map[match.group(2)] = match.group(1)
129
+
130
+ # Course pattern for Doral Academy
131
+ course_pattern = re.compile(
132
+ r'(\d)\s+(\d{7})\s+([^\n]+?)\s+([A-Z]{2})\s+([A-Z])\s+([A-Z])\s+([A-Z])\s+(\d\.\d{2})\s+(\d\.\d{2})',
133
+ re.MULTILINE
134
+ )
135
+
136
+ courses_by_grade = defaultdict(list)
137
+ for match in course_pattern.finditer(text):
138
+ grade_level_num = match.group(1)
139
+ grade_level = self.grade_level_map.get(grade_level_num, f"Grade {grade_level_num}")
140
+ school_year = grade_year_map.get(grade_level_num, "Unknown")
141
+
142
+ course_info = {
143
+ 'course_code': match.group(2),
144
+ 'name': match.group(3).strip(),
145
+ 'subject_area': match.group(4),
146
+ 'grade': match.group(5),
147
+ 'inclusion_status': match.group(6),
148
+ 'credit_status': match.group(7),
149
+ 'credits_attempted': float(match.group(8)),
150
+ 'credits': float(match.group(9)),
151
+ 'grade_level': grade_level,
152
+ 'school_year': school_year,
153
+ 'transcript_type': 'doral_academy'
154
+ }
155
+
156
+ courses_by_grade[grade_level_num].append(course_info)
157
+
158
+ # Extract GPA information from Doral Academy transcript
159
+ gpa_data = {}
160
+ unweighted_match = re.search(r'Un-weighted GPA\s*([\d\.]+)', text)
161
+ weighted_match = re.search(r'Weighted GPA\s*([\d\.]+)', text)
162
+
163
+ if unweighted_match:
164
+ gpa_data['unweighted'] = unweighted_match.group(1)
165
+ if weighted_match:
166
+ gpa_data['weighted'] = weighted_match.group(1)
167
+
168
+ # Extract current grade level
169
+ grade_match = re.search(r'GRADE LEVEL:\s*12', text) # Adjust as needed
170
+ grade_level = "12" if grade_match else "Unknown"
171
+
172
+ return {
173
+ 'student_info': student_info,
174
+ 'courses': dict(courses_by_grade),
175
+ 'gpa': gpa_data,
176
+ 'grade_level': grade_level
177
+ }
178
 
179
+ def _parse_miami_dade(self, text: str) -> Dict[str, Union[Dict, List[Dict]]:
180
+ """Parse standard Miami-Dade format"""
181
+ courses = []
182
+ courses_by_grade = defaultdict(list)
183
+
184
+ # Extract student info
185
+ student_info = {}
186
+ name_match = re.search(r'0783977 - ([^,]+),\s*([^\n]+)', text)
187
+ if name_match:
188
+ student_info['name'] = f"{name_match.group(2)} {name_match.group(1)}"
189
+
190
+ # Course pattern for Miami-Dade
191
+ course_pattern = re.compile(
192
+ r'([A-Z]-[A-Za-z\s&]+)\s*\|\s*(\d{4}-\d{4})\s*\|\s*(\d{2})\s*\|\s*([A-Z0-9]+)\s*\|\s*([^\|]+)\s*\|\s*([^\|]+)\s*\|\s*([^\|]+)\s*\|\s*([A-Z]?)\s*\|\s*([A-Z]?)\s*\|\s*([^\|]+)',
193
+ re.MULTILINE
194
+ )
195
+
196
+ for match in course_pattern.finditer(text):
197
+ grade_level = self.grade_level_map.get(match.group(3), match.group(3)
198
+ credits = match.group(10).strip()
199
+
200
+ course_info = {
201
+ 'requirement_category': match.group(1).strip(),
202
+ 'school_year': match.group(2),
203
+ 'grade_level': grade_level if isinstance(grade_level, str) else f"Grade {match.group(3)}",
204
+ 'course_code': match.group(4).strip(),
205
+ 'name': match.group(5).strip(),
206
+ 'term': match.group(6).strip(),
207
+ 'district_number': match.group(7).strip(),
208
+ 'grade': match.group(8),
209
+ 'inclusion_status': match.group(9),
210
+ 'credits': 0.0 if 'inProgress' in credits else float(credits.replace(' ', '')),
211
+ 'transcript_type': 'miami_dade'
212
+ }
213
+
214
+ courses_by_grade[match.group(3)].append(course_info)
215
 
216
  # Extract GPA information
217
  gpa_data = {
 
223
  grade_match = re.search(r'Current Grade:\s*(\d+)', text)
224
  grade_level = grade_match.group(1) if grade_match else "Unknown"
225
 
226
+ return {
227
+ 'student_info': student_info,
228
+ 'courses': dict(courses_by_grade),
229
+ 'gpa': gpa_data,
230
+ 'grade_level': grade_level
231
+ }
232
+
233
+ def extract_gpa(text, gpa_type):
234
+ pattern = rf'{gpa_type}\s*([\d\.]+)'
235
+ match = re.search(pattern, text)
236
+ return match.group(1) if match else "N/A"
237
+
238
+ def parse_transcript(file):
239
+ parser = UniversalTranscriptParser()
240
+
241
+ if file.name.endswith('.pdf'):
242
+ text = ''
243
+ with pdfplumber.open(file.name) as pdf:
244
+ for page in pdf.pages:
245
+ text += page.extract_text() + '\n'
246
+
247
+ parsed_data = parser.parse_transcript(text)
248
 
249
  # Prepare detailed output
250
  output_text = f"Student Transcript Summary\n{'='*40}\n"
251
+
252
+ if 'student_info' in parsed_data and 'name' in parsed_data['student_info']:
253
+ output_text += f"Student: {parsed_data['student_info']['name']}\n"
254
+
255
+ output_text += f"Current Grade Level: {parsed_data.get('grade_level', 'Unknown')}\n"
256
+
257
+ if 'gpa' in parsed_data:
258
+ gpa = parsed_data['gpa']
259
+ output_text += f"Weighted GPA: {gpa.get('weighted', 'N/A')}\n"
260
+ output_text += f"Unweighted GPA: {gpa.get('unweighted', 'N/A')}\n\n"
261
+
262
  output_text += "Course History:\n{'='*40}\n"
263
 
264
+ if 'courses' in parsed_data:
265
+ courses_by_grade = parsed_data['courses']
266
+
267
+ # Sort grades numerically (09, 10, 11, 12) or use original order
268
+ try:
269
+ grades_sorted = sorted(courses_by_grade.keys(), key=int)
270
+ except:
271
+ grades_sorted = sorted(courses_by_grade.keys())
272
+
273
+ for grade in grades_sorted:
274
+ output_text += f"\nGrade {grade}:\n{'-'*30}\n"
275
+ for course in courses_by_grade[grade]:
276
+ output_text += f"- {course.get('name', 'Unnamed Course')}"
277
+ if 'grade' in course and course['grade']:
278
+ output_text += f" (Grade: {course['grade']})"
279
+ if 'credits' in course:
280
+ output_text += f" | Credits: {course['credits']}"
281
+ if 'school_year' in course:
282
+ output_text += f" | Year: {course['school_year']}"
283
+ output_text += "\n"
284
 
285
+ return output_text, parsed_data
 
 
 
 
286
  else:
287
  return "Unsupported file format (PDF only for transcript parsing)", None
288
 
 
461
 
462
  if isinstance(courses_by_grade, dict):
463
  # Sort grades numerically
464
+ try:
465
+ grades_sorted = sorted(courses_by_grade.keys(), key=int)
466
+ except:
467
+ grades_sorted = sorted(courses_by_grade.keys())
468
+
469
+ for grade in grades_sorted:
470
  display += f"\n**Grade {grade}**\n"
471
  for course in courses_by_grade[grade]:
472
+ display += f"- {course.get('name', 'Unnamed Course')}"
473
  if 'grade' in course and course['grade']:
474
  display += f" (Grade: {course['grade']})"
475
  if 'credits' in course:
476
  display += f" | Credits: {course['credits']}"
477
+ if 'school_year' in course:
478
+ display += f" | Year: {course['school_year']}"
479
+ display += "\n"
480
 
481
  if 'gpa' in transcript_dict:
482
  gpa = transcript_dict['gpa']
 
564
 
565
  elif any(word in message.lower() for word in course_help):
566
  response = "Here's a summary of your courses:\n"
567
+ if isinstance(courses, dict):
568
+ try:
569
+ grades_sorted = sorted(courses.keys(), key=int)
570
+ except:
571
+ grades_sorted = sorted(courses.keys())
572
+
573
+ for grade in grades_sorted:
574
+ response += f"\nGrade {grade}:\n"
575
+ for course in courses[grade]:
576
+ response += f"- {course.get('name', 'Unnamed Course')}"
577
+ if 'grade' in course:
578
+ response += f" (Grade: {course['grade']})"
579
+ response += "\n"
580
+ else:
581
+ response += "No detailed course information available."
582
  return response
583
 
584
  elif "help" in message.lower():
 
661
 
662
  if __name__ == "__main__":
663
  app.launch()
664
+