Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -6,6 +6,8 @@ import re
|
|
6 |
from PyPDF2 import PdfReader
|
7 |
from collections import defaultdict
|
8 |
from transformers import pipeline
|
|
|
|
|
9 |
|
10 |
# Initialize NER model (will load only if transformers is available)
|
11 |
try:
|
@@ -15,55 +17,201 @@ except Exception as e:
|
|
15 |
ner_pipeline = None
|
16 |
|
17 |
# ========== IMPROVED TRANSCRIPT PARSING ==========
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
|
36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
|
38 |
-
|
39 |
-
|
|
|
|
|
|
|
40 |
|
41 |
-
#
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
course_name = course_name.replace('AP', 'AP ')
|
47 |
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
}
|
|
|
|
|
|
|
|
|
53 |
|
54 |
-
|
55 |
-
|
|
|
|
|
|
|
56 |
|
57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
67 |
|
68 |
# Extract GPA information
|
69 |
gpa_data = {
|
@@ -75,32 +223,66 @@ def parse_transcript(file):
|
|
75 |
grade_match = re.search(r'Current Grade:\s*(\d+)', text)
|
76 |
grade_level = grade_match.group(1) if grade_match else "Unknown"
|
77 |
|
78 |
-
|
79 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
|
81 |
# Prepare detailed output
|
82 |
output_text = f"Student Transcript Summary\n{'='*40}\n"
|
83 |
-
|
84 |
-
|
85 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
output_text += "Course History:\n{'='*40}\n"
|
87 |
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
98 |
|
99 |
-
return output_text,
|
100 |
-
"gpa": gpa_data,
|
101 |
-
"grade_level": grade_level,
|
102 |
-
"courses": dict(courses_by_grade)
|
103 |
-
}
|
104 |
else:
|
105 |
return "Unsupported file format (PDF only for transcript parsing)", None
|
106 |
|
@@ -279,15 +461,22 @@ def transcript_display(transcript_dict):
|
|
279 |
|
280 |
if isinstance(courses_by_grade, dict):
|
281 |
# Sort grades numerically
|
282 |
-
|
|
|
|
|
|
|
|
|
|
|
283 |
display += f"\n**Grade {grade}**\n"
|
284 |
for course in courses_by_grade[grade]:
|
285 |
-
display += f"- {course
|
286 |
if 'grade' in course and course['grade']:
|
287 |
display += f" (Grade: {course['grade']})"
|
288 |
if 'credits' in course:
|
289 |
display += f" | Credits: {course['credits']}"
|
290 |
-
|
|
|
|
|
291 |
|
292 |
if 'gpa' in transcript_dict:
|
293 |
gpa = transcript_dict['gpa']
|
@@ -375,13 +564,21 @@ def generate_response(message, history):
|
|
375 |
|
376 |
elif any(word in message.lower() for word in course_help):
|
377 |
response = "Here's a summary of your courses:\n"
|
378 |
-
|
379 |
-
|
380 |
-
|
381 |
-
|
382 |
-
|
383 |
-
|
384 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
385 |
return response
|
386 |
|
387 |
elif "help" in message.lower():
|
@@ -464,4 +661,4 @@ with gr.Blocks() as app:
|
|
464 |
|
465 |
if __name__ == "__main__":
|
466 |
app.launch()
|
467 |
-
|
|
|
6 |
from PyPDF2 import PdfReader
|
7 |
from collections import defaultdict
|
8 |
from transformers import pipeline
|
9 |
+
from typing import List, Dict, Union
|
10 |
+
import pdfplumber
|
11 |
|
12 |
# Initialize NER model (will load only if transformers is available)
|
13 |
try:
|
|
|
17 |
ner_pipeline = None
|
18 |
|
19 |
# ========== IMPROVED TRANSCRIPT PARSING ==========
|
20 |
+
class UniversalTranscriptParser:
|
21 |
+
def __init__(self):
|
22 |
+
# Patterns for different transcript types
|
23 |
+
self.patterns = {
|
24 |
+
'miami_dade': self._compile_miami_dade_patterns(),
|
25 |
+
'homeschool': self._compile_homeschool_patterns(),
|
26 |
+
'doral_academy': self._compile_doral_academy_patterns()
|
27 |
+
}
|
28 |
+
|
29 |
+
# Grade level mappings
|
30 |
+
self.grade_level_map = {
|
31 |
+
'09': '9th Grade', '10': '10th Grade', '11': '11th Grade', '12': '12th Grade',
|
32 |
+
'07': '7th Grade', '08': '8th Grade', 'MA': 'Middle School'
|
33 |
+
}
|
34 |
+
|
35 |
+
def parse_transcript(self, text: str) -> Dict[str, Union[Dict, List[Dict]]:
|
36 |
+
"""Determine transcript type and parse accordingly"""
|
37 |
+
transcript_type = self._identify_transcript_type(text)
|
38 |
+
|
39 |
+
if transcript_type == 'homeschool':
|
40 |
+
return self._parse_homeschool(text)
|
41 |
+
elif transcript_type == 'doral_academy':
|
42 |
+
return self._parse_doral_academy(text)
|
43 |
+
else: # Default to Miami-Dade pattern
|
44 |
+
return self._parse_miami_dade(text)
|
45 |
|
46 |
+
def _identify_transcript_type(self, text: str) -> str:
|
47 |
+
"""Identify which type of transcript we're processing"""
|
48 |
+
if re.search(r'Sample OFFICIAL HIGH SCHOOL TRANSCRIPT', text):
|
49 |
+
return 'homeschool'
|
50 |
+
elif re.search(r'DORAL ACADEMY HIGH SCHOOL', text):
|
51 |
+
return 'doral_academy'
|
52 |
+
return 'miami_dade'
|
53 |
|
54 |
+
def _parse_homeschool(self, text: str) -> Dict[str, Union[Dict, List[Dict]]:
|
55 |
+
"""Parse homeschool transcript format"""
|
56 |
+
courses = []
|
57 |
+
current_grade = None
|
58 |
+
current_year = None
|
59 |
|
60 |
+
# Extract student info
|
61 |
+
student_info = {}
|
62 |
+
name_match = re.search(r'Student Name:\s*(.+)\s*SSN:', text)
|
63 |
+
if name_match:
|
64 |
+
student_info['name'] = name_match.group(1).strip()
|
|
|
65 |
|
66 |
+
# Process each line
|
67 |
+
for line in text.split('\n'):
|
68 |
+
# Check for grade level header
|
69 |
+
grade_match = re.match(r'^\|?\s*(\d+th Grade)\s*\|.*(\d{4}-\d{4})', line)
|
70 |
+
if grade_match:
|
71 |
+
current_grade = grade_match.group(1)
|
72 |
+
current_year = grade_match.group(2)
|
73 |
+
continue
|
74 |
+
|
75 |
+
# Course line pattern
|
76 |
+
course_match = re.match(
|
77 |
+
r'^\|?\s*([^\|]+?)\s*\|\s*([A-Z][+*]?)\s*\|\s*([^\|]+)\s*\|\s*(\d+\.?\d*)\s*\|\s*(\d+)',
|
78 |
+
line
|
79 |
+
)
|
80 |
+
|
81 |
+
if course_match and current_grade:
|
82 |
+
course_name = course_match.group(1).strip()
|
83 |
+
# Clean course names that start with | or have extra spaces
|
84 |
+
course_name = re.sub(r'^\|?\s*', '', course_name)
|
85 |
+
|
86 |
+
courses.append({
|
87 |
+
'name': course_name,
|
88 |
+
'grade_level': current_grade,
|
89 |
+
'school_year': current_year,
|
90 |
+
'grade': course_match.group(2),
|
91 |
+
'credit_type': course_match.group(3).strip(),
|
92 |
+
'credits': float(course_match.group(4)),
|
93 |
+
'quality_points': int(course_match.group(5)),
|
94 |
+
'transcript_type': 'homeschool'
|
95 |
+
})
|
96 |
+
|
97 |
+
# Extract GPA information from homeschool transcript
|
98 |
+
gpa_data = {}
|
99 |
+
gpa_match = re.search(r'Cum\. GPA\s*\|\s*([\d\.]+)', text)
|
100 |
+
if gpa_match:
|
101 |
+
gpa_data['unweighted'] = gpa_match.group(1)
|
102 |
+
gpa_data['weighted'] = gpa_match.group(1) # Homeschool often has same weighted/unweighted
|
103 |
+
|
104 |
+
return {
|
105 |
+
'student_info': student_info,
|
106 |
+
'courses': {'All': courses}, # Homeschool doesn't separate by grade in same way
|
107 |
+
'gpa': gpa_data,
|
108 |
+
'grade_level': current_grade.replace('th Grade', '') if current_grade else "Unknown"
|
109 |
}
|
110 |
+
|
111 |
+
def _parse_doral_academy(self, text: str) -> Dict[str, Union[Dict, List[Dict]]]:
|
112 |
+
"""Parse Doral Academy specific format"""
|
113 |
+
courses = []
|
114 |
|
115 |
+
# Extract student info
|
116 |
+
student_info = {}
|
117 |
+
name_match = re.search(r'LEGAL NAME:\s*([^\n]+)', text)
|
118 |
+
if name_match:
|
119 |
+
student_info['name'] = name_match.group(1).strip()
|
120 |
|
121 |
+
# Extract school year information
|
122 |
+
year_pattern = re.compile(r'YEAR:\s*(\d{4}-\d{4})\s*GRADE LEVEL:\s*(\d{2})', re.MULTILINE)
|
123 |
+
year_matches = year_pattern.finditer(text)
|
124 |
+
|
125 |
+
# Create mapping of grade levels to years
|
126 |
+
grade_year_map = {}
|
127 |
+
for match in year_matches:
|
128 |
+
grade_year_map[match.group(2)] = match.group(1)
|
129 |
+
|
130 |
+
# Course pattern for Doral Academy
|
131 |
+
course_pattern = re.compile(
|
132 |
+
r'(\d)\s+(\d{7})\s+([^\n]+?)\s+([A-Z]{2})\s+([A-Z])\s+([A-Z])\s+([A-Z])\s+(\d\.\d{2})\s+(\d\.\d{2})',
|
133 |
+
re.MULTILINE
|
134 |
+
)
|
135 |
+
|
136 |
+
courses_by_grade = defaultdict(list)
|
137 |
+
for match in course_pattern.finditer(text):
|
138 |
+
grade_level_num = match.group(1)
|
139 |
+
grade_level = self.grade_level_map.get(grade_level_num, f"Grade {grade_level_num}")
|
140 |
+
school_year = grade_year_map.get(grade_level_num, "Unknown")
|
141 |
+
|
142 |
+
course_info = {
|
143 |
+
'course_code': match.group(2),
|
144 |
+
'name': match.group(3).strip(),
|
145 |
+
'subject_area': match.group(4),
|
146 |
+
'grade': match.group(5),
|
147 |
+
'inclusion_status': match.group(6),
|
148 |
+
'credit_status': match.group(7),
|
149 |
+
'credits_attempted': float(match.group(8)),
|
150 |
+
'credits': float(match.group(9)),
|
151 |
+
'grade_level': grade_level,
|
152 |
+
'school_year': school_year,
|
153 |
+
'transcript_type': 'doral_academy'
|
154 |
+
}
|
155 |
+
|
156 |
+
courses_by_grade[grade_level_num].append(course_info)
|
157 |
+
|
158 |
+
# Extract GPA information from Doral Academy transcript
|
159 |
+
gpa_data = {}
|
160 |
+
unweighted_match = re.search(r'Un-weighted GPA\s*([\d\.]+)', text)
|
161 |
+
weighted_match = re.search(r'Weighted GPA\s*([\d\.]+)', text)
|
162 |
+
|
163 |
+
if unweighted_match:
|
164 |
+
gpa_data['unweighted'] = unweighted_match.group(1)
|
165 |
+
if weighted_match:
|
166 |
+
gpa_data['weighted'] = weighted_match.group(1)
|
167 |
+
|
168 |
+
# Extract current grade level
|
169 |
+
grade_match = re.search(r'GRADE LEVEL:\s*12', text) # Adjust as needed
|
170 |
+
grade_level = "12" if grade_match else "Unknown"
|
171 |
+
|
172 |
+
return {
|
173 |
+
'student_info': student_info,
|
174 |
+
'courses': dict(courses_by_grade),
|
175 |
+
'gpa': gpa_data,
|
176 |
+
'grade_level': grade_level
|
177 |
+
}
|
178 |
|
179 |
+
def _parse_miami_dade(self, text: str) -> Dict[str, Union[Dict, List[Dict]]:
|
180 |
+
"""Parse standard Miami-Dade format"""
|
181 |
+
courses = []
|
182 |
+
courses_by_grade = defaultdict(list)
|
183 |
+
|
184 |
+
# Extract student info
|
185 |
+
student_info = {}
|
186 |
+
name_match = re.search(r'0783977 - ([^,]+),\s*([^\n]+)', text)
|
187 |
+
if name_match:
|
188 |
+
student_info['name'] = f"{name_match.group(2)} {name_match.group(1)}"
|
189 |
+
|
190 |
+
# Course pattern for Miami-Dade
|
191 |
+
course_pattern = re.compile(
|
192 |
+
r'([A-Z]-[A-Za-z\s&]+)\s*\|\s*(\d{4}-\d{4})\s*\|\s*(\d{2})\s*\|\s*([A-Z0-9]+)\s*\|\s*([^\|]+)\s*\|\s*([^\|]+)\s*\|\s*([^\|]+)\s*\|\s*([A-Z]?)\s*\|\s*([A-Z]?)\s*\|\s*([^\|]+)',
|
193 |
+
re.MULTILINE
|
194 |
+
)
|
195 |
+
|
196 |
+
for match in course_pattern.finditer(text):
|
197 |
+
grade_level = self.grade_level_map.get(match.group(3), match.group(3)
|
198 |
+
credits = match.group(10).strip()
|
199 |
+
|
200 |
+
course_info = {
|
201 |
+
'requirement_category': match.group(1).strip(),
|
202 |
+
'school_year': match.group(2),
|
203 |
+
'grade_level': grade_level if isinstance(grade_level, str) else f"Grade {match.group(3)}",
|
204 |
+
'course_code': match.group(4).strip(),
|
205 |
+
'name': match.group(5).strip(),
|
206 |
+
'term': match.group(6).strip(),
|
207 |
+
'district_number': match.group(7).strip(),
|
208 |
+
'grade': match.group(8),
|
209 |
+
'inclusion_status': match.group(9),
|
210 |
+
'credits': 0.0 if 'inProgress' in credits else float(credits.replace(' ', '')),
|
211 |
+
'transcript_type': 'miami_dade'
|
212 |
+
}
|
213 |
+
|
214 |
+
courses_by_grade[match.group(3)].append(course_info)
|
215 |
|
216 |
# Extract GPA information
|
217 |
gpa_data = {
|
|
|
223 |
grade_match = re.search(r'Current Grade:\s*(\d+)', text)
|
224 |
grade_level = grade_match.group(1) if grade_match else "Unknown"
|
225 |
|
226 |
+
return {
|
227 |
+
'student_info': student_info,
|
228 |
+
'courses': dict(courses_by_grade),
|
229 |
+
'gpa': gpa_data,
|
230 |
+
'grade_level': grade_level
|
231 |
+
}
|
232 |
+
|
233 |
+
def extract_gpa(text, gpa_type):
|
234 |
+
pattern = rf'{gpa_type}\s*([\d\.]+)'
|
235 |
+
match = re.search(pattern, text)
|
236 |
+
return match.group(1) if match else "N/A"
|
237 |
+
|
238 |
+
def parse_transcript(file):
|
239 |
+
parser = UniversalTranscriptParser()
|
240 |
+
|
241 |
+
if file.name.endswith('.pdf'):
|
242 |
+
text = ''
|
243 |
+
with pdfplumber.open(file.name) as pdf:
|
244 |
+
for page in pdf.pages:
|
245 |
+
text += page.extract_text() + '\n'
|
246 |
+
|
247 |
+
parsed_data = parser.parse_transcript(text)
|
248 |
|
249 |
# Prepare detailed output
|
250 |
output_text = f"Student Transcript Summary\n{'='*40}\n"
|
251 |
+
|
252 |
+
if 'student_info' in parsed_data and 'name' in parsed_data['student_info']:
|
253 |
+
output_text += f"Student: {parsed_data['student_info']['name']}\n"
|
254 |
+
|
255 |
+
output_text += f"Current Grade Level: {parsed_data.get('grade_level', 'Unknown')}\n"
|
256 |
+
|
257 |
+
if 'gpa' in parsed_data:
|
258 |
+
gpa = parsed_data['gpa']
|
259 |
+
output_text += f"Weighted GPA: {gpa.get('weighted', 'N/A')}\n"
|
260 |
+
output_text += f"Unweighted GPA: {gpa.get('unweighted', 'N/A')}\n\n"
|
261 |
+
|
262 |
output_text += "Course History:\n{'='*40}\n"
|
263 |
|
264 |
+
if 'courses' in parsed_data:
|
265 |
+
courses_by_grade = parsed_data['courses']
|
266 |
+
|
267 |
+
# Sort grades numerically (09, 10, 11, 12) or use original order
|
268 |
+
try:
|
269 |
+
grades_sorted = sorted(courses_by_grade.keys(), key=int)
|
270 |
+
except:
|
271 |
+
grades_sorted = sorted(courses_by_grade.keys())
|
272 |
+
|
273 |
+
for grade in grades_sorted:
|
274 |
+
output_text += f"\nGrade {grade}:\n{'-'*30}\n"
|
275 |
+
for course in courses_by_grade[grade]:
|
276 |
+
output_text += f"- {course.get('name', 'Unnamed Course')}"
|
277 |
+
if 'grade' in course and course['grade']:
|
278 |
+
output_text += f" (Grade: {course['grade']})"
|
279 |
+
if 'credits' in course:
|
280 |
+
output_text += f" | Credits: {course['credits']}"
|
281 |
+
if 'school_year' in course:
|
282 |
+
output_text += f" | Year: {course['school_year']}"
|
283 |
+
output_text += "\n"
|
284 |
|
285 |
+
return output_text, parsed_data
|
|
|
|
|
|
|
|
|
286 |
else:
|
287 |
return "Unsupported file format (PDF only for transcript parsing)", None
|
288 |
|
|
|
461 |
|
462 |
if isinstance(courses_by_grade, dict):
|
463 |
# Sort grades numerically
|
464 |
+
try:
|
465 |
+
grades_sorted = sorted(courses_by_grade.keys(), key=int)
|
466 |
+
except:
|
467 |
+
grades_sorted = sorted(courses_by_grade.keys())
|
468 |
+
|
469 |
+
for grade in grades_sorted:
|
470 |
display += f"\n**Grade {grade}**\n"
|
471 |
for course in courses_by_grade[grade]:
|
472 |
+
display += f"- {course.get('name', 'Unnamed Course')}"
|
473 |
if 'grade' in course and course['grade']:
|
474 |
display += f" (Grade: {course['grade']})"
|
475 |
if 'credits' in course:
|
476 |
display += f" | Credits: {course['credits']}"
|
477 |
+
if 'school_year' in course:
|
478 |
+
display += f" | Year: {course['school_year']}"
|
479 |
+
display += "\n"
|
480 |
|
481 |
if 'gpa' in transcript_dict:
|
482 |
gpa = transcript_dict['gpa']
|
|
|
564 |
|
565 |
elif any(word in message.lower() for word in course_help):
|
566 |
response = "Here's a summary of your courses:\n"
|
567 |
+
if isinstance(courses, dict):
|
568 |
+
try:
|
569 |
+
grades_sorted = sorted(courses.keys(), key=int)
|
570 |
+
except:
|
571 |
+
grades_sorted = sorted(courses.keys())
|
572 |
+
|
573 |
+
for grade in grades_sorted:
|
574 |
+
response += f"\nGrade {grade}:\n"
|
575 |
+
for course in courses[grade]:
|
576 |
+
response += f"- {course.get('name', 'Unnamed Course')}"
|
577 |
+
if 'grade' in course:
|
578 |
+
response += f" (Grade: {course['grade']})"
|
579 |
+
response += "\n"
|
580 |
+
else:
|
581 |
+
response += "No detailed course information available."
|
582 |
return response
|
583 |
|
584 |
elif "help" in message.lower():
|
|
|
661 |
|
662 |
if __name__ == "__main__":
|
663 |
app.launch()
|
664 |
+
|