Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -77,35 +77,152 @@ def validate_file(file_obj) -> None:
|
|
77 |
if file_size > MAX_FILE_SIZE_MB:
|
78 |
raise gr.Error(f"File too large. Max size: {MAX_FILE_SIZE_MB}MB")
|
79 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
def extract_text_with_ocr(file_path: str) -> str:
|
81 |
-
"""Extract text from image files using OCR."""
|
82 |
try:
|
83 |
image = Image.open(file_path)
|
84 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
85 |
return text
|
86 |
except Exception as e:
|
87 |
-
raise
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
88 |
|
89 |
-
# ========== ENHANCED TRANSCRIPT PARSING WITH DEEPSEEK ==========
|
90 |
def parse_transcript_with_deepseek(text: str) -> Dict:
|
91 |
-
"""
|
92 |
if not DEEPSEEK_API_KEY:
|
93 |
raise gr.Error("DeepSeek API key not configured")
|
94 |
|
|
|
|
|
|
|
|
|
95 |
prompt = f"""
|
96 |
-
Analyze this academic transcript and extract
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
* Course name
|
103 |
-
* Grade received
|
104 |
-
* Credits earned
|
105 |
-
* Year/semester taken
|
106 |
-
* Grade level when taken
|
107 |
-
|
108 |
-
Return the data in this exact JSON structure:
|
109 |
{{
|
110 |
"grade_level": "11",
|
111 |
"gpa": {{
|
@@ -120,13 +237,12 @@ def parse_transcript_with_deepseek(text: str) -> Dict:
|
|
120 |
"credits": "1.0",
|
121 |
"year": "2023-2024",
|
122 |
"grade_level": "11"
|
123 |
-
}}
|
124 |
-
// more courses...
|
125 |
]
|
126 |
}}
|
127 |
-
|
128 |
-
|
129 |
-
{text}
|
130 |
"""
|
131 |
|
132 |
headers = {
|
@@ -142,94 +258,92 @@ def parse_transcript_with_deepseek(text: str) -> Dict:
|
|
142 |
}
|
143 |
|
144 |
try:
|
145 |
-
response = requests.post(DEEPSEEK_API_URL, headers=headers, json=payload)
|
146 |
response.raise_for_status()
|
147 |
result = response.json()
|
148 |
|
149 |
-
# Extract the JSON content from the response
|
150 |
content = result['choices'][0]['message']['content']
|
151 |
|
152 |
-
#
|
153 |
-
|
154 |
-
content = content.split('```json')[1].split('```')[0].strip()
|
155 |
-
elif '```' in content:
|
156 |
-
content = content.split('```')[1].split('```')[0].strip()
|
157 |
|
158 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
159 |
except Exception as e:
|
160 |
-
raise gr.Error(f"DeepSeek
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
161 |
|
162 |
def parse_transcript(file_obj) -> Tuple[str, Optional[Dict]]:
|
163 |
-
"""
|
164 |
try:
|
165 |
if not file_obj:
|
166 |
-
raise
|
167 |
|
168 |
validate_file(file_obj)
|
169 |
-
|
170 |
-
text = ''
|
171 |
file_ext = os.path.splitext(file_obj.name)[1].lower()
|
172 |
|
173 |
-
|
174 |
-
|
175 |
-
# Try PyMuPDF first for better text extraction
|
176 |
-
try:
|
177 |
-
doc = fitz.open(file_obj.name)
|
178 |
-
for page in doc:
|
179 |
-
text += page.get_text() + '\n'
|
180 |
-
except:
|
181 |
-
# Fallback to PyPDF2
|
182 |
-
reader = PdfReader(file_obj.name)
|
183 |
-
for page in reader.pages:
|
184 |
-
page_text = page.extract_text()
|
185 |
-
if page_text:
|
186 |
-
text += page_text + '\n'
|
187 |
-
elif file_ext in ['.png', '.jpg', '.jpeg']:
|
188 |
-
text = extract_text_with_ocr(file_obj.name)
|
189 |
-
except Exception as e:
|
190 |
-
raise gr.Error(f"Error processing file: {str(e)}")
|
191 |
|
192 |
-
|
193 |
-
raise gr.Error("No text could be extracted from the file")
|
194 |
-
|
195 |
-
# Use DeepSeek for enhanced parsing
|
196 |
parsed_data = parse_transcript_with_deepseek(text)
|
197 |
|
198 |
-
# Format output
|
199 |
-
output_text =
|
200 |
-
output_text += f"Current Grade Level: {parsed_data.get('grade_level', 'Unknown')}\n"
|
201 |
-
|
202 |
-
if 'gpa' in parsed_data:
|
203 |
-
output_text += f"Weighted GPA: {parsed_data['gpa'].get('weighted', 'N/A')}\n"
|
204 |
-
output_text += f"Unweighted GPA: {parsed_data['gpa'].get('unweighted', 'N/A')}\n\n"
|
205 |
-
|
206 |
-
output_text += "Course History:\n{'='*40}\n"
|
207 |
-
|
208 |
-
# Organize courses by grade level
|
209 |
-
courses_by_grade = defaultdict(list)
|
210 |
-
for course in parsed_data.get('courses', []):
|
211 |
-
grade_level = course.get('grade_level', 'Unknown')
|
212 |
-
courses_by_grade[grade_level].append(course)
|
213 |
-
|
214 |
-
for grade in sorted(courses_by_grade.keys(), key=lambda x: int(x) if x.isdigit() else x):
|
215 |
-
output_text += f"\nGrade {grade}:\n{'-'*30}\n"
|
216 |
-
for course in courses_by_grade[grade]:
|
217 |
-
output_text += f"- {course.get('code', '')} {course.get('name', 'Unnamed course')}"
|
218 |
-
if 'grade' in course and course['grade']:
|
219 |
-
output_text += f" (Grade: {course['grade']})"
|
220 |
-
if 'credits' in course:
|
221 |
-
output_text += f" | Credits: {course['credits']}"
|
222 |
-
if 'year' in course:
|
223 |
-
output_text += f" | Year: {course['year']}"
|
224 |
-
output_text += "\n"
|
225 |
|
226 |
# Prepare the data structure for saving
|
227 |
transcript_data = {
|
228 |
"grade_level": parsed_data.get('grade_level', 'Unknown'),
|
229 |
"gpa": parsed_data.get('gpa', {}),
|
230 |
-
"courses":
|
231 |
}
|
232 |
|
|
|
|
|
|
|
|
|
|
|
233 |
return output_text, transcript_data
|
234 |
|
235 |
except Exception as e:
|
|
|
77 |
if file_size > MAX_FILE_SIZE_MB:
|
78 |
raise gr.Error(f"File too large. Max size: {MAX_FILE_SIZE_MB}MB")
|
79 |
|
80 |
+
# ========== ENHANCED TRANSCRIPT PARSING ==========
|
81 |
+
def extract_text_from_file(file_path: str, file_ext: str) -> str:
|
82 |
+
"""Enhanced text extraction with better error handling and fallbacks."""
|
83 |
+
text = ""
|
84 |
+
|
85 |
+
try:
|
86 |
+
if file_ext == '.pdf':
|
87 |
+
# First try PyMuPDF for better text extraction
|
88 |
+
try:
|
89 |
+
doc = fitz.open(file_path)
|
90 |
+
for page in doc:
|
91 |
+
text += page.get_text("text") + '\n'
|
92 |
+
if not text.strip():
|
93 |
+
raise ValueError("PyMuPDF returned empty text")
|
94 |
+
except Exception as e:
|
95 |
+
print(f"PyMuPDF failed, trying OCR fallback: {str(e)}")
|
96 |
+
text = extract_text_from_pdf_with_ocr(file_path)
|
97 |
+
|
98 |
+
elif file_ext in ['.png', '.jpg', '.jpeg']:
|
99 |
+
text = extract_text_with_ocr(file_path)
|
100 |
+
|
101 |
+
# Clean up the extracted text
|
102 |
+
text = clean_extracted_text(text)
|
103 |
+
|
104 |
+
if not text.strip():
|
105 |
+
raise ValueError("No text could be extracted from the file")
|
106 |
+
|
107 |
+
return text
|
108 |
+
|
109 |
+
except Exception as e:
|
110 |
+
raise gr.Error(f"Text extraction error: {str(e)}")
|
111 |
+
|
112 |
+
def extract_text_from_pdf_with_ocr(file_path: str) -> str:
|
113 |
+
"""Fallback PDF text extraction using OCR."""
|
114 |
+
text = ""
|
115 |
+
try:
|
116 |
+
doc = fitz.open(file_path)
|
117 |
+
for page in doc:
|
118 |
+
pix = page.get_pixmap()
|
119 |
+
img = Image.open(io.BytesIO(pix.tobytes()))
|
120 |
+
text += pytesseract.image_to_string(img) + '\n'
|
121 |
+
except Exception as e:
|
122 |
+
raise ValueError(f"PDF OCR failed: {str(e)}")
|
123 |
+
return text
|
124 |
+
|
125 |
def extract_text_with_ocr(file_path: str) -> str:
|
126 |
+
"""Extract text from image files using OCR with preprocessing."""
|
127 |
try:
|
128 |
image = Image.open(file_path)
|
129 |
+
|
130 |
+
# Preprocess image for better OCR results
|
131 |
+
image = image.convert('L') # Convert to grayscale
|
132 |
+
image = image.point(lambda x: 0 if x < 128 else 255, '1') # Thresholding
|
133 |
+
|
134 |
+
# Custom Tesseract configuration
|
135 |
+
custom_config = r'--oem 3 --psm 6'
|
136 |
+
text = pytesseract.image_to_string(image, config=custom_config)
|
137 |
return text
|
138 |
except Exception as e:
|
139 |
+
raise ValueError(f"OCR processing failed: {str(e)}")
|
140 |
+
|
141 |
+
def clean_extracted_text(text: str) -> str:
|
142 |
+
"""Clean and normalize the extracted text."""
|
143 |
+
# Remove multiple spaces and newlines
|
144 |
+
text = re.sub(r'\s+', ' ', text).strip()
|
145 |
+
|
146 |
+
# Fix common OCR errors
|
147 |
+
replacements = {
|
148 |
+
'|': 'I',
|
149 |
+
'‘': "'",
|
150 |
+
'’': "'",
|
151 |
+
'“': '"',
|
152 |
+
'”': '"',
|
153 |
+
'fi': 'fi',
|
154 |
+
'fl': 'fl'
|
155 |
+
}
|
156 |
+
|
157 |
+
for wrong, right in replacements.items():
|
158 |
+
text = text.replace(wrong, right)
|
159 |
+
|
160 |
+
return text
|
161 |
+
|
162 |
+
def remove_sensitive_info(text: str) -> str:
|
163 |
+
"""Remove potentially sensitive information from transcript text."""
|
164 |
+
# Remove social security numbers
|
165 |
+
text = re.sub(r'\b\d{3}-\d{2}-\d{4}\b', '[REDACTED]', text)
|
166 |
+
# Remove student IDs (assuming 6-9 digit numbers)
|
167 |
+
text = re.sub(r'\b\d{6,9}\b', '[ID]', text)
|
168 |
+
# Remove email addresses
|
169 |
+
text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '[EMAIL]', text)
|
170 |
+
return text
|
171 |
+
|
172 |
+
def extract_json_from_response(content: str) -> str:
|
173 |
+
"""Extract JSON string from API response."""
|
174 |
+
# Handle markdown code blocks
|
175 |
+
if '```json' in content:
|
176 |
+
content = content.split('```json')[1].split('```')[0].strip()
|
177 |
+
elif '```' in content:
|
178 |
+
content = content.split('```')[1].split('```')[0].strip()
|
179 |
+
|
180 |
+
# Sometimes the response is pure JSON
|
181 |
+
return content
|
182 |
+
|
183 |
+
def validate_parsed_data(data: Dict) -> Dict:
|
184 |
+
"""Validate and clean the parsed data structure."""
|
185 |
+
# Ensure required fields exist
|
186 |
+
if not isinstance(data, dict):
|
187 |
+
raise ValueError("Invalid data format")
|
188 |
+
|
189 |
+
# Set default structure if missing
|
190 |
+
if 'grade_level' not in data:
|
191 |
+
data['grade_level'] = 'Unknown'
|
192 |
+
|
193 |
+
if 'gpa' not in data:
|
194 |
+
data['gpa'] = {'weighted': 'N/A', 'unweighted': 'N/A'}
|
195 |
+
|
196 |
+
if 'courses' not in data:
|
197 |
+
data['courses'] = []
|
198 |
+
|
199 |
+
# Clean course data
|
200 |
+
for course in data['courses']:
|
201 |
+
if 'grade' in course:
|
202 |
+
course['grade'] = course['grade'].upper().strip()
|
203 |
+
|
204 |
+
# Ensure numeric credits are strings
|
205 |
+
if 'credits' in course and isinstance(course['credits'], (int, float)):
|
206 |
+
course['credits'] = str(course['credits'])
|
207 |
+
|
208 |
+
return data
|
209 |
|
|
|
210 |
def parse_transcript_with_deepseek(text: str) -> Dict:
|
211 |
+
"""Improved DeepSeek API integration with better error handling."""
|
212 |
if not DEEPSEEK_API_KEY:
|
213 |
raise gr.Error("DeepSeek API key not configured")
|
214 |
|
215 |
+
# Pre-process the text to remove sensitive information
|
216 |
+
text = remove_sensitive_info(text)
|
217 |
+
|
218 |
+
# Create a more robust prompt with examples
|
219 |
prompt = f"""
|
220 |
+
Analyze this academic transcript and extract structured information. Follow these rules:
|
221 |
+
1. Extract data even if partially visible
|
222 |
+
2. Guess missing values when reasonable
|
223 |
+
3. Return empty if completely missing
|
224 |
+
|
225 |
+
Required JSON structure:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
226 |
{{
|
227 |
"grade_level": "11",
|
228 |
"gpa": {{
|
|
|
237 |
"credits": "1.0",
|
238 |
"year": "2023-2024",
|
239 |
"grade_level": "11"
|
240 |
+
}}
|
|
|
241 |
]
|
242 |
}}
|
243 |
+
|
244 |
+
Transcript Text:
|
245 |
+
{text[:15000]} # Limit to first 15k chars to avoid token limits
|
246 |
"""
|
247 |
|
248 |
headers = {
|
|
|
258 |
}
|
259 |
|
260 |
try:
|
261 |
+
response = requests.post(DEEPSEEK_API_URL, headers=headers, json=payload, timeout=30)
|
262 |
response.raise_for_status()
|
263 |
result = response.json()
|
264 |
|
|
|
265 |
content = result['choices'][0]['message']['content']
|
266 |
|
267 |
+
# Extract JSON from response (handling markdown code blocks)
|
268 |
+
json_str = extract_json_from_response(content)
|
|
|
|
|
|
|
269 |
|
270 |
+
# Validate and clean the parsed data
|
271 |
+
parsed_data = validate_parsed_data(json.loads(json_str))
|
272 |
+
|
273 |
+
return parsed_data
|
274 |
+
|
275 |
+
except requests.exceptions.RequestException as e:
|
276 |
+
raise gr.Error(f"API request failed: {str(e)}")
|
277 |
+
except json.JSONDecodeError as e:
|
278 |
+
raise gr.Error(f"Failed to parse API response: {str(e)}")
|
279 |
except Exception as e:
|
280 |
+
raise gr.Error(f"DeepSeek processing error: {str(e)}")
|
281 |
+
|
282 |
+
def format_transcript_output(data: Dict) -> str:
|
283 |
+
"""Format the parsed data into human-readable text."""
|
284 |
+
output = []
|
285 |
+
output.append(f"Student Transcript Summary\n{'='*40}")
|
286 |
+
output.append(f"Current Grade Level: {data.get('grade_level', 'Unknown')}")
|
287 |
+
|
288 |
+
if 'gpa' in data:
|
289 |
+
output.append(f"\nGPA:")
|
290 |
+
output.append(f"- Weighted: {data['gpa'].get('weighted', 'N/A')}")
|
291 |
+
output.append(f"- Unweighted: {data['gpa'].get('unweighted', 'N/A')}")
|
292 |
+
|
293 |
+
if 'courses' in data:
|
294 |
+
output.append("\nCourse History:\n" + '='*40)
|
295 |
+
|
296 |
+
# Group courses by grade level
|
297 |
+
courses_by_grade = defaultdict(list)
|
298 |
+
for course in data['courses']:
|
299 |
+
grade_level = course.get('grade_level', 'Unknown')
|
300 |
+
courses_by_grade[grade_level].append(course)
|
301 |
+
|
302 |
+
# Sort grades numerically
|
303 |
+
for grade in sorted(courses_by_grade.keys(), key=lambda x: int(x) if x.isdigit() else x):
|
304 |
+
output.append(f"\nGrade {grade}:\n{'-'*30}")
|
305 |
+
for course in courses_by_grade[grade]:
|
306 |
+
course_str = f"- {course.get('code', '')} {course.get('name', 'Unnamed course')}"
|
307 |
+
if 'grade' in course:
|
308 |
+
course_str += f" (Grade: {course['grade']})"
|
309 |
+
if 'credits' in course:
|
310 |
+
course_str += f" | Credits: {course['credits']}"
|
311 |
+
if 'year' in course:
|
312 |
+
course_str += f" | Year: {course['year']}"
|
313 |
+
output.append(course_str)
|
314 |
+
|
315 |
+
return '\n'.join(output)
|
316 |
|
317 |
def parse_transcript(file_obj) -> Tuple[str, Optional[Dict]]:
|
318 |
+
"""Main function to parse transcript files."""
|
319 |
try:
|
320 |
if not file_obj:
|
321 |
+
raise ValueError("Please upload a file first")
|
322 |
|
323 |
validate_file(file_obj)
|
|
|
|
|
324 |
file_ext = os.path.splitext(file_obj.name)[1].lower()
|
325 |
|
326 |
+
# Extract text from file
|
327 |
+
text = extract_text_from_file(file_obj.name, file_ext)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
328 |
|
329 |
+
# Parse with DeepSeek
|
|
|
|
|
|
|
330 |
parsed_data = parse_transcript_with_deepseek(text)
|
331 |
|
332 |
+
# Format output
|
333 |
+
output_text = format_transcript_output(parsed_data)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
334 |
|
335 |
# Prepare the data structure for saving
|
336 |
transcript_data = {
|
337 |
"grade_level": parsed_data.get('grade_level', 'Unknown'),
|
338 |
"gpa": parsed_data.get('gpa', {}),
|
339 |
+
"courses": defaultdict(list)
|
340 |
}
|
341 |
|
342 |
+
# Organize courses by grade level for saving
|
343 |
+
for course in parsed_data.get('courses', []):
|
344 |
+
grade_level = course.get('grade_level', 'Unknown')
|
345 |
+
transcript_data["courses"][grade_level].append(course)
|
346 |
+
|
347 |
return output_text, transcript_data
|
348 |
|
349 |
except Exception as e:
|