Dannyar608 commited on
Commit
fcf1816
·
verified ·
1 Parent(s): 9b7ad24

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +200 -86
app.py CHANGED
@@ -77,35 +77,152 @@ def validate_file(file_obj) -> None:
77
  if file_size > MAX_FILE_SIZE_MB:
78
  raise gr.Error(f"File too large. Max size: {MAX_FILE_SIZE_MB}MB")
79
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  def extract_text_with_ocr(file_path: str) -> str:
81
- """Extract text from image files using OCR."""
82
  try:
83
  image = Image.open(file_path)
84
- text = pytesseract.image_to_string(image)
 
 
 
 
 
 
 
85
  return text
86
  except Exception as e:
87
- raise gr.Error(f"OCR processing failed: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
 
89
- # ========== ENHANCED TRANSCRIPT PARSING WITH DEEPSEEK ==========
90
  def parse_transcript_with_deepseek(text: str) -> Dict:
91
- """Use DeepSeek model to parse transcript text with high accuracy."""
92
  if not DEEPSEEK_API_KEY:
93
  raise gr.Error("DeepSeek API key not configured")
94
 
 
 
 
 
95
  prompt = f"""
96
- Analyze this academic transcript and extract the following information in JSON format:
97
- - Current grade level
98
- - Weighted GPA
99
- - Unweighted GPA
100
- - List of all courses with:
101
- * Course code
102
- * Course name
103
- * Grade received
104
- * Credits earned
105
- * Year/semester taken
106
- * Grade level when taken
107
-
108
- Return the data in this exact JSON structure:
109
  {{
110
  "grade_level": "11",
111
  "gpa": {{
@@ -120,13 +237,12 @@ def parse_transcript_with_deepseek(text: str) -> Dict:
120
  "credits": "1.0",
121
  "year": "2023-2024",
122
  "grade_level": "11"
123
- }},
124
- // more courses...
125
  ]
126
  }}
127
-
128
- Here is the transcript text to analyze:
129
- {text}
130
  """
131
 
132
  headers = {
@@ -142,94 +258,92 @@ def parse_transcript_with_deepseek(text: str) -> Dict:
142
  }
143
 
144
  try:
145
- response = requests.post(DEEPSEEK_API_URL, headers=headers, json=payload)
146
  response.raise_for_status()
147
  result = response.json()
148
 
149
- # Extract the JSON content from the response
150
  content = result['choices'][0]['message']['content']
151
 
152
- # Sometimes the response includes markdown code blocks
153
- if '```json' in content:
154
- content = content.split('```json')[1].split('```')[0].strip()
155
- elif '```' in content:
156
- content = content.split('```')[1].split('```')[0].strip()
157
 
158
- return json.loads(content)
 
 
 
 
 
 
 
 
159
  except Exception as e:
160
- raise gr.Error(f"DeepSeek API error: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
 
162
  def parse_transcript(file_obj) -> Tuple[str, Optional[Dict]]:
163
- """Parse transcript file with DeepSeek enhanced parsing."""
164
  try:
165
  if not file_obj:
166
- raise gr.Error("Please upload a file first")
167
 
168
  validate_file(file_obj)
169
-
170
- text = ''
171
  file_ext = os.path.splitext(file_obj.name)[1].lower()
172
 
173
- try:
174
- if file_ext == '.pdf':
175
- # Try PyMuPDF first for better text extraction
176
- try:
177
- doc = fitz.open(file_obj.name)
178
- for page in doc:
179
- text += page.get_text() + '\n'
180
- except:
181
- # Fallback to PyPDF2
182
- reader = PdfReader(file_obj.name)
183
- for page in reader.pages:
184
- page_text = page.extract_text()
185
- if page_text:
186
- text += page_text + '\n'
187
- elif file_ext in ['.png', '.jpg', '.jpeg']:
188
- text = extract_text_with_ocr(file_obj.name)
189
- except Exception as e:
190
- raise gr.Error(f"Error processing file: {str(e)}")
191
 
192
- if not text.strip():
193
- raise gr.Error("No text could be extracted from the file")
194
-
195
- # Use DeepSeek for enhanced parsing
196
  parsed_data = parse_transcript_with_deepseek(text)
197
 
198
- # Format output text
199
- output_text = f"Student Transcript Summary\n{'='*40}\n"
200
- output_text += f"Current Grade Level: {parsed_data.get('grade_level', 'Unknown')}\n"
201
-
202
- if 'gpa' in parsed_data:
203
- output_text += f"Weighted GPA: {parsed_data['gpa'].get('weighted', 'N/A')}\n"
204
- output_text += f"Unweighted GPA: {parsed_data['gpa'].get('unweighted', 'N/A')}\n\n"
205
-
206
- output_text += "Course History:\n{'='*40}\n"
207
-
208
- # Organize courses by grade level
209
- courses_by_grade = defaultdict(list)
210
- for course in parsed_data.get('courses', []):
211
- grade_level = course.get('grade_level', 'Unknown')
212
- courses_by_grade[grade_level].append(course)
213
-
214
- for grade in sorted(courses_by_grade.keys(), key=lambda x: int(x) if x.isdigit() else x):
215
- output_text += f"\nGrade {grade}:\n{'-'*30}\n"
216
- for course in courses_by_grade[grade]:
217
- output_text += f"- {course.get('code', '')} {course.get('name', 'Unnamed course')}"
218
- if 'grade' in course and course['grade']:
219
- output_text += f" (Grade: {course['grade']})"
220
- if 'credits' in course:
221
- output_text += f" | Credits: {course['credits']}"
222
- if 'year' in course:
223
- output_text += f" | Year: {course['year']}"
224
- output_text += "\n"
225
 
226
  # Prepare the data structure for saving
227
  transcript_data = {
228
  "grade_level": parsed_data.get('grade_level', 'Unknown'),
229
  "gpa": parsed_data.get('gpa', {}),
230
- "courses": dict(courses_by_grade)
231
  }
232
 
 
 
 
 
 
233
  return output_text, transcript_data
234
 
235
  except Exception as e:
 
77
  if file_size > MAX_FILE_SIZE_MB:
78
  raise gr.Error(f"File too large. Max size: {MAX_FILE_SIZE_MB}MB")
79
 
80
+ # ========== ENHANCED TRANSCRIPT PARSING ==========
81
+ def extract_text_from_file(file_path: str, file_ext: str) -> str:
82
+ """Enhanced text extraction with better error handling and fallbacks."""
83
+ text = ""
84
+
85
+ try:
86
+ if file_ext == '.pdf':
87
+ # First try PyMuPDF for better text extraction
88
+ try:
89
+ doc = fitz.open(file_path)
90
+ for page in doc:
91
+ text += page.get_text("text") + '\n'
92
+ if not text.strip():
93
+ raise ValueError("PyMuPDF returned empty text")
94
+ except Exception as e:
95
+ print(f"PyMuPDF failed, trying OCR fallback: {str(e)}")
96
+ text = extract_text_from_pdf_with_ocr(file_path)
97
+
98
+ elif file_ext in ['.png', '.jpg', '.jpeg']:
99
+ text = extract_text_with_ocr(file_path)
100
+
101
+ # Clean up the extracted text
102
+ text = clean_extracted_text(text)
103
+
104
+ if not text.strip():
105
+ raise ValueError("No text could be extracted from the file")
106
+
107
+ return text
108
+
109
+ except Exception as e:
110
+ raise gr.Error(f"Text extraction error: {str(e)}")
111
+
112
+ def extract_text_from_pdf_with_ocr(file_path: str) -> str:
113
+ """Fallback PDF text extraction using OCR."""
114
+ text = ""
115
+ try:
116
+ doc = fitz.open(file_path)
117
+ for page in doc:
118
+ pix = page.get_pixmap()
119
+ img = Image.open(io.BytesIO(pix.tobytes()))
120
+ text += pytesseract.image_to_string(img) + '\n'
121
+ except Exception as e:
122
+ raise ValueError(f"PDF OCR failed: {str(e)}")
123
+ return text
124
+
125
  def extract_text_with_ocr(file_path: str) -> str:
126
+ """Extract text from image files using OCR with preprocessing."""
127
  try:
128
  image = Image.open(file_path)
129
+
130
+ # Preprocess image for better OCR results
131
+ image = image.convert('L') # Convert to grayscale
132
+ image = image.point(lambda x: 0 if x < 128 else 255, '1') # Thresholding
133
+
134
+ # Custom Tesseract configuration
135
+ custom_config = r'--oem 3 --psm 6'
136
+ text = pytesseract.image_to_string(image, config=custom_config)
137
  return text
138
  except Exception as e:
139
+ raise ValueError(f"OCR processing failed: {str(e)}")
140
+
141
+ def clean_extracted_text(text: str) -> str:
142
+ """Clean and normalize the extracted text."""
143
+ # Remove multiple spaces and newlines
144
+ text = re.sub(r'\s+', ' ', text).strip()
145
+
146
+ # Fix common OCR errors
147
+ replacements = {
148
+ '|': 'I',
149
+ '‘': "'",
150
+ '’': "'",
151
+ '“': '"',
152
+ '”': '"',
153
+ 'fi': 'fi',
154
+ 'fl': 'fl'
155
+ }
156
+
157
+ for wrong, right in replacements.items():
158
+ text = text.replace(wrong, right)
159
+
160
+ return text
161
+
162
+ def remove_sensitive_info(text: str) -> str:
163
+ """Remove potentially sensitive information from transcript text."""
164
+ # Remove social security numbers
165
+ text = re.sub(r'\b\d{3}-\d{2}-\d{4}\b', '[REDACTED]', text)
166
+ # Remove student IDs (assuming 6-9 digit numbers)
167
+ text = re.sub(r'\b\d{6,9}\b', '[ID]', text)
168
+ # Remove email addresses
169
+ text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '[EMAIL]', text)
170
+ return text
171
+
172
+ def extract_json_from_response(content: str) -> str:
173
+ """Extract JSON string from API response."""
174
+ # Handle markdown code blocks
175
+ if '```json' in content:
176
+ content = content.split('```json')[1].split('```')[0].strip()
177
+ elif '```' in content:
178
+ content = content.split('```')[1].split('```')[0].strip()
179
+
180
+ # Sometimes the response is pure JSON
181
+ return content
182
+
183
+ def validate_parsed_data(data: Dict) -> Dict:
184
+ """Validate and clean the parsed data structure."""
185
+ # Ensure required fields exist
186
+ if not isinstance(data, dict):
187
+ raise ValueError("Invalid data format")
188
+
189
+ # Set default structure if missing
190
+ if 'grade_level' not in data:
191
+ data['grade_level'] = 'Unknown'
192
+
193
+ if 'gpa' not in data:
194
+ data['gpa'] = {'weighted': 'N/A', 'unweighted': 'N/A'}
195
+
196
+ if 'courses' not in data:
197
+ data['courses'] = []
198
+
199
+ # Clean course data
200
+ for course in data['courses']:
201
+ if 'grade' in course:
202
+ course['grade'] = course['grade'].upper().strip()
203
+
204
+ # Ensure numeric credits are strings
205
+ if 'credits' in course and isinstance(course['credits'], (int, float)):
206
+ course['credits'] = str(course['credits'])
207
+
208
+ return data
209
 
 
210
  def parse_transcript_with_deepseek(text: str) -> Dict:
211
+ """Improved DeepSeek API integration with better error handling."""
212
  if not DEEPSEEK_API_KEY:
213
  raise gr.Error("DeepSeek API key not configured")
214
 
215
+ # Pre-process the text to remove sensitive information
216
+ text = remove_sensitive_info(text)
217
+
218
+ # Create a more robust prompt with examples
219
  prompt = f"""
220
+ Analyze this academic transcript and extract structured information. Follow these rules:
221
+ 1. Extract data even if partially visible
222
+ 2. Guess missing values when reasonable
223
+ 3. Return empty if completely missing
224
+
225
+ Required JSON structure:
 
 
 
 
 
 
 
226
  {{
227
  "grade_level": "11",
228
  "gpa": {{
 
237
  "credits": "1.0",
238
  "year": "2023-2024",
239
  "grade_level": "11"
240
+ }}
 
241
  ]
242
  }}
243
+
244
+ Transcript Text:
245
+ {text[:15000]} # Limit to first 15k chars to avoid token limits
246
  """
247
 
248
  headers = {
 
258
  }
259
 
260
  try:
261
+ response = requests.post(DEEPSEEK_API_URL, headers=headers, json=payload, timeout=30)
262
  response.raise_for_status()
263
  result = response.json()
264
 
 
265
  content = result['choices'][0]['message']['content']
266
 
267
+ # Extract JSON from response (handling markdown code blocks)
268
+ json_str = extract_json_from_response(content)
 
 
 
269
 
270
+ # Validate and clean the parsed data
271
+ parsed_data = validate_parsed_data(json.loads(json_str))
272
+
273
+ return parsed_data
274
+
275
+ except requests.exceptions.RequestException as e:
276
+ raise gr.Error(f"API request failed: {str(e)}")
277
+ except json.JSONDecodeError as e:
278
+ raise gr.Error(f"Failed to parse API response: {str(e)}")
279
  except Exception as e:
280
+ raise gr.Error(f"DeepSeek processing error: {str(e)}")
281
+
282
+ def format_transcript_output(data: Dict) -> str:
283
+ """Format the parsed data into human-readable text."""
284
+ output = []
285
+ output.append(f"Student Transcript Summary\n{'='*40}")
286
+ output.append(f"Current Grade Level: {data.get('grade_level', 'Unknown')}")
287
+
288
+ if 'gpa' in data:
289
+ output.append(f"\nGPA:")
290
+ output.append(f"- Weighted: {data['gpa'].get('weighted', 'N/A')}")
291
+ output.append(f"- Unweighted: {data['gpa'].get('unweighted', 'N/A')}")
292
+
293
+ if 'courses' in data:
294
+ output.append("\nCourse History:\n" + '='*40)
295
+
296
+ # Group courses by grade level
297
+ courses_by_grade = defaultdict(list)
298
+ for course in data['courses']:
299
+ grade_level = course.get('grade_level', 'Unknown')
300
+ courses_by_grade[grade_level].append(course)
301
+
302
+ # Sort grades numerically
303
+ for grade in sorted(courses_by_grade.keys(), key=lambda x: int(x) if x.isdigit() else x):
304
+ output.append(f"\nGrade {grade}:\n{'-'*30}")
305
+ for course in courses_by_grade[grade]:
306
+ course_str = f"- {course.get('code', '')} {course.get('name', 'Unnamed course')}"
307
+ if 'grade' in course:
308
+ course_str += f" (Grade: {course['grade']})"
309
+ if 'credits' in course:
310
+ course_str += f" | Credits: {course['credits']}"
311
+ if 'year' in course:
312
+ course_str += f" | Year: {course['year']}"
313
+ output.append(course_str)
314
+
315
+ return '\n'.join(output)
316
 
317
  def parse_transcript(file_obj) -> Tuple[str, Optional[Dict]]:
318
+ """Main function to parse transcript files."""
319
  try:
320
  if not file_obj:
321
+ raise ValueError("Please upload a file first")
322
 
323
  validate_file(file_obj)
 
 
324
  file_ext = os.path.splitext(file_obj.name)[1].lower()
325
 
326
+ # Extract text from file
327
+ text = extract_text_from_file(file_obj.name, file_ext)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
328
 
329
+ # Parse with DeepSeek
 
 
 
330
  parsed_data = parse_transcript_with_deepseek(text)
331
 
332
+ # Format output
333
+ output_text = format_transcript_output(parsed_data)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
334
 
335
  # Prepare the data structure for saving
336
  transcript_data = {
337
  "grade_level": parsed_data.get('grade_level', 'Unknown'),
338
  "gpa": parsed_data.get('gpa', {}),
339
+ "courses": defaultdict(list)
340
  }
341
 
342
+ # Organize courses by grade level for saving
343
+ for course in parsed_data.get('courses', []):
344
+ grade_level = course.get('grade_level', 'Unknown')
345
+ transcript_data["courses"][grade_level].append(course)
346
+
347
  return output_text, transcript_data
348
 
349
  except Exception as e: