Dannyar608 commited on
Commit
9b7ad24
·
verified ·
1 Parent(s): bef81e2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +97 -143
app.py CHANGED
@@ -15,15 +15,18 @@ import io
15
  import secrets
16
  import string
17
  from huggingface_hub import HfApi, HfFolder
 
18
 
19
  # ========== CONFIGURATION ==========
20
  PROFILES_DIR = "student_profiles"
21
- ALLOWED_FILE_TYPES = [".pdf", ".png", ".jpg", ".jpeg"] # Added image support
22
  MAX_FILE_SIZE_MB = 5
23
  MIN_AGE = 5
24
  MAX_AGE = 120
25
  SESSION_TOKEN_LENGTH = 32
26
  HF_TOKEN = os.getenv("HF_TOKEN")
 
 
27
 
28
  # Initialize Hugging Face API
29
  if HF_TOKEN:
@@ -83,129 +86,81 @@ def extract_text_with_ocr(file_path: str) -> str:
83
  except Exception as e:
84
  raise gr.Error(f"OCR processing failed: {str(e)}")
85
 
86
- # ========== TRANSCRIPT PARSING ==========
87
- def extract_gpa(text: str, gpa_type: str) -> str:
88
- """More robust GPA extraction with multiple patterns."""
89
- gpa_patterns = [
90
- rf'{gpa_type}\s*GPA\s*[:=]?\s*([0-5]\.\d{{2}}|\d\.\d)', # Weighted GPA: 3.50
91
- rf'{gpa_type}\s*GPA\s+([0-5]\.\d{{2}}|\d\.\d)', # Weighted GPA 3.50
92
- rf'{gpa_type}\s*[:=]?\s*([0-5]\.\d{{2}}|\d\.\d)', # Weighted: 3.50
93
- rf'GPA\s*\({gpa_type}\)\s*[:=]?\s*([0-5]\.\d{{2}}|\d\.\d)', # GPA (Weighted): 3.50
94
- rf'{gpa_type}\s*[=:]?\s*([0-5]\.\d{{2}}|\d\.\d)', # Weighted=3.50
95
- rf'{gpa_type}\s*[=:]?\s*(\d\.\d{{2}})' # Weighted:3.50
96
- ]
97
 
98
- for pattern in gpa_patterns:
99
- match = re.search(pattern, text, re.IGNORECASE)
100
- if match:
101
- gpa_value = match.group(1)
102
- try:
103
- gpa_float = float(gpa_value)
104
- if not 0.0 <= gpa_float <= 5.0:
105
- return "Invalid GPA"
106
- return f"{gpa_float:.2f}"
107
- except ValueError:
108
- continue
109
-
110
- # Fallback to looking for any GPA-like number near the term
111
- fallback_pattern = re.compile(rf'(?:{gpa_type}.*?)([0-5]\.\d{{1,2}})(?!\d)')
112
- match = re.search(fallback_pattern, text, re.IGNORECASE)
113
- if match:
114
- return match.group(1)
115
-
116
- return "N/A"
117
 
118
- def extract_courses_from_table(text: str) -> Dict[str, List[Dict]]:
119
- """Enhanced course extraction with better pattern matching."""
120
- # Normalize text for better matching
121
- text = re.sub(r'\s+', ' ', text) # Replace multiple spaces
122
- text = text.replace('\n', ' ') # Replace newlines
123
-
124
- # More robust patterns
125
- patterns = [
126
- # Pattern for standard table format
127
- re.compile(
128
- r'(?:Year|Term|Semester)[\s:]*(.*?)\s*' # Year/Semester
129
- r'(?:Grade|Level)[\s:]*(.*?)\s*' # Grade level
130
- r'(?:Course\s*Code|Code)[\s:]*(.*?)\s*' # Course code
131
- r'(?:Course\s*Name|Title)[\s:]*(.*?)\s*' # Course name
132
- r'(?:Grade|Mark)[\s:]*(.*?)\s*' # Grade
133
- r'(?:Credits|Units)[\s:]*(.*?)(?:\s|$)' # Credits
134
- ),
135
- # Pattern for condensed format
136
- re.compile(
137
- r'(\d{4}-\d{4}|\w+\s\d{4})\s+' # Year range or Semester Year
138
- r'(\d+)\s+' # Grade level
139
- r'([A-Z]+\s*\d+[A-Z]*)\s+' # Course code
140
- r'(.+?)\s+' # Course name
141
- r'([A-F][+-]?|P|F|W|I)\s+' # Grade
142
- r'(\d+\.?\d*)' # Credits
143
- ),
144
- # Fallback pattern for less structured data
145
- re.compile(
146
- r'([A-Z]+\s*\d+[A-Z]*)\s+' # Course code
147
- r'(.+?)\s+' # Course name
148
- r'(?:Grade\s*:\s*)?([A-F][+-]?|P|F|W|I)\s*' # Grade
149
- r'(?:Credits\s*:\s*)?(\d+\.?\d*)' # Credits
150
- )
151
- ]
152
-
153
- courses_by_grade = defaultdict(list)
154
- extracted_courses = set() # To avoid duplicates
155
 
156
- for pattern in patterns:
157
- for match in re.finditer(pattern, text):
158
- if len(match.groups()) == 6:
159
- year, grade, code, name, grade_mark, credits = match.groups()
160
- else:
161
- # Handle shorter patterns
162
- code, name, grade_mark, credits = match.groups()[:4]
163
- year = "Unknown"
164
- grade = "Unknown"
165
-
166
- # Create unique identifier to avoid duplicates
167
- course_id = f"{code}_{name}_{year}"
168
- if course_id in extracted_courses:
169
- continue
170
- extracted_courses.add(course_id)
171
-
172
- # Clean and format data
173
- code = code.strip()
174
- name = name.strip()
175
- if 'AP' in code and 'AP ' not in code:
176
- code = code.replace('AP', 'AP ')
177
- if 'DE' in code and 'DE ' not in code:
178
- code = code.replace('DE', 'DE ')
179
-
180
- course_info = {
181
- 'code': code,
182
- 'name': name,
183
- 'grade': grade_mark.strip() if grade_mark else None,
184
- 'credits': credits if credits else '0',
185
- 'year': year.strip() if year else 'Unknown'
186
- }
187
-
188
- courses_by_grade[grade.strip() if grade else 'Unknown'].append(course_info)
189
 
190
- # If no courses found with patterns, try a more aggressive approach
191
- if not courses_by_grade:
192
- # Look for anything that looks like a course code followed by description
193
- fallback_pattern = re.compile(r'([A-Z]+\s*\d+[A-Z]*)\s+(.+?)(?:\s+([A-F][+-]?|P|F|W|I))?(?:\s+(\d+\.?\d*))?')
194
- for match in re.finditer(fallback_pattern, text):
195
- code, name, grade_mark, credits = match.groups()
196
- course_info = {
197
- 'code': code.strip(),
198
- 'name': name.strip(),
199
- 'grade': grade_mark.strip() if grade_mark else None,
200
- 'credits': credits if credits else '0',
201
- 'year': 'Unknown'
202
- }
203
- courses_by_grade['Unknown'].append(course_info)
204
 
205
- return courses_by_grade
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
206
 
207
  def parse_transcript(file_obj) -> Tuple[str, Optional[Dict]]:
208
- """Parse transcript file with robust error handling and OCR support."""
209
  try:
210
  if not file_obj:
211
  raise gr.Error("Please upload a file first")
@@ -237,46 +192,45 @@ def parse_transcript(file_obj) -> Tuple[str, Optional[Dict]]:
237
  if not text.strip():
238
  raise gr.Error("No text could be extracted from the file")
239
 
240
- # Enhanced GPA extraction
241
- gpa_data = {
242
- 'weighted': extract_gpa(text, 'Weighted'),
243
- 'unweighted': extract_gpa(text, 'Unweighted')
244
- }
245
-
246
- # Extract grade level with multiple fallback patterns
247
- grade_match = (
248
- re.search(r'Current Grade:\s*(\d+)', text) or
249
- re.search(r'Grade\s*:\s*(\d+)', text) or
250
- re.search(r'Grade\s+(\d+)', text) or
251
- re.search(r'Grade\s+Level:\s*(\d+)', text) or
252
- re.search(r'Grade\s*\(?\s*(\d+)\s*\)?', text)
253
- )
254
- grade_level = grade_match.group(1) if grade_match else "Unknown"
255
-
256
- courses_by_grade = extract_courses_from_table(text)
257
 
258
  # Format output text
259
  output_text = f"Student Transcript Summary\n{'='*40}\n"
260
- output_text += f"Current Grade Level: {grade_level}\n"
261
- output_text += f"Weighted GPA: {gpa_data['weighted']}\n"
262
- output_text += f"Unweighted GPA: {gpa_data['unweighted']}\n\n"
 
 
 
263
  output_text += "Course History:\n{'='*40}\n"
264
 
 
 
 
 
 
 
265
  for grade in sorted(courses_by_grade.keys(), key=lambda x: int(x) if x.isdigit() else x):
266
  output_text += f"\nGrade {grade}:\n{'-'*30}\n"
267
  for course in courses_by_grade[grade]:
268
- output_text += f"- {course['code']} {course['name']}"
269
  if 'grade' in course and course['grade']:
270
  output_text += f" (Grade: {course['grade']})"
271
  if 'credits' in course:
272
  output_text += f" | Credits: {course['credits']}"
273
- output_text += f" | Year: {course['year']}\n"
 
 
274
 
275
- return output_text, {
276
- "gpa": gpa_data,
277
- "grade_level": grade_level,
 
278
  "courses": dict(courses_by_grade)
279
  }
 
 
280
 
281
  except Exception as e:
282
  return f"Error processing transcript: {str(e)}", None
@@ -1359,4 +1313,4 @@ app = create_interface()
1359
  # For Hugging Face Spaces deployment
1360
  if __name__ == "__main__":
1361
  app.launch()
1362
-
 
15
  import secrets
16
  import string
17
  from huggingface_hub import HfApi, HfFolder
18
+ import requests # For API calls to DeepSeek
19
 
20
  # ========== CONFIGURATION ==========
21
  PROFILES_DIR = "student_profiles"
22
+ ALLOWED_FILE_TYPES = [".pdf", ".png", ".jpg", ".jpeg"]
23
  MAX_FILE_SIZE_MB = 5
24
  MIN_AGE = 5
25
  MAX_AGE = 120
26
  SESSION_TOKEN_LENGTH = 32
27
  HF_TOKEN = os.getenv("HF_TOKEN")
28
+ DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY") # Add your DeepSeek API key here
29
+ DEEPSEEK_API_URL = "https://api.deepseek.com/v1/chat/completions" # Example endpoint
30
 
31
  # Initialize Hugging Face API
32
  if HF_TOKEN:
 
86
  except Exception as e:
87
  raise gr.Error(f"OCR processing failed: {str(e)}")
88
 
89
+ # ========== ENHANCED TRANSCRIPT PARSING WITH DEEPSEEK ==========
90
+ def parse_transcript_with_deepseek(text: str) -> Dict:
91
+ """Use DeepSeek model to parse transcript text with high accuracy."""
92
+ if not DEEPSEEK_API_KEY:
93
+ raise gr.Error("DeepSeek API key not configured")
 
 
 
 
 
 
94
 
95
+ prompt = f"""
96
+ Analyze this academic transcript and extract the following information in JSON format:
97
+ - Current grade level
98
+ - Weighted GPA
99
+ - Unweighted GPA
100
+ - List of all courses with:
101
+ * Course code
102
+ * Course name
103
+ * Grade received
104
+ * Credits earned
105
+ * Year/semester taken
106
+ * Grade level when taken
 
 
 
 
 
 
 
107
 
108
+ Return the data in this exact JSON structure:
109
+ {{
110
+ "grade_level": "11",
111
+ "gpa": {{
112
+ "weighted": "4.2",
113
+ "unweighted": "3.9"
114
+ }},
115
+ "courses": [
116
+ {{
117
+ "code": "MATH101",
118
+ "name": "Algebra II",
119
+ "grade": "A",
120
+ "credits": "1.0",
121
+ "year": "2023-2024",
122
+ "grade_level": "11"
123
+ }},
124
+ // more courses...
125
+ ]
126
+ }}
127
+
128
+ Here is the transcript text to analyze:
129
+ {text}
130
+ """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
 
132
+ headers = {
133
+ "Authorization": f"Bearer {DEEPSEEK_API_KEY}",
134
+ "Content-Type": "application/json"
135
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
 
137
+ payload = {
138
+ "model": "deepseek-chat",
139
+ "messages": [{"role": "user", "content": prompt}],
140
+ "temperature": 0.1,
141
+ "max_tokens": 2000
142
+ }
 
 
 
 
 
 
 
 
143
 
144
+ try:
145
+ response = requests.post(DEEPSEEK_API_URL, headers=headers, json=payload)
146
+ response.raise_for_status()
147
+ result = response.json()
148
+
149
+ # Extract the JSON content from the response
150
+ content = result['choices'][0]['message']['content']
151
+
152
+ # Sometimes the response includes markdown code blocks
153
+ if '```json' in content:
154
+ content = content.split('```json')[1].split('```')[0].strip()
155
+ elif '```' in content:
156
+ content = content.split('```')[1].split('```')[0].strip()
157
+
158
+ return json.loads(content)
159
+ except Exception as e:
160
+ raise gr.Error(f"DeepSeek API error: {str(e)}")
161
 
162
  def parse_transcript(file_obj) -> Tuple[str, Optional[Dict]]:
163
+ """Parse transcript file with DeepSeek enhanced parsing."""
164
  try:
165
  if not file_obj:
166
  raise gr.Error("Please upload a file first")
 
192
  if not text.strip():
193
  raise gr.Error("No text could be extracted from the file")
194
 
195
+ # Use DeepSeek for enhanced parsing
196
+ parsed_data = parse_transcript_with_deepseek(text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
197
 
198
  # Format output text
199
  output_text = f"Student Transcript Summary\n{'='*40}\n"
200
+ output_text += f"Current Grade Level: {parsed_data.get('grade_level', 'Unknown')}\n"
201
+
202
+ if 'gpa' in parsed_data:
203
+ output_text += f"Weighted GPA: {parsed_data['gpa'].get('weighted', 'N/A')}\n"
204
+ output_text += f"Unweighted GPA: {parsed_data['gpa'].get('unweighted', 'N/A')}\n\n"
205
+
206
  output_text += "Course History:\n{'='*40}\n"
207
 
208
+ # Organize courses by grade level
209
+ courses_by_grade = defaultdict(list)
210
+ for course in parsed_data.get('courses', []):
211
+ grade_level = course.get('grade_level', 'Unknown')
212
+ courses_by_grade[grade_level].append(course)
213
+
214
  for grade in sorted(courses_by_grade.keys(), key=lambda x: int(x) if x.isdigit() else x):
215
  output_text += f"\nGrade {grade}:\n{'-'*30}\n"
216
  for course in courses_by_grade[grade]:
217
+ output_text += f"- {course.get('code', '')} {course.get('name', 'Unnamed course')}"
218
  if 'grade' in course and course['grade']:
219
  output_text += f" (Grade: {course['grade']})"
220
  if 'credits' in course:
221
  output_text += f" | Credits: {course['credits']}"
222
+ if 'year' in course:
223
+ output_text += f" | Year: {course['year']}"
224
+ output_text += "\n"
225
 
226
+ # Prepare the data structure for saving
227
+ transcript_data = {
228
+ "grade_level": parsed_data.get('grade_level', 'Unknown'),
229
+ "gpa": parsed_data.get('gpa', {}),
230
  "courses": dict(courses_by_grade)
231
  }
232
+
233
+ return output_text, transcript_data
234
 
235
  except Exception as e:
236
  return f"Error processing transcript: {str(e)}", None
 
1313
  # For Hugging Face Spaces deployment
1314
  if __name__ == "__main__":
1315
  app.launch()
1316
+