Dannyar608 commited on
Commit
e881a6a
·
verified ·
1 Parent(s): fcf1816

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +143 -85
app.py CHANGED
@@ -15,7 +15,9 @@ import io
15
  import secrets
16
  import string
17
  from huggingface_hub import HfApi, HfFolder
18
- import requests # For API calls to DeepSeek
 
 
19
 
20
  # ========== CONFIGURATION ==========
21
  PROFILES_DIR = "student_profiles"
@@ -25,14 +27,45 @@ MIN_AGE = 5
25
  MAX_AGE = 120
26
  SESSION_TOKEN_LENGTH = 32
27
  HF_TOKEN = os.getenv("HF_TOKEN")
28
- DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY") # Add your DeepSeek API key here
29
- DEEPSEEK_API_URL = "https://api.deepseek.com/v1/chat/completions" # Example endpoint
30
 
31
  # Initialize Hugging Face API
32
  if HF_TOKEN:
33
  hf_api = HfApi(token=HF_TOKEN)
34
  HfFolder.save_token(HF_TOKEN)
35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  # ========== UTILITY FUNCTIONS ==========
37
  def generate_session_token() -> str:
38
  """Generate a random session token for user identification."""
@@ -77,7 +110,7 @@ def validate_file(file_obj) -> None:
77
  if file_size > MAX_FILE_SIZE_MB:
78
  raise gr.Error(f"File too large. Max size: {MAX_FILE_SIZE_MB}MB")
79
 
80
- # ========== ENHANCED TRANSCRIPT PARSING ==========
81
  def extract_text_from_file(file_path: str, file_ext: str) -> str:
82
  """Enhanced text extraction with better error handling and fallbacks."""
83
  text = ""
@@ -169,60 +202,29 @@ def remove_sensitive_info(text: str) -> str:
169
  text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '[EMAIL]', text)
170
  return text
171
 
172
- def extract_json_from_response(content: str) -> str:
173
- """Extract JSON string from API response."""
174
- # Handle markdown code blocks
175
- if '```json' in content:
176
- content = content.split('```json')[1].split('```')[0].strip()
177
- elif '```' in content:
178
- content = content.split('```')[1].split('```')[0].strip()
179
-
180
- # Sometimes the response is pure JSON
181
- return content
182
-
183
- def validate_parsed_data(data: Dict) -> Dict:
184
- """Validate and clean the parsed data structure."""
185
- # Ensure required fields exist
186
- if not isinstance(data, dict):
187
- raise ValueError("Invalid data format")
188
-
189
- # Set default structure if missing
190
- if 'grade_level' not in data:
191
- data['grade_level'] = 'Unknown'
192
-
193
- if 'gpa' not in data:
194
- data['gpa'] = {'weighted': 'N/A', 'unweighted': 'N/A'}
195
-
196
- if 'courses' not in data:
197
- data['courses'] = []
198
-
199
- # Clean course data
200
- for course in data['courses']:
201
- if 'grade' in course:
202
- course['grade'] = course['grade'].upper().strip()
203
-
204
- # Ensure numeric credits are strings
205
- if 'credits' in course and isinstance(course['credits'], (int, float)):
206
- course['credits'] = str(course['credits'])
207
-
208
- return data
209
-
210
  def parse_transcript_with_deepseek(text: str) -> Dict:
211
- """Improved DeepSeek API integration with better error handling."""
212
- if not DEEPSEEK_API_KEY:
213
- raise gr.Error("DeepSeek API key not configured")
214
 
215
- # Pre-process the text to remove sensitive information
216
- text = remove_sensitive_info(text)
217
 
218
- # Create a more robust prompt with examples
219
  prompt = f"""
220
- Analyze this academic transcript and extract structured information. Follow these rules:
221
- 1. Extract data even if partially visible
222
- 2. Guess missing values when reasonable
223
- 3. Return empty if completely missing
224
-
225
- Required JSON structure:
 
 
 
 
 
 
 
226
  {{
227
  "grade_level": "11",
228
  "gpa": {{
@@ -240,44 +242,76 @@ def parse_transcript_with_deepseek(text: str) -> Dict:
240
  }}
241
  ]
242
  }}
243
-
244
  Transcript Text:
245
- {text[:15000]} # Limit to first 15k chars to avoid token limits
246
  """
247
 
248
- headers = {
249
- "Authorization": f"Bearer {DEEPSEEK_API_KEY}",
250
- "Content-Type": "application/json"
251
- }
252
-
253
- payload = {
254
- "model": "deepseek-chat",
255
- "messages": [{"role": "user", "content": prompt}],
256
- "temperature": 0.1,
257
- "max_tokens": 2000
258
- }
259
-
260
  try:
261
- response = requests.post(DEEPSEEK_API_URL, headers=headers, json=payload, timeout=30)
262
- response.raise_for_status()
263
- result = response.json()
 
 
 
 
 
 
 
 
 
 
 
 
264
 
265
- content = result['choices'][0]['message']['content']
 
 
266
 
267
- # Extract JSON from response (handling markdown code blocks)
268
- json_str = extract_json_from_response(content)
 
 
 
 
 
269
 
270
- # Validate and clean the parsed data
271
- parsed_data = validate_parsed_data(json.loads(json_str))
 
272
 
273
- return parsed_data
274
 
275
- except requests.exceptions.RequestException as e:
276
- raise gr.Error(f"API request failed: {str(e)}")
277
- except json.JSONDecodeError as e:
278
- raise gr.Error(f"Failed to parse API response: {str(e)}")
279
  except Exception as e:
280
- raise gr.Error(f"DeepSeek processing error: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
281
 
282
  def format_transcript_output(data: Dict) -> str:
283
  """Format the parsed data into human-readable text."""
@@ -326,10 +360,10 @@ def parse_transcript(file_obj) -> Tuple[str, Optional[Dict]]:
326
  # Extract text from file
327
  text = extract_text_from_file(file_obj.name, file_ext)
328
 
329
- # Parse with DeepSeek
330
  parsed_data = parse_transcript_with_deepseek(text)
331
 
332
- # Format output
333
  output_text = format_transcript_output(parsed_data)
334
 
335
  # Prepare the data structure for saving
@@ -339,7 +373,7 @@ def parse_transcript(file_obj) -> Tuple[str, Optional[Dict]]:
339
  "courses": defaultdict(list)
340
  }
341
 
342
- # Organize courses by grade level for saving
343
  for course in parsed_data.get('courses', []):
344
  grade_level = course.get('grade_level', 'Unknown')
345
  transcript_data["courses"][grade_level].append(course)
@@ -1043,6 +1077,13 @@ def create_interface():
1043
  background-color: #ffebee;
1044
  color: #c62828;
1045
  }
 
 
 
 
 
 
 
1046
  """
1047
 
1048
  gr.Markdown("""
@@ -1051,6 +1092,12 @@ def create_interface():
1051
  Complete each step to get customized learning recommendations.
1052
  """)
1053
 
 
 
 
 
 
 
1054
  # Progress tracker - now with dynamic styling
1055
  with gr.Row():
1056
  with gr.Column(scale=1):
@@ -1101,6 +1148,9 @@ def create_interface():
1101
  transcript_data = gr.State()
1102
 
1103
  def process_transcript_and_update(file_obj, current_tab_status):
 
 
 
1104
  output_text, data = parse_transcript(file_obj)
1105
  if "Error" not in output_text:
1106
  new_status = current_tab_status.copy()
@@ -1418,6 +1468,14 @@ def create_interface():
1418
  inputs=[gr.State(4), tab_completed],
1419
  outputs=[tabs, nav_message, quiz_alert]
1420
  )
 
 
 
 
 
 
 
 
1421
 
1422
  return app
1423
 
 
15
  import secrets
16
  import string
17
  from huggingface_hub import HfApi, HfFolder
18
+ import torch
19
+ from transformers import AutoTokenizer, AutoModelForCausalLM
20
+ import time
21
 
22
  # ========== CONFIGURATION ==========
23
  PROFILES_DIR = "student_profiles"
 
27
  MAX_AGE = 120
28
  SESSION_TOKEN_LENGTH = 32
29
  HF_TOKEN = os.getenv("HF_TOKEN")
 
 
30
 
31
  # Initialize Hugging Face API
32
  if HF_TOKEN:
33
  hf_api = HfApi(token=HF_TOKEN)
34
  HfFolder.save_token(HF_TOKEN)
35
 
36
+ # ========== DEEPSEEK MODEL LOADING ==========
37
+ def load_deepseek_model():
38
+ """Load the DeepSeek model with progress tracking"""
39
+ progress = gr.Progress()
40
+ progress(0, desc="Loading DeepSeek model...")
41
+
42
+ try:
43
+ start_time = time.time()
44
+ tokenizer = AutoTokenizer.from_pretrained(
45
+ "deepseek-ai/DeepSeek-V3",
46
+ trust_remote_code=True
47
+ )
48
+ progress(0.3, desc="Loading tokenizer...")
49
+
50
+ model = AutoModelForCausalLM.from_pretrained(
51
+ "deepseek-ai/DeepSeek-V3",
52
+ trust_remote_code=True,
53
+ torch_dtype=torch.float16,
54
+ device_map="auto"
55
+ )
56
+ progress(0.9, desc="Loading model weights...")
57
+
58
+ load_time = time.time() - start_time
59
+ print(f"DeepSeek model loaded in {load_time:.2f} seconds")
60
+ return model, tokenizer
61
+
62
+ except Exception as e:
63
+ print(f"Error loading DeepSeek model: {str(e)}")
64
+ return None, None
65
+
66
+ # Load model at startup
67
+ model, tokenizer = load_deepseek_model()
68
+
69
  # ========== UTILITY FUNCTIONS ==========
70
  def generate_session_token() -> str:
71
  """Generate a random session token for user identification."""
 
110
  if file_size > MAX_FILE_SIZE_MB:
111
  raise gr.Error(f"File too large. Max size: {MAX_FILE_SIZE_MB}MB")
112
 
113
+ # ========== TEXT EXTRACTION FUNCTIONS ==========
114
  def extract_text_from_file(file_path: str, file_ext: str) -> str:
115
  """Enhanced text extraction with better error handling and fallbacks."""
116
  text = ""
 
202
  text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '[EMAIL]', text)
203
  return text
204
 
205
+ # ========== TRANSCRIPT PARSING ==========
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
206
  def parse_transcript_with_deepseek(text: str) -> Dict:
207
+ """Use local DeepSeek model to parse transcript text"""
208
+ if model is None or tokenizer is None:
209
+ raise gr.Error("DeepSeek model failed to load. Please try again later.")
210
 
211
+ # Pre-process the text
212
+ text = remove_sensitive_info(text[:15000]) # Limit to first 15k chars
213
 
 
214
  prompt = f"""
215
+ Analyze this academic transcript and extract structured information:
216
+ - Current grade level
217
+ - Weighted GPA (if available)
218
+ - Unweighted GPA (if available)
219
+ - List of all courses with:
220
+ * Course code
221
+ * Course name
222
+ * Grade received
223
+ * Credits earned
224
+ * Year/semester taken
225
+ * Grade level when taken
226
+
227
+ Return the data in this JSON structure:
228
  {{
229
  "grade_level": "11",
230
  "gpa": {{
 
242
  }}
243
  ]
244
  }}
245
+
246
  Transcript Text:
247
+ {text}
248
  """
249
 
 
 
 
 
 
 
 
 
 
 
 
 
250
  try:
251
+ # Show progress to user
252
+ progress = gr.Progress()
253
+ progress(0, desc="Analyzing transcript...")
254
+
255
+ # Tokenize and generate response
256
+ inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
257
+ progress(0.3)
258
+
259
+ outputs = model.generate(
260
+ **inputs,
261
+ max_new_tokens=2000,
262
+ temperature=0.1,
263
+ do_sample=True
264
+ )
265
+ progress(0.8)
266
 
267
+ # Decode the response
268
+ response = tokenizer.decode(outputs[0], skip_special_tokens=True)
269
+ progress(0.9)
270
 
271
+ # Extract the JSON content from the response
272
+ if '```json' in response:
273
+ json_str = response.split('```json')[1].split('```')[0].strip()
274
+ elif '```' in response:
275
+ json_str = response.split('```')[1].split('```')[0].strip()
276
+ else:
277
+ json_str = response
278
 
279
+ # Parse and validate the JSON
280
+ parsed_data = json.loads(json_str)
281
+ progress(1.0)
282
 
283
+ return validate_parsed_data(parsed_data)
284
 
285
+ except torch.cuda.OutOfMemoryError:
286
+ raise gr.Error("The model ran out of memory. Try with a smaller transcript or upgrade your GPU.")
 
 
287
  except Exception as e:
288
+ raise gr.Error(f"Error processing transcript: {str(e)}")
289
+
290
+ def validate_parsed_data(data: Dict) -> Dict:
291
+ """Validate and clean the parsed data structure."""
292
+ if not isinstance(data, dict):
293
+ raise ValueError("Invalid data format")
294
+
295
+ # Set default structure if missing
296
+ if 'grade_level' not in data:
297
+ data['grade_level'] = 'Unknown'
298
+
299
+ if 'gpa' not in data:
300
+ data['gpa'] = {'weighted': 'N/A', 'unweighted': 'N/A'}
301
+
302
+ if 'courses' not in data:
303
+ data['courses'] = []
304
+
305
+ # Clean course data
306
+ for course in data['courses']:
307
+ if 'grade' in course:
308
+ course['grade'] = course['grade'].upper().strip()
309
+
310
+ # Ensure numeric credits are strings
311
+ if 'credits' in course and isinstance(course['credits'], (int, float)):
312
+ course['credits'] = str(course['credits'])
313
+
314
+ return data
315
 
316
  def format_transcript_output(data: Dict) -> str:
317
  """Format the parsed data into human-readable text."""
 
360
  # Extract text from file
361
  text = extract_text_from_file(file_obj.name, file_ext)
362
 
363
+ # Use DeepSeek for parsing
364
  parsed_data = parse_transcript_with_deepseek(text)
365
 
366
+ # Format output text
367
  output_text = format_transcript_output(parsed_data)
368
 
369
  # Prepare the data structure for saving
 
373
  "courses": defaultdict(list)
374
  }
375
 
376
+ # Organize courses by grade level
377
  for course in parsed_data.get('courses', []):
378
  grade_level = course.get('grade_level', 'Unknown')
379
  transcript_data["courses"][grade_level].append(course)
 
1077
  background-color: #ffebee;
1078
  color: #c62828;
1079
  }
1080
+ .model-loading {
1081
+ padding: 15px;
1082
+ margin: 15px 0;
1083
+ border-radius: 4px;
1084
+ background-color: #fff3e0;
1085
+ color: #e65100;
1086
+ }
1087
  """
1088
 
1089
  gr.Markdown("""
 
1092
  Complete each step to get customized learning recommendations.
1093
  """)
1094
 
1095
+ # Model loading status
1096
+ model_status = gr.HTML(
1097
+ value="<div class='model-loading'>Loading AI model... (This may take a few minutes)</div>" if model is None else "",
1098
+ visible=model is None
1099
+ )
1100
+
1101
  # Progress tracker - now with dynamic styling
1102
  with gr.Row():
1103
  with gr.Column(scale=1):
 
1148
  transcript_data = gr.State()
1149
 
1150
  def process_transcript_and_update(file_obj, current_tab_status):
1151
+ if model is None:
1152
+ return "Error: AI model failed to load. Please try again later.", None, current_tab_status, gr.update(), gr.update(), gr.update()
1153
+
1154
  output_text, data = parse_transcript(file_obj)
1155
  if "Error" not in output_text:
1156
  new_status = current_tab_status.copy()
 
1468
  inputs=[gr.State(4), tab_completed],
1469
  outputs=[tabs, nav_message, quiz_alert]
1470
  )
1471
+
1472
+ # Check model loading status periodically
1473
+ def check_model_status():
1474
+ if model is not None and tokenizer is not None:
1475
+ return gr.update(visible=False)
1476
+ return gr.update(visible=True)
1477
+
1478
+ app.load(check_model_status, None, model_status, every=1)
1479
 
1480
  return app
1481