Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -15,7 +15,9 @@ import io
|
|
15 |
import secrets
|
16 |
import string
|
17 |
from huggingface_hub import HfApi, HfFolder
|
18 |
-
import
|
|
|
|
|
19 |
|
20 |
# ========== CONFIGURATION ==========
|
21 |
PROFILES_DIR = "student_profiles"
|
@@ -25,14 +27,45 @@ MIN_AGE = 5
|
|
25 |
MAX_AGE = 120
|
26 |
SESSION_TOKEN_LENGTH = 32
|
27 |
HF_TOKEN = os.getenv("HF_TOKEN")
|
28 |
-
DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY") # Add your DeepSeek API key here
|
29 |
-
DEEPSEEK_API_URL = "https://api.deepseek.com/v1/chat/completions" # Example endpoint
|
30 |
|
31 |
# Initialize Hugging Face API
|
32 |
if HF_TOKEN:
|
33 |
hf_api = HfApi(token=HF_TOKEN)
|
34 |
HfFolder.save_token(HF_TOKEN)
|
35 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
# ========== UTILITY FUNCTIONS ==========
|
37 |
def generate_session_token() -> str:
|
38 |
"""Generate a random session token for user identification."""
|
@@ -77,7 +110,7 @@ def validate_file(file_obj) -> None:
|
|
77 |
if file_size > MAX_FILE_SIZE_MB:
|
78 |
raise gr.Error(f"File too large. Max size: {MAX_FILE_SIZE_MB}MB")
|
79 |
|
80 |
-
# ==========
|
81 |
def extract_text_from_file(file_path: str, file_ext: str) -> str:
|
82 |
"""Enhanced text extraction with better error handling and fallbacks."""
|
83 |
text = ""
|
@@ -169,60 +202,29 @@ def remove_sensitive_info(text: str) -> str:
|
|
169 |
text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '[EMAIL]', text)
|
170 |
return text
|
171 |
|
172 |
-
|
173 |
-
"""Extract JSON string from API response."""
|
174 |
-
# Handle markdown code blocks
|
175 |
-
if '```json' in content:
|
176 |
-
content = content.split('```json')[1].split('```')[0].strip()
|
177 |
-
elif '```' in content:
|
178 |
-
content = content.split('```')[1].split('```')[0].strip()
|
179 |
-
|
180 |
-
# Sometimes the response is pure JSON
|
181 |
-
return content
|
182 |
-
|
183 |
-
def validate_parsed_data(data: Dict) -> Dict:
|
184 |
-
"""Validate and clean the parsed data structure."""
|
185 |
-
# Ensure required fields exist
|
186 |
-
if not isinstance(data, dict):
|
187 |
-
raise ValueError("Invalid data format")
|
188 |
-
|
189 |
-
# Set default structure if missing
|
190 |
-
if 'grade_level' not in data:
|
191 |
-
data['grade_level'] = 'Unknown'
|
192 |
-
|
193 |
-
if 'gpa' not in data:
|
194 |
-
data['gpa'] = {'weighted': 'N/A', 'unweighted': 'N/A'}
|
195 |
-
|
196 |
-
if 'courses' not in data:
|
197 |
-
data['courses'] = []
|
198 |
-
|
199 |
-
# Clean course data
|
200 |
-
for course in data['courses']:
|
201 |
-
if 'grade' in course:
|
202 |
-
course['grade'] = course['grade'].upper().strip()
|
203 |
-
|
204 |
-
# Ensure numeric credits are strings
|
205 |
-
if 'credits' in course and isinstance(course['credits'], (int, float)):
|
206 |
-
course['credits'] = str(course['credits'])
|
207 |
-
|
208 |
-
return data
|
209 |
-
|
210 |
def parse_transcript_with_deepseek(text: str) -> Dict:
|
211 |
-
"""
|
212 |
-
if
|
213 |
-
raise gr.Error("DeepSeek
|
214 |
|
215 |
-
# Pre-process the text
|
216 |
-
text = remove_sensitive_info(text)
|
217 |
|
218 |
-
# Create a more robust prompt with examples
|
219 |
prompt = f"""
|
220 |
-
Analyze this academic transcript and extract structured information
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
226 |
{{
|
227 |
"grade_level": "11",
|
228 |
"gpa": {{
|
@@ -240,44 +242,76 @@ def parse_transcript_with_deepseek(text: str) -> Dict:
|
|
240 |
}}
|
241 |
]
|
242 |
}}
|
243 |
-
|
244 |
Transcript Text:
|
245 |
-
{text
|
246 |
"""
|
247 |
|
248 |
-
headers = {
|
249 |
-
"Authorization": f"Bearer {DEEPSEEK_API_KEY}",
|
250 |
-
"Content-Type": "application/json"
|
251 |
-
}
|
252 |
-
|
253 |
-
payload = {
|
254 |
-
"model": "deepseek-chat",
|
255 |
-
"messages": [{"role": "user", "content": prompt}],
|
256 |
-
"temperature": 0.1,
|
257 |
-
"max_tokens": 2000
|
258 |
-
}
|
259 |
-
|
260 |
try:
|
261 |
-
|
262 |
-
|
263 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
264 |
|
265 |
-
|
|
|
|
|
266 |
|
267 |
-
# Extract JSON from response
|
268 |
-
|
|
|
|
|
|
|
|
|
|
|
269 |
|
270 |
-
#
|
271 |
-
parsed_data =
|
|
|
272 |
|
273 |
-
return parsed_data
|
274 |
|
275 |
-
except
|
276 |
-
raise gr.Error(
|
277 |
-
except json.JSONDecodeError as e:
|
278 |
-
raise gr.Error(f"Failed to parse API response: {str(e)}")
|
279 |
except Exception as e:
|
280 |
-
raise gr.Error(f"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
281 |
|
282 |
def format_transcript_output(data: Dict) -> str:
|
283 |
"""Format the parsed data into human-readable text."""
|
@@ -326,10 +360,10 @@ def parse_transcript(file_obj) -> Tuple[str, Optional[Dict]]:
|
|
326 |
# Extract text from file
|
327 |
text = extract_text_from_file(file_obj.name, file_ext)
|
328 |
|
329 |
-
#
|
330 |
parsed_data = parse_transcript_with_deepseek(text)
|
331 |
|
332 |
-
# Format output
|
333 |
output_text = format_transcript_output(parsed_data)
|
334 |
|
335 |
# Prepare the data structure for saving
|
@@ -339,7 +373,7 @@ def parse_transcript(file_obj) -> Tuple[str, Optional[Dict]]:
|
|
339 |
"courses": defaultdict(list)
|
340 |
}
|
341 |
|
342 |
-
# Organize courses by grade level
|
343 |
for course in parsed_data.get('courses', []):
|
344 |
grade_level = course.get('grade_level', 'Unknown')
|
345 |
transcript_data["courses"][grade_level].append(course)
|
@@ -1043,6 +1077,13 @@ def create_interface():
|
|
1043 |
background-color: #ffebee;
|
1044 |
color: #c62828;
|
1045 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1046 |
"""
|
1047 |
|
1048 |
gr.Markdown("""
|
@@ -1051,6 +1092,12 @@ def create_interface():
|
|
1051 |
Complete each step to get customized learning recommendations.
|
1052 |
""")
|
1053 |
|
|
|
|
|
|
|
|
|
|
|
|
|
1054 |
# Progress tracker - now with dynamic styling
|
1055 |
with gr.Row():
|
1056 |
with gr.Column(scale=1):
|
@@ -1101,6 +1148,9 @@ def create_interface():
|
|
1101 |
transcript_data = gr.State()
|
1102 |
|
1103 |
def process_transcript_and_update(file_obj, current_tab_status):
|
|
|
|
|
|
|
1104 |
output_text, data = parse_transcript(file_obj)
|
1105 |
if "Error" not in output_text:
|
1106 |
new_status = current_tab_status.copy()
|
@@ -1418,6 +1468,14 @@ def create_interface():
|
|
1418 |
inputs=[gr.State(4), tab_completed],
|
1419 |
outputs=[tabs, nav_message, quiz_alert]
|
1420 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1421 |
|
1422 |
return app
|
1423 |
|
|
|
15 |
import secrets
|
16 |
import string
|
17 |
from huggingface_hub import HfApi, HfFolder
|
18 |
+
import torch
|
19 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
20 |
+
import time
|
21 |
|
22 |
# ========== CONFIGURATION ==========
|
23 |
PROFILES_DIR = "student_profiles"
|
|
|
27 |
MAX_AGE = 120
|
28 |
SESSION_TOKEN_LENGTH = 32
|
29 |
HF_TOKEN = os.getenv("HF_TOKEN")
|
|
|
|
|
30 |
|
31 |
# Initialize Hugging Face API
|
32 |
if HF_TOKEN:
|
33 |
hf_api = HfApi(token=HF_TOKEN)
|
34 |
HfFolder.save_token(HF_TOKEN)
|
35 |
|
36 |
+
# ========== DEEPSEEK MODEL LOADING ==========
|
37 |
+
def load_deepseek_model():
|
38 |
+
"""Load the DeepSeek model with progress tracking"""
|
39 |
+
progress = gr.Progress()
|
40 |
+
progress(0, desc="Loading DeepSeek model...")
|
41 |
+
|
42 |
+
try:
|
43 |
+
start_time = time.time()
|
44 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
45 |
+
"deepseek-ai/DeepSeek-V3",
|
46 |
+
trust_remote_code=True
|
47 |
+
)
|
48 |
+
progress(0.3, desc="Loading tokenizer...")
|
49 |
+
|
50 |
+
model = AutoModelForCausalLM.from_pretrained(
|
51 |
+
"deepseek-ai/DeepSeek-V3",
|
52 |
+
trust_remote_code=True,
|
53 |
+
torch_dtype=torch.float16,
|
54 |
+
device_map="auto"
|
55 |
+
)
|
56 |
+
progress(0.9, desc="Loading model weights...")
|
57 |
+
|
58 |
+
load_time = time.time() - start_time
|
59 |
+
print(f"DeepSeek model loaded in {load_time:.2f} seconds")
|
60 |
+
return model, tokenizer
|
61 |
+
|
62 |
+
except Exception as e:
|
63 |
+
print(f"Error loading DeepSeek model: {str(e)}")
|
64 |
+
return None, None
|
65 |
+
|
66 |
+
# Load model at startup
|
67 |
+
model, tokenizer = load_deepseek_model()
|
68 |
+
|
69 |
# ========== UTILITY FUNCTIONS ==========
|
70 |
def generate_session_token() -> str:
|
71 |
"""Generate a random session token for user identification."""
|
|
|
110 |
if file_size > MAX_FILE_SIZE_MB:
|
111 |
raise gr.Error(f"File too large. Max size: {MAX_FILE_SIZE_MB}MB")
|
112 |
|
113 |
+
# ========== TEXT EXTRACTION FUNCTIONS ==========
|
114 |
def extract_text_from_file(file_path: str, file_ext: str) -> str:
|
115 |
"""Enhanced text extraction with better error handling and fallbacks."""
|
116 |
text = ""
|
|
|
202 |
text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '[EMAIL]', text)
|
203 |
return text
|
204 |
|
205 |
+
# ========== TRANSCRIPT PARSING ==========
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
206 |
def parse_transcript_with_deepseek(text: str) -> Dict:
|
207 |
+
"""Use local DeepSeek model to parse transcript text"""
|
208 |
+
if model is None or tokenizer is None:
|
209 |
+
raise gr.Error("DeepSeek model failed to load. Please try again later.")
|
210 |
|
211 |
+
# Pre-process the text
|
212 |
+
text = remove_sensitive_info(text[:15000]) # Limit to first 15k chars
|
213 |
|
|
|
214 |
prompt = f"""
|
215 |
+
Analyze this academic transcript and extract structured information:
|
216 |
+
- Current grade level
|
217 |
+
- Weighted GPA (if available)
|
218 |
+
- Unweighted GPA (if available)
|
219 |
+
- List of all courses with:
|
220 |
+
* Course code
|
221 |
+
* Course name
|
222 |
+
* Grade received
|
223 |
+
* Credits earned
|
224 |
+
* Year/semester taken
|
225 |
+
* Grade level when taken
|
226 |
+
|
227 |
+
Return the data in this JSON structure:
|
228 |
{{
|
229 |
"grade_level": "11",
|
230 |
"gpa": {{
|
|
|
242 |
}}
|
243 |
]
|
244 |
}}
|
245 |
+
|
246 |
Transcript Text:
|
247 |
+
{text}
|
248 |
"""
|
249 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
250 |
try:
|
251 |
+
# Show progress to user
|
252 |
+
progress = gr.Progress()
|
253 |
+
progress(0, desc="Analyzing transcript...")
|
254 |
+
|
255 |
+
# Tokenize and generate response
|
256 |
+
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
|
257 |
+
progress(0.3)
|
258 |
+
|
259 |
+
outputs = model.generate(
|
260 |
+
**inputs,
|
261 |
+
max_new_tokens=2000,
|
262 |
+
temperature=0.1,
|
263 |
+
do_sample=True
|
264 |
+
)
|
265 |
+
progress(0.8)
|
266 |
|
267 |
+
# Decode the response
|
268 |
+
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
269 |
+
progress(0.9)
|
270 |
|
271 |
+
# Extract the JSON content from the response
|
272 |
+
if '```json' in response:
|
273 |
+
json_str = response.split('```json')[1].split('```')[0].strip()
|
274 |
+
elif '```' in response:
|
275 |
+
json_str = response.split('```')[1].split('```')[0].strip()
|
276 |
+
else:
|
277 |
+
json_str = response
|
278 |
|
279 |
+
# Parse and validate the JSON
|
280 |
+
parsed_data = json.loads(json_str)
|
281 |
+
progress(1.0)
|
282 |
|
283 |
+
return validate_parsed_data(parsed_data)
|
284 |
|
285 |
+
except torch.cuda.OutOfMemoryError:
|
286 |
+
raise gr.Error("The model ran out of memory. Try with a smaller transcript or upgrade your GPU.")
|
|
|
|
|
287 |
except Exception as e:
|
288 |
+
raise gr.Error(f"Error processing transcript: {str(e)}")
|
289 |
+
|
290 |
+
def validate_parsed_data(data: Dict) -> Dict:
|
291 |
+
"""Validate and clean the parsed data structure."""
|
292 |
+
if not isinstance(data, dict):
|
293 |
+
raise ValueError("Invalid data format")
|
294 |
+
|
295 |
+
# Set default structure if missing
|
296 |
+
if 'grade_level' not in data:
|
297 |
+
data['grade_level'] = 'Unknown'
|
298 |
+
|
299 |
+
if 'gpa' not in data:
|
300 |
+
data['gpa'] = {'weighted': 'N/A', 'unweighted': 'N/A'}
|
301 |
+
|
302 |
+
if 'courses' not in data:
|
303 |
+
data['courses'] = []
|
304 |
+
|
305 |
+
# Clean course data
|
306 |
+
for course in data['courses']:
|
307 |
+
if 'grade' in course:
|
308 |
+
course['grade'] = course['grade'].upper().strip()
|
309 |
+
|
310 |
+
# Ensure numeric credits are strings
|
311 |
+
if 'credits' in course and isinstance(course['credits'], (int, float)):
|
312 |
+
course['credits'] = str(course['credits'])
|
313 |
+
|
314 |
+
return data
|
315 |
|
316 |
def format_transcript_output(data: Dict) -> str:
|
317 |
"""Format the parsed data into human-readable text."""
|
|
|
360 |
# Extract text from file
|
361 |
text = extract_text_from_file(file_obj.name, file_ext)
|
362 |
|
363 |
+
# Use DeepSeek for parsing
|
364 |
parsed_data = parse_transcript_with_deepseek(text)
|
365 |
|
366 |
+
# Format output text
|
367 |
output_text = format_transcript_output(parsed_data)
|
368 |
|
369 |
# Prepare the data structure for saving
|
|
|
373 |
"courses": defaultdict(list)
|
374 |
}
|
375 |
|
376 |
+
# Organize courses by grade level
|
377 |
for course in parsed_data.get('courses', []):
|
378 |
grade_level = course.get('grade_level', 'Unknown')
|
379 |
transcript_data["courses"][grade_level].append(course)
|
|
|
1077 |
background-color: #ffebee;
|
1078 |
color: #c62828;
|
1079 |
}
|
1080 |
+
.model-loading {
|
1081 |
+
padding: 15px;
|
1082 |
+
margin: 15px 0;
|
1083 |
+
border-radius: 4px;
|
1084 |
+
background-color: #fff3e0;
|
1085 |
+
color: #e65100;
|
1086 |
+
}
|
1087 |
"""
|
1088 |
|
1089 |
gr.Markdown("""
|
|
|
1092 |
Complete each step to get customized learning recommendations.
|
1093 |
""")
|
1094 |
|
1095 |
+
# Model loading status
|
1096 |
+
model_status = gr.HTML(
|
1097 |
+
value="<div class='model-loading'>Loading AI model... (This may take a few minutes)</div>" if model is None else "",
|
1098 |
+
visible=model is None
|
1099 |
+
)
|
1100 |
+
|
1101 |
# Progress tracker - now with dynamic styling
|
1102 |
with gr.Row():
|
1103 |
with gr.Column(scale=1):
|
|
|
1148 |
transcript_data = gr.State()
|
1149 |
|
1150 |
def process_transcript_and_update(file_obj, current_tab_status):
|
1151 |
+
if model is None:
|
1152 |
+
return "Error: AI model failed to load. Please try again later.", None, current_tab_status, gr.update(), gr.update(), gr.update()
|
1153 |
+
|
1154 |
output_text, data = parse_transcript(file_obj)
|
1155 |
if "Error" not in output_text:
|
1156 |
new_status = current_tab_status.copy()
|
|
|
1468 |
inputs=[gr.State(4), tab_completed],
|
1469 |
outputs=[tabs, nav_message, quiz_alert]
|
1470 |
)
|
1471 |
+
|
1472 |
+
# Check model loading status periodically
|
1473 |
+
def check_model_status():
|
1474 |
+
if model is not None and tokenizer is not None:
|
1475 |
+
return gr.update(visible=False)
|
1476 |
+
return gr.update(visible=True)
|
1477 |
+
|
1478 |
+
app.load(check_model_status, None, model_status, every=1)
|
1479 |
|
1480 |
return app
|
1481 |
|