Dannyar608 commited on
Commit
9abe9f0
·
verified ·
1 Parent(s): 40dbed7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +161 -146
app.py CHANGED
@@ -5,113 +5,129 @@ import os
5
  import re
6
  from PyPDF2 import PdfReader
7
  from collections import defaultdict
8
-
9
- # ========== TRANSCRIPT PARSING FUNCTIONS ==========
10
- def extract_courses_with_grade_levels(text):
11
- # First extract the current grade level
12
- grade_level_pattern = r"(Grade|Year)\s*[:]?\s*(\d+|Freshman|Sophomore|Junior|Senior)"
13
- grade_match = re.search(grade_level_pattern, text, re.IGNORECASE)
14
- current_grade_level = grade_match.group(2) if grade_match else "Unknown"
15
-
16
- # Improved course pattern to better match course codes and names
17
- course_pattern = r"""
18
- (?:^|\n)
19
- (?: (Grade|Year)\s*[:]?\s*(\d+|Freshman|Sophomore|Junior|Senior)\s*[\n-]* )? # Optional grade level context
20
- (
21
- (?:[A-Z]{2,}\s?\d{3}[A-Z]?\b) # Course codes like MATH101 or CS 201A
22
- |
23
- (?:[A-Z][a-z]+(?:\s+[A-Z][a-z]+)+) # Course names like "Calculus I" or "World History"
24
- )
25
- \s*
26
- (?: [:\-]?\s* ([A-F][+-]?|\d{2,3}%)? )? # Optional grade
27
- \s*
28
- (?: [:\-]?\s* (\d\.\d{1,2})? )? # Optional credit hours
29
- """
30
-
31
- courses_by_grade = defaultdict(list)
32
- current_grade = current_grade_level
33
-
34
- for match in re.finditer(course_pattern, text, re.VERBOSE | re.MULTILINE):
35
- grade_context, grade_level, course, grade, credits = match.groups()
36
-
37
- if grade_context:
38
- current_grade = grade_level
39
-
40
- if course:
41
- course_info = {
42
- "course": course.strip(),
43
- "grade_level": current_grade
44
- }
45
- if grade:
46
- course_info["grade"] = grade.strip()
47
- if credits:
48
- course_info["credits"] = credits.strip()
49
- courses_by_grade[current_grade].append(course_info)
50
-
51
- return dict(courses_by_grade)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
  def parse_transcript(file):
54
- if file.name.endswith('.csv'):
55
- df = pd.read_csv(file)
56
- elif file.name.endswith('.xlsx'):
57
- df = pd.read_excel(file)
58
- elif file.name.endswith('.pdf'):
59
  text = ''
60
  reader = PdfReader(file)
61
  for page in reader.pages:
62
- page_text = page.extract_text()
63
- if page_text:
64
- text += page_text + '\n'
65
-
66
- # Grade level extraction
67
- grade_match = re.search(r'(Grade|Year)[\s:]*(\d+|Freshman|Sophomore|Junior|Senior)', text, re.IGNORECASE)
68
- grade_level = grade_match.group(2) if grade_match else "Unknown"
69
-
70
- # Enhanced GPA extraction
71
- gpa_data = {'weighted': "N/A", 'unweighted': "N/A"}
72
- gpa_patterns = [
73
- r'Weighted GPA[\s:]*(\d\.\d{1,2})',
74
- r'GPA \(Weighted\)[\s:]*(\d\.\d{1,2})',
75
- r'Cumulative GPA \(Weighted\)[\s:]*(\d\.\d{1,2})',
76
- r'Unweighted GPA[\s:]*(\d\.\d{1,2})',
77
- r'GPA \(Unweighted\)[\s:]*(\d\.\d{1,2})',
78
- r'Cumulative GPA \(Unweighted\)[\s:]*(\d\.\d{1,2})',
79
- r'GPA[\s:]*(\d\.\d{1,2})'
80
- ]
81
- for pattern in gpa_patterns:
82
- for match in re.finditer(pattern, text, re.IGNORECASE):
83
- gpa_value = match.group(1)
84
- if 'weighted' in pattern.lower():
85
- gpa_data['weighted'] = gpa_value
86
- elif 'unweighted' in pattern.lower():
87
- gpa_data['unweighted'] = gpa_value
88
- else:
89
- if gpa_data['unweighted'] == "N/A":
90
- gpa_data['unweighted'] = gpa_value
91
- if gpa_data['weighted'] == "N/A":
92
- gpa_data['weighted'] = gpa_value
93
-
94
- courses_by_grade = extract_courses_with_grade_levels(text)
95
-
96
- output_text = f"Grade Level: {grade_level}\n\n"
97
- if gpa_data['weighted'] != "N/A" or gpa_data['unweighted'] != "N/A":
98
- output_text += "GPA Information:\n"
99
- if gpa_data['unweighted'] != "N/A":
100
- output_text += f"- Unweighted GPA: {gpa_data['unweighted']}\n"
101
- if gpa_data['weighted'] != "N/A":
102
- output_text += f"- Weighted GPA: {gpa_data['weighted']}\n"
103
  else:
104
- output_text += "No GPA information found\n"
105
-
 
 
 
 
 
106
  return output_text, {
107
  "gpa": gpa_data,
108
- "grade_level": grade_level,
109
- "courses": courses_by_grade
110
  }
 
 
 
 
111
  else:
112
  return "Unsupported file format", None
113
 
114
- # For CSV/XLSX fallback
115
  gpa = "N/A"
116
  for col in ['GPA', 'Grade Point Average', 'Cumulative GPA']:
117
  if col in df.columns:
@@ -136,31 +152,6 @@ def parse_transcript(file):
136
  "courses": courses
137
  }
138
 
139
- # ... [keep all other functions the same until transcript_display] ...
140
-
141
- def transcript_display(transcript_dict):
142
- if not transcript_dict:
143
- return "No transcript uploaded."
144
-
145
- if isinstance(transcript_dict, dict) and "courses" in transcript_dict:
146
- if isinstance(transcript_dict["courses"], dict):
147
- display = "### Course History\n\n"
148
- for grade_level, courses in transcript_dict["courses"].items():
149
- display += f"**Grade {grade_level}**\n"
150
- for course in courses:
151
- display += f"- {course.get('course', 'N/A')}"
152
- if 'grade' in course:
153
- display += f" (Grade: {course['grade']})"
154
- if 'credits' in course:
155
- display += f" | Credits: {course['credits']}"
156
- display += "\n"
157
- display += "\n"
158
- return display
159
- elif isinstance(transcript_dict["courses"], list):
160
- return "### Courses\n" + "\n".join([f"- {course}" for course in transcript_dict["courses"]])
161
-
162
- return "No course information available in the expected format."
163
-
164
  # ========== LEARNING STYLE QUIZ ==========
165
  learning_style_questions = [
166
  "When you study for a test, you prefer to:",
@@ -278,8 +269,10 @@ def learning_style_quiz(*answers):
278
 
279
  return result
280
 
281
- # ========== SAVE STUDENT PROFILE FUNCTION ==========
282
- def save_profile(name, age, interests, transcript, learning_style, movie, movie_reason, show, show_reason, book, book_reason, character, character_reason, blog):
 
 
283
  # Convert age to int if it's a numpy number (from gradio Number input)
284
  age = int(age) if age else 0
285
 
@@ -326,22 +319,43 @@ def save_profile(name, age, interests, transcript, learning_style, movie, movie_
326
  return markdown_summary
327
 
328
  def transcript_display(transcript_dict):
329
- if not transcript_dict:
330
- return "No transcript uploaded."
331
- if isinstance(transcript_dict, dict) and "courses" in transcript_dict:
332
- if isinstance(transcript_dict["courses"], dict):
333
- display = ""
334
- for grade_level, courses in transcript_dict["courses"].items():
335
- display += f"\n**Grade {grade_level}**\n"
336
- for course in courses:
337
- display += f"- {course['course']}"
 
 
 
338
  if 'grade' in course:
339
  display += f" (Grade: {course['grade']})"
 
 
340
  display += "\n"
341
- return display
342
- elif isinstance(transcript_dict["courses"], list):
343
- return "\n".join([f"- {course}" for course in transcript_dict["courses"]])
344
- return "No course information available"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
345
 
346
  # ========== AI TEACHING ASSISTANT ==========
347
  def load_profile():
@@ -432,22 +446,25 @@ def generate_response(message, history):
432
  # ========== GRADIO INTERFACE ==========
433
  with gr.Blocks() as app:
434
  with gr.Tab("Step 1: Upload Transcript"):
435
- transcript_file = gr.File(label="Upload your transcript (CSV, Excel, or PDF)")
436
- transcript_output = gr.Textbox(label="Transcript Output")
 
437
  transcript_data = gr.State()
438
- transcript_file.change(fn=parse_transcript, inputs=transcript_file, outputs=[transcript_output, transcript_data])
 
 
 
 
439
 
440
  with gr.Tab("Step 2: Learning Style Quiz"):
441
  gr.Markdown("### Learning Style Quiz (20 Questions)")
442
  quiz_components = []
443
  for i, (question, options) in enumerate(zip(learning_style_questions, learning_style_options)):
444
- quiz_components.append(
445
- gr.Radio(options, label=f"{i+1}. {question}")
446
- )
447
 
448
- learning_output = gr.Textbox(label="Learning Style Result", lines=10)
449
  gr.Button("Submit Quiz").click(
450
- learning_style_quiz,
451
  inputs=quiz_components,
452
  outputs=learning_output
453
  )
@@ -471,7 +488,6 @@ with gr.Blocks() as app:
471
  with gr.Tab("Step 4: Save & Review"):
472
  output_summary = gr.Markdown()
473
  save_btn = gr.Button("Save Profile")
474
-
475
  save_btn.click(
476
  fn=save_profile,
477
  inputs=[name, age, interests, transcript_data, learning_output,
@@ -480,7 +496,6 @@ with gr.Blocks() as app:
480
  outputs=output_summary
481
  )
482
 
483
- # AI Teaching Assistant Tab
484
  with gr.Tab("🤖 AI Teaching Assistant"):
485
  gr.Markdown("## Your Personalized Learning Assistant")
486
  chatbot = gr.ChatInterface(
@@ -494,5 +509,5 @@ with gr.Blocks() as app:
494
  )
495
 
496
  if __name__ == "__main__":
497
- app.launch()
498
 
 
5
  import re
6
  from PyPDF2 import PdfReader
7
  from collections import defaultdict
8
+ from transformers import pipeline
9
+
10
+ # Initialize NER model (will load only if transformers is available)
11
+ try:
12
+ ner_pipeline = pipeline("ner", model="dslim/bert-base-NER")
13
+ except Exception as e:
14
+ print(f"Could not load NER model: {e}")
15
+ ner_pipeline = None
16
+
17
+ # ========== IMPROVED TRANSCRIPT PARSING ==========
18
+ def extract_gpa(text):
19
+ gpa_data = {'weighted': "N/A", 'unweighted': "N/A"}
20
+ gpa_patterns = [
21
+ r'Weighted GPA[\s:]*(\d\.\d{1,2})',
22
+ r'GPA \(Weighted\)[\s:]*(\d\.\d{1,2})',
23
+ r'Cumulative GPA \(Weighted\)[\s:]*(\d\.\d{1,2})',
24
+ r'Unweighted GPA[\s:]*(\d\.\d{1,2})',
25
+ r'GPA \(Unweighted\)[\s:]*(\d\.\d{1,2})',
26
+ r'Cumulative GPA \(Unweighted\)[\s:]*(\d\.\d{1,2})',
27
+ r'GPA[\s:]*(\d\.\d{1,2})'
28
+ ]
29
+ for pattern in gpa_patterns:
30
+ for match in re.finditer(pattern, text, re.IGNORECASE):
31
+ gpa_value = match.group(1)
32
+ if 'weighted' in pattern.lower():
33
+ gpa_data['weighted'] = gpa_value
34
+ elif 'unweighted' in pattern.lower():
35
+ gpa_data['unweighted'] = gpa_value
36
+ else:
37
+ if gpa_data['unweighted'] == "N/A":
38
+ gpa_data['unweighted'] = gpa_value
39
+ if gpa_data['weighted'] == "N/A":
40
+ gpa_data['weighted'] = gpa_value
41
+ return gpa_data
42
+
43
+ def extract_courses_with_regex(text):
44
+ patterns = [
45
+ r'(?:^|\n)([A-Z]{2,}\s*-?\s*\d{3}[A-Z]?\b)\s*([A-F][+-]?|\d{2,3}%)?',
46
+ r'(?:^|\n)([A-Z][a-z]+(?:\s+[A-Z]?[a-z]+)+)\s*[:\-]?\s*([A-F][+-]?|\d{2,3}%)?',
47
+ r'(?:^|\n)([A-Z]{2,})\s*\d{3}\b'
48
+ ]
49
+ courses = []
50
+ for pattern in patterns:
51
+ for match in re.finditer(pattern, text, re.MULTILINE):
52
+ course_name = match.group(1).strip()
53
+ grade = match.group(2).strip() if match.group(2) else None
54
+ courses.append({'name': course_name, 'grade': grade})
55
+ return courses
56
+
57
+ def extract_grade_levels(text):
58
+ grade_pattern = r'(?:Grade|Year|Term)\s*[:]?\s*(\d+|Freshman|Sophomore|Junior|Senior)\b'
59
+ grade_matches = list(re.finditer(grade_pattern, text, re.IGNORECASE))
60
+ grade_sections = []
61
+ for i, match in enumerate(grade_matches):
62
+ start_pos = match.start()
63
+ end_pos = grade_matches[i+1].start() if i+1 < len(grade_matches) else len(text)
64
+ grade_sections.append({
65
+ 'grade': match.group(1),
66
+ 'text': text[start_pos:end_pos]
67
+ })
68
+ return grade_sections
69
 
70
  def parse_transcript(file):
71
+ if file.name.endswith('.pdf'):
 
 
 
 
72
  text = ''
73
  reader = PdfReader(file)
74
  for page in reader.pages:
75
+ text += page.extract_text() + '\n'
76
+
77
+ # Try both NER and regex approaches
78
+ courses = []
79
+ if ner_pipeline:
80
+ try:
81
+ entities = ner_pipeline(text)
82
+ current_course = {}
83
+ for entity in entities:
84
+ if entity['word'].startswith('##'):
85
+ current_course['name'] = current_course.get('name', '') + entity['word'][2:]
86
+ elif entity['entity'] in ['B-ORG', 'I-ORG']: # Using ORG as proxy for courses
87
+ if 'name' in current_course:
88
+ courses.append(current_course)
89
+ current_course = {'name': entity['word']}
90
+ elif entity['entity'] == 'GRADE' and current_course:
91
+ current_course['grade'] = entity['word']
92
+ if current_course:
93
+ courses.append(current_course)
94
+ except Exception as e:
95
+ print(f"NER failed: {e}")
96
+
97
+ # Fallback to regex if NER didn't find courses
98
+ if not courses:
99
+ courses = extract_courses_with_regex(text)
100
+
101
+ # Organize by grade level
102
+ grade_sections = extract_grade_levels(text)
103
+ courses_by_grade = defaultdict(list)
104
+
105
+ if grade_sections:
106
+ for section in grade_sections:
107
+ section_courses = extract_courses_with_regex(section['text'])
108
+ for course in section_courses:
109
+ course['term'] = section['grade']
110
+ courses_by_grade[section['grade']].append(course)
 
 
 
 
 
111
  else:
112
+ courses_by_grade["All"] = courses
113
+
114
+ gpa_data = extract_gpa(text)
115
+
116
+ output_text = "Transcript parsed successfully\n"
117
+ output_text += f"Found {len(courses)} courses across {len(courses_by_grade)} grade levels\n"
118
+
119
  return output_text, {
120
  "gpa": gpa_data,
121
+ "courses": dict(courses_by_grade)
 
122
  }
123
+ elif file.name.endswith('.csv'):
124
+ df = pd.read_csv(file)
125
+ elif file.name.endswith('.xlsx'):
126
+ df = pd.read_excel(file)
127
  else:
128
  return "Unsupported file format", None
129
 
130
+ # Fallback for CSV/Excel
131
  gpa = "N/A"
132
  for col in ['GPA', 'Grade Point Average', 'Cumulative GPA']:
133
  if col in df.columns:
 
152
  "courses": courses
153
  }
154
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
  # ========== LEARNING STYLE QUIZ ==========
156
  learning_style_questions = [
157
  "When you study for a test, you prefer to:",
 
269
 
270
  return result
271
 
272
+ # ========== SAVE STUDENT PROFILE ==========
273
+ def save_profile(name, age, interests, transcript, learning_style,
274
+ movie, movie_reason, show, show_reason,
275
+ book, book_reason, character, character_reason, blog):
276
  # Convert age to int if it's a numpy number (from gradio Number input)
277
  age = int(age) if age else 0
278
 
 
319
  return markdown_summary
320
 
321
  def transcript_display(transcript_dict):
322
+ if not transcript_dict or "courses" not in transcript_dict:
323
+ return "No course information available"
324
+
325
+ display = "### Course History\n\n"
326
+ courses_by_grade = transcript_dict["courses"]
327
+
328
+ if isinstance(courses_by_grade, dict):
329
+ for grade, courses in courses_by_grade.items():
330
+ display += f"**{grade}**\n"
331
+ for course in courses:
332
+ if isinstance(course, dict):
333
+ display += f"- {course.get('name', 'N/A')}"
334
  if 'grade' in course:
335
  display += f" (Grade: {course['grade']})"
336
+ if 'term' in course:
337
+ display += f" | Term: {course['term']}"
338
  display += "\n"
339
+ else:
340
+ display += f"- {str(course)}\n"
341
+ display += "\n"
342
+ elif isinstance(courses_by_grade, list):
343
+ for course in courses_by_grade:
344
+ if isinstance(course, dict):
345
+ display += f"- {course.get('name', 'N/A')}"
346
+ if 'grade' in course:
347
+ display += f" (Grade: {course['grade']})"
348
+ display += "\n"
349
+ else:
350
+ display += f"- {str(course)}\n"
351
+
352
+ if 'gpa' in transcript_dict:
353
+ gpa = transcript_dict['gpa']
354
+ display += "\n**GPA Information**\n"
355
+ display += f"- Unweighted: {gpa.get('unweighted', 'N/A')}\n"
356
+ display += f"- Weighted: {gpa.get('weighted', 'N/A')}\n"
357
+
358
+ return display
359
 
360
  # ========== AI TEACHING ASSISTANT ==========
361
  def load_profile():
 
446
  # ========== GRADIO INTERFACE ==========
447
  with gr.Blocks() as app:
448
  with gr.Tab("Step 1: Upload Transcript"):
449
+ gr.Markdown("### Upload your transcript (PDF recommended for best results)")
450
+ transcript_file = gr.File(label="Transcript file", file_types=[".pdf", ".csv", ".xlsx"])
451
+ transcript_output = gr.Textbox(label="Parsing Results")
452
  transcript_data = gr.State()
453
+ transcript_file.change(
454
+ fn=parse_transcript,
455
+ inputs=transcript_file,
456
+ outputs=[transcript_output, transcript_data]
457
+ )
458
 
459
  with gr.Tab("Step 2: Learning Style Quiz"):
460
  gr.Markdown("### Learning Style Quiz (20 Questions)")
461
  quiz_components = []
462
  for i, (question, options) in enumerate(zip(learning_style_questions, learning_style_options)):
463
+ quiz_components.append(gr.Radio(options, label=f"{i+1}. {question}"))
 
 
464
 
465
+ learning_output = gr.Textbox(label="Your Learning Style", lines=10)
466
  gr.Button("Submit Quiz").click(
467
+ fn=learning_style_quiz,
468
  inputs=quiz_components,
469
  outputs=learning_output
470
  )
 
488
  with gr.Tab("Step 4: Save & Review"):
489
  output_summary = gr.Markdown()
490
  save_btn = gr.Button("Save Profile")
 
491
  save_btn.click(
492
  fn=save_profile,
493
  inputs=[name, age, interests, transcript_data, learning_output,
 
496
  outputs=output_summary
497
  )
498
 
 
499
  with gr.Tab("🤖 AI Teaching Assistant"):
500
  gr.Markdown("## Your Personalized Learning Assistant")
501
  chatbot = gr.ChatInterface(
 
509
  )
510
 
511
  if __name__ == "__main__":
512
+ app.launch()
513