Spaces:

Dannyar608
/

Final_project

Runtime error

App Files Files Community

Dannyar608 commited on Apr 29

Commit

9abe9f0

verified ·

1 Parent(s): 40dbed7

Update app.py

Browse files

Files changed (1) hide show

app.py +161 -146

app.py CHANGED Viewed

@@ -5,113 +5,129 @@ import os
 import re
 from PyPDF2 import PdfReader
 from collections import defaultdict
-# ========== TRANSCRIPT PARSING FUNCTIONS ==========
-def extract_courses_with_grade_levels(text):
-    # First extract the current grade level
-    grade_level_pattern = r"(Grade|Year)\s*[:]?\s*(\d+|Freshman|Sophomore|Junior|Senior)"
-    grade_match = re.search(grade_level_pattern, text, re.IGNORECASE)
-    current_grade_level = grade_match.group(2) if grade_match else "Unknown"
-    # Improved course pattern to better match course codes and names
-    course_pattern = r"""
-        (?:^|\n)
-        (?: (Grade|Year)\s*[:]?\s*(\d+|Freshman|Sophomore|Junior|Senior)\s*[\n-]* )?  # Optional grade level context
-        (
-            (?:[A-Z]{2,}\s?\d{3}[A-Z]?\b)  # Course codes like MATH101 or CS 201A
-            |
-            (?:[A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)  # Course names like "Calculus I" or "World History"
-        )
-        \s*
-        (?: [:\-]?\s* ([A-F][+-]?|\d{2,3}%)? )?  # Optional grade
-        \s*
-        (?: [:\-]?\s* (\d\.\d{1,2})? )?  # Optional credit hours
-    """
-    courses_by_grade = defaultdict(list)
-    current_grade = current_grade_level
-    for match in re.finditer(course_pattern, text, re.VERBOSE | re.MULTILINE):
-        grade_context, grade_level, course, grade, credits = match.groups()
-        if grade_context:
-            current_grade = grade_level
-        if course:
-            course_info = {
-                "course": course.strip(),
-                "grade_level": current_grade
-            }
-            if grade:
-                course_info["grade"] = grade.strip()
-            if credits:
-                course_info["credits"] = credits.strip()
-            courses_by_grade[current_grade].append(course_info)
-    return dict(courses_by_grade)
 def parse_transcript(file):
-    if file.name.endswith('.csv'):
-        df = pd.read_csv(file)
-    elif file.name.endswith('.xlsx'):
-        df = pd.read_excel(file)
-    elif file.name.endswith('.pdf'):
         text = ''
         reader = PdfReader(file)
         for page in reader.pages:
-            page_text = page.extract_text()
-            if page_text:
-                text += page_text + '\n'
-        # Grade level extraction
-        grade_match = re.search(r'(Grade|Year)[\s:]*(\d+|Freshman|Sophomore|Junior|Senior)', text, re.IGNORECASE)
-        grade_level = grade_match.group(2) if grade_match else "Unknown"
-        # Enhanced GPA extraction
-        gpa_data = {'weighted': "N/A", 'unweighted': "N/A"}
-        gpa_patterns = [
-            r'Weighted GPA[\s:]*(\d\.\d{1,2})',
-            r'GPA \(Weighted\)[\s:]*(\d\.\d{1,2})',
-            r'Cumulative GPA \(Weighted\)[\s:]*(\d\.\d{1,2})',
-            r'Unweighted GPA[\s:]*(\d\.\d{1,2})',
-            r'GPA \(Unweighted\)[\s:]*(\d\.\d{1,2})',
-            r'Cumulative GPA \(Unweighted\)[\s:]*(\d\.\d{1,2})',
-            r'GPA[\s:]*(\d\.\d{1,2})'
-        ]
-        for pattern in gpa_patterns:
-            for match in re.finditer(pattern, text, re.IGNORECASE):
-                gpa_value = match.group(1)
-                if 'weighted' in pattern.lower():
-                    gpa_data['weighted'] = gpa_value
-                elif 'unweighted' in pattern.lower():
-                    gpa_data['unweighted'] = gpa_value
-                else:
-                    if gpa_data['unweighted'] == "N/A":
-                        gpa_data['unweighted'] = gpa_value
-                    if gpa_data['weighted'] == "N/A":
-                        gpa_data['weighted'] = gpa_value
-        courses_by_grade = extract_courses_with_grade_levels(text)
-        output_text = f"Grade Level: {grade_level}\n\n"
-        if gpa_data['weighted'] != "N/A" or gpa_data['unweighted'] != "N/A":
-            output_text += "GPA Information:\n"
-            if gpa_data['unweighted'] != "N/A":
-                output_text += f"- Unweighted GPA: {gpa_data['unweighted']}\n"
-            if gpa_data['weighted'] != "N/A":
-                output_text += f"- Weighted GPA: {gpa_data['weighted']}\n"
         else:
-            output_text += "No GPA information found\n"
         return output_text, {
             "gpa": gpa_data,
-            "grade_level": grade_level,
-            "courses": courses_by_grade
         }
     else:
         return "Unsupported file format", None
-    # For CSV/XLSX fallback
     gpa = "N/A"
     for col in ['GPA', 'Grade Point Average', 'Cumulative GPA']:
         if col in df.columns:
@@ -136,31 +152,6 @@ def parse_transcript(file):
         "courses": courses
     }
-# ... [keep all other functions the same until transcript_display] ...
-def transcript_display(transcript_dict):
-    if not transcript_dict:
-        return "No transcript uploaded."
-    if isinstance(transcript_dict, dict) and "courses" in transcript_dict:
-        if isinstance(transcript_dict["courses"], dict):
-            display = "### Course History\n\n"
-            for grade_level, courses in transcript_dict["courses"].items():
-                display += f"**Grade {grade_level}**\n"
-                for course in courses:
-                    display += f"- {course.get('course', 'N/A')}"
-                    if 'grade' in course:
-                        display += f" (Grade: {course['grade']})"
-                    if 'credits' in course:
-                        display += f" | Credits: {course['credits']}"
-                    display += "\n"
-                display += "\n"
-            return display
-        elif isinstance(transcript_dict["courses"], list):
-            return "### Courses\n" + "\n".join([f"- {course}" for course in transcript_dict["courses"]])
-    return "No course information available in the expected format."
 # ========== LEARNING STYLE QUIZ ==========
 learning_style_questions = [
     "When you study for a test, you prefer to:",
@@ -278,8 +269,10 @@ def learning_style_quiz(*answers):
     return result
-# ========== SAVE STUDENT PROFILE FUNCTION ==========
-def save_profile(name, age, interests, transcript, learning_style, movie, movie_reason, show, show_reason, book, book_reason, character, character_reason, blog):
     # Convert age to int if it's a numpy number (from gradio Number input)
     age = int(age) if age else 0
@@ -326,22 +319,43 @@ def save_profile(name, age, interests, transcript, learning_style, movie, movie_
     return markdown_summary
 def transcript_display(transcript_dict):
-    if not transcript_dict:
-        return "No transcript uploaded."
-    if isinstance(transcript_dict, dict) and "courses" in transcript_dict:
-        if isinstance(transcript_dict["courses"], dict):
-            display = ""
-            for grade_level, courses in transcript_dict["courses"].items():
-                display += f"\n**Grade {grade_level}**\n"
-                for course in courses:
-                    display += f"- {course['course']}"
                     if 'grade' in course:
                         display += f" (Grade: {course['grade']})"
                     display += "\n"
-            return display
-        elif isinstance(transcript_dict["courses"], list):
-            return "\n".join([f"- {course}" for course in transcript_dict["courses"]])
-    return "No course information available"
 # ========== AI TEACHING ASSISTANT ==========
 def load_profile():
@@ -432,22 +446,25 @@ def generate_response(message, history):
 # ========== GRADIO INTERFACE ==========
 with gr.Blocks() as app:
     with gr.Tab("Step 1: Upload Transcript"):
-        transcript_file = gr.File(label="Upload your transcript (CSV, Excel, or PDF)")
-        transcript_output = gr.Textbox(label="Transcript Output")
         transcript_data = gr.State()
-        transcript_file.change(fn=parse_transcript, inputs=transcript_file, outputs=[transcript_output, transcript_data])
     with gr.Tab("Step 2: Learning Style Quiz"):
         gr.Markdown("### Learning Style Quiz (20 Questions)")
         quiz_components = []
         for i, (question, options) in enumerate(zip(learning_style_questions, learning_style_options)):
-            quiz_components.append(
-                gr.Radio(options, label=f"{i+1}. {question}")
-            )
-        learning_output = gr.Textbox(label="Learning Style Result", lines=10)
         gr.Button("Submit Quiz").click(
-            learning_style_quiz,
             inputs=quiz_components,
             outputs=learning_output
         )
@@ -471,7 +488,6 @@ with gr.Blocks() as app:
     with gr.Tab("Step 4: Save & Review"):
         output_summary = gr.Markdown()
         save_btn = gr.Button("Save Profile")
         save_btn.click(
             fn=save_profile,
             inputs=[name, age, interests, transcript_data, learning_output,
@@ -480,7 +496,6 @@ with gr.Blocks() as app:
             outputs=output_summary
         )
-    # AI Teaching Assistant Tab
     with gr.Tab("🤖 AI Teaching Assistant"):
         gr.Markdown("## Your Personalized Learning Assistant")
         chatbot = gr.ChatInterface(
@@ -494,5 +509,5 @@ with gr.Blocks() as app:
         )
 if __name__ == "__main__":
-    app.launch()

 import re
 from PyPDF2 import PdfReader
 from collections import defaultdict
+from transformers import pipeline
+# Initialize NER model (will load only if transformers is available)
+try:
+    ner_pipeline = pipeline("ner", model="dslim/bert-base-NER")
+except Exception as e:
+    print(f"Could not load NER model: {e}")
+    ner_pipeline = None
+# ========== IMPROVED TRANSCRIPT PARSING ==========
+def extract_gpa(text):
+    gpa_data = {'weighted': "N/A", 'unweighted': "N/A"}
+    gpa_patterns = [
+        r'Weighted GPA[\s:]*(\d\.\d{1,2})',
+        r'GPA \(Weighted\)[\s:]*(\d\.\d{1,2})',
+        r'Cumulative GPA \(Weighted\)[\s:]*(\d\.\d{1,2})',
+        r'Unweighted GPA[\s:]*(\d\.\d{1,2})',
+        r'GPA \(Unweighted\)[\s:]*(\d\.\d{1,2})',
+        r'Cumulative GPA \(Unweighted\)[\s:]*(\d\.\d{1,2})',
+        r'GPA[\s:]*(\d\.\d{1,2})'
+    ]
+    for pattern in gpa_patterns:
+        for match in re.finditer(pattern, text, re.IGNORECASE):
+            gpa_value = match.group(1)
+            if 'weighted' in pattern.lower():
+                gpa_data['weighted'] = gpa_value
+            elif 'unweighted' in pattern.lower():
+                gpa_data['unweighted'] = gpa_value
+            else:
+                if gpa_data['unweighted'] == "N/A":
+                    gpa_data['unweighted'] = gpa_value
+                if gpa_data['weighted'] == "N/A":
+                    gpa_data['weighted'] = gpa_value
+    return gpa_data
+def extract_courses_with_regex(text):
+    patterns = [
+        r'(?:^|\n)([A-Z]{2,}\s*-?\s*\d{3}[A-Z]?\b)\s*([A-F][+-]?|\d{2,3}%)?',
+        r'(?:^|\n)([A-Z][a-z]+(?:\s+[A-Z]?[a-z]+)+)\s*[:\-]?\s*([A-F][+-]?|\d{2,3}%)?',
+        r'(?:^|\n)([A-Z]{2,})\s*\d{3}\b'
+    ]
+    courses = []
+    for pattern in patterns:
+        for match in re.finditer(pattern, text, re.MULTILINE):
+            course_name = match.group(1).strip()
+            grade = match.group(2).strip() if match.group(2) else None
+            courses.append({'name': course_name, 'grade': grade})
+    return courses
+def extract_grade_levels(text):
+    grade_pattern = r'(?:Grade|Year|Term)\s*[:]?\s*(\d+|Freshman|Sophomore|Junior|Senior)\b'
+    grade_matches = list(re.finditer(grade_pattern, text, re.IGNORECASE))
+    grade_sections = []
+    for i, match in enumerate(grade_matches):
+        start_pos = match.start()
+        end_pos = grade_matches[i+1].start() if i+1 < len(grade_matches) else len(text)
+        grade_sections.append({
+            'grade': match.group(1),
+            'text': text[start_pos:end_pos]
+        })
+    return grade_sections
 def parse_transcript(file):
+    if file.name.endswith('.pdf'):
         text = ''
         reader = PdfReader(file)
         for page in reader.pages:
+            text += page.extract_text() + '\n'
+        # Try both NER and regex approaches
+        courses = []
+        if ner_pipeline:
+            try:
+                entities = ner_pipeline(text)
+                current_course = {}
+                for entity in entities:
+                    if entity['word'].startswith('##'):
+                        current_course['name'] = current_course.get('name', '') + entity['word'][2:]
+                    elif entity['entity'] in ['B-ORG', 'I-ORG']:  # Using ORG as proxy for courses
+                        if 'name' in current_course:
+                            courses.append(current_course)
+                        current_course = {'name': entity['word']}
+                    elif entity['entity'] == 'GRADE' and current_course:
+                        current_course['grade'] = entity['word']
+                if current_course:
+                    courses.append(current_course)
+            except Exception as e:
+                print(f"NER failed: {e}")
+        # Fallback to regex if NER didn't find courses
+        if not courses:
+            courses = extract_courses_with_regex(text)
+        # Organize by grade level
+        grade_sections = extract_grade_levels(text)
+        courses_by_grade = defaultdict(list)
+        if grade_sections:
+            for section in grade_sections:
+                section_courses = extract_courses_with_regex(section['text'])
+                for course in section_courses:
+                    course['term'] = section['grade']
+                    courses_by_grade[section['grade']].append(course)
         else:
+            courses_by_grade["All"] = courses
+        gpa_data = extract_gpa(text)
+        output_text = "Transcript parsed successfully\n"
+        output_text += f"Found {len(courses)} courses across {len(courses_by_grade)} grade levels\n"
         return output_text, {
             "gpa": gpa_data,
+            "courses": dict(courses_by_grade)
         }
+    elif file.name.endswith('.csv'):
+        df = pd.read_csv(file)
+    elif file.name.endswith('.xlsx'):
+        df = pd.read_excel(file)
     else:
         return "Unsupported file format", None
+    # Fallback for CSV/Excel
     gpa = "N/A"
     for col in ['GPA', 'Grade Point Average', 'Cumulative GPA']:
         if col in df.columns:
         "courses": courses
     }
 # ========== LEARNING STYLE QUIZ ==========
 learning_style_questions = [
     "When you study for a test, you prefer to:",
     return result
+# ========== SAVE STUDENT PROFILE ==========
+def save_profile(name, age, interests, transcript, learning_style,
+                movie, movie_reason, show, show_reason,
+                book, book_reason, character, character_reason, blog):
     # Convert age to int if it's a numpy number (from gradio Number input)
     age = int(age) if age else 0
     return markdown_summary
 def transcript_display(transcript_dict):
+    if not transcript_dict or "courses" not in transcript_dict:
+        return "No course information available"
+    display = "### Course History\n\n"
+    courses_by_grade = transcript_dict["courses"]
+    if isinstance(courses_by_grade, dict):
+        for grade, courses in courses_by_grade.items():
+            display += f"**{grade}**\n"
+            for course in courses:
+                if isinstance(course, dict):
+                    display += f"- {course.get('name', 'N/A')}"
                     if 'grade' in course:
                         display += f" (Grade: {course['grade']})"
+                    if 'term' in course:
+                        display += f" | Term: {course['term']}"
                     display += "\n"
+                else:
+                    display += f"- {str(course)}\n"
+            display += "\n"
+    elif isinstance(courses_by_grade, list):
+        for course in courses_by_grade:
+            if isinstance(course, dict):
+                display += f"- {course.get('name', 'N/A')}"
+                if 'grade' in course:
+                    display += f" (Grade: {course['grade']})"
+                display += "\n"
+            else:
+                display += f"- {str(course)}\n"
+    if 'gpa' in transcript_dict:
+        gpa = transcript_dict['gpa']
+        display += "\n**GPA Information**\n"
+        display += f"- Unweighted: {gpa.get('unweighted', 'N/A')}\n"
+        display += f"- Weighted: {gpa.get('weighted', 'N/A')}\n"
+    return display
 # ========== AI TEACHING ASSISTANT ==========
 def load_profile():
 # ========== GRADIO INTERFACE ==========
 with gr.Blocks() as app:
     with gr.Tab("Step 1: Upload Transcript"):
+        gr.Markdown("### Upload your transcript (PDF recommended for best results)")
+        transcript_file = gr.File(label="Transcript file", file_types=[".pdf", ".csv", ".xlsx"])
+        transcript_output = gr.Textbox(label="Parsing Results")
         transcript_data = gr.State()
+        transcript_file.change(
+            fn=parse_transcript,
+            inputs=transcript_file,
+            outputs=[transcript_output, transcript_data]
+        )
     with gr.Tab("Step 2: Learning Style Quiz"):
         gr.Markdown("### Learning Style Quiz (20 Questions)")
         quiz_components = []
         for i, (question, options) in enumerate(zip(learning_style_questions, learning_style_options)):
+            quiz_components.append(gr.Radio(options, label=f"{i+1}. {question}"))
+        learning_output = gr.Textbox(label="Your Learning Style", lines=10)
         gr.Button("Submit Quiz").click(
+            fn=learning_style_quiz,
             inputs=quiz_components,
             outputs=learning_output
         )
     with gr.Tab("Step 4: Save & Review"):
         output_summary = gr.Markdown()
         save_btn = gr.Button("Save Profile")
         save_btn.click(
             fn=save_profile,
             inputs=[name, age, interests, transcript_data, learning_output,
             outputs=output_summary
         )
     with gr.Tab("🤖 AI Teaching Assistant"):
         gr.Markdown("## Your Personalized Learning Assistant")
         chatbot = gr.ChatInterface(
         )
 if __name__ == "__main__":
+    app.launch()