Spaces:

Dannyar608
/

Final_project

Running

App Files Files Community

Dannyar608 commited on 5 days ago

Commit

9b7ad24

verified ·

1 Parent(s): bef81e2

Update app.py

Browse files

Files changed (1) hide show

app.py +97 -143

app.py CHANGED Viewed

@@ -15,15 +15,18 @@ import io
 import secrets
 import string
 from huggingface_hub import HfApi, HfFolder
 # ========== CONFIGURATION ==========
 PROFILES_DIR = "student_profiles"
-ALLOWED_FILE_TYPES = [".pdf", ".png", ".jpg", ".jpeg"]  # Added image support
 MAX_FILE_SIZE_MB = 5
 MIN_AGE = 5
 MAX_AGE = 120
 SESSION_TOKEN_LENGTH = 32
 HF_TOKEN = os.getenv("HF_TOKEN")
 # Initialize Hugging Face API
 if HF_TOKEN:
@@ -83,129 +86,81 @@ def extract_text_with_ocr(file_path: str) -> str:
     except Exception as e:
         raise gr.Error(f"OCR processing failed: {str(e)}")
-# ========== TRANSCRIPT PARSING ==========
-def extract_gpa(text: str, gpa_type: str) -> str:
-    """More robust GPA extraction with multiple patterns."""
-    gpa_patterns = [
-        rf'{gpa_type}\s*GPA\s*[:=]?\s*([0-5]\.\d{{2}}|\d\.\d)',  # Weighted GPA: 3.50
-        rf'{gpa_type}\s*GPA\s+([0-5]\.\d{{2}}|\d\.\d)',          # Weighted GPA 3.50
-        rf'{gpa_type}\s*[:=]?\s*([0-5]\.\d{{2}}|\d\.\d)',        # Weighted: 3.50
-        rf'GPA\s*\({gpa_type}\)\s*[:=]?\s*([0-5]\.\d{{2}}|\d\.\d)', # GPA (Weighted): 3.50
-        rf'{gpa_type}\s*[=:]?\s*([0-5]\.\d{{2}}|\d\.\d)',        # Weighted=3.50
-        rf'{gpa_type}\s*[=:]?\s*(\d\.\d{{2}})'                   # Weighted:3.50
-    ]
-    for pattern in gpa_patterns:
-        match = re.search(pattern, text, re.IGNORECASE)
-        if match:
-            gpa_value = match.group(1)
-            try:
-                gpa_float = float(gpa_value)
-                if not 0.0 <= gpa_float <= 5.0:
-                    return "Invalid GPA"
-                return f"{gpa_float:.2f}"
-            except ValueError:
-                continue
-    # Fallback to looking for any GPA-like number near the term
-    fallback_pattern = re.compile(rf'(?:{gpa_type}.*?)([0-5]\.\d{{1,2}})(?!\d)')
-    match = re.search(fallback_pattern, text, re.IGNORECASE)
-    if match:
-        return match.group(1)
-    return "N/A"
-def extract_courses_from_table(text: str) -> Dict[str, List[Dict]]:
-    """Enhanced course extraction with better pattern matching."""
-    # Normalize text for better matching
-    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces
-    text = text.replace('\n', ' ')     # Replace newlines
-    # More robust patterns
-    patterns = [
-        # Pattern for standard table format
-        re.compile(
-            r'(?:Year|Term|Semester)[\s:]*(.*?)\s*'  # Year/Semester
-            r'(?:Grade|Level)[\s:]*(.*?)\s*'         # Grade level
-            r'(?:Course\s*Code|Code)[\s:]*(.*?)\s*'  # Course code
-            r'(?:Course\s*Name|Title)[\s:]*(.*?)\s*' # Course name
-            r'(?:Grade|Mark)[\s:]*(.*?)\s*'          # Grade
-            r'(?:Credits|Units)[\s:]*(.*?)(?:\s|$)'  # Credits
-        ),
-        # Pattern for condensed format
-        re.compile(
-            r'(\d{4}-\d{4}|\w+\s\d{4})\s+'  # Year range or Semester Year
-            r'(\d+)\s+'                     # Grade level
-            r'([A-Z]+\s*\d+[A-Z]*)\s+'      # Course code
-            r'(.+?)\s+'                      # Course name
-            r'([A-F][+-]?|P|F|W|I)\s+'      # Grade
-            r'(\d+\.?\d*)'                   # Credits
-        ),
-        # Fallback pattern for less structured data
-        re.compile(
-            r'([A-Z]+\s*\d+[A-Z]*)\s+'      # Course code
-            r'(.+?)\s+'                      # Course name
-            r'(?:Grade\s*:\s*)?([A-F][+-]?|P|F|W|I)\s*'  # Grade
-            r'(?:Credits\s*:\s*)?(\d+\.?\d*)'            # Credits
-        )
-    ]
-    courses_by_grade = defaultdict(list)
-    extracted_courses = set()  # To avoid duplicates
-    for pattern in patterns:
-        for match in re.finditer(pattern, text):
-            if len(match.groups()) == 6:
-                year, grade, code, name, grade_mark, credits = match.groups()
-            else:
-                # Handle shorter patterns
-                code, name, grade_mark, credits = match.groups()[:4]
-                year = "Unknown"
-                grade = "Unknown"
-            # Create unique identifier to avoid duplicates
-            course_id = f"{code}_{name}_{year}"
-            if course_id in extracted_courses:
-                continue
-            extracted_courses.add(course_id)
-            # Clean and format data
-            code = code.strip()
-            name = name.strip()
-            if 'AP' in code and 'AP ' not in code:
-                code = code.replace('AP', 'AP ')
-            if 'DE' in code and 'DE ' not in code:
-                code = code.replace('DE', 'DE ')
-            course_info = {
-                'code': code,
-                'name': name,
-                'grade': grade_mark.strip() if grade_mark else None,
-                'credits': credits if credits else '0',
-                'year': year.strip() if year else 'Unknown'
-            }
-            courses_by_grade[grade.strip() if grade else 'Unknown'].append(course_info)
-    # If no courses found with patterns, try a more aggressive approach
-    if not courses_by_grade:
-        # Look for anything that looks like a course code followed by description
-        fallback_pattern = re.compile(r'([A-Z]+\s*\d+[A-Z]*)\s+(.+?)(?:\s+([A-F][+-]?|P|F|W|I))?(?:\s+(\d+\.?\d*))?')
-        for match in re.finditer(fallback_pattern, text):
-            code, name, grade_mark, credits = match.groups()
-            course_info = {
-                'code': code.strip(),
-                'name': name.strip(),
-                'grade': grade_mark.strip() if grade_mark else None,
-                'credits': credits if credits else '0',
-                'year': 'Unknown'
-            }
-            courses_by_grade['Unknown'].append(course_info)
-    return courses_by_grade
 def parse_transcript(file_obj) -> Tuple[str, Optional[Dict]]:
-    """Parse transcript file with robust error handling and OCR support."""
     try:
         if not file_obj:
             raise gr.Error("Please upload a file first")
@@ -237,46 +192,45 @@ def parse_transcript(file_obj) -> Tuple[str, Optional[Dict]]:
         if not text.strip():
             raise gr.Error("No text could be extracted from the file")
-        # Enhanced GPA extraction
-        gpa_data = {
-            'weighted': extract_gpa(text, 'Weighted'),
-            'unweighted': extract_gpa(text, 'Unweighted')
-        }
-        # Extract grade level with multiple fallback patterns
-        grade_match = (
-            re.search(r'Current Grade:\s*(\d+)', text) or
-            re.search(r'Grade\s*:\s*(\d+)', text) or
-            re.search(r'Grade\s+(\d+)', text) or
-            re.search(r'Grade\s+Level:\s*(\d+)', text) or
-            re.search(r'Grade\s*\(?\s*(\d+)\s*\)?', text)
-        )
-        grade_level = grade_match.group(1) if grade_match else "Unknown"
-        courses_by_grade = extract_courses_from_table(text)
         # Format output text
         output_text = f"Student Transcript Summary\n{'='*40}\n"
-        output_text += f"Current Grade Level: {grade_level}\n"
-        output_text += f"Weighted GPA: {gpa_data['weighted']}\n"
-        output_text += f"Unweighted GPA: {gpa_data['unweighted']}\n\n"
         output_text += "Course History:\n{'='*40}\n"
         for grade in sorted(courses_by_grade.keys(), key=lambda x: int(x) if x.isdigit() else x):
             output_text += f"\nGrade {grade}:\n{'-'*30}\n"
             for course in courses_by_grade[grade]:
-                output_text += f"- {course['code']} {course['name']}"
                 if 'grade' in course and course['grade']:
                     output_text += f" (Grade: {course['grade']})"
                 if 'credits' in course:
                     output_text += f" | Credits: {course['credits']}"
-                output_text += f" | Year: {course['year']}\n"
-        return output_text, {
-            "gpa": gpa_data,
-            "grade_level": grade_level,
             "courses": dict(courses_by_grade)
         }
     except Exception as e:
         return f"Error processing transcript: {str(e)}", None
@@ -1359,4 +1313,4 @@ app = create_interface()
 # For Hugging Face Spaces deployment
 if __name__ == "__main__":
     app.launch()

 import secrets
 import string
 from huggingface_hub import HfApi, HfFolder
+import requests  # For API calls to DeepSeek
 # ========== CONFIGURATION ==========
 PROFILES_DIR = "student_profiles"
+ALLOWED_FILE_TYPES = [".pdf", ".png", ".jpg", ".jpeg"]
 MAX_FILE_SIZE_MB = 5
 MIN_AGE = 5
 MAX_AGE = 120
 SESSION_TOKEN_LENGTH = 32
 HF_TOKEN = os.getenv("HF_TOKEN")
+DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY")  # Add your DeepSeek API key here
+DEEPSEEK_API_URL = "https://api.deepseek.com/v1/chat/completions"  # Example endpoint
 # Initialize Hugging Face API
 if HF_TOKEN:
     except Exception as e:
         raise gr.Error(f"OCR processing failed: {str(e)}")
+# ========== ENHANCED TRANSCRIPT PARSING WITH DEEPSEEK ==========
+def parse_transcript_with_deepseek(text: str) -> Dict:
+    """Use DeepSeek model to parse transcript text with high accuracy."""
+    if not DEEPSEEK_API_KEY:
+        raise gr.Error("DeepSeek API key not configured")
+    prompt = f"""
+    Analyze this academic transcript and extract the following information in JSON format:
+    - Current grade level
+    - Weighted GPA
+    - Unweighted GPA
+    - List of all courses with:
+      * Course code
+      * Course name
+      * Grade received
+      * Credits earned
+      * Year/semester taken
+      * Grade level when taken
+    Return the data in this exact JSON structure:
+    {{
+        "grade_level": "11",
+        "gpa": {{
+            "weighted": "4.2",
+            "unweighted": "3.9"
+        }},
+        "courses": [
+            {{
+                "code": "MATH101",
+                "name": "Algebra II",
+                "grade": "A",
+                "credits": "1.0",
+                "year": "2023-2024",
+                "grade_level": "11"
+            }},
+            // more courses...
+        ]
+    }}
+    Here is the transcript text to analyze:
+    {text}
+    """
+    headers = {
+        "Authorization": f"Bearer {DEEPSEEK_API_KEY}",
+        "Content-Type": "application/json"
+    }
+    payload = {
+        "model": "deepseek-chat",
+        "messages": [{"role": "user", "content": prompt}],
+        "temperature": 0.1,
+        "max_tokens": 2000
+    }
+    try:
+        response = requests.post(DEEPSEEK_API_URL, headers=headers, json=payload)
+        response.raise_for_status()
+        result = response.json()
+        # Extract the JSON content from the response
+        content = result['choices'][0]['message']['content']
+        # Sometimes the response includes markdown code blocks
+        if '```json' in content:
+            content = content.split('```json')[1].split('```')[0].strip()
+        elif '```' in content:
+            content = content.split('```')[1].split('```')[0].strip()
+        return json.loads(content)
+    except Exception as e:
+        raise gr.Error(f"DeepSeek API error: {str(e)}")
 def parse_transcript(file_obj) -> Tuple[str, Optional[Dict]]:
+    """Parse transcript file with DeepSeek enhanced parsing."""
     try:
         if not file_obj:
             raise gr.Error("Please upload a file first")
         if not text.strip():
             raise gr.Error("No text could be extracted from the file")
+        # Use DeepSeek for enhanced parsing
+        parsed_data = parse_transcript_with_deepseek(text)
         # Format output text
         output_text = f"Student Transcript Summary\n{'='*40}\n"
+        output_text += f"Current Grade Level: {parsed_data.get('grade_level', 'Unknown')}\n"
+        if 'gpa' in parsed_data:
+            output_text += f"Weighted GPA: {parsed_data['gpa'].get('weighted', 'N/A')}\n"
+            output_text += f"Unweighted GPA: {parsed_data['gpa'].get('unweighted', 'N/A')}\n\n"
         output_text += "Course History:\n{'='*40}\n"
+        # Organize courses by grade level
+        courses_by_grade = defaultdict(list)
+        for course in parsed_data.get('courses', []):
+            grade_level = course.get('grade_level', 'Unknown')
+            courses_by_grade[grade_level].append(course)
         for grade in sorted(courses_by_grade.keys(), key=lambda x: int(x) if x.isdigit() else x):
             output_text += f"\nGrade {grade}:\n{'-'*30}\n"
             for course in courses_by_grade[grade]:
+                output_text += f"- {course.get('code', '')} {course.get('name', 'Unnamed course')}"
                 if 'grade' in course and course['grade']:
                     output_text += f" (Grade: {course['grade']})"
                 if 'credits' in course:
                     output_text += f" | Credits: {course['credits']}"
+                if 'year' in course:
+                    output_text += f" | Year: {course['year']}"
+                output_text += "\n"
+        # Prepare the data structure for saving
+        transcript_data = {
+            "grade_level": parsed_data.get('grade_level', 'Unknown'),
+            "gpa": parsed_data.get('gpa', {}),
             "courses": dict(courses_by_grade)
         }
+        return output_text, transcript_data
     except Exception as e:
         return f"Error processing transcript: {str(e)}", None
 # For Hugging Face Spaces deployment
 if __name__ == "__main__":
     app.launch()