Spaces:

Dannyar608
/

Final_project

Running

App Files Files Community

Dannyar608 commited on 4 days ago

Commit

e881a6a

verified ·

1 Parent(s): fcf1816

Update app.py

Browse files

Files changed (1) hide show

app.py +143 -85

app.py CHANGED Viewed

@@ -15,7 +15,9 @@ import io
 import secrets
 import string
 from huggingface_hub import HfApi, HfFolder
-import requests  # For API calls to DeepSeek
 # ========== CONFIGURATION ==========
 PROFILES_DIR = "student_profiles"
@@ -25,14 +27,45 @@ MIN_AGE = 5
 MAX_AGE = 120
 SESSION_TOKEN_LENGTH = 32
 HF_TOKEN = os.getenv("HF_TOKEN")
-DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY")  # Add your DeepSeek API key here
-DEEPSEEK_API_URL = "https://api.deepseek.com/v1/chat/completions"  # Example endpoint
 # Initialize Hugging Face API
 if HF_TOKEN:
     hf_api = HfApi(token=HF_TOKEN)
     HfFolder.save_token(HF_TOKEN)
 # ========== UTILITY FUNCTIONS ==========
 def generate_session_token() -> str:
     """Generate a random session token for user identification."""
@@ -77,7 +110,7 @@ def validate_file(file_obj) -> None:
     if file_size > MAX_FILE_SIZE_MB:
         raise gr.Error(f"File too large. Max size: {MAX_FILE_SIZE_MB}MB")
-# ========== ENHANCED TRANSCRIPT PARSING ==========
 def extract_text_from_file(file_path: str, file_ext: str) -> str:
     """Enhanced text extraction with better error handling and fallbacks."""
     text = ""
@@ -169,60 +202,29 @@ def remove_sensitive_info(text: str) -> str:
     text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '[EMAIL]', text)
     return text
-def extract_json_from_response(content: str) -> str:
-    """Extract JSON string from API response."""
-    # Handle markdown code blocks
-    if '```json' in content:
-        content = content.split('```json')[1].split('```')[0].strip()
-    elif '```' in content:
-        content = content.split('```')[1].split('```')[0].strip()
-    # Sometimes the response is pure JSON
-    return content
-def validate_parsed_data(data: Dict) -> Dict:
-    """Validate and clean the parsed data structure."""
-    # Ensure required fields exist
-    if not isinstance(data, dict):
-        raise ValueError("Invalid data format")
-    # Set default structure if missing
-    if 'grade_level' not in data:
-        data['grade_level'] = 'Unknown'
-    if 'gpa' not in data:
-        data['gpa'] = {'weighted': 'N/A', 'unweighted': 'N/A'}
-    if 'courses' not in data:
-        data['courses'] = []
-    # Clean course data
-    for course in data['courses']:
-        if 'grade' in course:
-            course['grade'] = course['grade'].upper().strip()
-        # Ensure numeric credits are strings
-        if 'credits' in course and isinstance(course['credits'], (int, float)):
-            course['credits'] = str(course['credits'])
-    return data
 def parse_transcript_with_deepseek(text: str) -> Dict:
-    """Improved DeepSeek API integration with better error handling."""
-    if not DEEPSEEK_API_KEY:
-        raise gr.Error("DeepSeek API key not configured")
-    # Pre-process the text to remove sensitive information
-    text = remove_sensitive_info(text)
-    # Create a more robust prompt with examples
     prompt = f"""
-    Analyze this academic transcript and extract structured information. Follow these rules:
-    1. Extract data even if partially visible
-    2. Guess missing values when reasonable
-    3. Return empty if completely missing
-    Required JSON structure:
     {{
         "grade_level": "11",
         "gpa": {{
@@ -240,44 +242,76 @@ def parse_transcript_with_deepseek(text: str) -> Dict:
             }}
         ]
     }}
     Transcript Text:
-    {text[:15000]}  # Limit to first 15k chars to avoid token limits
     """
-    headers = {
-        "Authorization": f"Bearer {DEEPSEEK_API_KEY}",
-        "Content-Type": "application/json"
-    }
-    payload = {
-        "model": "deepseek-chat",
-        "messages": [{"role": "user", "content": prompt}],
-        "temperature": 0.1,
-        "max_tokens": 2000
-    }
     try:
-        response = requests.post(DEEPSEEK_API_URL, headers=headers, json=payload, timeout=30)
-        response.raise_for_status()
-        result = response.json()
-        content = result['choices'][0]['message']['content']
-        # Extract JSON from response (handling markdown code blocks)
-        json_str = extract_json_from_response(content)
-        # Validate and clean the parsed data
-        parsed_data = validate_parsed_data(json.loads(json_str))
-        return parsed_data
-    except requests.exceptions.RequestException as e:
-        raise gr.Error(f"API request failed: {str(e)}")
-    except json.JSONDecodeError as e:
-        raise gr.Error(f"Failed to parse API response: {str(e)}")
     except Exception as e:
-        raise gr.Error(f"DeepSeek processing error: {str(e)}")
 def format_transcript_output(data: Dict) -> str:
     """Format the parsed data into human-readable text."""
@@ -326,10 +360,10 @@ def parse_transcript(file_obj) -> Tuple[str, Optional[Dict]]:
         # Extract text from file
         text = extract_text_from_file(file_obj.name, file_ext)
-        # Parse with DeepSeek
         parsed_data = parse_transcript_with_deepseek(text)
-        # Format output
         output_text = format_transcript_output(parsed_data)
         # Prepare the data structure for saving
@@ -339,7 +373,7 @@ def parse_transcript(file_obj) -> Tuple[str, Optional[Dict]]:
             "courses": defaultdict(list)
         }
-        # Organize courses by grade level for saving
         for course in parsed_data.get('courses', []):
             grade_level = course.get('grade_level', 'Unknown')
             transcript_data["courses"][grade_level].append(course)
@@ -1043,6 +1077,13 @@ def create_interface():
             background-color: #ffebee;
             color: #c62828;
         }
         """
         gr.Markdown("""
@@ -1051,6 +1092,12 @@ def create_interface():
         Complete each step to get customized learning recommendations.
         """)
         # Progress tracker - now with dynamic styling
         with gr.Row():
             with gr.Column(scale=1):
@@ -1101,6 +1148,9 @@ def create_interface():
                         transcript_data = gr.State()
                 def process_transcript_and_update(file_obj, current_tab_status):
                     output_text, data = parse_transcript(file_obj)
                     if "Error" not in output_text:
                         new_status = current_tab_status.copy()
@@ -1418,6 +1468,14 @@ def create_interface():
             inputs=[gr.State(4), tab_completed],
             outputs=[tabs, nav_message, quiz_alert]
         )
     return app

 import secrets
 import string
 from huggingface_hub import HfApi, HfFolder
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import time
 # ========== CONFIGURATION ==========
 PROFILES_DIR = "student_profiles"
 MAX_AGE = 120
 SESSION_TOKEN_LENGTH = 32
 HF_TOKEN = os.getenv("HF_TOKEN")
 # Initialize Hugging Face API
 if HF_TOKEN:
     hf_api = HfApi(token=HF_TOKEN)
     HfFolder.save_token(HF_TOKEN)
+# ========== DEEPSEEK MODEL LOADING ==========
+def load_deepseek_model():
+    """Load the DeepSeek model with progress tracking"""
+    progress = gr.Progress()
+    progress(0, desc="Loading DeepSeek model...")
+    try:
+        start_time = time.time()
+        tokenizer = AutoTokenizer.from_pretrained(
+            "deepseek-ai/DeepSeek-V3",
+            trust_remote_code=True
+        )
+        progress(0.3, desc="Loading tokenizer...")
+        model = AutoModelForCausalLM.from_pretrained(
+            "deepseek-ai/DeepSeek-V3",
+            trust_remote_code=True,
+            torch_dtype=torch.float16,
+            device_map="auto"
+        )
+        progress(0.9, desc="Loading model weights...")
+        load_time = time.time() - start_time
+        print(f"DeepSeek model loaded in {load_time:.2f} seconds")
+        return model, tokenizer
+    except Exception as e:
+        print(f"Error loading DeepSeek model: {str(e)}")
+        return None, None
+# Load model at startup
+model, tokenizer = load_deepseek_model()
 # ========== UTILITY FUNCTIONS ==========
 def generate_session_token() -> str:
     """Generate a random session token for user identification."""
     if file_size > MAX_FILE_SIZE_MB:
         raise gr.Error(f"File too large. Max size: {MAX_FILE_SIZE_MB}MB")
+# ========== TEXT EXTRACTION FUNCTIONS ==========
 def extract_text_from_file(file_path: str, file_ext: str) -> str:
     """Enhanced text extraction with better error handling and fallbacks."""
     text = ""
     text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '[EMAIL]', text)
     return text
+# ========== TRANSCRIPT PARSING ==========
 def parse_transcript_with_deepseek(text: str) -> Dict:
+    """Use local DeepSeek model to parse transcript text"""
+    if model is None or tokenizer is None:
+        raise gr.Error("DeepSeek model failed to load. Please try again later.")
+    # Pre-process the text
+    text = remove_sensitive_info(text[:15000])  # Limit to first 15k chars
     prompt = f"""
+    Analyze this academic transcript and extract structured information:
+    - Current grade level
+    - Weighted GPA (if available)
+    - Unweighted GPA (if available)
+    - List of all courses with:
+      * Course code
+      * Course name
+      * Grade received
+      * Credits earned
+      * Year/semester taken
+      * Grade level when taken
+    Return the data in this JSON structure:
     {{
         "grade_level": "11",
         "gpa": {{
             }}
         ]
     }}
     Transcript Text:
+    {text}
     """
     try:
+        # Show progress to user
+        progress = gr.Progress()
+        progress(0, desc="Analyzing transcript...")
+        # Tokenize and generate response
+        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+        progress(0.3)
+        outputs = model.generate(
+            **inputs,
+            max_new_tokens=2000,
+            temperature=0.1,
+            do_sample=True
+        )
+        progress(0.8)
+        # Decode the response
+        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        progress(0.9)
+        # Extract the JSON content from the response
+        if '```json' in response:
+            json_str = response.split('```json')[1].split('```')[0].strip()
+        elif '```' in response:
+            json_str = response.split('```')[1].split('```')[0].strip()
+        else:
+            json_str = response
+        # Parse and validate the JSON
+        parsed_data = json.loads(json_str)
+        progress(1.0)
+        return validate_parsed_data(parsed_data)
+    except torch.cuda.OutOfMemoryError:
+        raise gr.Error("The model ran out of memory. Try with a smaller transcript or upgrade your GPU.")
     except Exception as e:
+        raise gr.Error(f"Error processing transcript: {str(e)}")
+def validate_parsed_data(data: Dict) -> Dict:
+    """Validate and clean the parsed data structure."""
+    if not isinstance(data, dict):
+        raise ValueError("Invalid data format")
+    # Set default structure if missing
+    if 'grade_level' not in data:
+        data['grade_level'] = 'Unknown'
+    if 'gpa' not in data:
+        data['gpa'] = {'weighted': 'N/A', 'unweighted': 'N/A'}
+    if 'courses' not in data:
+        data['courses'] = []
+    # Clean course data
+    for course in data['courses']:
+        if 'grade' in course:
+            course['grade'] = course['grade'].upper().strip()
+        # Ensure numeric credits are strings
+        if 'credits' in course and isinstance(course['credits'], (int, float)):
+            course['credits'] = str(course['credits'])
+    return data
 def format_transcript_output(data: Dict) -> str:
     """Format the parsed data into human-readable text."""
         # Extract text from file
         text = extract_text_from_file(file_obj.name, file_ext)
+        # Use DeepSeek for parsing
         parsed_data = parse_transcript_with_deepseek(text)
+        # Format output text
         output_text = format_transcript_output(parsed_data)
         # Prepare the data structure for saving
             "courses": defaultdict(list)
         }
+        # Organize courses by grade level
         for course in parsed_data.get('courses', []):
             grade_level = course.get('grade_level', 'Unknown')
             transcript_data["courses"][grade_level].append(course)
             background-color: #ffebee;
             color: #c62828;
         }
+        .model-loading {
+            padding: 15px;
+            margin: 15px 0;
+            border-radius: 4px;
+            background-color: #fff3e0;
+            color: #e65100;
+        }
         """
         gr.Markdown("""
         Complete each step to get customized learning recommendations.
         """)
+        # Model loading status
+        model_status = gr.HTML(
+            value="<div class='model-loading'>Loading AI model... (This may take a few minutes)</div>" if model is None else "",
+            visible=model is None
+        )
         # Progress tracker - now with dynamic styling
         with gr.Row():
             with gr.Column(scale=1):
                         transcript_data = gr.State()
                 def process_transcript_and_update(file_obj, current_tab_status):
+                    if model is None:
+                        return "Error: AI model failed to load. Please try again later.", None, current_tab_status, gr.update(), gr.update(), gr.update()
                     output_text, data = parse_transcript(file_obj)
                     if "Error" not in output_text:
                         new_status = current_tab_status.copy()
             inputs=[gr.State(4), tab_completed],
             outputs=[tabs, nav_message, quiz_alert]
         )
+        # Check model loading status periodically
+        def check_model_status():
+            if model is not None and tokenizer is not None:
+                return gr.update(visible=False)
+            return gr.update(visible=True)
+        app.load(check_model_status, None, model_status, every=1)
     return app