Spaces:

agents-course
/

Unit4_scoring

Running

App Files Files Community

Jofthomas commited on 5 days ago

Commit

e946ee0

verified ·

1 Parent(s): 3227abd

Update main.py

Browse files

Files changed (1) hide show

main.py +11 -25

main.py CHANGED Viewed

@@ -297,9 +297,7 @@ def update_huggingface_dataset(username: str, score: float, code_link: str):
         ds_dict = None
         df = None
         try:
-            # Try downloading a file first to check existence without loading full dataset if large
-            # This might not be necessary if load_dataset handles non-existence gracefully
-            # hf_hub_download(repo_id=HF_DATASET_ID, filename="data/train-00000-of-00001.parquet", repo_type="dataset")
             ds_dict = load_dataset(HF_DATASET_ID, trust_remote_code=True) # Added trust_remote_code=True if needed
             logger.info("Dataset loaded successfully.")
             if "train" in ds_dict:
@@ -310,22 +308,22 @@ def update_huggingface_dataset(username: str, score: float, code_link: str):
         except Exception as load_error:
             logger.warning(f"Could not load dataset '{HF_DATASET_ID}' or it's empty/new ({load_error}). Will create structure.")
-            # Create an empty DataFrame with the correct schema if loading failed
             df = pd.DataFrame({col: pd.Series(dtype=dtype) for col, dtype in expected_columns.items()})
-        # Ensure all expected columns exist, add if they don't
         for col, dtype in expected_columns.items():
             if col not in df.columns:
                 logger.warning(f"Column '{col}' not found in loaded data. Adding it.")
-                # Use appropriate default based on dtype if needed, though concat handles it
                 df[col] = pd.Series(dtype=dtype)
-        # Convert score column to numeric, coercing errors, and fill NaNs
         df['score'] = pd.to_numeric(df['score'], errors='coerce').fillna(0.0)
-        # Ensure other columns have correct types, fill NaNs for string columns
         df['username'] = df['username'].astype(str).fillna('')
         df['timestamp'] = df['timestamp'].astype(str).fillna('')
-        df['code'] = df['code'].astype(str).fillna('') # Ensure code column is string
         # 2. Find existing score for the user
         existing_entries = df[df['username'] == username]
@@ -337,9 +335,7 @@ def update_huggingface_dataset(username: str, score: float, code_link: str):
             max_existing_score = existing_entries['score'].max() # Already numeric
             if score > max_existing_score:
                 logger.info(f"New score {score} is higher than existing max {max_existing_score} for {username}. Updating entry.")
-                # Remove *all* old entries for this user to replace with the single best one
-                df = df[df['username'] != username].copy() # Use .copy() to avoid SettingWithCopyWarning
-                # Add new entry with score and code link
                 new_entry = pd.DataFrame([{
                     'username': username,
                     'score': score,
@@ -366,32 +362,23 @@ def update_huggingface_dataset(username: str, score: float, code_link: str):
         if needs_update:
             logger.info(f"Preparing to push updated dataset to '{HF_DATASET_ID}'...")
-            # Ensure final DataFrame columns match the expected schema exactly before converting
-            # Select and order columns just in case
             df = df[list(expected_columns.keys())]
-            # Explicitly cast types again before creating Dataset object
             for col, dtype in expected_columns.items():
-                 # Handle potential pandas nullable types if necessary, default to standard types
                  if dtype == 'str':
                      df[col] = df[col].astype(str).fillna('')
                  elif dtype == 'float':
                      df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0.0) # Ensure float conversion
-                 # Add other type handling if needed
             logger.info(f"Final DataFrame columns and types:\n{df.dtypes}")
             logger.info(f"Sample data before push:\n{df.head().to_string()}")
-            # Create the Dataset object from the final DataFrame
             updated_ds = Dataset.from_pandas(df)
-            # Wrap it in a DatasetDict (standard practice)
             final_ds_dict = DatasetDict({'train': updated_ds})
             logger.info(f"Dataset structure to push: {final_ds_dict}")
-            # *** UNCOMMENT THIS LINE TO ACTUALLY PUSH THE DATA ***
-            # final_ds_dict.push_to_hub(HF_DATASET_ID)
-            # logger.info(f"Successfully pushed updated dataset to '{HF_DATASET_ID}'.")
-            logger.warning("Dataset push to hub is currently commented out in the code. Uncomment the 'push_to_hub' line to enable leaderboard updates.")
             return True
         else:
             logger.info("No changes needed, dataset not pushed.")
@@ -399,8 +386,7 @@ def update_huggingface_dataset(username: str, score: float, code_link: str):
     except Exception as e:
         logger.error(f"Error interacting with Hugging Face dataset '{HF_DATASET_ID}': {e}", exc_info=True)
-        # Re-raise the exception to be caught by the endpoint handler or calling function
-        # Adjust the exception type if not using FastAPI's HTTPException
         raise HTTPException(status_code=500, detail=f"Failed to update Hugging Face dataset: {e}")

         ds_dict = None
         df = None
         try:
             ds_dict = load_dataset(HF_DATASET_ID, trust_remote_code=True) # Added trust_remote_code=True if needed
             logger.info("Dataset loaded successfully.")
             if "train" in ds_dict:
         except Exception as load_error:
             logger.warning(f"Could not load dataset '{HF_DATASET_ID}' or it's empty/new ({load_error}). Will create structure.")
             df = pd.DataFrame({col: pd.Series(dtype=dtype) for col, dtype in expected_columns.items()})
         for col, dtype in expected_columns.items():
             if col not in df.columns:
                 logger.warning(f"Column '{col}' not found in loaded data. Adding it.")
                 df[col] = pd.Series(dtype=dtype)
         df['score'] = pd.to_numeric(df['score'], errors='coerce').fillna(0.0)
         df['username'] = df['username'].astype(str).fillna('')
         df['timestamp'] = df['timestamp'].astype(str).fillna('')
+        df['code'] = df['code'].astype(str).fillna('')
         # 2. Find existing score for the user
         existing_entries = df[df['username'] == username]
             max_existing_score = existing_entries['score'].max() # Already numeric
             if score > max_existing_score:
                 logger.info(f"New score {score} is higher than existing max {max_existing_score} for {username}. Updating entry.")
+                df = df[df['username'] != username].copy()
                 new_entry = pd.DataFrame([{
                     'username': username,
                     'score': score,
         if needs_update:
             logger.info(f"Preparing to push updated dataset to '{HF_DATASET_ID}'...")
             df = df[list(expected_columns.keys())]
             for col, dtype in expected_columns.items():
                  if dtype == 'str':
                      df[col] = df[col].astype(str).fillna('')
                  elif dtype == 'float':
                      df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0.0) # Ensure float conversion
             logger.info(f"Final DataFrame columns and types:\n{df.dtypes}")
             logger.info(f"Sample data before push:\n{df.head().to_string()}")
             updated_ds = Dataset.from_pandas(df)
             final_ds_dict = DatasetDict({'train': updated_ds})
             logger.info(f"Dataset structure to push: {final_ds_dict}")
+            final_ds_dict.push_to_hub(HF_DATASET_ID)
+            logger.warning("Dataset push to hub is currently commented out in the code.")
             return True
         else:
             logger.info("No changes needed, dataset not pushed.")
     except Exception as e:
         logger.error(f"Error interacting with Hugging Face dataset '{HF_DATASET_ID}': {e}", exc_info=True)
         raise HTTPException(status_code=500, detail=f"Failed to update Hugging Face dataset: {e}")