Jofthomas commited on
Commit
e946ee0
·
verified ·
1 Parent(s): 3227abd

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +11 -25
main.py CHANGED
@@ -297,9 +297,7 @@ def update_huggingface_dataset(username: str, score: float, code_link: str):
297
  ds_dict = None
298
  df = None
299
  try:
300
- # Try downloading a file first to check existence without loading full dataset if large
301
- # This might not be necessary if load_dataset handles non-existence gracefully
302
- # hf_hub_download(repo_id=HF_DATASET_ID, filename="data/train-00000-of-00001.parquet", repo_type="dataset")
303
  ds_dict = load_dataset(HF_DATASET_ID, trust_remote_code=True) # Added trust_remote_code=True if needed
304
  logger.info("Dataset loaded successfully.")
305
  if "train" in ds_dict:
@@ -310,22 +308,22 @@ def update_huggingface_dataset(username: str, score: float, code_link: str):
310
 
311
  except Exception as load_error:
312
  logger.warning(f"Could not load dataset '{HF_DATASET_ID}' or it's empty/new ({load_error}). Will create structure.")
313
- # Create an empty DataFrame with the correct schema if loading failed
314
  df = pd.DataFrame({col: pd.Series(dtype=dtype) for col, dtype in expected_columns.items()})
315
 
316
- # Ensure all expected columns exist, add if they don't
317
  for col, dtype in expected_columns.items():
318
  if col not in df.columns:
319
  logger.warning(f"Column '{col}' not found in loaded data. Adding it.")
320
- # Use appropriate default based on dtype if needed, though concat handles it
321
  df[col] = pd.Series(dtype=dtype)
322
 
323
- # Convert score column to numeric, coercing errors, and fill NaNs
324
  df['score'] = pd.to_numeric(df['score'], errors='coerce').fillna(0.0)
325
- # Ensure other columns have correct types, fill NaNs for string columns
326
  df['username'] = df['username'].astype(str).fillna('')
327
  df['timestamp'] = df['timestamp'].astype(str).fillna('')
328
- df['code'] = df['code'].astype(str).fillna('') # Ensure code column is string
329
 
330
  # 2. Find existing score for the user
331
  existing_entries = df[df['username'] == username]
@@ -337,9 +335,7 @@ def update_huggingface_dataset(username: str, score: float, code_link: str):
337
  max_existing_score = existing_entries['score'].max() # Already numeric
338
  if score > max_existing_score:
339
  logger.info(f"New score {score} is higher than existing max {max_existing_score} for {username}. Updating entry.")
340
- # Remove *all* old entries for this user to replace with the single best one
341
- df = df[df['username'] != username].copy() # Use .copy() to avoid SettingWithCopyWarning
342
- # Add new entry with score and code link
343
  new_entry = pd.DataFrame([{
344
  'username': username,
345
  'score': score,
@@ -366,32 +362,23 @@ def update_huggingface_dataset(username: str, score: float, code_link: str):
366
  if needs_update:
367
  logger.info(f"Preparing to push updated dataset to '{HF_DATASET_ID}'...")
368
 
369
- # Ensure final DataFrame columns match the expected schema exactly before converting
370
- # Select and order columns just in case
371
  df = df[list(expected_columns.keys())]
372
- # Explicitly cast types again before creating Dataset object
373
  for col, dtype in expected_columns.items():
374
- # Handle potential pandas nullable types if necessary, default to standard types
375
  if dtype == 'str':
376
  df[col] = df[col].astype(str).fillna('')
377
  elif dtype == 'float':
378
  df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0.0) # Ensure float conversion
379
- # Add other type handling if needed
380
 
381
  logger.info(f"Final DataFrame columns and types:\n{df.dtypes}")
382
  logger.info(f"Sample data before push:\n{df.head().to_string()}")
383
 
384
- # Create the Dataset object from the final DataFrame
385
  updated_ds = Dataset.from_pandas(df)
386
- # Wrap it in a DatasetDict (standard practice)
387
  final_ds_dict = DatasetDict({'train': updated_ds})
388
 
389
  logger.info(f"Dataset structure to push: {final_ds_dict}")
390
 
391
- # *** UNCOMMENT THIS LINE TO ACTUALLY PUSH THE DATA ***
392
- # final_ds_dict.push_to_hub(HF_DATASET_ID)
393
- # logger.info(f"Successfully pushed updated dataset to '{HF_DATASET_ID}'.")
394
- logger.warning("Dataset push to hub is currently commented out in the code. Uncomment the 'push_to_hub' line to enable leaderboard updates.")
395
  return True
396
  else:
397
  logger.info("No changes needed, dataset not pushed.")
@@ -399,8 +386,7 @@ def update_huggingface_dataset(username: str, score: float, code_link: str):
399
 
400
  except Exception as e:
401
  logger.error(f"Error interacting with Hugging Face dataset '{HF_DATASET_ID}': {e}", exc_info=True)
402
- # Re-raise the exception to be caught by the endpoint handler or calling function
403
- # Adjust the exception type if not using FastAPI's HTTPException
404
  raise HTTPException(status_code=500, detail=f"Failed to update Hugging Face dataset: {e}")
405
 
406
 
 
297
  ds_dict = None
298
  df = None
299
  try:
300
+
 
 
301
  ds_dict = load_dataset(HF_DATASET_ID, trust_remote_code=True) # Added trust_remote_code=True if needed
302
  logger.info("Dataset loaded successfully.")
303
  if "train" in ds_dict:
 
308
 
309
  except Exception as load_error:
310
  logger.warning(f"Could not load dataset '{HF_DATASET_ID}' or it's empty/new ({load_error}). Will create structure.")
311
+
312
  df = pd.DataFrame({col: pd.Series(dtype=dtype) for col, dtype in expected_columns.items()})
313
 
314
+
315
  for col, dtype in expected_columns.items():
316
  if col not in df.columns:
317
  logger.warning(f"Column '{col}' not found in loaded data. Adding it.")
318
+
319
  df[col] = pd.Series(dtype=dtype)
320
 
321
+
322
  df['score'] = pd.to_numeric(df['score'], errors='coerce').fillna(0.0)
323
+
324
  df['username'] = df['username'].astype(str).fillna('')
325
  df['timestamp'] = df['timestamp'].astype(str).fillna('')
326
+ df['code'] = df['code'].astype(str).fillna('')
327
 
328
  # 2. Find existing score for the user
329
  existing_entries = df[df['username'] == username]
 
335
  max_existing_score = existing_entries['score'].max() # Already numeric
336
  if score > max_existing_score:
337
  logger.info(f"New score {score} is higher than existing max {max_existing_score} for {username}. Updating entry.")
338
+ df = df[df['username'] != username].copy()
 
 
339
  new_entry = pd.DataFrame([{
340
  'username': username,
341
  'score': score,
 
362
  if needs_update:
363
  logger.info(f"Preparing to push updated dataset to '{HF_DATASET_ID}'...")
364
 
 
 
365
  df = df[list(expected_columns.keys())]
 
366
  for col, dtype in expected_columns.items():
 
367
  if dtype == 'str':
368
  df[col] = df[col].astype(str).fillna('')
369
  elif dtype == 'float':
370
  df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0.0) # Ensure float conversion
 
371
 
372
  logger.info(f"Final DataFrame columns and types:\n{df.dtypes}")
373
  logger.info(f"Sample data before push:\n{df.head().to_string()}")
374
 
 
375
  updated_ds = Dataset.from_pandas(df)
 
376
  final_ds_dict = DatasetDict({'train': updated_ds})
377
 
378
  logger.info(f"Dataset structure to push: {final_ds_dict}")
379
 
380
+ final_ds_dict.push_to_hub(HF_DATASET_ID)
381
+ logger.warning("Dataset push to hub is currently commented out in the code.")
 
 
382
  return True
383
  else:
384
  logger.info("No changes needed, dataset not pushed.")
 
386
 
387
  except Exception as e:
388
  logger.error(f"Error interacting with Hugging Face dataset '{HF_DATASET_ID}': {e}", exc_info=True)
389
+
 
390
  raise HTTPException(status_code=500, detail=f"Failed to update Hugging Face dataset: {e}")
391
 
392