Spaces:
Running
Running
Update main.py
Browse files
main.py
CHANGED
@@ -297,9 +297,7 @@ def update_huggingface_dataset(username: str, score: float, code_link: str):
|
|
297 |
ds_dict = None
|
298 |
df = None
|
299 |
try:
|
300 |
-
|
301 |
-
# This might not be necessary if load_dataset handles non-existence gracefully
|
302 |
-
# hf_hub_download(repo_id=HF_DATASET_ID, filename="data/train-00000-of-00001.parquet", repo_type="dataset")
|
303 |
ds_dict = load_dataset(HF_DATASET_ID, trust_remote_code=True) # Added trust_remote_code=True if needed
|
304 |
logger.info("Dataset loaded successfully.")
|
305 |
if "train" in ds_dict:
|
@@ -310,22 +308,22 @@ def update_huggingface_dataset(username: str, score: float, code_link: str):
|
|
310 |
|
311 |
except Exception as load_error:
|
312 |
logger.warning(f"Could not load dataset '{HF_DATASET_ID}' or it's empty/new ({load_error}). Will create structure.")
|
313 |
-
|
314 |
df = pd.DataFrame({col: pd.Series(dtype=dtype) for col, dtype in expected_columns.items()})
|
315 |
|
316 |
-
|
317 |
for col, dtype in expected_columns.items():
|
318 |
if col not in df.columns:
|
319 |
logger.warning(f"Column '{col}' not found in loaded data. Adding it.")
|
320 |
-
|
321 |
df[col] = pd.Series(dtype=dtype)
|
322 |
|
323 |
-
|
324 |
df['score'] = pd.to_numeric(df['score'], errors='coerce').fillna(0.0)
|
325 |
-
|
326 |
df['username'] = df['username'].astype(str).fillna('')
|
327 |
df['timestamp'] = df['timestamp'].astype(str).fillna('')
|
328 |
-
df['code'] = df['code'].astype(str).fillna('')
|
329 |
|
330 |
# 2. Find existing score for the user
|
331 |
existing_entries = df[df['username'] == username]
|
@@ -337,9 +335,7 @@ def update_huggingface_dataset(username: str, score: float, code_link: str):
|
|
337 |
max_existing_score = existing_entries['score'].max() # Already numeric
|
338 |
if score > max_existing_score:
|
339 |
logger.info(f"New score {score} is higher than existing max {max_existing_score} for {username}. Updating entry.")
|
340 |
-
|
341 |
-
df = df[df['username'] != username].copy() # Use .copy() to avoid SettingWithCopyWarning
|
342 |
-
# Add new entry with score and code link
|
343 |
new_entry = pd.DataFrame([{
|
344 |
'username': username,
|
345 |
'score': score,
|
@@ -366,32 +362,23 @@ def update_huggingface_dataset(username: str, score: float, code_link: str):
|
|
366 |
if needs_update:
|
367 |
logger.info(f"Preparing to push updated dataset to '{HF_DATASET_ID}'...")
|
368 |
|
369 |
-
# Ensure final DataFrame columns match the expected schema exactly before converting
|
370 |
-
# Select and order columns just in case
|
371 |
df = df[list(expected_columns.keys())]
|
372 |
-
# Explicitly cast types again before creating Dataset object
|
373 |
for col, dtype in expected_columns.items():
|
374 |
-
# Handle potential pandas nullable types if necessary, default to standard types
|
375 |
if dtype == 'str':
|
376 |
df[col] = df[col].astype(str).fillna('')
|
377 |
elif dtype == 'float':
|
378 |
df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0.0) # Ensure float conversion
|
379 |
-
# Add other type handling if needed
|
380 |
|
381 |
logger.info(f"Final DataFrame columns and types:\n{df.dtypes}")
|
382 |
logger.info(f"Sample data before push:\n{df.head().to_string()}")
|
383 |
|
384 |
-
# Create the Dataset object from the final DataFrame
|
385 |
updated_ds = Dataset.from_pandas(df)
|
386 |
-
# Wrap it in a DatasetDict (standard practice)
|
387 |
final_ds_dict = DatasetDict({'train': updated_ds})
|
388 |
|
389 |
logger.info(f"Dataset structure to push: {final_ds_dict}")
|
390 |
|
391 |
-
|
392 |
-
|
393 |
-
# logger.info(f"Successfully pushed updated dataset to '{HF_DATASET_ID}'.")
|
394 |
-
logger.warning("Dataset push to hub is currently commented out in the code. Uncomment the 'push_to_hub' line to enable leaderboard updates.")
|
395 |
return True
|
396 |
else:
|
397 |
logger.info("No changes needed, dataset not pushed.")
|
@@ -399,8 +386,7 @@ def update_huggingface_dataset(username: str, score: float, code_link: str):
|
|
399 |
|
400 |
except Exception as e:
|
401 |
logger.error(f"Error interacting with Hugging Face dataset '{HF_DATASET_ID}': {e}", exc_info=True)
|
402 |
-
|
403 |
-
# Adjust the exception type if not using FastAPI's HTTPException
|
404 |
raise HTTPException(status_code=500, detail=f"Failed to update Hugging Face dataset: {e}")
|
405 |
|
406 |
|
|
|
297 |
ds_dict = None
|
298 |
df = None
|
299 |
try:
|
300 |
+
|
|
|
|
|
301 |
ds_dict = load_dataset(HF_DATASET_ID, trust_remote_code=True) # Added trust_remote_code=True if needed
|
302 |
logger.info("Dataset loaded successfully.")
|
303 |
if "train" in ds_dict:
|
|
|
308 |
|
309 |
except Exception as load_error:
|
310 |
logger.warning(f"Could not load dataset '{HF_DATASET_ID}' or it's empty/new ({load_error}). Will create structure.")
|
311 |
+
|
312 |
df = pd.DataFrame({col: pd.Series(dtype=dtype) for col, dtype in expected_columns.items()})
|
313 |
|
314 |
+
|
315 |
for col, dtype in expected_columns.items():
|
316 |
if col not in df.columns:
|
317 |
logger.warning(f"Column '{col}' not found in loaded data. Adding it.")
|
318 |
+
|
319 |
df[col] = pd.Series(dtype=dtype)
|
320 |
|
321 |
+
|
322 |
df['score'] = pd.to_numeric(df['score'], errors='coerce').fillna(0.0)
|
323 |
+
|
324 |
df['username'] = df['username'].astype(str).fillna('')
|
325 |
df['timestamp'] = df['timestamp'].astype(str).fillna('')
|
326 |
+
df['code'] = df['code'].astype(str).fillna('')
|
327 |
|
328 |
# 2. Find existing score for the user
|
329 |
existing_entries = df[df['username'] == username]
|
|
|
335 |
max_existing_score = existing_entries['score'].max() # Already numeric
|
336 |
if score > max_existing_score:
|
337 |
logger.info(f"New score {score} is higher than existing max {max_existing_score} for {username}. Updating entry.")
|
338 |
+
df = df[df['username'] != username].copy()
|
|
|
|
|
339 |
new_entry = pd.DataFrame([{
|
340 |
'username': username,
|
341 |
'score': score,
|
|
|
362 |
if needs_update:
|
363 |
logger.info(f"Preparing to push updated dataset to '{HF_DATASET_ID}'...")
|
364 |
|
|
|
|
|
365 |
df = df[list(expected_columns.keys())]
|
|
|
366 |
for col, dtype in expected_columns.items():
|
|
|
367 |
if dtype == 'str':
|
368 |
df[col] = df[col].astype(str).fillna('')
|
369 |
elif dtype == 'float':
|
370 |
df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0.0) # Ensure float conversion
|
|
|
371 |
|
372 |
logger.info(f"Final DataFrame columns and types:\n{df.dtypes}")
|
373 |
logger.info(f"Sample data before push:\n{df.head().to_string()}")
|
374 |
|
|
|
375 |
updated_ds = Dataset.from_pandas(df)
|
|
|
376 |
final_ds_dict = DatasetDict({'train': updated_ds})
|
377 |
|
378 |
logger.info(f"Dataset structure to push: {final_ds_dict}")
|
379 |
|
380 |
+
final_ds_dict.push_to_hub(HF_DATASET_ID)
|
381 |
+
logger.warning("Dataset push to hub is currently commented out in the code.")
|
|
|
|
|
382 |
return True
|
383 |
else:
|
384 |
logger.info("No changes needed, dataset not pushed.")
|
|
|
386 |
|
387 |
except Exception as e:
|
388 |
logger.error(f"Error interacting with Hugging Face dataset '{HF_DATASET_ID}': {e}", exc_info=True)
|
389 |
+
|
|
|
390 |
raise HTTPException(status_code=500, detail=f"Failed to update Hugging Face dataset: {e}")
|
391 |
|
392 |
|