Spaces:
Running
Running
Update main.py
Browse files
main.py
CHANGED
@@ -9,6 +9,9 @@ from datetime import datetime, timezone
|
|
9 |
import logging
|
10 |
import uvicorn # To run the app
|
11 |
|
|
|
|
|
|
|
12 |
# --- Configuration ---
|
13 |
HF_DATASET_ID = "agents-course/unit4-students-scores"
|
14 |
# Ensure you have write access to this dataset repository on Hugging Face
|
@@ -17,26 +20,38 @@ HF_DATASET_ID = "agents-course/unit4-students-scores"
|
|
17 |
# --- Logging Setup ---
|
18 |
logging.basicConfig(level=logging.INFO)
|
19 |
logger = logging.getLogger(__name__)
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
#
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
#
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
|
41 |
# Prepare data structures for the API
|
42 |
questions_for_api: List[Dict[str, str]] = []
|
@@ -169,7 +184,8 @@ def update_huggingface_dataset(username: str, score: float):
|
|
169 |
# Ensure the schema matches if columns were added/modified.
|
170 |
# Use 'train' split convention.
|
171 |
updated_ds = DatasetDict({'train': Dataset.from_pandas(df)})
|
172 |
-
updated_ds
|
|
|
173 |
logger.info("Dataset push successful.")
|
174 |
return True
|
175 |
else:
|
@@ -311,4 +327,5 @@ if __name__ == "__main__":
|
|
311 |
local_port = int(os.getenv("PORT", "8000"))
|
312 |
logger.info(f"Running Uvicorn locally on port: {local_port}")
|
313 |
# Note: host='127.0.0.1' is usually fine for local runs outside docker
|
|
|
314 |
uvicorn.run(app, host="127.0.0.1", port=local_port, log_level="info")
|
|
|
9 |
import logging
|
10 |
import uvicorn # To run the app
|
11 |
|
12 |
+
tool_threshold = 3
|
13 |
+
step_threshold = 5
|
14 |
+
|
15 |
# --- Configuration ---
|
16 |
HF_DATASET_ID = "agents-course/unit4-students-scores"
|
17 |
# Ensure you have write access to this dataset repository on Hugging Face
|
|
|
20 |
# --- Logging Setup ---
|
21 |
logging.basicConfig(level=logging.INFO)
|
22 |
logger = logging.getLogger(__name__)
|
23 |
+
filtered_dataset=None
|
24 |
+
def load_dataset():
|
25 |
+
global filtered_dataset
|
26 |
+
tempo_filtered=[]
|
27 |
+
dataset=load_dataset("gaia-benchmark/GAIA","2023_level1",trust_remote_code=True)
|
28 |
+
for question in dataset['validation']:
|
29 |
+
metadata = question.get('Annotator Metadata') # Use .get() for safety
|
30 |
+
|
31 |
+
if metadata: # Check if 'Annotator Metadata' exists
|
32 |
+
num_tools_str = metadata.get('Number of tools')
|
33 |
+
num_steps_str = metadata.get('Number of steps')
|
34 |
+
|
35 |
+
# Check if both numbers exist before trying to convert
|
36 |
+
if num_tools_str is not None and num_steps_str is not None:
|
37 |
+
try:
|
38 |
+
# Convert values to integers for comparison
|
39 |
+
num_tools = int(num_tools_str)
|
40 |
+
num_steps = int(num_steps_str)
|
41 |
+
|
42 |
+
# Apply the filter conditions
|
43 |
+
if num_tools < tool_threshold and num_steps < step_threshold:
|
44 |
+
print(f"MATCH FOUND (Task ID: {question.get('task_id', 'N/A')}) - Tools: {num_tools}, Steps: {num_steps}")
|
45 |
+
print(question) # Print the matching question dictionary
|
46 |
+
print("------------------------------------------------------------------")
|
47 |
+
tempo_filtered.append(question) # Add to the filtered list
|
48 |
+
# else: # Optional: Handle items that don't match the filter
|
49 |
+
# print(f"Skipping Task ID: {question.get('task_id', 'N/A')} - Tools: {num_tools}, Steps: {num_steps}")
|
50 |
+
except ValueError:
|
51 |
+
# Handle cases where 'Number of tools' or 'Number of steps' is not a valid integer
|
52 |
+
print(f"Skipping Task ID: {question.get('task_id', 'N/A')} - Could not convert tool/step count to integer.")
|
53 |
+
print("------------------------------------------------------------------")
|
54 |
+
filtered_data=tempo_filtered
|
55 |
|
56 |
# Prepare data structures for the API
|
57 |
questions_for_api: List[Dict[str, str]] = []
|
|
|
184 |
# Ensure the schema matches if columns were added/modified.
|
185 |
# Use 'train' split convention.
|
186 |
updated_ds = DatasetDict({'train': Dataset.from_pandas(df)})
|
187 |
+
pritn(updated_ds)
|
188 |
+
#updated_ds.push_to_hub(HF_DATASET_ID) # Token should be picked up from env or login
|
189 |
logger.info("Dataset push successful.")
|
190 |
return True
|
191 |
else:
|
|
|
327 |
local_port = int(os.getenv("PORT", "8000"))
|
328 |
logger.info(f"Running Uvicorn locally on port: {local_port}")
|
329 |
# Note: host='127.0.0.1' is usually fine for local runs outside docker
|
330 |
+
load_dataset()
|
331 |
uvicorn.run(app, host="127.0.0.1", port=local_port, log_level="info")
|