Spaces:
Running
Running
from dataclasses import dataclass | |
import json | |
from pathlib import Path | |
import pandas as pd | |
import numpy as np | |
from src.constants import MethodTypes, ProblemTypes, DatasetInfo, MetricNames | |
np.random.seed(0) | |
class DatasetInfoRow: | |
name: str | |
problem_type: str | |
num_rows: int | |
num_features: int | |
def generate_dummy_data(method: str, model_type: str, datasets: list[DatasetInfoRow]): | |
# Create a sample DataFrame with schema | |
# "dataset,model,eval_metrics/normalized-error,eval_metrics/rank,eval_metrics/ELO,eval_metrics/fit_time_per_1K_rows,eval_metrics/inference_time_per_1K_rows,domain,num_variates" | |
rows = [] | |
for dataset in datasets: | |
rows.append({ | |
"dataset": dataset.name, | |
"model": method, | |
f"eval_metrics/{MetricNames.raw_error}": np.random.rand(), | |
# "eval_metrics/rank": np.random.randint(1, 100), | |
# "eval_metrics/ELO": np.random.randint(100, 1000), | |
f"eval_metrics/{MetricNames.fit_time_per_1K_rows}": np.random.randint(1, 100), | |
f"eval_metrics/{MetricNames.inference_time_per_1K_rows}": np.random.randint(1, 100), | |
ProblemTypes.col_name: dataset.problem_type, | |
DatasetInfo.num_features: dataset.num_features, | |
}) | |
df = pd.DataFrame(rows) | |
result_path = Path(__file__).parent.parent / "results" / method | |
csv_path = result_path / f"all_results.csv" | |
csv_path.parent.mkdir(parents=True, exist_ok=True) | |
df.to_csv(csv_path, index=False) | |
with open(result_path / "config.json", "w") as f: | |
f.write(json.dumps( | |
{ | |
"model": method, | |
MethodTypes.col_name: model_type, | |
} | |
)) | |
return df | |
if __name__ == "__main__": | |
datasets = [ | |
DatasetInfoRow(name="airline", problem_type=ProblemTypes.regression, num_features=12, num_rows=10), | |
DatasetInfoRow(name="electricity", problem_type=ProblemTypes.classification, num_features=2, num_rows=1020), | |
DatasetInfoRow(name="solar-energy", problem_type=ProblemTypes.multi_classification, num_features=3, num_rows=100), | |
DatasetInfoRow(name="traffic", problem_type=ProblemTypes.multi_classification, num_features=12, num_rows=10000), | |
DatasetInfoRow(name="volcano", problem_type=ProblemTypes.regression, num_features=12, num_rows=100), | |
] | |
methods = [ | |
("AutoGluon (best)", MethodTypes.automl), | |
("CatBoost", MethodTypes.boosted_tree), | |
("TabPFN", MethodTypes.foundational), | |
("TabPFN-v2", MethodTypes.foundational), | |
("KNN", MethodTypes.other), | |
] | |
for method, method_type in methods: | |
generate_dummy_data( | |
method=method, | |
datasets=datasets, | |
model_type=method_type, | |
) | |
row_datasets = [] | |
for dataset in datasets: | |
row_datasets.append({ | |
# dataset,domain,frequency,num_variates | |
# M4 Yearly,Econ/Fin,A-DEC,1 | |
DatasetInfo.col_name: dataset.name, | |
ProblemTypes.col_name: dataset.problem_type, | |
DatasetInfo.num_rows: dataset.num_rows, | |
DatasetInfo.num_features: dataset.num_features, | |
}) | |
pd.DataFrame(row_datasets).to_csv(Path(__file__).parent.parent / "results" / "dataset_properties.csv", index=False) |