TabArena-WIP / src /generate_dummy_data.py
geoalgo's picture
edit readme, add script to generate dummy data
d19e46b
from dataclasses import dataclass
import json
from pathlib import Path
import pandas as pd
import numpy as np
from src.constants import MethodTypes, ProblemTypes, DatasetInfo, MetricNames
np.random.seed(0)
@dataclass
class DatasetInfoRow:
name: str
problem_type: str
num_rows: int
num_features: int
def generate_dummy_data(method: str, model_type: str, datasets: list[DatasetInfoRow]):
# Create a sample DataFrame with schema
# "dataset,model,eval_metrics/normalized-error,eval_metrics/rank,eval_metrics/ELO,eval_metrics/fit_time_per_1K_rows,eval_metrics/inference_time_per_1K_rows,domain,num_variates"
rows = []
for dataset in datasets:
rows.append({
"dataset": dataset.name,
"model": method,
f"eval_metrics/{MetricNames.raw_error}": np.random.rand(),
# "eval_metrics/rank": np.random.randint(1, 100),
# "eval_metrics/ELO": np.random.randint(100, 1000),
f"eval_metrics/{MetricNames.fit_time_per_1K_rows}": np.random.randint(1, 100),
f"eval_metrics/{MetricNames.inference_time_per_1K_rows}": np.random.randint(1, 100),
ProblemTypes.col_name: dataset.problem_type,
DatasetInfo.num_features: dataset.num_features,
})
df = pd.DataFrame(rows)
result_path = Path(__file__).parent.parent / "results" / method
csv_path = result_path / f"all_results.csv"
csv_path.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(csv_path, index=False)
with open(result_path / "config.json", "w") as f:
f.write(json.dumps(
{
"model": method,
MethodTypes.col_name: model_type,
}
))
return df
if __name__ == "__main__":
datasets = [
DatasetInfoRow(name="airline", problem_type=ProblemTypes.regression, num_features=12, num_rows=10),
DatasetInfoRow(name="electricity", problem_type=ProblemTypes.classification, num_features=2, num_rows=1020),
DatasetInfoRow(name="solar-energy", problem_type=ProblemTypes.multi_classification, num_features=3, num_rows=100),
DatasetInfoRow(name="traffic", problem_type=ProblemTypes.multi_classification, num_features=12, num_rows=10000),
DatasetInfoRow(name="volcano", problem_type=ProblemTypes.regression, num_features=12, num_rows=100),
]
methods = [
("AutoGluon (best)", MethodTypes.automl),
("CatBoost", MethodTypes.boosted_tree),
("TabPFN", MethodTypes.foundational),
("TabPFN-v2", MethodTypes.foundational),
("KNN", MethodTypes.other),
]
for method, method_type in methods:
generate_dummy_data(
method=method,
datasets=datasets,
model_type=method_type,
)
row_datasets = []
for dataset in datasets:
row_datasets.append({
# dataset,domain,frequency,num_variates
# M4 Yearly,Econ/Fin,A-DEC,1
DatasetInfo.col_name: dataset.name,
ProblemTypes.col_name: dataset.problem_type,
DatasetInfo.num_rows: dataset.num_rows,
DatasetInfo.num_features: dataset.num_features,
})
pd.DataFrame(row_datasets).to_csv(Path(__file__).parent.parent / "results" / "dataset_properties.csv", index=False)