from dataclasses import dataclass import json from pathlib import Path import pandas as pd import numpy as np from src.constants import MethodTypes, ProblemTypes, DatasetInfo, MetricNames, DatasetInfoRow np.random.seed(0) def generate_dummy_data(method: str, model_type: str, datasets: list[DatasetInfoRow]): # Create a sample DataFrame with schema # "dataset,model,eval_metrics/normalized-error,eval_metrics/rank,eval_metrics/ELO,eval_metrics/fit_time_per_1K_rows,eval_metrics/inference_time_per_1K_rows,domain,num_variates" rows = [] for dataset in datasets: rows.append({ "dataset": dataset.name, "model": method, f"{MetricNames.raw_error}": np.random.rand(), f"{MetricNames.fit_time_per_1K_rows}": np.random.randint(1, 100), f"{MetricNames.inference_time_per_1K_rows}": np.random.randint(1, 100), }) df = pd.DataFrame(rows) result_path = Path(__file__).parent.parent / "results" / method csv_path = result_path / f"all_results.csv" csv_path.parent.mkdir(parents=True, exist_ok=True) df.to_csv(csv_path, index=False) with open(result_path / "config.json", "w") as f: f.write(json.dumps( { "model": method, MethodTypes.col_name: model_type, } )) return df if __name__ == "__main__": datasets = [ DatasetInfoRow(name="airline", problem_type=ProblemTypes.regression, num_features=12, num_rows=10), DatasetInfoRow(name="electricity", problem_type=ProblemTypes.classification, num_features=2, num_rows=1020), DatasetInfoRow(name="solar-energy", problem_type=ProblemTypes.multi_classification, num_features=3, num_rows=100), DatasetInfoRow(name="traffic", problem_type=ProblemTypes.multi_classification, num_features=12, num_rows=10000), DatasetInfoRow(name="volcano", problem_type=ProblemTypes.regression, num_features=12, num_rows=100), ] methods = [ ("AutoGluon (best)", MethodTypes.automl), ("CatBoost", MethodTypes.boosted_tree), ("TabPFN", MethodTypes.foundational), ("TabPFN-v2", MethodTypes.foundational), ("KNN", MethodTypes.other), ] for method, method_type in methods: generate_dummy_data( method=method, datasets=datasets, model_type=method_type, ) row_datasets = [] for dataset in datasets: row_datasets.append({ # dataset,domain,frequency,num_variates # M4 Yearly,Econ/Fin,A-DEC,1 DatasetInfo.col_name: dataset.name, ProblemTypes.col_name: dataset.problem_type, DatasetInfo.num_rows: dataset.num_rows, DatasetInfo.num_features: dataset.num_features, }) pd.DataFrame(row_datasets).to_csv(Path(__file__).parent.parent / "results" / "dataset_properties.csv", index=False)