from dataclasses import dataclass import json from pathlib import Path import pandas as pd import numpy as np from src.constants import MethodTypes, ProblemTypes, DatasetInfo, MetricNames, DatasetInfoRow np.random.seed(0) def generate_dummy_data(method: str, model_type: str, datasets: list[DatasetInfoRow]): # Create a sample DataFrame with schema # "dataset,model,eval_metrics/normalized-error,eval_metrics/rank,eval_metrics/ELO,eval_metrics/fit_time_per_1K_rows,eval_metrics/inference_time_per_1K_rows,domain,num_variates" rows = [] for dataset in datasets: rows.append({ "dataset": dataset.name, "model": method, f"{MetricNames.raw_error}": np.random.rand(), f"{MetricNames.fit_time_per_1K_rows}": np.random.randint(1, 100), f"{MetricNames.inference_time_per_1K_rows}": np.random.randint(1, 100), }) df = pd.DataFrame(rows) result_path = Path(__file__).parent.parent / "results" / method csv_path = result_path / f"all_results.csv" csv_path.parent.mkdir(parents=True, exist_ok=True) df.to_csv(csv_path, index=False) with open(result_path / "config.json", "w") as f: f.write(json.dumps( { "model": method, MethodTypes.col_name: model_type, } )) return df def get_model_type(model: str): pattern_families = { "tuned": MethodTypes.automl, "tuned + ensemble": MethodTypes.automl, "FT": MethodTypes.finetuned, "AutoGluon": MethodTypes.automl, "Autosklearn2": MethodTypes.automl, "CAT": MethodTypes.tree, "EBM": MethodTypes.other, "FASTAI": MethodTypes.finetuned, "FT_TRANSFORMER": MethodTypes.finetuned, "GBM": MethodTypes.tree, "KNN": MethodTypes.other, "REALMLP": MethodTypes.finetuned, "RF": MethodTypes.tree, "XGB": MethodTypes.tree, "XT": MethodTypes.tree, } for pattern, family in pattern_families.items(): if pattern in model: return family return MethodTypes.other if __name__ == "__main__": tabrepo_results_root = Path("~/Downloads/tabrepo_temp_results").expanduser() results_root = Path(__file__).parent.parent / "results" results_root.mkdir(exist_ok=True) df_datasets = pd.read_csv(tabrepo_results_root / "dataset_properties.csv") df_datasets.to_csv(results_root / "dataset_properties.csv", index=False) df_models = pd.read_csv(tabrepo_results_root / "all_results.csv") # For now discard tuned and ensemble df_models = df_models[~df_models.model.str.contains("tuned")] for model in df_models.loc[:, "model"].unique(): result_path = results_root / model result_path.mkdir(exist_ok=True) model_type = get_model_type(model) df_all_results = df_models.loc[df_models.model == model].copy() # dataset,model,raw-error,fit-time-per-1K-rows,inference-time-per-1K-rows df_all_results.rename(columns={ "dataset": "dataset", "model": "model", "metric_error": "raw-error", # TODO divide by number of rows and multiply by 1K "time_train_s": "fit-time-per-1K-rows", "time_infer_s": "inference-time-per-1K-rows", }, inplace=True) df_all_results.to_csv(result_path / "all_results.csv", index=False) with open(result_path / "config.json", "w") as f: f.write(json.dumps( { "model": model, MethodTypes.col_name: model_type, } ))