File size: 3,661 Bytes
9daaf0d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
from dataclasses import dataclass
import json
from pathlib import Path
import pandas as pd
import numpy as np
from src.constants import MethodTypes, ProblemTypes, DatasetInfo, MetricNames, DatasetInfoRow

np.random.seed(0)


def generate_dummy_data(method: str, model_type: str, datasets: list[DatasetInfoRow]):
    # Create a sample DataFrame with schema 
    # "dataset,model,eval_metrics/normalized-error,eval_metrics/rank,eval_metrics/ELO,eval_metrics/fit_time_per_1K_rows,eval_metrics/inference_time_per_1K_rows,domain,num_variates"
    
    rows = []
    for dataset in datasets:
        rows.append({
            "dataset": dataset.name,
            "model": method,
            f"{MetricNames.raw_error}": np.random.rand(),
            f"{MetricNames.fit_time_per_1K_rows}": np.random.randint(1, 100),
            f"{MetricNames.inference_time_per_1K_rows}": np.random.randint(1, 100),
        })
    df = pd.DataFrame(rows)
    result_path = Path(__file__).parent.parent / "results" / method
    csv_path = result_path / f"all_results.csv"
    csv_path.parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(csv_path, index=False)

    with open(result_path / "config.json", "w") as f:
        f.write(json.dumps(
            {
                "model": method,
                MethodTypes.col_name: model_type,
            }
        ))

    return df


def get_model_type(model: str):
    pattern_families = {
        "tuned": MethodTypes.automl,
        "tuned + ensemble": MethodTypes.automl,
        "FT": MethodTypes.finetuned,
        "AutoGluon": MethodTypes.automl,
        "Autosklearn2": MethodTypes.automl,
        "CAT": MethodTypes.tree,
        "EBM": MethodTypes.other,
        "FASTAI": MethodTypes.finetuned,
        "FT_TRANSFORMER": MethodTypes.finetuned,
        "GBM": MethodTypes.tree,
        "KNN": MethodTypes.other,
        "REALMLP": MethodTypes.finetuned,
        "RF": MethodTypes.tree,
        "XGB": MethodTypes.tree,
        "XT": MethodTypes.tree,
    }

    for pattern, family in pattern_families.items():
        if pattern in model:
            return family
    return MethodTypes.other


if __name__ == "__main__":
    tabrepo_results_root = Path("~/Downloads/tabrepo_temp_results").expanduser()
    results_root = Path(__file__).parent.parent / "results"
    results_root.mkdir(exist_ok=True)
    df_datasets = pd.read_csv(tabrepo_results_root / "dataset_properties.csv")
    df_datasets.to_csv(results_root / "dataset_properties.csv", index=False)

    df_models = pd.read_csv(tabrepo_results_root / "all_results.csv")

    # For now discard tuned and ensemble
    df_models = df_models[~df_models.model.str.contains("tuned")]
    for model in df_models.loc[:, "model"].unique():
        result_path = results_root / model
        result_path.mkdir(exist_ok=True)
        model_type = get_model_type(model)
        df_all_results = df_models.loc[df_models.model == model].copy()
        # dataset,model,raw-error,fit-time-per-1K-rows,inference-time-per-1K-rows
        df_all_results.rename(columns={
            "dataset": "dataset",
            "model": "model",
            "metric_error": "raw-error",
            # TODO divide by number of rows and multiply by 1K
            "time_train_s": "fit-time-per-1K-rows",
            "time_infer_s": "inference-time-per-1K-rows",
        }, inplace=True)
        df_all_results.to_csv(result_path / "all_results.csv", index=False)

        with open(result_path / "config.json", "w") as f:
            f.write(json.dumps(
                {
                    "model": model,
                    MethodTypes.col_name: model_type,
                }
            ))