File size: 2,936 Bytes
d19e46b
 
 
 
 
091c806
d19e46b
 
 
 
 
 
 
 
 
 
 
 
 
091c806
 
 
d19e46b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
from dataclasses import dataclass
import json
from pathlib import Path
import pandas as pd
import numpy as np
from src.constants import MethodTypes, ProblemTypes, DatasetInfo, MetricNames, DatasetInfoRow

np.random.seed(0)


def generate_dummy_data(method: str, model_type: str, datasets: list[DatasetInfoRow]):
    # Create a sample DataFrame with schema 
    # "dataset,model,eval_metrics/normalized-error,eval_metrics/rank,eval_metrics/ELO,eval_metrics/fit_time_per_1K_rows,eval_metrics/inference_time_per_1K_rows,domain,num_variates"
    
    rows = []
    for dataset in datasets:
        rows.append({
            "dataset": dataset.name,
            "model": method,
            f"{MetricNames.raw_error}": np.random.rand(),
            f"{MetricNames.fit_time_per_1K_rows}": np.random.randint(1, 100),
            f"{MetricNames.inference_time_per_1K_rows}": np.random.randint(1, 100),
        })
    df = pd.DataFrame(rows)
    result_path = Path(__file__).parent.parent / "results" / method
    csv_path = result_path / f"all_results.csv"
    csv_path.parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(csv_path, index=False)

    with open(result_path / "config.json", "w") as f:
        f.write(json.dumps(
            {
                "model": method,
                MethodTypes.col_name: model_type,
            }
        ))

    return df


if __name__ == "__main__":

    datasets = [
        DatasetInfoRow(name="airline", problem_type=ProblemTypes.regression, num_features=12, num_rows=10),
        DatasetInfoRow(name="electricity", problem_type=ProblemTypes.classification, num_features=2, num_rows=1020),
        DatasetInfoRow(name="solar-energy", problem_type=ProblemTypes.multi_classification, num_features=3, num_rows=100),
        DatasetInfoRow(name="traffic", problem_type=ProblemTypes.multi_classification, num_features=12, num_rows=10000),
        DatasetInfoRow(name="volcano", problem_type=ProblemTypes.regression, num_features=12, num_rows=100),
    ]

    methods = [
        ("AutoGluon (best)", MethodTypes.automl),
        ("CatBoost", MethodTypes.boosted_tree),
        ("TabPFN", MethodTypes.foundational),
        ("TabPFN-v2", MethodTypes.foundational),
        ("KNN", MethodTypes.other),
    ]
    for method, method_type in methods:
        generate_dummy_data(
            method=method,
            datasets=datasets,
            model_type=method_type,
        )

    row_datasets = []
    for dataset in datasets:
        row_datasets.append({
        # dataset,domain,frequency,num_variates
            # M4 Yearly,Econ/Fin,A-DEC,1
            DatasetInfo.col_name: dataset.name,
            ProblemTypes.col_name: dataset.problem_type,
            DatasetInfo.num_rows: dataset.num_rows,
            DatasetInfo.num_features: dataset.num_features,
        })

    pd.DataFrame(row_datasets).to_csv(Path(__file__).parent.parent / "results" / "dataset_properties.csv", index=False)