Spaces:
Building
Building
File size: 2,936 Bytes
d19e46b 091c806 d19e46b 091c806 d19e46b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 |
from dataclasses import dataclass
import json
from pathlib import Path
import pandas as pd
import numpy as np
from src.constants import MethodTypes, ProblemTypes, DatasetInfo, MetricNames, DatasetInfoRow
np.random.seed(0)
def generate_dummy_data(method: str, model_type: str, datasets: list[DatasetInfoRow]):
# Create a sample DataFrame with schema
# "dataset,model,eval_metrics/normalized-error,eval_metrics/rank,eval_metrics/ELO,eval_metrics/fit_time_per_1K_rows,eval_metrics/inference_time_per_1K_rows,domain,num_variates"
rows = []
for dataset in datasets:
rows.append({
"dataset": dataset.name,
"model": method,
f"{MetricNames.raw_error}": np.random.rand(),
f"{MetricNames.fit_time_per_1K_rows}": np.random.randint(1, 100),
f"{MetricNames.inference_time_per_1K_rows}": np.random.randint(1, 100),
})
df = pd.DataFrame(rows)
result_path = Path(__file__).parent.parent / "results" / method
csv_path = result_path / f"all_results.csv"
csv_path.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(csv_path, index=False)
with open(result_path / "config.json", "w") as f:
f.write(json.dumps(
{
"model": method,
MethodTypes.col_name: model_type,
}
))
return df
if __name__ == "__main__":
datasets = [
DatasetInfoRow(name="airline", problem_type=ProblemTypes.regression, num_features=12, num_rows=10),
DatasetInfoRow(name="electricity", problem_type=ProblemTypes.classification, num_features=2, num_rows=1020),
DatasetInfoRow(name="solar-energy", problem_type=ProblemTypes.multi_classification, num_features=3, num_rows=100),
DatasetInfoRow(name="traffic", problem_type=ProblemTypes.multi_classification, num_features=12, num_rows=10000),
DatasetInfoRow(name="volcano", problem_type=ProblemTypes.regression, num_features=12, num_rows=100),
]
methods = [
("AutoGluon (best)", MethodTypes.automl),
("CatBoost", MethodTypes.boosted_tree),
("TabPFN", MethodTypes.foundational),
("TabPFN-v2", MethodTypes.foundational),
("KNN", MethodTypes.other),
]
for method, method_type in methods:
generate_dummy_data(
method=method,
datasets=datasets,
model_type=method_type,
)
row_datasets = []
for dataset in datasets:
row_datasets.append({
# dataset,domain,frequency,num_variates
# M4 Yearly,Econ/Fin,A-DEC,1
DatasetInfo.col_name: dataset.name,
ProblemTypes.col_name: dataset.problem_type,
DatasetInfo.num_rows: dataset.num_rows,
DatasetInfo.num_features: dataset.num_features,
})
pd.DataFrame(row_datasets).to_csv(Path(__file__).parent.parent / "results" / "dataset_properties.csv", index=False) |