File size: 4,909 Bytes
861c542
ac48ca6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import gradio as gr
import matplotlib.pyplot as plt

from collections import OrderedDict
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier

def do_train(random_state, n_samples, min_estimators, max_estimators):
    RANDOM_STATE = random_state

    # Generate a binary classification dataset.
    X, y = make_classification(
        n_samples=n_samples,
        n_features=25,
        n_clusters_per_class=1,
        n_informative=15,
        random_state=RANDOM_STATE,
    )

    # NOTE: Setting the `warm_start` construction parameter to `True` disables
    # support for parallelized ensembles but is necessary for tracking the OOB
    # error trajectory during training.
    ensemble_clfs = [
        (
            "RandomForestClassifier, max_features='sqrt'",
            RandomForestClassifier(
                warm_start=True,
                oob_score=True,
                max_features="sqrt",
                random_state=RANDOM_STATE,
            ),
        ),
        (
            "RandomForestClassifier, max_features='log2'",
            RandomForestClassifier(
                warm_start=True,
                max_features="log2",
                oob_score=True,
                random_state=RANDOM_STATE,
            ),
        ),
        (
            "RandomForestClassifier, max_features=None",
            RandomForestClassifier(
                warm_start=True,
                max_features=None,
                oob_score=True,
                random_state=RANDOM_STATE,
            ),
        ),
    ]

    # Map a classifier name to a list of (<n_estimators>, <error rate>) pairs.
    error_rate = OrderedDict((label, []) for label, _ in ensemble_clfs)

    # Range of `n_estimators` values to explore.
    min_estimators = 15
    max_estimators = 150

    for label, clf in ensemble_clfs:
        for i in range(min_estimators, max_estimators + 1, 5):
            clf.set_params(n_estimators=i)
            clf.fit(X, y)

            # Record the OOB error for each `n_estimators=i` setting.
            oob_error = 1 - clf.oob_score_
            error_rate[label].append((i, oob_error))

    # Generate the "OOB error rate" vs. "n_estimators" plot.
    fig, ax = plt.subplots()
    for label, clf_err in error_rate.items():
        xs, ys = zip(*clf_err)
        ax.plot(xs, ys, label=label)

    ax.set_xlim(min_estimators, max_estimators)
    ax.set_xlabel("n_estimators")
    ax.set_ylabel("OOB error rate")
    ax.legend(loc="upper right")
    return fig

model_card = f"""
## Description
The ``RandomForestClassifier`` is trained using bootstrap aggregation, where each new tree is fit from a bootstrap sample of the training observations $z_i = (x_i, y_i)$.
The out-of-bag (OOB) error is the average error for each $z_i$ calculated using predictions from the trees that do not contain
$z_i$ in their respective bootstrap sample. This allows the ``RandomForestClassifier`` to be fit and validated whilst being trained.
You can play around with ``number of samples``, ``random seed``, ``min estimators`` and ``max estimators`` controlling the number of trees.
The example demonstrates how the OOB error can be measured at the addition of each new tree during training.
The resulting plot allows a practitioner to approximate a suitable value of ``n_estimators`` at which the error stabilizes.
## Dataset
Simulation data
"""
with gr.Blocks() as demo:
    gr.Markdown('''
            <div>
            <h1 style='text-align: center'>Out-of-Bag(OOB) Errors for Random Forests</h1>
            </div>
        ''')
    gr.Markdown(model_card)
    gr.Markdown("Author: <a href=\"https://huggingface.co/vumichien\">Vu Minh Chien</a>. Based on the example from <a href=\"https://scikit-learn.org/stable/auto_examples/ensemble/plot_gradient_boosting_oob.html#sphx-glr-auto-examples-ensemble-plot-gradient-boosting-oob-py\">scikit-learn</a>")
    n_samples = gr.Slider(minimum=500, maximum=5000, step=500, value=500, label="Number of samples")
    random_state = gr.Slider(minimum=0, maximum=2000, step=1, value=0, label="Random seed")
    min_estimators = gr.Slider(minimum=5, maximum=300, step=5, value=15, label="Minimum Number of trees")
    max_estimators = gr.Slider(minimum=min_estimators, maximum=300, step=5, value=150, label="Maximum Number of trees")

    with gr.Row():
        with gr.Column():
            plot = gr.Plot()
        
    n_samples.change(fn=do_train, inputs=[n_samples, random_state, min_estimators, max_estimators], outputs=[plot])
    random_state.change(fn=do_train, inputs=[n_samples, random_state, min_estimators, max_estimators], outputs=[plot])
    min_estimators.change(fn=do_train, inputs=[n_samples, random_state, min_estimators, max_estimators], outputs=[plot])
    max_estimators.change(fn=do_train, inputs=[n_samples, random_state, min_estimators, max_estimators], outputs=[plot])

demo.launch()