File size: 5,081 Bytes
861c542
ac48ca6
 
 
 
 
 
2d851fe
 
 
 
 
ac48ca6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8fde30f
ac48ca6
 
2d851fe
 
ac48ca6
2d851fe
ac48ca6
 
 
2d851fe
ac48ca6
 
 
 
 
2d851fe
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import gradio as gr
import matplotlib.pyplot as plt

from collections import OrderedDict
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier

def compare(number1, number2):
    if number1 > number2:
        number2 = number1
    return number2

def do_train(random_state, n_samples, min_estimators, max_estimators):
    RANDOM_STATE = random_state

    # Generate a binary classification dataset.
    X, y = make_classification(
        n_samples=n_samples,
        n_features=25,
        n_clusters_per_class=1,
        n_informative=15,
        random_state=RANDOM_STATE,
    )

    # NOTE: Setting the `warm_start` construction parameter to `True` disables
    # support for parallelized ensembles but is necessary for tracking the OOB
    # error trajectory during training.
    ensemble_clfs = [
        (
            "RandomForestClassifier, max_features='sqrt'",
            RandomForestClassifier(
                warm_start=True,
                oob_score=True,
                max_features="sqrt",
                random_state=RANDOM_STATE,
            ),
        ),
        (
            "RandomForestClassifier, max_features='log2'",
            RandomForestClassifier(
                warm_start=True,
                max_features="log2",
                oob_score=True,
                random_state=RANDOM_STATE,
            ),
        ),
        (
            "RandomForestClassifier, max_features=None",
            RandomForestClassifier(
                warm_start=True,
                max_features=None,
                oob_score=True,
                random_state=RANDOM_STATE,
            ),
        ),
    ]

    # Map a classifier name to a list of (<n_estimators>, <error rate>) pairs.
    error_rate = OrderedDict((label, []) for label, _ in ensemble_clfs)

    # Range of `n_estimators` values to explore.
    min_estimators = 15
    max_estimators = 150

    for label, clf in ensemble_clfs:
        for i in range(min_estimators, max_estimators + 1, 5):
            clf.set_params(n_estimators=i)
            clf.fit(X, y)

            # Record the OOB error for each `n_estimators=i` setting.
            oob_error = 1 - clf.oob_score_
            error_rate[label].append((i, oob_error))

    # Generate the "OOB error rate" vs. "n_estimators" plot.
    fig, ax = plt.subplots()
    for label, clf_err in error_rate.items():
        xs, ys = zip(*clf_err)
        ax.plot(xs, ys, label=label)

    ax.set_xlim(min_estimators, max_estimators)
    ax.set_xlabel("n_estimators")
    ax.set_ylabel("OOB error rate")
    ax.legend(loc="upper right")
    return fig

model_card = f"""
## Description
The ``RandomForestClassifier`` is trained using bootstrap aggregation, where each new tree is fit from a bootstrap sample of the training observations $z_i = (x_i, y_i)$.
The out-of-bag (OOB) error is the average error for each $z_i$ calculated using predictions from the trees that do not contain
$z_i$ in their respective bootstrap sample. This allows the ``RandomForestClassifier`` to be fit and validated whilst being trained.
You can play around with ``number of samples``, ``random seed``, ``min estimators`` and ``max estimators`` controlling the number of trees.
The example demonstrates how the OOB error can be measured at the addition of each new tree during training.
The resulting plot allows a practitioner to approximate a suitable value of ``n_estimators`` at which the error stabilizes.
## Dataset
Simulation data
"""
with gr.Blocks() as demo:
    gr.Markdown('''
            <div>
            <h1 style='text-align: center'>Out-of-Bag(OOB) Errors for Random Forests</h1>
            </div>
        ''')
    gr.Markdown(model_card)
    gr.Markdown("Author: <a href=\"https://huggingface.co/bharat-raghunathan\">Bharat Raghunathan</a>. Based on the example from <a href=\"https://scikit-learn.org/stable/auto_examples/ensemble/plot_ensemble_oob.html#sphx-glr-auto-examples-ensemble-plot-ensemble-oob-py\">scikit-learn</a>")
    n_samples = gr.Slider(minimum=500, maximum=5000, step=500, value=500, label="Number of samples")
    random_state = gr.Slider(minimum=0, maximum=2000, step=1, value=0, label="Random seed")
    min_estimators = gr.Slider(minimum=5, maximum=300, step=5, value=15, label="Minimum number of trees")
    max_estimators = gr.Slider(minimum=5, maximum=300, step=5, value=150, label="Maximum number of trees")

    min_estimators.change(compare, [min_estimators, max_estimators], max_estimators)
    with gr.Row():
        with gr.Column():
            plot = gr.Plot()

    n_samples.change(fn=do_train, inputs=[n_samples, random_state, min_estimators, max_estimators], outputs=[plot])
    random_state.change(fn=do_train, inputs=[n_samples, random_state, min_estimators, max_estimators], outputs=[plot])
    min_estimators.change(fn=do_train, inputs=[n_samples, random_state, min_estimators, max_estimators], outputs=[plot])
    max_estimators.change(fn=do_train, inputs=[n_samples, random_state, min_estimators, max_estimators], outputs=[plot])

demo.queue().launch()