File size: 4,038 Bytes
35e3254
 
 
 
590064e
 
 
 
 
 
0c661e5
64721de
35e3254
 
 
 
 
 
 
 
 
 
 
 
 
64721de
 
 
 
 
35e3254
 
 
 
 
 
 
 
 
64721de
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35e3254
 
 
 
64721de
35e3254
 
64721de
590064e
64721de
590064e
64721de
35e3254
b23ba47
 
 
 
35e3254
 
64721de
35e3254
 
 
 
 
64721de
 
 
 
 
35e3254
 
 
64721de
35e3254
64721de
35e3254
 
 
 
 
 
 
64721de
35e3254
 
 
 
 
64721de
35e3254
 
590064e
 
 
 
 
64721de
35e3254
b23ba47
 
 
 
 
35e3254
64721de
35e3254
64721de
35e3254
64721de
35e3254
 
 
 
 
64721de
35e3254
 
64721de
35e3254
 
 
64721de
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
import json

from datasets import load_dataset

from defaults import (
    ADDRESS_BETTERTRANSFORMER,
    ADDRESS_VANILLA,
    HEADERS,
    SPAM_N_REQUESTS,
)
from utils import ElapsedFuturesSession

data = load_dataset("glue", "sst2", split="validation")

RETURN_MESSAGE_SINGLE = """
Inference statistics:

* Response status: {0}
* Prediction: {1}
* Inference latency (preprocessing/forward/postprocessing): {2} ms
* Peak GPU memory usage: {3} MB
* End-to-end latency (communication + pre/forward/post): {4} ms
* Padding ratio: 0.0 %
"""

RETURN_MESSAGE_SPAM = (
    """
Processing """
    + f"{SPAM_N_REQUESTS}"
    + """ inputs sent asynchronously. Grab a coffee.

Inference statistics:

* Promise resolution time: {0} ms
* Mean inference latency (preprocessing/forward/postprocessing): {1} ms
* Mean peak GPU memory: {2} MB
* Mean padding ratio: {3} %
* Mean sequence length: {4} tokens
"""
)


def get_message_single(
    status, prediction, inf_latency, peak_gpu_memory, end_to_end_latency, **kwargs
):
    return RETURN_MESSAGE_SINGLE.format(
        status, prediction, inf_latency, peak_gpu_memory, end_to_end_latency
    )


def get_message_spam(
    resolution_time,
    mean_inference_latency,
    mean_peak_gpu_memory,
    mean_padding_ratio,
    mean_sequence_length,
    **kwargs,
):
    return RETURN_MESSAGE_SPAM.format(
        resolution_time,
        mean_inference_latency,
        mean_peak_gpu_memory,
        mean_padding_ratio,
        mean_sequence_length,
    )


SESSION = ElapsedFuturesSession()


def send_single(input_model_vanilla, address: str):
    assert address in [ADDRESS_VANILLA, ADDRESS_BETTERTRANSFORMER]

    # should not take more than 10 s, so timeout if that's the case
    promise = SESSION.post(
        address, headers=HEADERS, data=input_model_vanilla.encode("utf-8"), timeout=10
    )

    try:
        response = promise.result()  # resolve ASAP
    except Exception as e:
        return f"{e}"

    status = response.status_code

    response_text = json.loads(response.text)
    prediction = response_text[0]
    inf_latency = response_text[1]
    peak_gpu_memory = response_text[2]
    end_to_end_latency = response.elapsed

    return get_message_single(
        status, prediction, inf_latency, peak_gpu_memory, end_to_end_latency
    )


def send_spam(address: str):
    assert address in [ADDRESS_VANILLA, ADDRESS_BETTERTRANSFORMER]

    # data = "this is positive lol"  #TODO: use dynamic data with padding

    assert SPAM_N_REQUESTS <= len(data)

    inp = data.shuffle().select(range(SPAM_N_REQUESTS))

    resolution_time = 0
    mean_inference_latency = 0
    mean_peak_gpu_memory = 0

    n_pads = 0
    n_elems = 0
    sequence_length = 0

    promises = []

    for i in range(SPAM_N_REQUESTS):
        input_data = inp[i]["sentence"].encode("utf-8")

        # should not take more than 15 s, so timeout if that's the case
        promises.append(
            SESSION.post(address, headers=HEADERS, data=input_data, timeout=15)
        )

    for promise in promises:
        try:
            response = promise.result()  # resolve ASAP
        except Exception as e:
            return f"{e}"

        response = promise.result()

        response_text = json.loads(response.text)

        resolution_time = max(resolution_time, response.elapsed)

        mean_inference_latency += response_text[1]
        mean_peak_gpu_memory += response_text[2]
        n_pads += response_text[3]
        n_elems += response_text[4]
        sequence_length += response_text[5]

    mean_padding_ratio = f"{n_pads / n_elems * 100:.2f}"
    mean_sequence_length = sequence_length / SPAM_N_REQUESTS

    resolution_time = round(resolution_time, 2)
    mean_inference_latency = round(mean_inference_latency / SPAM_N_REQUESTS, 2)
    mean_peak_gpu_memory = round(mean_peak_gpu_memory / SPAM_N_REQUESTS, 2)

    return get_message_spam(
        resolution_time,
        mean_inference_latency,
        mean_peak_gpu_memory,
        mean_padding_ratio,
        mean_sequence_length,
    )