Spaces:
Running
Running
File size: 4,038 Bytes
35e3254 590064e 0c661e5 64721de 35e3254 64721de 35e3254 64721de 35e3254 64721de 35e3254 64721de 590064e 64721de 590064e 64721de 35e3254 b23ba47 35e3254 64721de 35e3254 64721de 35e3254 64721de 35e3254 64721de 35e3254 64721de 35e3254 64721de 35e3254 590064e 64721de 35e3254 b23ba47 35e3254 64721de 35e3254 64721de 35e3254 64721de 35e3254 64721de 35e3254 64721de 35e3254 64721de |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 |
import json
from datasets import load_dataset
from defaults import (
ADDRESS_BETTERTRANSFORMER,
ADDRESS_VANILLA,
HEADERS,
SPAM_N_REQUESTS,
)
from utils import ElapsedFuturesSession
data = load_dataset("glue", "sst2", split="validation")
RETURN_MESSAGE_SINGLE = """
Inference statistics:
* Response status: {0}
* Prediction: {1}
* Inference latency (preprocessing/forward/postprocessing): {2} ms
* Peak GPU memory usage: {3} MB
* End-to-end latency (communication + pre/forward/post): {4} ms
* Padding ratio: 0.0 %
"""
RETURN_MESSAGE_SPAM = (
"""
Processing """
+ f"{SPAM_N_REQUESTS}"
+ """ inputs sent asynchronously. Grab a coffee.
Inference statistics:
* Promise resolution time: {0} ms
* Mean inference latency (preprocessing/forward/postprocessing): {1} ms
* Mean peak GPU memory: {2} MB
* Mean padding ratio: {3} %
* Mean sequence length: {4} tokens
"""
)
def get_message_single(
status, prediction, inf_latency, peak_gpu_memory, end_to_end_latency, **kwargs
):
return RETURN_MESSAGE_SINGLE.format(
status, prediction, inf_latency, peak_gpu_memory, end_to_end_latency
)
def get_message_spam(
resolution_time,
mean_inference_latency,
mean_peak_gpu_memory,
mean_padding_ratio,
mean_sequence_length,
**kwargs,
):
return RETURN_MESSAGE_SPAM.format(
resolution_time,
mean_inference_latency,
mean_peak_gpu_memory,
mean_padding_ratio,
mean_sequence_length,
)
SESSION = ElapsedFuturesSession()
def send_single(input_model_vanilla, address: str):
assert address in [ADDRESS_VANILLA, ADDRESS_BETTERTRANSFORMER]
# should not take more than 10 s, so timeout if that's the case
promise = SESSION.post(
address, headers=HEADERS, data=input_model_vanilla.encode("utf-8"), timeout=10
)
try:
response = promise.result() # resolve ASAP
except Exception as e:
return f"{e}"
status = response.status_code
response_text = json.loads(response.text)
prediction = response_text[0]
inf_latency = response_text[1]
peak_gpu_memory = response_text[2]
end_to_end_latency = response.elapsed
return get_message_single(
status, prediction, inf_latency, peak_gpu_memory, end_to_end_latency
)
def send_spam(address: str):
assert address in [ADDRESS_VANILLA, ADDRESS_BETTERTRANSFORMER]
# data = "this is positive lol" #TODO: use dynamic data with padding
assert SPAM_N_REQUESTS <= len(data)
inp = data.shuffle().select(range(SPAM_N_REQUESTS))
resolution_time = 0
mean_inference_latency = 0
mean_peak_gpu_memory = 0
n_pads = 0
n_elems = 0
sequence_length = 0
promises = []
for i in range(SPAM_N_REQUESTS):
input_data = inp[i]["sentence"].encode("utf-8")
# should not take more than 15 s, so timeout if that's the case
promises.append(
SESSION.post(address, headers=HEADERS, data=input_data, timeout=15)
)
for promise in promises:
try:
response = promise.result() # resolve ASAP
except Exception as e:
return f"{e}"
response = promise.result()
response_text = json.loads(response.text)
resolution_time = max(resolution_time, response.elapsed)
mean_inference_latency += response_text[1]
mean_peak_gpu_memory += response_text[2]
n_pads += response_text[3]
n_elems += response_text[4]
sequence_length += response_text[5]
mean_padding_ratio = f"{n_pads / n_elems * 100:.2f}"
mean_sequence_length = sequence_length / SPAM_N_REQUESTS
resolution_time = round(resolution_time, 2)
mean_inference_latency = round(mean_inference_latency / SPAM_N_REQUESTS, 2)
mean_peak_gpu_memory = round(mean_peak_gpu_memory / SPAM_N_REQUESTS, 2)
return get_message_spam(
resolution_time,
mean_inference_latency,
mean_peak_gpu_memory,
mean_padding_ratio,
mean_sequence_length,
)
|