Spaces:
Running
Running
File size: 4,239 Bytes
35e3254 590064e bf38ec8 590064e 0c661e5 64721de bf38ec8 35e3254 64721de bf38ec8 35e3254 bf38ec8 35e3254 bf38ec8 35e3254 64721de bf38ec8 64721de bf38ec8 64721de bf38ec8 64721de bf38ec8 64721de bf38ec8 64721de 35e3254 64721de 590064e 64721de 590064e 64721de 35e3254 b23ba47 35e3254 64721de 35e3254 64721de 35e3254 bf38ec8 35e3254 64721de 35e3254 bf38ec8 35e3254 64721de 35e3254 bf38ec8 35e3254 64721de bf38ec8 35e3254 590064e 64721de 35e3254 b23ba47 35e3254 64721de 35e3254 64721de bf38ec8 64721de 35e3254 bf38ec8 64721de bf38ec8 35e3254 bf38ec8 64721de bf38ec8 64721de bf38ec8 64721de bf38ec8 64721de |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
import json
from defaults import (
ADDRESS_BETTERTRANSFORMER,
ADDRESS_VANILLA,
HEADERS,
MODEL_NAME,
)
from utils import ElapsedFuturesSession
from transformers import AutoTokenizer
import numpy as np
RETURN_MESSAGE_SINGLE = """
Inference statistics:
* Response status: {0}
* Prediction: {1}
* Inference latency (preprocessing/forward/postprocessing): {2} ms
* Peak GPU memory usage: {3} MB
* End-to-end latency (communication + pre/forward/post): {4} ms
* Padding ratio: 0.0 %
"""
RETURN_MESSAGE_SPAM = (
"""
Processing """
+ "NUMBER REQ" + """ inputs sent asynchronously. Grab a coffee.
Inference statistics:
* Throughput: {0} samples/s
* Mean inference latency (preprocessing/forward/postprocessing): {1} ms
* Mean peak GPU memory: {2} MB
* Mean padding ratio: {3} %
* Mean sequence length: {4} tokens
* Effective mean batch size: {5}
"""
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
def get_message_single(
status, prediction, inf_latency, peak_gpu_memory, end_to_end_latency, **kwargs
):
return RETURN_MESSAGE_SINGLE.format(
status, prediction, inf_latency, peak_gpu_memory, end_to_end_latency
)
def get_message_spam(
throughput,
mean_inference_latency,
mean_peak_gpu_memory,
mean_padding_ratio,
mean_sequence_length,
effective_batch_size,
**kwargs,
):
return RETURN_MESSAGE_SPAM.format(
throughput,
mean_inference_latency,
mean_peak_gpu_memory,
mean_padding_ratio,
mean_sequence_length,
effective_batch_size,
)
SESSION = ElapsedFuturesSession()
def send_single(input_model_vanilla, address: str):
assert address in [ADDRESS_VANILLA, ADDRESS_BETTERTRANSFORMER]
# should not take more than 10 s, so timeout if that's the case
promise = SESSION.post(
address, headers=HEADERS, data=input_model_vanilla.encode("utf-8"), timeout=10
)
try:
response = promise.result() # resolve ASAP
except Exception as e:
return f"{e}"
status = response.status_code
response_text = json.loads(response.text)
prediction = response_text[0]
inf_latency = response_text[1]
peak_gpu_memory = response_text[2]
end_to_end_latency = response.elapsed
return get_message_single(
status, prediction, inf_latency, peak_gpu_memory, end_to_end_latency
)
def send_spam(inp, address: str):
assert address in [ADDRESS_VANILLA, ADDRESS_BETTERTRANSFORMER]
# data = "this is positive lol" #TODO: use dynamic data with padding
max_resolution_time = 0
mean_inference_latency = 0
mean_peak_gpu_memory = 0
n_pads = 0
n_elems = 0
sequence_length = 0
effective_batch_size = 0
promises = []
n_inputs = len(inp)
for i in range(n_inputs):
input_data = inp[i]["sentence"].encode("utf-8")
# should not take more than 15 s, so timeout if that's the case
promises.append(
SESSION.post(address, headers=HEADERS, data=input_data, timeout=15)
)
for promise in promises:
try:
response = promise.result() # resolve ASAP
except Exception as e:
return f"{e}"
response = promise.result()
response_text = json.loads(response.text)
max_resolution_time = max(max_resolution_time, response.elapsed)
mean_inference_latency += response_text[1]
mean_peak_gpu_memory += response_text[2]
n_pads += response_text[3]
n_elems += response_text[4]
sequence_length += response_text[5]
effective_batch_size += response_text[6]
throughput = n_inputs / (max_resolution_time * 1e-3)
mean_padding_ratio = f"{n_pads / n_elems * 100:.2f}"
mean_sequence_length = sequence_length / n_inputs
effective_batch_size = effective_batch_size / n_inputs
throughput = round(throughput, 2)
mean_inference_latency = round(mean_inference_latency / n_inputs, 2)
mean_peak_gpu_memory = round(mean_peak_gpu_memory / n_inputs, 2)
return get_message_spam(
throughput,
mean_inference_latency,
mean_peak_gpu_memory,
mean_padding_ratio,
mean_sequence_length,
effective_batch_size,
)
|