File size: 8,828 Bytes
5120311
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
import tensorrt as trt
import torch
TRT_LOGGER = trt.Logger()

# Simple helper data class that's a little nicer to use than a 2-tuple.
class HostDeviceMem(object):
    def __init__(self, host_mem, device_mem):
        self.host = host_mem
        self.device = device_mem

    def __str__(self):
        return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)

    def __repr__(self):
        return self.__str__()

# Allocates all buffers required for an engine, i.e. host/device inputs/outputs.
def allocate_buffers(engine):
    inputs = []
    outputs = []
    bindings = []
    stream = cuda.Stream()
    out_shapes = []
    input_shapes = []
    out_names = []
    max_batch_size = engine.get_profile_shape(0, 0)[2][0]
    # max_batch_size = 1
    for binding in engine:

        binding_shape = engine.get_binding_shape(binding)
        #Fix -1 dimension for proper memory allocation for batch_size > 1
        if binding_shape[0] == -1:
            binding_shape = (1,) + binding_shape[1:]
        size = trt.volume(binding_shape) * max_batch_size
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        # Allocate host and device buffers
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        # Append the device buffer to device bindings.
        bindings.append(int(device_mem))
        # Append to the appropriate list.
        if engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem, device_mem))
            input_shapes.append(engine.get_binding_shape(binding))
        else:
            outputs.append(HostDeviceMem(host_mem, device_mem))
            #Collect original output shapes and names from engine
            out_shapes.append(engine.get_binding_shape(binding))
            out_names.append(binding)
    return inputs, outputs, bindings, stream, input_shapes, out_shapes, out_names, max_batch_size

# This function is generalized for multiple inputs/outputs.
# inputs and outputs are expected to be lists of HostDeviceMem objects.
def do_inference(context, bindings, inputs, outputs, stream):
    # Transfer input data to the GPU.
    [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
    # Run inference.
    context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
    # Transfer predictions back from the GPU.
    [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
    # Synchronize the stream
    stream.synchronize()
    # Return only the host outputs.
    return [out.host for out in outputs]

class TrtModel(object):
    def __init__(self, model):
        self.engine_file = model
        self.engine = None
        self.inputs = None
        self.outputs = None
        self.bindings = None
        self.stream = None
        self.context = None
        self.input_shapes = None
        self.out_shapes = None
        self.max_batch_size = 1

    def build(self):
        with open(self.engine_file, 'rb') as f, trt.Runtime(TRT_LOGGER) as runtime:
            self.engine = runtime.deserialize_cuda_engine(f.read())
        self.inputs, self.outputs, self.bindings, self.stream, self.input_shapes, self.out_shapes, self.out_names, self.max_batch_size = allocate_buffers(
            self.engine)
        # print(self.inputs, self.outputs, self.bindings, self.stream, self.input_shapes, self.out_shapes, self.out_names, self.max_batch_size)
        self.context = self.engine.create_execution_context()
        self.context.active_optimization_profile = 0

    def run(self, input, deflatten: bool = False, as_dict = False, use_token_type_ids=None):
        # assert len(input) == 2, "You must input 2 ndarrays [attention_mask, input_ids]"
        # assert len(input[0]) == len(input[1]), "Invalid input shape: {} vs {}".format(input[0].shape, input[1].shape)
        
        # lazy load implementation
        if self.engine is None:
            self.build()
        batch_size = input[0].shape[0]
        # Allocate mem for first input
        input_ids = np.array(input[0], dtype = np.int32)
        allocate_place = np.prod(input_ids.shape)
        self.inputs[0].host[:allocate_place] = input_ids.flatten(order='C')
        
        # Allocate mem for second input
        attent_mask = np.array(input[1], dtype = np.int32)
        allocate_place = np.prod(attent_mask.shape)
        self.inputs[1].host[:allocate_place] = attent_mask.flatten(order='C')
        
        # Set binding
        self.context.set_binding_shape(0, input_ids.shape)
        self.context.set_binding_shape(1, attent_mask.shape)
     
        if use_token_type_ids:
            token_type_ids = np.array(input[2], dtype = np.int32)
            allocate_place = np.prod(token_type_ids.shape)
            self.inputs[2].host[:allocate_place] = token_type_ids.flatten(order='C')
            self.context.set_binding_shape(2, token_type_ids.shape)
     
        trt_outputs = do_inference(
            self.context, bindings=self.bindings,
            inputs=self.inputs, outputs=self.outputs, stream=self.stream)
       
        # print(self.inputs, self.outputs, self.bindings, self.stream, self.input_shapes, self.out_shapes, self.out_names, self.max_batch_size)
        # Reshape TRT outputs to original shape instead of flattened array
        # print(trt_outputs[0].shape)

        if deflatten:
            out_shapes = [(batch_size, ) + out_shape[1:] for out_shape in self.out_shapes]
            trt_outputs = [output[:np.prod(shape)].reshape(shape) for output, shape in zip(trt_outputs, out_shapes)]
        # if as_dict:
        #     return {name: trt_outputs[i] for i, name in enumerate(self.out_names)}

        # trt_outputs =  [trt_output.reshape(-1,512) for trt_output in trt_outputs]
        trt_outputs = [trt_output[:batch_size] for trt_output in trt_outputs]
        return trt_outputs


def mean_pooling(token_embeddings, attention_mask):
        # print(token_embeddings.shape)
        # token_embeddings = model_output[0] #First element of model_output contains all token embeddings
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

def encode(sentences: list, tokenizer, trt_model:TrtModel, use_token_type_ids = False, max_lengh =512):

        sentences = [sentences] if isinstance(sentences, str) else sentences

        # inputs = {
        #     k: v.numpy()
        #     for k, v in tokenizer(
        #         sentences,
        #         padding='max_length',
        #         truncation=True,
        #         return_tensors="pt",
        #     ).items()
        # }
        
        x = tokenizer(sentences,  padding='max_length', truncation=True, max_length = max_lengh)
        input_ids = x["input_ids"]
        attention_mask = x["attention_mask"]
        input_ids = np.array(input_ids, dtype = np.int32)
        attention_mask = np.array(attention_mask, dtype = np.int32)

        if use_token_type_ids:
            token_type_ids = x["token_type_ids"]
            token_type_ids = np.array(token_type_ids, dtype = np.int32)
            hidden_states = trt_model.run([input_ids, attention_mask, token_type_ids ], deflatten=True, use_token_type_ids=True)
        else:
            hidden_states = trt_model.run([input_ids, attention_mask], deflatten=True)
        sentence_embeddings = mean_pooling(torch.from_numpy(hidden_states[0]), torch.from_numpy(attention_mask))
        # import json
        # with open("tensorRT/embs.json", 'w') as f:
        #     json.dump(sentence_embeddings.tolist(), f, ensure_ascii=False)

        return sentence_embeddings.numpy()
if __name__ == "__main__":
    import torch
    import json
    

    

    from transformers import AutoTokenizer

    # tokenizer = AutoTokenizer.from_pretrained("tensorRT/models/paraphrase-mpnet-base-v2")
    # model = TrtModel("tensorRT/models/paraphrase-mpnet-base-v2.engine")
    # tokenizer = AutoTokenizer.from_pretrained("tensorRT/models/distiluse-base-multilingual-cased-v2")
    # model = TrtModel("tensorRT/models/distiluse-base-multilingual-cased-v2.endgine")

    tokenizer = AutoTokenizer.from_pretrained("tensorRT/models/paraphrase-multilingual-MiniLM-L12-v2")
    model = TrtModel("tensorRT/models/paraphrase-multilingual-MiniLM-L12-v2.engine")
    
    lst_input = ["Pham Minh Chinh is Vietnam's Prime Minister"] *2
    encode(lst_input, tokenizer, model, use_token_type_ids=False)