Commit
·
122bb5c
1
Parent(s):
7b8b167
Limit concurency
Browse files
app.py
CHANGED
@@ -32,15 +32,15 @@ def extract_assistant_response(generated_text):
|
|
32 |
else:
|
33 |
return generated_text[start_idx:end_idx].strip()
|
34 |
|
35 |
-
def
|
36 |
sample = []
|
37 |
for turn in chat_history:
|
38 |
if turn[0]:
|
39 |
sample.append({'role': 'user', 'content': turn[0]})
|
40 |
if turn[1]:
|
41 |
sample.append({'role': 'assistant', 'content': turn[1]})
|
42 |
-
chat_sample =
|
43 |
-
input_ids =
|
44 |
|
45 |
max_new_tokens = int(max_new_tokens)
|
46 |
max_input_length = max_context_length - max_new_tokens
|
@@ -50,9 +50,9 @@ def generate_response_2_7B_instruct(chat_history, max_new_tokens):
|
|
50 |
input_ids['attention_mask'] = input_ids['attention_mask'][:, -max_input_length:]
|
51 |
|
52 |
with torch.no_grad():
|
53 |
-
outputs =
|
54 |
"""
|
55 |
-
outputs =
|
56 |
input_ids=input_ids,
|
57 |
max_new_tokens=int(max_new_tokens),
|
58 |
do_sample=True,
|
@@ -66,50 +66,7 @@ def generate_response_2_7B_instruct(chat_history, max_new_tokens):
|
|
66 |
num_return_sequences=1
|
67 |
)
|
68 |
"""
|
69 |
-
generated_text =
|
70 |
-
assistant_response = extract_assistant_response(generated_text)
|
71 |
-
|
72 |
-
del input_ids
|
73 |
-
del outputs
|
74 |
-
torch.cuda.empty_cache()
|
75 |
-
|
76 |
-
return assistant_response
|
77 |
-
|
78 |
-
def generate_response_7B_instruct(chat_history, max_new_tokens):
|
79 |
-
sample = []
|
80 |
-
for turn in chat_history:
|
81 |
-
if turn[0]:
|
82 |
-
sample.append({'role': 'user', 'content': turn[0]})
|
83 |
-
if turn[1]:
|
84 |
-
sample.append({'role': 'assistant', 'content': turn[1]})
|
85 |
-
chat_sample = tokenizer_7B_instruct.apply_chat_template(sample, tokenize=False)
|
86 |
-
input_ids = tokenizer_7B_instruct(chat_sample, return_tensors='pt', add_special_tokens=False).to(model_7B_instruct.device)
|
87 |
-
|
88 |
-
max_new_tokens = int(max_new_tokens)
|
89 |
-
max_input_length = max_context_length - max_new_tokens
|
90 |
-
if input_ids['input_ids'].size(1) > max_input_length:
|
91 |
-
input_ids['input_ids'] = input_ids['input_ids'][:, -max_input_length:]
|
92 |
-
if 'attention_mask' in input_ids:
|
93 |
-
input_ids['attention_mask'] = input_ids['attention_mask'][:, -max_input_length:]
|
94 |
-
|
95 |
-
with torch.no_grad():
|
96 |
-
outputs = model_7B_instruct.generate(**input_ids, max_new_tokens=int(max_new_tokens), return_dict_in_generate=False, output_scores=False, use_cache=True, num_beams=1, do_sample=False)
|
97 |
-
"""
|
98 |
-
outputs = model_7B_instruct.generate(
|
99 |
-
input_ids=input_ids,
|
100 |
-
max_new_tokens=int(max_new_tokens),
|
101 |
-
do_sample=True,
|
102 |
-
use_cache=True,
|
103 |
-
temperature=temperature,
|
104 |
-
top_k=int(top_k),
|
105 |
-
top_p=top_p,
|
106 |
-
repetition_penalty=repetition_penalty,
|
107 |
-
num_beams=int(num_beams),
|
108 |
-
length_penalty=length_penalty,
|
109 |
-
num_return_sequences=1
|
110 |
-
)
|
111 |
-
"""
|
112 |
-
generated_text = tokenizer_7B_instruct.decode(outputs[0])
|
113 |
assistant_response = extract_assistant_response(generated_text)
|
114 |
|
115 |
del input_ids
|
@@ -141,7 +98,7 @@ with gr.Blocks() as demo:
|
|
141 |
return gr.update(value=""), chat_history, chat_history
|
142 |
|
143 |
def bot_response_2_7B_instruct(chat_history, max_new_tokens):
|
144 |
-
response =
|
145 |
chat_history[-1][1] = response
|
146 |
return chat_history, chat_history
|
147 |
|
@@ -178,7 +135,7 @@ with gr.Blocks() as demo:
|
|
178 |
return gr.update(value=""), chat_history, chat_history
|
179 |
|
180 |
def bot_response_7B_instruct(chat_history, max_new_tokens):
|
181 |
-
response =
|
182 |
chat_history[-1][1] = response
|
183 |
return chat_history, chat_history
|
184 |
|
|
|
32 |
else:
|
33 |
return generated_text[start_idx:end_idx].strip()
|
34 |
|
35 |
+
def generate_response(chat_history, max_new_tokens, model, tokenizer):
|
36 |
sample = []
|
37 |
for turn in chat_history:
|
38 |
if turn[0]:
|
39 |
sample.append({'role': 'user', 'content': turn[0]})
|
40 |
if turn[1]:
|
41 |
sample.append({'role': 'assistant', 'content': turn[1]})
|
42 |
+
chat_sample = tokenizer.apply_chat_template(sample, tokenize=False)
|
43 |
+
input_ids = tokenizer(chat_sample, return_tensors='pt', add_special_tokens=False).to(model.device)
|
44 |
|
45 |
max_new_tokens = int(max_new_tokens)
|
46 |
max_input_length = max_context_length - max_new_tokens
|
|
|
50 |
input_ids['attention_mask'] = input_ids['attention_mask'][:, -max_input_length:]
|
51 |
|
52 |
with torch.no_grad():
|
53 |
+
outputs = model.generate(**input_ids, max_new_tokens=int(max_new_tokens), return_dict_in_generate=False, output_scores=False, use_cache=True, num_beams=1, do_sample=False)
|
54 |
"""
|
55 |
+
outputs = model.generate(
|
56 |
input_ids=input_ids,
|
57 |
max_new_tokens=int(max_new_tokens),
|
58 |
do_sample=True,
|
|
|
66 |
num_return_sequences=1
|
67 |
)
|
68 |
"""
|
69 |
+
generated_text = tokenizer.decode(outputs[0])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
assistant_response = extract_assistant_response(generated_text)
|
71 |
|
72 |
del input_ids
|
|
|
98 |
return gr.update(value=""), chat_history, chat_history
|
99 |
|
100 |
def bot_response_2_7B_instruct(chat_history, max_new_tokens):
|
101 |
+
response = generate_response(chat_history, max_new_tokens, model_2_7B_instruct, tokenizer_2_7B_instruct)
|
102 |
chat_history[-1][1] = response
|
103 |
return chat_history, chat_history
|
104 |
|
|
|
135 |
return gr.update(value=""), chat_history, chat_history
|
136 |
|
137 |
def bot_response_7B_instruct(chat_history, max_new_tokens):
|
138 |
+
response = generate_response(chat_history, max_new_tokens, model_7B_instruct, tokenizer_7B_instruct)
|
139 |
chat_history[-1][1] = response
|
140 |
return chat_history, chat_history
|
141 |
|