gabrielclark3330 commited on
Commit
122bb5c
·
1 Parent(s): 7b8b167

Limit concurency

Browse files
Files changed (1) hide show
  1. app.py +8 -51
app.py CHANGED
@@ -32,15 +32,15 @@ def extract_assistant_response(generated_text):
32
  else:
33
  return generated_text[start_idx:end_idx].strip()
34
 
35
- def generate_response_2_7B_instruct(chat_history, max_new_tokens):
36
  sample = []
37
  for turn in chat_history:
38
  if turn[0]:
39
  sample.append({'role': 'user', 'content': turn[0]})
40
  if turn[1]:
41
  sample.append({'role': 'assistant', 'content': turn[1]})
42
- chat_sample = tokenizer_2_7B_instruct.apply_chat_template(sample, tokenize=False)
43
- input_ids = tokenizer_2_7B_instruct(chat_sample, return_tensors='pt', add_special_tokens=False).to(model_2_7B_instruct.device)
44
 
45
  max_new_tokens = int(max_new_tokens)
46
  max_input_length = max_context_length - max_new_tokens
@@ -50,9 +50,9 @@ def generate_response_2_7B_instruct(chat_history, max_new_tokens):
50
  input_ids['attention_mask'] = input_ids['attention_mask'][:, -max_input_length:]
51
 
52
  with torch.no_grad():
53
- outputs = model_2_7B_instruct.generate(**input_ids, max_new_tokens=int(max_new_tokens), return_dict_in_generate=False, output_scores=False, use_cache=True, num_beams=1, do_sample=False)
54
  """
55
- outputs = model_2_7B_instruct.generate(
56
  input_ids=input_ids,
57
  max_new_tokens=int(max_new_tokens),
58
  do_sample=True,
@@ -66,50 +66,7 @@ def generate_response_2_7B_instruct(chat_history, max_new_tokens):
66
  num_return_sequences=1
67
  )
68
  """
69
- generated_text = tokenizer_2_7B_instruct.decode(outputs[0])
70
- assistant_response = extract_assistant_response(generated_text)
71
-
72
- del input_ids
73
- del outputs
74
- torch.cuda.empty_cache()
75
-
76
- return assistant_response
77
-
78
- def generate_response_7B_instruct(chat_history, max_new_tokens):
79
- sample = []
80
- for turn in chat_history:
81
- if turn[0]:
82
- sample.append({'role': 'user', 'content': turn[0]})
83
- if turn[1]:
84
- sample.append({'role': 'assistant', 'content': turn[1]})
85
- chat_sample = tokenizer_7B_instruct.apply_chat_template(sample, tokenize=False)
86
- input_ids = tokenizer_7B_instruct(chat_sample, return_tensors='pt', add_special_tokens=False).to(model_7B_instruct.device)
87
-
88
- max_new_tokens = int(max_new_tokens)
89
- max_input_length = max_context_length - max_new_tokens
90
- if input_ids['input_ids'].size(1) > max_input_length:
91
- input_ids['input_ids'] = input_ids['input_ids'][:, -max_input_length:]
92
- if 'attention_mask' in input_ids:
93
- input_ids['attention_mask'] = input_ids['attention_mask'][:, -max_input_length:]
94
-
95
- with torch.no_grad():
96
- outputs = model_7B_instruct.generate(**input_ids, max_new_tokens=int(max_new_tokens), return_dict_in_generate=False, output_scores=False, use_cache=True, num_beams=1, do_sample=False)
97
- """
98
- outputs = model_7B_instruct.generate(
99
- input_ids=input_ids,
100
- max_new_tokens=int(max_new_tokens),
101
- do_sample=True,
102
- use_cache=True,
103
- temperature=temperature,
104
- top_k=int(top_k),
105
- top_p=top_p,
106
- repetition_penalty=repetition_penalty,
107
- num_beams=int(num_beams),
108
- length_penalty=length_penalty,
109
- num_return_sequences=1
110
- )
111
- """
112
- generated_text = tokenizer_7B_instruct.decode(outputs[0])
113
  assistant_response = extract_assistant_response(generated_text)
114
 
115
  del input_ids
@@ -141,7 +98,7 @@ with gr.Blocks() as demo:
141
  return gr.update(value=""), chat_history, chat_history
142
 
143
  def bot_response_2_7B_instruct(chat_history, max_new_tokens):
144
- response = generate_response_2_7B_instruct(chat_history, max_new_tokens)
145
  chat_history[-1][1] = response
146
  return chat_history, chat_history
147
 
@@ -178,7 +135,7 @@ with gr.Blocks() as demo:
178
  return gr.update(value=""), chat_history, chat_history
179
 
180
  def bot_response_7B_instruct(chat_history, max_new_tokens):
181
- response = generate_response_7B_instruct(chat_history, max_new_tokens)
182
  chat_history[-1][1] = response
183
  return chat_history, chat_history
184
 
 
32
  else:
33
  return generated_text[start_idx:end_idx].strip()
34
 
35
+ def generate_response(chat_history, max_new_tokens, model, tokenizer):
36
  sample = []
37
  for turn in chat_history:
38
  if turn[0]:
39
  sample.append({'role': 'user', 'content': turn[0]})
40
  if turn[1]:
41
  sample.append({'role': 'assistant', 'content': turn[1]})
42
+ chat_sample = tokenizer.apply_chat_template(sample, tokenize=False)
43
+ input_ids = tokenizer(chat_sample, return_tensors='pt', add_special_tokens=False).to(model.device)
44
 
45
  max_new_tokens = int(max_new_tokens)
46
  max_input_length = max_context_length - max_new_tokens
 
50
  input_ids['attention_mask'] = input_ids['attention_mask'][:, -max_input_length:]
51
 
52
  with torch.no_grad():
53
+ outputs = model.generate(**input_ids, max_new_tokens=int(max_new_tokens), return_dict_in_generate=False, output_scores=False, use_cache=True, num_beams=1, do_sample=False)
54
  """
55
+ outputs = model.generate(
56
  input_ids=input_ids,
57
  max_new_tokens=int(max_new_tokens),
58
  do_sample=True,
 
66
  num_return_sequences=1
67
  )
68
  """
69
+ generated_text = tokenizer.decode(outputs[0])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  assistant_response = extract_assistant_response(generated_text)
71
 
72
  del input_ids
 
98
  return gr.update(value=""), chat_history, chat_history
99
 
100
  def bot_response_2_7B_instruct(chat_history, max_new_tokens):
101
+ response = generate_response(chat_history, max_new_tokens, model_2_7B_instruct, tokenizer_2_7B_instruct)
102
  chat_history[-1][1] = response
103
  return chat_history, chat_history
104
 
 
135
  return gr.update(value=""), chat_history, chat_history
136
 
137
  def bot_response_7B_instruct(chat_history, max_new_tokens):
138
+ response = generate_response(chat_history, max_new_tokens, model_7B_instruct, tokenizer_7B_instruct)
139
  chat_history[-1][1] = response
140
  return chat_history, chat_history
141