Zamba2-7B

Paused

App Files Files Community

gabrielclark3330 commited on Oct 13, 2024

Commit

122bb5c

1 Parent(s): 7b8b167

Limit concurency

Browse files

Files changed (1) hide show

app.py +8 -51

app.py CHANGED Viewed

@@ -32,15 +32,15 @@ def extract_assistant_response(generated_text):
     else:
         return generated_text[start_idx:end_idx].strip()
-def generate_response_2_7B_instruct(chat_history, max_new_tokens):
     sample = []
     for turn in chat_history:
         if turn[0]:
             sample.append({'role': 'user', 'content': turn[0]})
         if turn[1]:
             sample.append({'role': 'assistant', 'content': turn[1]})
-    chat_sample = tokenizer_2_7B_instruct.apply_chat_template(sample, tokenize=False)
-    input_ids = tokenizer_2_7B_instruct(chat_sample, return_tensors='pt', add_special_tokens=False).to(model_2_7B_instruct.device)
     max_new_tokens = int(max_new_tokens)
     max_input_length = max_context_length - max_new_tokens
@@ -50,9 +50,9 @@ def generate_response_2_7B_instruct(chat_history, max_new_tokens):
             input_ids['attention_mask'] = input_ids['attention_mask'][:, -max_input_length:]
     with torch.no_grad():
-        outputs = model_2_7B_instruct.generate(**input_ids, max_new_tokens=int(max_new_tokens), return_dict_in_generate=False, output_scores=False, use_cache=True, num_beams=1, do_sample=False)
     """
-    outputs = model_2_7B_instruct.generate(
         input_ids=input_ids,
         max_new_tokens=int(max_new_tokens),
         do_sample=True,
@@ -66,50 +66,7 @@ def generate_response_2_7B_instruct(chat_history, max_new_tokens):
         num_return_sequences=1
     )
     """
-    generated_text = tokenizer_2_7B_instruct.decode(outputs[0])
-    assistant_response = extract_assistant_response(generated_text)
-    del input_ids
-    del outputs
-    torch.cuda.empty_cache()
-    return assistant_response
-def generate_response_7B_instruct(chat_history, max_new_tokens):
-    sample = []
-    for turn in chat_history:
-        if turn[0]:
-            sample.append({'role': 'user', 'content': turn[0]})
-        if turn[1]:
-            sample.append({'role': 'assistant', 'content': turn[1]})
-    chat_sample = tokenizer_7B_instruct.apply_chat_template(sample, tokenize=False)
-    input_ids = tokenizer_7B_instruct(chat_sample, return_tensors='pt', add_special_tokens=False).to(model_7B_instruct.device)
-    max_new_tokens = int(max_new_tokens)
-    max_input_length = max_context_length - max_new_tokens
-    if input_ids['input_ids'].size(1) > max_input_length:
-        input_ids['input_ids'] = input_ids['input_ids'][:, -max_input_length:]
-        if 'attention_mask' in input_ids:
-            input_ids['attention_mask'] = input_ids['attention_mask'][:, -max_input_length:]
-    with torch.no_grad():
-        outputs = model_7B_instruct.generate(**input_ids, max_new_tokens=int(max_new_tokens), return_dict_in_generate=False, output_scores=False, use_cache=True, num_beams=1, do_sample=False)
-    """
-    outputs = model_7B_instruct.generate(
-        input_ids=input_ids,
-        max_new_tokens=int(max_new_tokens),
-        do_sample=True,
-        use_cache=True,
-        temperature=temperature,
-        top_k=int(top_k),
-        top_p=top_p,
-        repetition_penalty=repetition_penalty,
-        num_beams=int(num_beams),
-        length_penalty=length_penalty,
-        num_return_sequences=1
-    )
-    """
-    generated_text = tokenizer_7B_instruct.decode(outputs[0])
     assistant_response = extract_assistant_response(generated_text)
     del input_ids
@@ -141,7 +98,7 @@ with gr.Blocks() as demo:
                 return gr.update(value=""), chat_history, chat_history
             def bot_response_2_7B_instruct(chat_history, max_new_tokens):
-                response = generate_response_2_7B_instruct(chat_history, max_new_tokens)
                 chat_history[-1][1] = response
                 return chat_history, chat_history
@@ -178,7 +135,7 @@ with gr.Blocks() as demo:
                 return gr.update(value=""), chat_history, chat_history
             def bot_response_7B_instruct(chat_history, max_new_tokens):
-                response = generate_response_7B_instruct(chat_history, max_new_tokens)
                 chat_history[-1][1] = response
                 return chat_history, chat_history

     else:
         return generated_text[start_idx:end_idx].strip()
+def generate_response(chat_history, max_new_tokens, model, tokenizer):
     sample = []
     for turn in chat_history:
         if turn[0]:
             sample.append({'role': 'user', 'content': turn[0]})
         if turn[1]:
             sample.append({'role': 'assistant', 'content': turn[1]})
+    chat_sample = tokenizer.apply_chat_template(sample, tokenize=False)
+    input_ids = tokenizer(chat_sample, return_tensors='pt', add_special_tokens=False).to(model.device)
     max_new_tokens = int(max_new_tokens)
     max_input_length = max_context_length - max_new_tokens
             input_ids['attention_mask'] = input_ids['attention_mask'][:, -max_input_length:]
     with torch.no_grad():
+        outputs = model.generate(**input_ids, max_new_tokens=int(max_new_tokens), return_dict_in_generate=False, output_scores=False, use_cache=True, num_beams=1, do_sample=False)
     """
+    outputs = model.generate(
         input_ids=input_ids,
         max_new_tokens=int(max_new_tokens),
         do_sample=True,
         num_return_sequences=1
     )
     """
+    generated_text = tokenizer.decode(outputs[0])
     assistant_response = extract_assistant_response(generated_text)
     del input_ids
                 return gr.update(value=""), chat_history, chat_history
             def bot_response_2_7B_instruct(chat_history, max_new_tokens):
+                response = generate_response(chat_history, max_new_tokens, model_2_7B_instruct, tokenizer_2_7B_instruct)
                 chat_history[-1][1] = response
                 return chat_history, chat_history
                 return gr.update(value=""), chat_history, chat_history
             def bot_response_7B_instruct(chat_history, max_new_tokens):
+                response = generate_response(chat_history, max_new_tokens, model_7B_instruct, tokenizer_7B_instruct)
                 chat_history[-1][1] = response
                 return chat_history, chat_history