sandz7 commited on
Commit
76a6e88
Β·
1 Parent(s): 10a4f12

added terminators, params on generations and a thread with steamer to finalize also a sliding feature on UI

Browse files
Files changed (1) hide show
  1. app.py +60 -21
app.py CHANGED
@@ -3,10 +3,11 @@ import pandas as pd
3
  import numpy as np
4
  import gradio as gr
5
  import re
6
- from transformers import AutoTokenizer, AutoModelForCausalLM
7
  import re
8
  from huggingface_hub import login
9
  import os
 
10
 
11
  # HF_TOKEN
12
  TOKEN = os.getenv('HF_AUTH_TOKEN')
@@ -26,33 +27,53 @@ DESCRIPTION = '''
26
  # Place transformers in hardware to prepare for process and generation
27
  llama_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B")
28
  llama_model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B", token=TOKEN, torch_dtype=torch.float16).to('cuda')
 
 
 
 
29
 
30
  # Place just input pass and return generation output
31
  def llama_generation(input_text: str,
32
- history):
 
 
33
  """
34
  Pass input texts, tokenize, output and back to text.
35
  """
36
 
37
- # Header prompt
38
- header = '''A conversation between a curious human and an AI assistant called Amphisbeana.
39
- The assistant helps the user by giving accurate and complete responses, it is free to make suggestions.\n\n'''
40
-
41
- input_ids = llama_tokenizer.encode(header + input_text,
42
- return_tensors='pt').to('cuda')
43
-
44
- # llama generation looks for the numeric vectors not the tensors so there is no need for **input_ids rather just input_ids
45
- output_ids = llama_model.generate(input_ids=input_ids,
46
- max_new_tokens=256,
47
- temperature=0.5,
48
- top_p=0.8,
49
- repetition_penalty=2.0)
50
-
51
- # Decode
52
- output_text = llama_tokenizer.decode(output_ids[0],
53
- skip_special_tokens=True)
54
-
55
- return output_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
 
57
  # Let's just make sure the llama is returning as it should and than place that return output into a function making it fit into a base
58
  # Prompt for gpt-4o
@@ -65,6 +86,24 @@ with gr.Blocks(fill_height=True) as demo:
65
  fn=llama_generation,
66
  chatbot=chatbot,
67
  fill_height=True,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  examples=["Make a poem of batman inside willy wonka",
69
  "How can you a burrito with just flour?",
70
  "How was saturn formed in 3 sentences",
 
3
  import numpy as np
4
  import gradio as gr
5
  import re
6
+ from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
7
  import re
8
  from huggingface_hub import login
9
  import os
10
+ from threading import Thread
11
 
12
  # HF_TOKEN
13
  TOKEN = os.getenv('HF_AUTH_TOKEN')
 
27
  # Place transformers in hardware to prepare for process and generation
28
  llama_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B")
29
  llama_model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B", token=TOKEN, torch_dtype=torch.float16).to('cuda')
30
+ terminators = [
31
+ llama_tokenizer.eos_token_id,
32
+ llama_tokenizer.convert_tokens_to_ids("<|eot_id|>")
33
+ ]
34
 
35
  # Place just input pass and return generation output
36
  def llama_generation(input_text: str,
37
+ history: list,
38
+ temperature: float,
39
+ max_new_tokens: int):
40
  """
41
  Pass input texts, tokenize, output and back to text.
42
  """
43
 
44
+ conversation = []
45
+ for user, assistant in history:
46
+ conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
47
+ conversation.append({"role": "user", "content": input_text})
48
+
49
+ input_ids = llama_tokenizer.apply_chat_template(conversation, return_tensors='pt').to(llama_model.device)
50
+
51
+ # Skip_prompt, ignores the prompt in the chatbot
52
+ streamer = TextIteratorStreamer(llama_tokenizer, skip_prompt=True, skip_special_tokens=True)
53
+
54
+ # generation arguments to pass in llm generate() eventually
55
+ generate_kwargs = dict(
56
+ input_ids=input_ids,
57
+ streamer=streamer,
58
+ max_new_tokens=max_new_tokens,
59
+ do_sample=True,
60
+ temperature=temperature,
61
+ eos_token_id=terminators
62
+ )
63
+
64
+ # This makes a greedy generation when temperature is passed to 0 (selects the next token sequence generated by model regardless). Selects each token with the highest probability
65
+ if temperature == 0:
66
+ generate_kwargs["do_sample"] = False
67
+
68
+ # In order to use the generate_kwargs we need to place it in a thread which can also allow the UI to run different commands even when the model is generating
69
+ # place the function as target and place the kwargs next as the kwargs
70
+ thread = Thread(target=llama_model.generate, kwargs=generate_kwargs)
71
+ thread.start()
72
+
73
+ outputs = []
74
+ for text in streamer:
75
+ outputs.append(text)
76
+ return "".join(outputs)
77
 
78
  # Let's just make sure the llama is returning as it should and than place that return output into a function making it fit into a base
79
  # Prompt for gpt-4o
 
86
  fn=llama_generation,
87
  chatbot=chatbot,
88
  fill_height=True,
89
+ # These will effect the parameters args and kwargs inside the llama_generation function, that the ui can interact with from the code
90
+ additional_inputs_accordion=gr.Accordion(label="βš™οΈ Parameters", open=False, render=False),
91
+ additional_inputs=[
92
+ # Slider feature users can interactive to effect the temperature of model
93
+ gr.Slider(minimum=0,
94
+ maximum=1,
95
+ step=0.1,
96
+ value=0.95,
97
+ label="Temperature",
98
+ render=False),
99
+ # Sliding feature for the max tokens for generation on model
100
+ gr.Slider(minimum=128,
101
+ maximum=1500,
102
+ step=1,
103
+ value=512,
104
+ label="Max new tokens",
105
+ render=False),
106
+ ],
107
  examples=["Make a poem of batman inside willy wonka",
108
  "How can you a burrito with just flour?",
109
  "How was saturn formed in 3 sentences",