sandz7 commited on
Commit
7061b48
Β·
1 Parent(s): 3e86feb

returned only the llama_generation with output_text and streamer

Browse files
Files changed (1) hide show
  1. app.py +96 -93
app.py CHANGED
@@ -74,93 +74,92 @@ def gpt_generation(input: str,
74
  return stream
75
 
76
  # Place just input pass and return generation output
77
- def loki_generation(input_text: str,
78
- history: list,
79
- temperature: float,
80
- max_new_tokens: int,
81
- mode: str):
82
  """
83
  Pass input texts, tokenize, output and back to text.
84
  """
85
- if mode == "llama":
86
- conversation = []
87
- for user, assistant in history:
88
- conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
89
- conversation.append({"role": "user", "content": input_text})
90
-
91
- input_ids = llama_tokenizer.apply_chat_template(conversation, return_tensors='pt').to(llama_model.device)
92
-
93
- streamer = TextIteratorStreamer(llama_tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
94
-
95
- # generation arguments to pass in llm generate() eventually
96
- generate_kwargs = dict(
97
- input_ids=input_ids,
98
- streamer=streamer,
99
- max_new_tokens=max_new_tokens,
100
- do_sample=True,
101
- temperature=temperature,
102
- eos_token_id=terminators[0]
103
- )
104
-
105
- # This makes a greedy generation when temperature is passed to 0 (selects the next token sequence generated by model regardless). Selects each token with the highest probability
106
- if temperature == 0:
107
- generate_kwargs["do_sample"] = False
108
-
109
- # Place the generation in a thread so we can access it.
110
- # place the function as target and place the kwargs next as the kwargs
111
- thread = Thread(target=llama_model.generate, kwargs=generate_kwargs)
112
- thread.start()
113
-
114
- # outputs = []
115
- # for text in streamer:
116
- # outputs.append(text)
117
- # yield "".join(outputs)
118
-
119
- text = [text for text in streamer]
120
- output_text = output_list(text)
121
- print("llama mode was on.")
122
- return output_text
123
 
124
- else:
125
- conversation = []
126
- for user, assistant in history:
127
- conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
128
- conversation.append({"role": "user", "content": input_text})
129
-
130
- input_ids = llama_tokenizer.apply_chat_template(conversation, return_tensors='pt').to(llama_model.device)
131
-
132
- streamer = TextIteratorStreamer(llama_tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
133
-
134
- # generation arguments to pass in llm generate() eventually
135
- generate_kwargs = dict(
136
- input_ids=input_ids,
137
- streamer=streamer,
138
- max_new_tokens=max_new_tokens,
139
- do_sample=True,
140
- temperature=temperature,
141
- eos_token_id=terminators[0]
142
- )
143
-
144
- # This makes a greedy generation when temperature is passed to 0 (selects the next token sequence generated by model regardless). Selects each token with the highest probability
145
- if temperature == 0:
146
- generate_kwargs["do_sample"] = False
147
-
148
- # Place the generation in a thread so we can access it.
149
- # place the function as target and place the kwargs next as the kwargs
150
- thread = Thread(target=llama_model.generate, kwargs=generate_kwargs)
151
- thread.start()
152
-
153
- llama_outputs = [text for text in streamer]
154
- output_text = output_list(llama_outputs)
155
- stream = gpt_generation(input=input_text, llama_output=output_text)
156
- print("loki mode was on.")
157
- return stream
158
- # outputs = []
159
- # for chunk in stream:
160
- # if chunk.choices[0].delta.content is not None:
161
- # text = chunk.choices[0].delta.content
162
- # outputs.append(text)
163
- # yield "".join(outputs)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
 
165
 
166
  def check_cuda():
@@ -212,11 +211,11 @@ def bot_comms(input_text: str,
212
  yield "Done. GPT-3.5-turbo is ready for your questions! πŸƒ"
213
 
214
  if llm_mode == "switch to llama":
215
- streamer = loki_generation(input_text=input_text,
216
- history=history,
217
- temperature=temperature,
218
- max_new_tokens=max_new_tokens,
219
- mode="llama")
220
  outputs = []
221
  for text in streamer:
222
  outputs.append(text)
@@ -247,11 +246,15 @@ def bot_comms(input_text: str,
247
  yield "".join(outputs)
248
 
249
  if llm_mode is None:
250
- stream = loki_generation(input_text=input_text,
251
- history=history,
252
- temperature=temperature,
253
- max_new_tokens=max_new_tokens,
254
- mode="loki")
 
 
 
 
255
  outputs = []
256
  print("Loki is activate to answer")
257
  for text in stream:
 
74
  return stream
75
 
76
  # Place just input pass and return generation output
77
+ def llama_generation(input_text: str,
78
+ history: list,
79
+ temperature: float,
80
+ max_new_tokens: int,
81
+ mode: str):
82
  """
83
  Pass input texts, tokenize, output and back to text.
84
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
 
86
+ conversation = []
87
+ for user, assistant in history:
88
+ conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
89
+ conversation.append({"role": "user", "content": input_text})
90
+
91
+ input_ids = llama_tokenizer.apply_chat_template(conversation, return_tensors='pt').to(llama_model.device)
92
+
93
+ streamer = TextIteratorStreamer(llama_tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
94
+
95
+ # generation arguments to pass in llm generate() eventually
96
+ generate_kwargs = dict(
97
+ input_ids=input_ids,
98
+ streamer=streamer,
99
+ max_new_tokens=max_new_tokens,
100
+ do_sample=True,
101
+ temperature=temperature,
102
+ eos_token_id=terminators[0]
103
+ )
104
+
105
+ # This makes a greedy generation when temperature is passed to 0 (selects the next token sequence generated by model regardless). Selects each token with the highest probability
106
+ if temperature == 0:
107
+ generate_kwargs["do_sample"] = False
108
+
109
+ # Place the generation in a thread so we can access it.
110
+ # place the function as target and place the kwargs next as the kwargs
111
+ thread = Thread(target=llama_model.generate, kwargs=generate_kwargs)
112
+ thread.start()
113
+
114
+ # outputs = []
115
+ # for text in streamer:
116
+ # outputs.append(text)
117
+ # yield "".join(outputs)
118
+
119
+ text = [text for text in streamer]
120
+ output_text = output_list(text)
121
+ print("llama mode was on.")
122
+ return output_text, streamer
123
+
124
+ # conversation = []
125
+ # for user, assistant in history:
126
+ # conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
127
+ # conversation.append({"role": "user", "content": input_text})
128
+
129
+ # input_ids = llama_tokenizer.apply_chat_template(conversation, return_tensors='pt').to(llama_model.device)
130
+
131
+ # streamer = TextIteratorStreamer(llama_tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
132
+
133
+ # # generation arguments to pass in llm generate() eventually
134
+ # generate_kwargs = dict(
135
+ # input_ids=input_ids,
136
+ # streamer=streamer,
137
+ # max_new_tokens=max_new_tokens,
138
+ # do_sample=True,
139
+ # temperature=temperature,
140
+ # eos_token_id=terminators[0]
141
+ # )
142
+
143
+ # # This makes a greedy generation when temperature is passed to 0 (selects the next token sequence generated by model regardless). Selects each token with the highest probability
144
+ # if temperature == 0:
145
+ # generate_kwargs["do_sample"] = False
146
+
147
+ # # Place the generation in a thread so we can access it.
148
+ # # place the function as target and place the kwargs next as the kwargs
149
+ # thread = Thread(target=llama_model.generate, kwargs=generate_kwargs)
150
+ # thread.start()
151
+
152
+ # llama_outputs = [text for text in streamer]
153
+ # output_text = output_list(llama_outputs)
154
+ # stream = gpt_generation(input=input_text, llama_output=output_text)
155
+ # print("loki mode was on.")
156
+ # return stream
157
+ # outputs = []
158
+ # for chunk in stream:
159
+ # if chunk.choices[0].delta.content is not None:
160
+ # text = chunk.choices[0].delta.content
161
+ # outputs.append(text)
162
+ # yield "".join(outputs)
163
 
164
 
165
  def check_cuda():
 
211
  yield "Done. GPT-3.5-turbo is ready for your questions! πŸƒ"
212
 
213
  if llm_mode == "switch to llama":
214
+ output_text, streamer = llama_generation(input_text=input_text,
215
+ history=history,
216
+ temperature=temperature,
217
+ max_new_tokens=max_new_tokens,
218
+ mode="llama")
219
  outputs = []
220
  for text in streamer:
221
  outputs.append(text)
 
246
  yield "".join(outputs)
247
 
248
  if llm_mode is None:
249
+ output_text, streamer = llama_generation(input_text=input_text,
250
+ history=history,
251
+ temperature=temperature,
252
+ max_new_tokens=max_new_tokens,
253
+ mode="loki")
254
+ stream = gpt_generation(input=input_text,
255
+ llama_output=output_text,
256
+ mode="gpt-4o")
257
+
258
  outputs = []
259
  print("Loki is activate to answer")
260
  for text in stream: