Spaces:

sandz7
/

loki

Runtime error

App Files Files Community

sandz7 commited on May 25, 2024

Commit

14cd22f

1 Parent(s): 7061b48

just returned the streamers into the bot_coms and just yielded the text

Browse files

Files changed (1) hide show

app.py +16 -96

app.py CHANGED Viewed

@@ -77,8 +77,7 @@ def gpt_generation(input: str,
 def llama_generation(input_text: str,
                      history: list,
                      temperature: float,
-                     max_new_tokens: int,
-                     mode: str):
     """
     Pass input texts, tokenize, output and back to text.
     """
@@ -111,56 +110,7 @@ def llama_generation(input_text: str,
     thread = Thread(target=llama_model.generate, kwargs=generate_kwargs)
     thread.start()
-    # outputs = []
-    # for text in streamer:
-    #     outputs.append(text)
-    #     yield "".join(outputs)
-    text = [text for text in streamer]
-    output_text = output_list(text)
-    print("llama mode was on.")
-    return output_text, streamer
-    # conversation = []
-    # for user, assistant in history:
-    #     conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
-    # conversation.append({"role": "user", "content": input_text})
-    # input_ids = llama_tokenizer.apply_chat_template(conversation, return_tensors='pt').to(llama_model.device)
-    # streamer = TextIteratorStreamer(llama_tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
-    # # generation arguments to pass in llm generate() eventually
-    # generate_kwargs = dict(
-    #     input_ids=input_ids,
-    #     streamer=streamer,
-    #     max_new_tokens=max_new_tokens,
-    #     do_sample=True,
-    #     temperature=temperature,
-    #     eos_token_id=terminators[0]
-    # )
-    # # This makes a greedy generation when temperature is passed to 0 (selects the next token sequence generated by model regardless). Selects each token with the highest probability
-    # if temperature == 0:
-    #     generate_kwargs["do_sample"] = False
-    # # Place the generation in a thread so we can access it.
-    # # place the function as target and place the kwargs next as the kwargs
-    # thread = Thread(target=llama_model.generate, kwargs=generate_kwargs)
-    # thread.start()
-    # llama_outputs = [text for text in streamer]
-    # output_text = output_list(llama_outputs)
-    # stream = gpt_generation(input=input_text, llama_output=output_text)
-    # print("loki mode was on.")
-    # return stream
-    # outputs = []
-    # for chunk in stream:
-    #     if chunk.choices[0].delta.content is not None:
-    #         text = chunk.choices[0].delta.content
-    #         outputs.append(text)
-    #         yield "".join(outputs)
 def check_cuda():
     if torch.cuda.is_available():
@@ -171,13 +121,6 @@ def check_cuda():
 first_time = True
 llm_mode = ""
-# # Async generator function
-# async def async_generator():
-#     for i in range(5):
-#         # Simulate an asynchronous operation
-#         await asyncio.sleep(1)
-#         yield i
 def bot_comms(input_text: str,
               history: list,
               temperature: float,
@@ -211,20 +154,12 @@ def bot_comms(input_text: str,
         yield "Done. GPT-3.5-turbo is ready for your questions! 🏃"
     if llm_mode == "switch to llama":
-        output_text, streamer = llama_generation(input_text=input_text,
-                                                 history=history,
-                                                 temperature=temperature,
-                                                 max_new_tokens=max_new_tokens,
-                                                 mode="llama")
-        outputs = []
         for text in streamer:
-            outputs.append(text)
-            yield "".join(outputs)
     if llm_mode == "switch to gpt-4o":
-        stream = gpt_generation(input=input_text,
-                                llama_output="",
-                                mode="gpt-4o")
         outputs = []
         print("gpt-4o only about to answer.")
         for chunk in stream:
@@ -234,9 +169,7 @@ def bot_comms(input_text: str,
                 yield "".join(outputs)
     if llm_mode == "switch to gpt-3.5-turbo":
-        stream = gpt_generation(input=input_text,
-                                llama_output="",
-                                mode="gpt-3.5-turbo")
         outputs = []
         print("gpt-3.5-turbo is about to answer.")
         for chunk in stream:
@@ -245,31 +178,18 @@ def bot_comms(input_text: str,
                 outputs.append(text)
                 yield "".join(outputs)
-    if llm_mode is None:
-        output_text, streamer = llama_generation(input_text=input_text,
-                                                 history=history,
-                                                 temperature=temperature,
-                                                 max_new_tokens=max_new_tokens,
-                                                 mode="loki")
-        stream = gpt_generation(input=input_text,
-                                llama_output=output_text,
-                                mode="gpt-4o")
         outputs = []
-        print("Loki is activate to answer")
-        for text in stream:
-            outputs.append(text)
-            yield "".join(outputs)
-# # Integration in your existing code
-# async def main():
-#     async for value in async_generator():
-#         print(value)
-#         # Add your existing logic here, e.g., call bot_comms with appropriate arguments
-#         await bot_comms(input_text="example", history=[], temperature=0.5, max_new_tokens=128)
-# Rune async function
-# asyncio.run(main())
 chatbot=gr.Chatbot(height=600, label="Loki AI")

 def llama_generation(input_text: str,
                      history: list,
                      temperature: float,
+                     max_new_tokens: int):
     """
     Pass input texts, tokenize, output and back to text.
     """
     thread = Thread(target=llama_model.generate, kwargs=generate_kwargs)
     thread.start()
+    return streamer
 def check_cuda():
     if torch.cuda.is_available():
 first_time = True
 llm_mode = ""
 def bot_comms(input_text: str,
               history: list,
               temperature: float,
         yield "Done. GPT-3.5-turbo is ready for your questions! 🏃"
     if llm_mode == "switch to llama":
+        streamer = llama_generation(input_text=input_text, history=history, temperature=temperature, max_new_tokens=max_new_tokens)
         for text in streamer:
+            yield text
     if llm_mode == "switch to gpt-4o":
+        stream = gpt_generation(input=input_text, llama_output="", mode="gpt-4o")
         outputs = []
         print("gpt-4o only about to answer.")
         for chunk in stream:
                 yield "".join(outputs)
     if llm_mode == "switch to gpt-3.5-turbo":
+        stream = gpt_generation(input=input_text, llama_output="", mode="gpt-3.5-turbo")
         outputs = []
         print("gpt-3.5-turbo is about to answer.")
         for chunk in stream:
                 outputs.append(text)
                 yield "".join(outputs)
+    if llm_mode is None or llm_mode == "":
+        streamer = llama_generation(input_text=input_text, history=history, temperature=temperature, max_new_tokens=max_new_tokens)
+        output_text = output_list([text for text in streamer])
+        stream = gpt_generation(input=input_text, llama_output=output_text, mode="gpt-4o")
         outputs = []
+        print("Loki is activated to answer")
+        for chunk in stream:
+            if chunk.choices[0].delta.content is not None:
+                text = chunk.choices[0].delta.content
+                outputs.append(text)
+                yield "".join(outputs)
 chatbot=gr.Chatbot(height=600, label="Loki AI")