Spaces:

sandz7
/

loki

Runtime error

App Files Files Community

sandz7 commited on May 25, 2024

Commit

7061b48

1 Parent(s): 3e86feb

returned only the llama_generation with output_text and streamer

Browse files

Files changed (1) hide show

app.py +96 -93

app.py CHANGED Viewed

@@ -74,93 +74,92 @@ def gpt_generation(input: str,
     return stream
 # Place just input pass and return generation output
-def loki_generation(input_text: str,
-                    history: list,
-                    temperature: float,
-                    max_new_tokens: int,
-                    mode: str):
     """
     Pass input texts, tokenize, output and back to text.
     """
-    if mode == "llama":
-        conversation = []
-        for user, assistant in history:
-            conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
-        conversation.append({"role": "user", "content": input_text})
-        input_ids = llama_tokenizer.apply_chat_template(conversation, return_tensors='pt').to(llama_model.device)
-        streamer = TextIteratorStreamer(llama_tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
-        # generation arguments to pass in llm generate() eventually
-        generate_kwargs = dict(
-            input_ids=input_ids,
-            streamer=streamer,
-            max_new_tokens=max_new_tokens,
-            do_sample=True,
-            temperature=temperature,
-            eos_token_id=terminators[0]
-        )
-        # This makes a greedy generation when temperature is passed to 0 (selects the next token sequence generated by model regardless). Selects each token with the highest probability
-        if temperature == 0:
-            generate_kwargs["do_sample"] = False
-        # Place the generation in a thread so we can access it.
-        # place the function as target and place the kwargs next as the kwargs
-        thread = Thread(target=llama_model.generate, kwargs=generate_kwargs)
-        thread.start()
-        # outputs = []
-        # for text in streamer:
-        #     outputs.append(text)
-        #     yield "".join(outputs)
-        text = [text for text in streamer]
-        output_text = output_list(text)
-        print("llama mode was on.")
-        return output_text
-    else:
-        conversation = []
-        for user, assistant in history:
-            conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
-        conversation.append({"role": "user", "content": input_text})
-        input_ids = llama_tokenizer.apply_chat_template(conversation, return_tensors='pt').to(llama_model.device)
-        streamer = TextIteratorStreamer(llama_tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
-        # generation arguments to pass in llm generate() eventually
-        generate_kwargs = dict(
-            input_ids=input_ids,
-            streamer=streamer,
-            max_new_tokens=max_new_tokens,
-            do_sample=True,
-            temperature=temperature,
-            eos_token_id=terminators[0]
-        )
-        # This makes a greedy generation when temperature is passed to 0 (selects the next token sequence generated by model regardless). Selects each token with the highest probability
-        if temperature == 0:
-            generate_kwargs["do_sample"] = False
-        # Place the generation in a thread so we can access it.
-        # place the function as target and place the kwargs next as the kwargs
-        thread = Thread(target=llama_model.generate, kwargs=generate_kwargs)
-        thread.start()
-        llama_outputs = [text for text in streamer]
-        output_text = output_list(llama_outputs)
-        stream = gpt_generation(input=input_text, llama_output=output_text)
-        print("loki mode was on.")
-        return stream
-        # outputs = []
-        # for chunk in stream:
-        #     if chunk.choices[0].delta.content is not None:
-        #         text = chunk.choices[0].delta.content
-        #         outputs.append(text)
-        #         yield "".join(outputs)
 def check_cuda():
@@ -212,11 +211,11 @@ def bot_comms(input_text: str,
         yield "Done. GPT-3.5-turbo is ready for your questions! 🏃"
     if llm_mode == "switch to llama":
-        streamer = loki_generation(input_text=input_text,
-                                   history=history,
-                                   temperature=temperature,
-                                   max_new_tokens=max_new_tokens,
-                                   mode="llama")
         outputs = []
         for text in streamer:
             outputs.append(text)
@@ -247,11 +246,15 @@ def bot_comms(input_text: str,
                 yield "".join(outputs)
     if llm_mode is None:
-        stream = loki_generation(input_text=input_text,
-                                 history=history,
-                                 temperature=temperature,
-                                 max_new_tokens=max_new_tokens,
-                                 mode="loki")
         outputs = []
         print("Loki is activate to answer")
         for text in stream:

     return stream
 # Place just input pass and return generation output
+def llama_generation(input_text: str,
+                     history: list,
+                     temperature: float,
+                     max_new_tokens: int,
+                     mode: str):
     """
     Pass input texts, tokenize, output and back to text.
     """
+    conversation = []
+    for user, assistant in history:
+        conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
+    conversation.append({"role": "user", "content": input_text})
+    input_ids = llama_tokenizer.apply_chat_template(conversation, return_tensors='pt').to(llama_model.device)
+    streamer = TextIteratorStreamer(llama_tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
+    # generation arguments to pass in llm generate() eventually
+    generate_kwargs = dict(
+        input_ids=input_ids,
+        streamer=streamer,
+        max_new_tokens=max_new_tokens,
+        do_sample=True,
+        temperature=temperature,
+        eos_token_id=terminators[0]
+    )
+    # This makes a greedy generation when temperature is passed to 0 (selects the next token sequence generated by model regardless). Selects each token with the highest probability
+    if temperature == 0:
+        generate_kwargs["do_sample"] = False
+    # Place the generation in a thread so we can access it.
+    # place the function as target and place the kwargs next as the kwargs
+    thread = Thread(target=llama_model.generate, kwargs=generate_kwargs)
+    thread.start()
+    # outputs = []
+    # for text in streamer:
+    #     outputs.append(text)
+    #     yield "".join(outputs)
+    text = [text for text in streamer]
+    output_text = output_list(text)
+    print("llama mode was on.")
+    return output_text, streamer
+    # conversation = []
+    # for user, assistant in history:
+    #     conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
+    # conversation.append({"role": "user", "content": input_text})
+    # input_ids = llama_tokenizer.apply_chat_template(conversation, return_tensors='pt').to(llama_model.device)
+    # streamer = TextIteratorStreamer(llama_tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
+    # # generation arguments to pass in llm generate() eventually
+    # generate_kwargs = dict(
+    #     input_ids=input_ids,
+    #     streamer=streamer,
+    #     max_new_tokens=max_new_tokens,
+    #     do_sample=True,
+    #     temperature=temperature,
+    #     eos_token_id=terminators[0]
+    # )
+    # # This makes a greedy generation when temperature is passed to 0 (selects the next token sequence generated by model regardless). Selects each token with the highest probability
+    # if temperature == 0:
+    #     generate_kwargs["do_sample"] = False
+    # # Place the generation in a thread so we can access it.
+    # # place the function as target and place the kwargs next as the kwargs
+    # thread = Thread(target=llama_model.generate, kwargs=generate_kwargs)
+    # thread.start()
+    # llama_outputs = [text for text in streamer]
+    # output_text = output_list(llama_outputs)
+    # stream = gpt_generation(input=input_text, llama_output=output_text)
+    # print("loki mode was on.")
+    # return stream
+    # outputs = []
+    # for chunk in stream:
+    #     if chunk.choices[0].delta.content is not None:
+    #         text = chunk.choices[0].delta.content
+    #         outputs.append(text)
+    #         yield "".join(outputs)
 def check_cuda():
         yield "Done. GPT-3.5-turbo is ready for your questions! 🏃"
     if llm_mode == "switch to llama":
+        output_text, streamer = llama_generation(input_text=input_text,
+                                                 history=history,
+                                                 temperature=temperature,
+                                                 max_new_tokens=max_new_tokens,
+                                                 mode="llama")
         outputs = []
         for text in streamer:
             outputs.append(text)
                 yield "".join(outputs)
     if llm_mode is None:
+        output_text, streamer = llama_generation(input_text=input_text,
+                                                 history=history,
+                                                 temperature=temperature,
+                                                 max_new_tokens=max_new_tokens,
+                                                 mode="loki")
+        stream = gpt_generation(input=input_text,
+                                llama_output=output_text,
+                                mode="gpt-4o")
         outputs = []
         print("Loki is activate to answer")
         for text in stream: