Spaces:

jchwenger
/

DMLAP.07_language-models_3_chatbots

Sleeping

App Files Files Community

jchwenger commited on Nov 16, 2023

Commit

62100c8

1 Parent(s): 3de15d6

app |

Browse files

Files changed (1) hide show

app.py +102 -0

app.py ADDED Viewed

	@@ -0,0 +1,102 @@

+# Adapted from the Gradio tutorials:
+# https://www.gradio.app/guides/creating-a-chatbot-fast#example-using-a-local-open-source-llm-with-hugging-face
+import gradio as gr
+import torch
+# Get cpu, gpu or mps device for training.
+# See: https://pytorch.org/tutorials/beginner/basics/quickstart_tutorial.html#creating-models
+device = (
+    "cuda"
+    if torch.cuda.is_available()
+    else "mps"
+    if torch.backends.mps.is_available()
+    else "cpu"
+)
+from transformers import AutoTokenizer
+from transformers import AutoModelForCausalLM
+from transformers import StoppingCriteria
+from transformers import StoppingCriteriaList
+from transformers import TextIteratorStreamer
+from threading import Thread
+MODEL_ID = "togethercomputer/RedPajama-INCITE-Chat-3B-v1"
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.float16)
+model = model.to(device) # move model to GPU
+class StopOnTokens(StoppingCriteria):
+    """
+    Class used `stopping_criteria` in `generate_kwargs` that provides an additional
+    way of stopping the generation loop (if this class returns `True` on a token,
+    the generation is stopped)).
+    """
+    # note: Python now supports type hints, see this: https://realpython.com/lessons/type-hinting/
+    #       (for the **kwargs see also: https://realpython.com/python-kwargs-and-args/)
+    # this could also be written: def __call__(self, input_ids, scores, **kwargs):
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
+        stop_ids = [29, 0] # see the cell below to understand where these come from
+        for stop_id in stop_ids:
+            if input_ids[0][-1] == stop_id:
+                return True
+        return False
+def predict(message, history):
+    history_transformer_format = history + [[message, ""]]
+    stop = StopOnTokens()
+    # useful to debug
+    # msg = "history"
+    # print(msg)
+    # print(*history_transformer_format, sep="\n")
+    # print("***")
+    # at each step, we feed the entire history in string format,
+    # restoring the format used in their dataset with new lines
+    # and <human>: or <bot>: added before the messages
+    messages = "".join(
+        ["".join(
+            ["\n<human>:"+item[0], "\n<bot>:"+item[1]]
+         )
+        for item in history_transformer_format]
+    )
+    # # to see what we feed to our net:
+    # msg = "string prompt"
+    # print(msg)
+    # print("-" * len(msg))
+    # print(messages)
+    # print("-" * 40)
+    # convert the string into tensors & move to GPU
+    model_inputs = tokenizer([messages], return_tensors="pt").to("cuda")
+    streamer = TextIteratorStreamer(tokenizer, timeout=30., skip_prompt=True, skip_special_tokens=True)
+    generate_kwargs = dict(
+        model_inputs,
+        streamer=streamer,
+        max_new_tokens=1024,
+        do_sample=True,
+        top_p=0.95,
+        top_k=1000,
+        temperature=1.0,
+        pad_token_id=tokenizer.eos_token_id, # mute annoying warning: https://stackoverflow.com/a/71397707
+        num_beams=1,  # this is for beam search (disabled), see: https://huggingface.co/blog/how-to-generate#beam-search
+        stopping_criteria=StoppingCriteriaList([stop])
+    )
+    t = Thread(target=model.generate, kwargs=generate_kwargs)
+    t.start()
+    partial_message  = ""
+    for new_token in streamer:
+        # seen the format <human>: and \n<bot> above (when 'messages' is defined)?
+        # we stream the message *until* we encounter '<', which is by the end
+        if new_token != '<':
+            partial_message += new_token
+            yield partial_message
+gr.ChatInterface(predict).queue().launch(debug=True)