IST199655 commited on
Commit
9213095
·
1 Parent(s): ef4866e
Files changed (1) hide show
  1. app.py +70 -18
app.py CHANGED
@@ -5,8 +5,9 @@ from huggingface_hub import InferenceClient
5
  Copied from inference in colab notebook
6
  """
7
 
8
- from transformers import AutoModel, AutoTokenizer , AutoModelForCausalLM
9
  import torch
 
10
 
11
  # Load model and tokenizer globally to avoid reloading for every request
12
  model_path = "Heit39/llama_lora_model_1"
@@ -23,6 +24,58 @@ model = PeftModel.from_pretrained(base_model, model_path)
23
 
24
 
25
  # Define the response function
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  def respond(
27
  message: str,
28
  history: list[tuple[str, str]],
@@ -54,27 +107,26 @@ def respond(
54
  inputs = tokenizer(prompt, return_tensors="pt", truncation=True)
55
  input_ids = inputs.input_ids.to("cpu") # Ensure input is on the CPU
56
 
57
- # Generate response
58
- output_ids = model.generate(
59
- input_ids,
60
- max_length=input_ids.shape[1] + max_tokens,
61
- temperature=temperature,
62
- top_p=top_p,
63
- do_sample=True,
64
- )
65
-
66
- # Decode the generated text
67
- generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
 
68
 
69
- # Extract the assistant's response from the generated text
70
- assistant_response = generated_text[len(prompt):].strip()
71
-
72
- # Yield responses incrementally (simulate streaming)
73
  response = ""
74
- for token in assistant_response.split(): # Split tokens by whitespace
75
- response += token + " "
76
  yield response.strip()
77
 
 
78
  """
79
  For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
80
  """
 
5
  Copied from inference in colab notebook
6
  """
7
 
8
+ from transformers import AutoTokenizer , AutoModelForCausalLM , TextIteratorStreamer
9
  import torch
10
+ from threading import Thread
11
 
12
  # Load model and tokenizer globally to avoid reloading for every request
13
  model_path = "Heit39/llama_lora_model_1"
 
24
 
25
 
26
  # Define the response function
27
+ # def respond(
28
+ # message: str,
29
+ # history: list[tuple[str, str]],
30
+ # system_message: str,
31
+ # max_tokens: int,
32
+ # temperature: float,
33
+ # top_p: float,
34
+ # ):
35
+ # # Combine system message and history into a single prompt
36
+ # messages = [{"role": "system", "content": system_message}]
37
+ # for val in history:
38
+ # if val[0]:
39
+ # messages.append({"role": "user", "content": val[0]})
40
+ # if val[1]:
41
+ # messages.append({"role": "assistant", "content": val[1]})
42
+ # messages.append({"role": "user", "content": message})
43
+
44
+ # # Create a single text prompt from the messages
45
+ # prompt = ""
46
+ # for msg in messages:
47
+ # if msg["role"] == "system":
48
+ # prompt += f"[System]: {msg['content']}\n\n"
49
+ # elif msg["role"] == "user":
50
+ # prompt += f"[User]: {msg['content']}\n\n"
51
+ # elif msg["role"] == "assistant":
52
+ # prompt += f"[Assistant]: {msg['content']}\n\n"
53
+
54
+ # # Tokenize the prompt
55
+ # inputs = tokenizer(prompt, return_tensors="pt", truncation=True)
56
+ # input_ids = inputs.input_ids.to("cpu") # Ensure input is on the CPU
57
+
58
+ # # Generate response
59
+ # output_ids = model.generate(
60
+ # input_ids,
61
+ # max_length=input_ids.shape[1] + max_tokens,
62
+ # temperature=temperature,
63
+ # top_p=top_p,
64
+ # do_sample=True,
65
+ # )
66
+
67
+ # # Decode the generated text
68
+ # generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
69
+
70
+ # # Extract the assistant's response from the generated text
71
+ # assistant_response = generated_text[len(prompt):].strip()
72
+
73
+ # # Yield responses incrementally (simulate streaming)
74
+ # response = ""
75
+ # for token in assistant_response.split(): # Split tokens by whitespace
76
+ # response += token + " "
77
+ # yield response.strip()
78
+
79
  def respond(
80
  message: str,
81
  history: list[tuple[str, str]],
 
107
  inputs = tokenizer(prompt, return_tensors="pt", truncation=True)
108
  input_ids = inputs.input_ids.to("cpu") # Ensure input is on the CPU
109
 
110
+ # Generate tokens incrementally
111
+ streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
112
+ generation_kwargs = {
113
+ "input_ids": input_ids,
114
+ "max_new_tokens": max_tokens,
115
+ "temperature": temperature,
116
+ "top_p": top_p,
117
+ "do_sample": True,
118
+ "streamer": streamer,
119
+ }
120
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
121
+ thread.start()
122
 
123
+ # Yield responses as they are generated
 
 
 
124
  response = ""
125
+ for token in streamer:
126
+ response += token
127
  yield response.strip()
128
 
129
+
130
  """
131
  For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
132
  """