IST199655 commited on
Commit
3d5b038
Β·
1 Parent(s): b6079ea
Files changed (2) hide show
  1. app.py +82 -71
  2. requirements.txt +3 -1
app.py CHANGED
@@ -4,34 +4,83 @@ from huggingface_hub import InferenceClient
4
  """
5
  Copied from inference in colab notebook
6
  """
7
- # import torch
8
 
9
- # # Monkey-patch to avoid CUDA initialization issues
10
- # torch.cuda.get_device_capability = lambda *args, **kwargs: (0, 0)
11
 
12
- # from unsloth.chat_templates import get_chat_template
13
- # from unsloth import FastLanguageModel
14
 
15
- # # IMPORTING MODEL AND TOKENIZER β€”β€”β€”β€”β€”β€”β€”β€”
 
16
 
17
- # max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
18
- # dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
19
- # load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
 
 
 
20
 
21
- # model, tokenizer = FastLanguageModel.from_pretrained(
22
- # model_name = "llama_lora_model_1",
23
- # max_seq_length = max_seq_length,
24
- # dtype = dtype,
25
- # load_in_4bit = load_in_4bit,
26
- # )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
- # tokenizer = get_chat_template(
29
- # tokenizer,
30
- # chat_template = "llama-3.1",
31
- # )
32
- # FastLanguageModel.for_inference(model) # Enable native 2x faster inference
33
 
34
- # # RUNNING INFERENCE β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
35
 
36
  # def respond(
37
  # message,
@@ -51,57 +100,19 @@ Copied from inference in colab notebook
51
 
52
  # messages.append({"role": "user", "content": message})
53
 
54
- # inputs = tokenizer.apply_chat_template(
55
- # messages,
56
- # tokenize = True,
57
- # add_generation_prompt = True, # Must add for generation
58
- # return_tensors = "pt",
59
- # )
60
-
61
- # outputs = model.generate(input_ids = inputs, max_new_tokens = max_tokens, use_cache = True,
62
- # temperature = 1.5, min_p = 0.1)
63
- # response = tokenizer.batch_decode(outputs)
64
-
65
- # yield response
66
 
67
- """
68
- For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
69
- """
70
- client = InferenceClient(model="https://huggingface.co/Heit39/llama_lora_model_1")
71
-
72
-
73
-
74
- def respond(
75
- message,
76
- history: list[tuple[str, str]],
77
- system_message,
78
- max_tokens,
79
- temperature,
80
- top_p,
81
- ):
82
- messages = [{"role": "system", "content": system_message}]
83
-
84
- for val in history:
85
- if val[0]:
86
- messages.append({"role": "user", "content": val[0]})
87
- if val[1]:
88
- messages.append({"role": "assistant", "content": val[1]})
89
-
90
- messages.append({"role": "user", "content": message})
91
-
92
- response = ""
93
-
94
- for message in client.chat_completion(
95
- messages,
96
- max_tokens=max_tokens,
97
- stream=True,
98
- temperature=temperature,
99
- top_p=top_p,
100
- ):
101
- token = message.choices[0].delta.content
102
-
103
- response += token
104
- yield response
105
 
106
 
107
  """
 
4
  """
5
  Copied from inference in colab notebook
6
  """
 
7
 
8
+ from transformers import LlamaForCausalLM, LlamaTokenizer
9
+ import torch
10
 
11
+ # Load model and tokenizer globally to avoid reloading for every request
12
+ model_path = "llama_lora_model_1"
13
 
14
+ # Load tokenizer
15
+ tokenizer = LlamaTokenizer.from_pretrained(model_path)
16
 
17
+ # Load model
18
+ model = LlamaForCausalLM.from_pretrained(
19
+ model_path,
20
+ torch_dtype=torch.float32, # Adjust based on your environment
21
+ device_map="cpu" # Use CPU for inference
22
+ )
23
 
24
+ # Define the response function
25
+ def respond(
26
+ message: str,
27
+ history: list[tuple[str, str]],
28
+ system_message: str,
29
+ max_tokens: int,
30
+ temperature: float,
31
+ top_p: float,
32
+ ):
33
+ # Combine system message and history into a single prompt
34
+ messages = [{"role": "system", "content": system_message}]
35
+ for val in history:
36
+ if val[0]:
37
+ messages.append({"role": "user", "content": val[0]})
38
+ if val[1]:
39
+ messages.append({"role": "assistant", "content": val[1]})
40
+ messages.append({"role": "user", "content": message})
41
+
42
+ # Create a single text prompt from the messages
43
+ prompt = ""
44
+ for msg in messages:
45
+ if msg["role"] == "system":
46
+ prompt += f"[System]: {msg['content']}\n\n"
47
+ elif msg["role"] == "user":
48
+ prompt += f"[User]: {msg['content']}\n\n"
49
+ elif msg["role"] == "assistant":
50
+ prompt += f"[Assistant]: {msg['content']}\n\n"
51
+
52
+ # Tokenize the prompt
53
+ inputs = tokenizer(prompt, return_tensors="pt", truncation=True)
54
+ input_ids = inputs.input_ids.to("cpu") # Ensure input is on the CPU
55
+
56
+ # Generate response
57
+ output_ids = model.generate(
58
+ input_ids,
59
+ max_length=input_ids.shape[1] + max_tokens,
60
+ temperature=temperature,
61
+ top_p=top_p,
62
+ do_sample=True,
63
+ )
64
+
65
+ # Decode the generated text
66
+ generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
67
+
68
+ # Extract the assistant's response from the generated text
69
+ assistant_response = generated_text[len(prompt):].strip()
70
+
71
+ # Yield responses incrementally (simulate streaming)
72
+ response = ""
73
+ for token in assistant_response.split(): # Split tokens by whitespace
74
+ response += token + " "
75
+ yield response.strip()
76
+
77
+
78
+ """
79
+ For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
80
+ """
81
+ # client = InferenceClient(model="https://huggingface.co/Heit39/llama_lora_model_1")
82
 
 
 
 
 
 
83
 
 
84
 
85
  # def respond(
86
  # message,
 
100
 
101
  # messages.append({"role": "user", "content": message})
102
 
103
+ # response = ""
 
 
 
 
 
 
 
 
 
 
 
104
 
105
+ # for message in client.chat_completion(
106
+ # messages,
107
+ # max_tokens=max_tokens,
108
+ # stream=True,
109
+ # temperature=temperature,
110
+ # top_p=top_p,
111
+ # ):
112
+ # token = message.choices[0].delta.content
113
+
114
+ # response += token
115
+ # yield response
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
 
117
 
118
  """
requirements.txt CHANGED
@@ -1,3 +1,5 @@
1
  huggingface_hub==0.25.2
2
 
3
- unsloth
 
 
 
1
  huggingface_hub==0.25.2
2
 
3
+ unsloth
4
+ transformers
5
+ accelerate