PayPeer commited on
Commit
ff224fc
Β·
verified Β·
1 Parent(s): 1c0ae7f

🫦 BitNet on CPU (Native 1-bit LLM)

Browse files
Files changed (1) hide show
  1. app.py +30 -22
app.py CHANGED
@@ -1,29 +1,29 @@
 
 
 
1
  import gradio as gr
2
  import torch
3
  from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
4
 
5
- # use the official bitnet package to supply the missing code
6
- from bitnet.configuration_bitnet import BitNetConfig
7
- from bitnet.modeling_bitnet import BitNetForCausalLM
8
- from bitnet.tokenization_bitnet import BitNetTokenizer
9
 
10
- # Singleton for model and tokenizer
11
- _model = None
12
- _tokenizer = None
13
 
14
  def load_model():
15
- global _model, _tokenizer
16
- if _model is None or _tokenizer is None:
17
  model_id = "microsoft/bitnet-b1.58-2B-4T"
18
- # load tokenizer, config, and model from the bitnet pip package
19
- _tokenizer = BitNetTokenizer.from_pretrained(model_id)
20
- config = BitNetConfig.from_pretrained(model_id)
21
- _model = BitNetForCausalLM.from_pretrained(
22
  model_id,
23
  config=config,
24
  torch_dtype=torch.bfloat16
25
  )
26
- return _model, _tokenizer
 
 
27
 
28
  def manage_history(history):
29
  # Limit to 3 turns (each turn is user + assistant = 2 messages)
@@ -39,6 +39,8 @@ def manage_history(history):
39
 
40
  return history
41
 
 
 
42
  def generate_response(user_input, system_prompt, max_new_tokens, temperature, top_p, top_k, history):
43
  model, tokenizer = load_model()
44
 
@@ -50,7 +52,7 @@ def generate_response(user_input, system_prompt, max_new_tokens, temperature, to
50
  prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
51
  chat_input = tokenizer(prompt, return_tensors="pt").to(model.device)
52
 
53
- # Generate response
54
  chat_outputs = model.generate(
55
  **chat_input,
56
  max_new_tokens=max_new_tokens,
@@ -60,19 +62,20 @@ def generate_response(user_input, system_prompt, max_new_tokens, temperature, to
60
  do_sample=True
61
  )
62
 
63
- # Decode response
64
  response = tokenizer.decode(chat_outputs[0][chat_input['input_ids'].shape[-1]:], skip_special_tokens=True)
65
 
66
- # Update history
67
  history.append({"role": "user", "content": user_input})
68
  history.append({"role": "assistant", "content": response})
69
 
70
- # Manage history limits
71
  history = manage_history(history)
72
 
73
  return history, history
74
 
75
- # Gradio interface
 
76
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
77
  gr.Markdown("# BitNet b1.58 2B4T Demo")
78
 
@@ -80,7 +83,9 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
80
  with gr.Column():
81
  gr.Markdown("""
82
  ## About BitNet b1.58 2B4T
83
- BitNet b1.58 2B4T is the first open-source, native 1-bit Large Language Model with 2 billion parameters, developed by Microsoft Research. Trained on 4 trillion tokens, it matches the performance of full-precision models while offering significant efficiency gains in memory, energy, and latency. Features include:
 
 
84
  - Transformer-based architecture with BitLinear layers
85
  - Native 1.58-bit weights and 8-bit activations
86
  - Maximum context length of 4096 tokens
@@ -90,7 +95,9 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
90
  with gr.Column():
91
  gr.Markdown("""
92
  ## About Tonic AI
93
- Tonic AI is a vibrant community of AI enthusiasts and developers always building cool demos and pushing the boundaries of what's possible with AI. We're passionate about creating innovative, accessible, and engaging AI experiences for everyone. Join us in exploring the future of AI!
 
 
94
  """)
95
 
96
  with gr.Row():
@@ -152,8 +159,9 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
152
  ],
153
  outputs=[chatbot, chat_history]
154
  )
 
 
155
 
156
  if __name__ == "__main__":
157
- # Preload model to avoid threading issues
158
  load_model()
159
  demo.launch(ssr_mode=False, share=True)
 
1
+ # πŸ€–βš‘ β–„β–€ [ I M P O R T S ]
2
+
3
+ import accelerate
4
  import gradio as gr
5
  import torch
6
  from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
7
 
8
+ # πŸ§ πŸ”§ β–„β–€ [ M O D E L ]
 
 
 
9
 
10
+ microsoft_model = None
11
+ microsoft_tokenizer = None
 
12
 
13
  def load_model():
14
+ global microsoft_model, microsoft_tokenizer
15
+ if microsoft_model is None or microsoft_tokenizer is None:
16
  model_id = "microsoft/bitnet-b1.58-2B-4T"
17
+ microsoft_tokenizer = AutoTokenizer.from_pretrained(model_id)
18
+ config = AutoConfig.from_pretrained(model_id)
19
+ microsoft_model = AutoModelForCausalLM.from_pretrained(
 
20
  model_id,
21
  config=config,
22
  torch_dtype=torch.bfloat16
23
  )
24
+ return microsoft_model, microsoft_tokenizer
25
+
26
+ # πŸ—‚οΈπŸ•°οΈ β–„β–€ [ C O N V E R S A T I O N - H I S T O R Y ]
27
 
28
  def manage_history(history):
29
  # Limit to 3 turns (each turn is user + assistant = 2 messages)
 
39
 
40
  return history
41
 
42
+ # πŸ’¬βœ¨ β–„β–€ [ G E N E R A T E - R E S P O N S E ]
43
+
44
  def generate_response(user_input, system_prompt, max_new_tokens, temperature, top_p, top_k, history):
45
  model, tokenizer = load_model()
46
 
 
52
  prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
53
  chat_input = tokenizer(prompt, return_tensors="pt").to(model.device)
54
 
55
+ # Generate Response
56
  chat_outputs = model.generate(
57
  **chat_input,
58
  max_new_tokens=max_new_tokens,
 
62
  do_sample=True
63
  )
64
 
65
+ # Decode Response
66
  response = tokenizer.decode(chat_outputs[0][chat_input['input_ids'].shape[-1]:], skip_special_tokens=True)
67
 
68
+ # Update History
69
  history.append({"role": "user", "content": user_input})
70
  history.append({"role": "assistant", "content": response})
71
 
72
+ # Manage History Limits
73
  history = manage_history(history)
74
 
75
  return history, history
76
 
77
+ # πŸŽ›οΈπŸ–₯️ β–„β–€ [ G R A D I O - I N T E R F A C E ]
78
+
79
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
80
  gr.Markdown("# BitNet b1.58 2B4T Demo")
81
 
 
83
  with gr.Column():
84
  gr.Markdown("""
85
  ## About BitNet b1.58 2B4T
86
+ BitNet b1.58 2B4T is the first open-source, native 1-bit Large Language Model with 2 billion parameters,
87
+ developed by Microsoft Research. Trained on 4 trillion tokens, it matches the performance of full-precision
88
+ models while offering significant efficiency gains in memory, energy, and latency. Features include:
89
  - Transformer-based architecture with BitLinear layers
90
  - Native 1.58-bit weights and 8-bit activations
91
  - Maximum context length of 4096 tokens
 
95
  with gr.Column():
96
  gr.Markdown("""
97
  ## About Tonic AI
98
+ Tonic AI is a vibrant community of AI enthusiasts and developers always building cool demos and pushing
99
+ the boundaries of what's possible with AI. We're passionate about creating innovative, accessible, and
100
+ engaging AI experiences for everyone. Join us in exploring the future of AI!
101
  """)
102
 
103
  with gr.Row():
 
159
  ],
160
  outputs=[chatbot, chat_history]
161
  )
162
+
163
+ # πŸš€οΏ½οΏ½οΏ½οΏ½ β–„β–€ [ M A I N ]
164
 
165
  if __name__ == "__main__":
 
166
  load_model()
167
  demo.launch(ssr_mode=False, share=True)