Spaces:

Tonic
/

Native_1-bit_LLM

Running

App Files Files Community

PayPeer commited on 10 days ago

Commit

ff224fc

verified ·

1 Parent(s): 1c0ae7f

🫦 BitNet on CPU (Native 1-bit LLM)

Browse files

Files changed (1) hide show

app.py +30 -22

app.py CHANGED Viewed

@@ -1,29 +1,29 @@
 import gradio as gr
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
-# use the official bitnet package to supply the missing code
-from bitnet.configuration_bitnet import BitNetConfig
-from bitnet.modeling_bitnet import BitNetForCausalLM
-from bitnet.tokenization_bitnet import BitNetTokenizer
-# Singleton for model and tokenizer
-_model = None
-_tokenizer = None
 def load_model():
-    global _model, _tokenizer
-    if _model is None or _tokenizer is None:
         model_id = "microsoft/bitnet-b1.58-2B-4T"
-        # load tokenizer, config, and model from the bitnet pip package
-        _tokenizer = BitNetTokenizer.from_pretrained(model_id)
-        config     = BitNetConfig.from_pretrained(model_id)
-        _model     = BitNetForCausalLM.from_pretrained(
             model_id,
             config=config,
             torch_dtype=torch.bfloat16
         )
-    return _model, _tokenizer
 def manage_history(history):
     # Limit to 3 turns (each turn is user + assistant = 2 messages)
@@ -39,6 +39,8 @@ def manage_history(history):
     return history
 def generate_response(user_input, system_prompt, max_new_tokens, temperature, top_p, top_k, history):
     model, tokenizer = load_model()
@@ -50,7 +52,7 @@ def generate_response(user_input, system_prompt, max_new_tokens, temperature, to
     prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     chat_input = tokenizer(prompt, return_tensors="pt").to(model.device)
-    # Generate response
     chat_outputs = model.generate(
         **chat_input,
         max_new_tokens=max_new_tokens,
@@ -60,19 +62,20 @@ def generate_response(user_input, system_prompt, max_new_tokens, temperature, to
         do_sample=True
     )
-    # Decode response
     response = tokenizer.decode(chat_outputs[0][chat_input['input_ids'].shape[-1]:], skip_special_tokens=True)
-    # Update history
     history.append({"role": "user", "content": user_input})
     history.append({"role": "assistant", "content": response})
-    # Manage history limits
     history = manage_history(history)
     return history, history
-# Gradio interface
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown("# BitNet b1.58 2B4T Demo")
@@ -80,7 +83,9 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
         with gr.Column():
             gr.Markdown("""
             ## About BitNet b1.58 2B4T
-            BitNet b1.58 2B4T is the first open-source, native 1-bit Large Language Model with 2 billion parameters, developed by Microsoft Research. Trained on 4 trillion tokens, it matches the performance of full-precision models while offering significant efficiency gains in memory, energy, and latency. Features include:
             - Transformer-based architecture with BitLinear layers
             - Native 1.58-bit weights and 8-bit activations
             - Maximum context length of 4096 tokens
@@ -90,7 +95,9 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
         with gr.Column():
             gr.Markdown("""
             ## About Tonic AI
-            Tonic AI is a vibrant community of AI enthusiasts and developers always building cool demos and pushing the boundaries of what's possible with AI. We're passionate about creating innovative, accessible, and engaging AI experiences for everyone. Join us in exploring the future of AI!
             """)
     with gr.Row():
@@ -152,8 +159,9 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
         ],
         outputs=[chatbot, chat_history]
     )
 if __name__ == "__main__":
-    # Preload model to avoid threading issues
     load_model()
     demo.launch(ssr_mode=False, share=True)

+# 🤖⚡ ▄▀ [ I M P O R T S ]
+import accelerate
 import gradio as gr
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
+# 🧠🔧 ▄▀ [ M O D E L ]
+microsoft_model = None
+microsoft_tokenizer = None
 def load_model():
+    global microsoft_model, microsoft_tokenizer
+    if microsoft_model is None or microsoft_tokenizer is None:
         model_id = "microsoft/bitnet-b1.58-2B-4T"
+        microsoft_tokenizer = AutoTokenizer.from_pretrained(model_id)
+        config = AutoConfig.from_pretrained(model_id)
+        microsoft_model = AutoModelForCausalLM.from_pretrained(
             model_id,
             config=config,
             torch_dtype=torch.bfloat16
         )
+    return microsoft_model, microsoft_tokenizer
+# 🗂️🕰️ ▄▀ [ C O N V E R S A T I O N - H I S T O R Y ]
 def manage_history(history):
     # Limit to 3 turns (each turn is user + assistant = 2 messages)
     return history
+# 💬✨ ▄▀ [ G E N E R A T E - R E S P O N S E ]
 def generate_response(user_input, system_prompt, max_new_tokens, temperature, top_p, top_k, history):
     model, tokenizer = load_model()
     prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     chat_input = tokenizer(prompt, return_tensors="pt").to(model.device)
+    # Generate Response
     chat_outputs = model.generate(
         **chat_input,
         max_new_tokens=max_new_tokens,
         do_sample=True
     )
+    # Decode Response
     response = tokenizer.decode(chat_outputs[0][chat_input['input_ids'].shape[-1]:], skip_special_tokens=True)
+    # Update History
     history.append({"role": "user", "content": user_input})
     history.append({"role": "assistant", "content": response})
+    # Manage History Limits
     history = manage_history(history)
     return history, history
+# 🎛️🖥️ ▄▀ [ G R A D I O - I N T E R F A C E ]
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown("# BitNet b1.58 2B4T Demo")
         with gr.Column():
             gr.Markdown("""
             ## About BitNet b1.58 2B4T
+            BitNet b1.58 2B4T is the first open-source, native 1-bit Large Language Model with 2 billion parameters,
+            developed by Microsoft Research. Trained on 4 trillion tokens, it matches the performance of full-precision
+            models while offering significant efficiency gains in memory, energy, and latency. Features include:
             - Transformer-based architecture with BitLinear layers
             - Native 1.58-bit weights and 8-bit activations
             - Maximum context length of 4096 tokens
         with gr.Column():
             gr.Markdown("""
             ## About Tonic AI
+            Tonic AI is a vibrant community of AI enthusiasts and developers always building cool demos and pushing
+            the boundaries of what's possible with AI. We're passionate about creating innovative, accessible, and
+            engaging AI experiences for everyone. Join us in exploring the future of AI!
             """)
     with gr.Row():
         ],
         outputs=[chatbot, chat_history]
     )
+# 🚀���� ▄▀ [ M A I N ]
 if __name__ == "__main__":
     load_model()
     demo.launch(ssr_mode=False, share=True)