Spaces:

Tonic
/

Native_1-bit_LLM

Running

Tonic commited on 12 days ago

Commit

4ce621a

unverified ·

1 Parent(s): 66a9100

add singleton to avoid threading issues

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,16 +1,30 @@
 import gradio as gr
 import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
 def load_model():
-    model_id = "microsoft/bitnet-b1.58-2B-4T"
-    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
-    model = AutoModelForCausalLM.from_pretrained(
-        model_id,
-        torch_dtype=torch.bfloat16,
-        trust_remote_code=True
-    )
-    return model, tokenizer
 def manage_history(history):
     # Limit to 3 turns (each turn is user + assistant = 2 messages)
@@ -141,4 +155,6 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
     )
 if __name__ == "__main__":
-    demo.launch(ssr_mode=False)

 import gradio as gr
 import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
+# Singleton for model and tokenizer
+_model = None
+_tokenizer = None
 def load_model():
+    global _model, _tokenizer
+    if _model is None or _tokenizer is None:
+        model_id = "microsoft/bitnet-b1.58-2B-4T"
+        _tokenizer = AutoTokenizer.from_pretrained(
+            model_id,
+            trust_remote_code=True
+        )
+        config = AutoConfig.from_pretrained(
+            model_id,
+            trust_remote_code=True
+        )
+        _model = AutoModelForCausalLM.from_pretrained(
+            model_id,
+            config=config,
+            torch_dtype=torch.bfloat16,
+            trust_remote_code=True
+        )
+    return _model, _tokenizer
 def manage_history(history):
     # Limit to 3 turns (each turn is user + assistant = 2 messages)
     )
 if __name__ == "__main__":
+    # Preload model to avoid threading issues
+    load_model()
+    demo.launch(ssr_mode=False, share=True)