Spaces:

nikravan
/

GLM4_0414

Running on Zero

App Files Files Community

nikravan commited on 20 days ago

Commit

4cc3701

verified ·

1 Parent(s): 1b002a7

Update app.py

Browse files

Files changed (1) hide show

app.py +25 -30

app.py CHANGED Viewed

@@ -1,10 +1,17 @@
 import gradio as gr
 from huggingface_hub import InferenceClient
 import spaces
-"""
-For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
-"""
-client = InferenceClient("THUDM/GLM-Z1-32B-0414")
 @spaces.GPU
 def respond(
@@ -27,17 +34,18 @@ def respond(
     response = ""
-    from transformers import AutoModelForCausalLM, AutoTokenizer
     MODEL_PATH = "THUDM/GLM-4-Z1-32B-0414"
     tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
-    model = AutoModelForCausalLM.from_pretrained(MODEL_PATH, device_map="auto")
-    #message = [{"role": "user", "content": "Let a, b be positive real numbers such that ab = a + b + 3. Determine the range of possible values for a + b."}]
     inputs = tokenizer.apply_chat_template(
-        message,
         return_tensors="pt",
         add_generation_prompt=True,
         return_dict=True,
@@ -46,29 +54,17 @@ def respond(
     generate_kwargs = {
         "input_ids": inputs["input_ids"],
         "attention_mask": inputs["attention_mask"],
-        "max_new_tokens": 4096,
-        "do_sample": False,
     }
     out = model.generate(**generate_kwargs)
-    response=(tokenizer.decode(out[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True))
     yield response
-    # for message in client.chat_completion(
-    #     messages,
-    #     max_tokens=max_tokens,
-    #     stream=True,
-    #     temperature=temperature,
-    #     top_p=top_p,
-    # ):
-    #     token = message.choices[0].delta.content
-    #     response += token
-    #     yield response
-"""
-For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
-"""
 demo = gr.ChatInterface(
     respond,
     additional_inputs=[
@@ -85,6 +81,5 @@ demo = gr.ChatInterface(
     ],
 )
 if __name__ == "__main__":
-    demo.launch()

 import gradio as gr
 from huggingface_hub import InferenceClient
 import spaces
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers import BitsAndBytesConfig
+import torch
+# پیکربندی quantization به صورت 4 بیتی
+quantization_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_compute_dtype=torch.float16,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_use_double_quant=True,
+)
 @spaces.GPU
 def respond(
     response = ""
     MODEL_PATH = "THUDM/GLM-4-Z1-32B-0414"
     tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
+    model = AutoModelForCausalLM.from_pretrained(
+        MODEL_PATH,
+        device_map="auto",
+        quantization_config=quantization_config,
+        torch_dtype=torch.float16
+    )
     inputs = tokenizer.apply_chat_template(
+        messages,  # تغییر از message به messages
         return_tensors="pt",
         add_generation_prompt=True,
         return_dict=True,
     generate_kwargs = {
         "input_ids": inputs["input_ids"],
         "attention_mask": inputs["attention_mask"],
+        "max_new_tokens": max_tokens,
+        "temperature": temperature,
+        "top_p": top_p,
+        "do_sample": True if temperature > 0 else False,
     }
     out = model.generate(**generate_kwargs)
+    response = tokenizer.decode(out[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
     yield response
 demo = gr.ChatInterface(
     respond,
     additional_inputs=[
     ],
 )
 if __name__ == "__main__":
+    demo.launch()