nikravan commited on
Commit
4cc3701
·
verified ·
1 Parent(s): 1b002a7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -30
app.py CHANGED
@@ -1,10 +1,17 @@
1
  import gradio as gr
2
  from huggingface_hub import InferenceClient
3
  import spaces
4
- """
5
- For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
6
- """
7
- client = InferenceClient("THUDM/GLM-Z1-32B-0414")
 
 
 
 
 
 
 
8
 
9
  @spaces.GPU
10
  def respond(
@@ -27,17 +34,18 @@ def respond(
27
 
28
  response = ""
29
 
30
- from transformers import AutoModelForCausalLM, AutoTokenizer
31
-
32
  MODEL_PATH = "THUDM/GLM-4-Z1-32B-0414"
33
 
34
  tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
35
- model = AutoModelForCausalLM.from_pretrained(MODEL_PATH, device_map="auto")
36
-
37
- #message = [{"role": "user", "content": "Let a, b be positive real numbers such that ab = a + b + 3. Determine the range of possible values for a + b."}]
 
 
 
38
 
39
  inputs = tokenizer.apply_chat_template(
40
- message,
41
  return_tensors="pt",
42
  add_generation_prompt=True,
43
  return_dict=True,
@@ -46,29 +54,17 @@ def respond(
46
  generate_kwargs = {
47
  "input_ids": inputs["input_ids"],
48
  "attention_mask": inputs["attention_mask"],
49
- "max_new_tokens": 4096,
50
- "do_sample": False,
 
 
51
  }
 
52
  out = model.generate(**generate_kwargs)
53
- response=(tokenizer.decode(out[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True))
54
 
55
  yield response
56
- # for message in client.chat_completion(
57
- # messages,
58
- # max_tokens=max_tokens,
59
- # stream=True,
60
- # temperature=temperature,
61
- # top_p=top_p,
62
- # ):
63
- # token = message.choices[0].delta.content
64
-
65
- # response += token
66
- # yield response
67
 
68
-
69
- """
70
- For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
71
- """
72
  demo = gr.ChatInterface(
73
  respond,
74
  additional_inputs=[
@@ -85,6 +81,5 @@ demo = gr.ChatInterface(
85
  ],
86
  )
87
 
88
-
89
  if __name__ == "__main__":
90
- demo.launch()
 
1
  import gradio as gr
2
  from huggingface_hub import InferenceClient
3
  import spaces
4
+ from transformers import AutoModelForCausalLM, AutoTokenizer
5
+ from transformers import BitsAndBytesConfig
6
+ import torch
7
+
8
+ # پیکربندی quantization به صورت 4 بیتی
9
+ quantization_config = BitsAndBytesConfig(
10
+ load_in_4bit=True,
11
+ bnb_4bit_compute_dtype=torch.float16,
12
+ bnb_4bit_quant_type="nf4",
13
+ bnb_4bit_use_double_quant=True,
14
+ )
15
 
16
  @spaces.GPU
17
  def respond(
 
34
 
35
  response = ""
36
 
 
 
37
  MODEL_PATH = "THUDM/GLM-4-Z1-32B-0414"
38
 
39
  tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
40
+ model = AutoModelForCausalLM.from_pretrained(
41
+ MODEL_PATH,
42
+ device_map="auto",
43
+ quantization_config=quantization_config,
44
+ torch_dtype=torch.float16
45
+ )
46
 
47
  inputs = tokenizer.apply_chat_template(
48
+ messages, # تغییر از message به messages
49
  return_tensors="pt",
50
  add_generation_prompt=True,
51
  return_dict=True,
 
54
  generate_kwargs = {
55
  "input_ids": inputs["input_ids"],
56
  "attention_mask": inputs["attention_mask"],
57
+ "max_new_tokens": max_tokens,
58
+ "temperature": temperature,
59
+ "top_p": top_p,
60
+ "do_sample": True if temperature > 0 else False,
61
  }
62
+
63
  out = model.generate(**generate_kwargs)
64
+ response = tokenizer.decode(out[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
65
 
66
  yield response
 
 
 
 
 
 
 
 
 
 
 
67
 
 
 
 
 
68
  demo = gr.ChatInterface(
69
  respond,
70
  additional_inputs=[
 
81
  ],
82
  )
83
 
 
84
  if __name__ == "__main__":
85
+ demo.launch()