ford442 commited on
Commit
143174c
·
verified ·
1 Parent(s): 30e82da

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +67 -68
app.py CHANGED
@@ -4,8 +4,7 @@ import torch
4
  import gradio as gr
5
  import os
6
 
7
- # --- Environment and PyTorch Configurations (Kept from your original code) ---
8
- # ... (rest of your os.putenv and torch.backends settings) ...
9
  os.putenv('TORCH_LINALG_PREFER_CUSOLVER','1')
10
  os.putenv('PYTORCH_CUDA_ALLOC_CONF','max_split_size_mb:128')
11
  os.environ["SAFETENSORS_FAST_GPU"] = "1"
@@ -22,59 +21,68 @@ torch.set_float32_matmul_precision("highest")
22
  # --- Model and Tokenizer Configuration ---
23
  model_name = "FelixChao/vicuna-33b-coder"
24
 
25
- # ** DOCUMENTATION: Quantization Configuration **
26
- # To load the model in 4-bit (or 8-bit), you now use BitsAndBytesConfig.
27
- # This is useful if you're VRAM-constrained.
28
-
29
- # Example for 4-bit quantization:
30
  print("Setting up 4-bit quantization config...")
31
  quantization_config_4bit = BitsAndBytesConfig(
32
  load_in_4bit=True,
33
- bnb_4bit_use_double_quant=True, # Optional: Improves precision slightly but uses a bit more memory
34
- bnb_4bit_quant_type="nf4", # Recommended: "nf4" (NormalFloat4) or "fp4" for floating point 4-bit
35
- bnb_4bit_compute_dtype=torch.bfloat16 # Or torch.float16. Computation will happen in this dtype.
36
- # bfloat16 is good if your GPU supports it (Ampere series onwards)
37
  )
38
 
39
- # Example for 8-bit quantization (if you prefer that over 4-bit):
40
- # print("Setting up 8-bit quantization config...")
41
- # quantization_config_8bit = BitsAndBytesConfig(
42
- # load_in_8bit=True
43
- # )
44
-
45
- # ** DOCUMENTATION: Model Loading with Quantization **
46
  print(f"Loading model: {model_name} with quantization")
47
  model = AutoModelForCausalLM.from_pretrained(
48
  model_name,
49
- quantization_config=quantization_config_4bit, # Pass the config here
50
- device_map="auto", # CRITICAL: Use device_map="auto" for quantized models.
51
- # It automatically distributes the model across available GPUs/CPU memory as needed.
52
- # Do NOT use .to('cuda') after this when using device_map="auto" with quantization.
53
- # torch_dtype="auto", # With device_map="auto" and quantization, dtype is often handled,
54
- # but bnb_4bit_compute_dtype in BitsAndBytesConfig specifies compute precision.
55
- # trust_remote_code=True # As discussed, generally not needed for Vicuna
56
  )
57
 
58
  print(f"Loading tokenizer: {model_name}")
59
  tokenizer = AutoTokenizer.from_pretrained(
60
  model_name,
61
- # trust_remote_code=True,
62
  use_fast=True
63
  )
64
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  if tokenizer.pad_token is None:
66
  tokenizer.pad_token = tokenizer.eos_token
 
 
 
 
67
  print(f"Tokenizer `pad_token` was None, set to `eos_token`: {tokenizer.eos_token}")
68
 
69
- # Note: model.config.pad_token_id is usually set by the tokenizer or handled by generate.
70
- # If using device_map, the model might not have a single `model.device` attribute in the traditional sense
71
- # if it's spread across devices. model_inputs should still be moved to the device of the first layer,
72
- # which `generate` often handles, or you can query input_device = model.hf_device_map[""] (for the first block)
73
- # and .to(input_device)
74
-
75
- # ... (rest of your generate_code function and Gradio app code) ...
76
- # Make sure to adjust the device placement for model_inputs if needed,
77
- # though often `model.generate` handles this correctly when `device_map` is used.
78
 
79
  @spaces.GPU(required=True)
80
  def generate_code(prompt: str) -> str:
@@ -83,76 +91,67 @@ def generate_code(prompt: str) -> str:
83
  {"role": "user", "content": prompt}
84
  ]
85
  try:
 
 
86
  text = tokenizer.apply_chat_template(
87
  messages,
88
  tokenize=False,
89
- add_generation_prompt=True
90
  )
 
91
  except Exception as e:
92
  print(f"Error applying chat template: {e}")
93
- return f"Error: Could not apply chat template. ({e})"
94
-
95
- # Determine the device for inputs. If model is on multiple devices,
96
- # inputs typically go to the device of the first part of the model.
97
- # With device_map="auto", model.device might not be straightforward.
98
- # model.generate usually handles input placement correctly.
99
- # If you face issues, you might need to explicitly find the input device:
100
- # input_device = model.hf_device_map.get("", "cuda:0") # Get device of first module or default
101
- # model_inputs = tokenizer([text], return_tensors="pt").to(input_device)
102
- # For now, let's assume .to(model.device) works or generate handles it.
103
- # If model.device is not available due to device_map, remove .to(model.device)
104
- # and let `generate` handle it, or use the hf_device_map.
105
-
106
- # Since device_map="auto" is used, the model might be on multiple devices.
107
- # We don't need to explicitly move model_inputs to model.device here,
108
- # as the `generate` function should handle it correctly with `device_map`.
109
- model_inputs = tokenizer([text], return_tensors="pt")
110
 
 
111
 
112
  with torch.no_grad():
113
  generated_ids = model.generate(
114
- input_ids=model_inputs.input_ids.to(model.device if hasattr(model, "device") else model.hf_device_map[""]), # Ensure input_ids are on the correct device
115
- attention_mask=model_inputs.attention_mask.to(model.device if hasattr(model, "device") else model.hf_device_map[""]), # Ensure attention_mask is on the correct device
116
  max_new_tokens=1024,
117
  min_new_tokens=256,
118
  do_sample=True,
119
  temperature=0.7,
120
  top_p=0.9,
121
- pad_token_id=tokenizer.eos_token_id
122
  )
123
 
124
- # The rest of your generate_code function for decoding should be fine
125
  response_ids = generated_ids[0][len(model_inputs.input_ids[0]):]
126
  response = tokenizer.decode(response_ids, skip_special_tokens=True)
127
  return response.strip()
128
 
129
- # --- Gradio Interface (Kept mostly from your original code) ---
130
- with gr.Blocks(title="Vicuna 33B Coder") as demo: # Updated title
131
  with gr.Tab("Code Chat"):
132
  gr.Markdown("# Vicuna 33B Coder\nProvide a prompt to generate code.")
133
  with gr.Row():
134
- prompt = gr.Textbox(
135
  label="Prompt",
136
- show_label=True,
137
- lines=3,
138
  placeholder="Enter your coding prompt here...",
139
  )
140
  run_button = gr.Button("Generate Code", variant="primary")
141
  with gr.Row():
142
- result = gr.Code(
143
  label="Generated Code",
144
- show_label=True,
145
- language="python",
146
  lines=20,
147
  )
148
  gr.on(
149
  triggers=[
150
  run_button.click,
151
- prompt.submit
152
  ],
153
  fn=generate_code,
154
- inputs=[prompt],
155
- outputs=[result],
156
  )
157
 
158
  if __name__ == "__main__":
 
4
  import gradio as gr
5
  import os
6
 
7
+ # --- Environment and PyTorch Configurations ---
 
8
  os.putenv('TORCH_LINALG_PREFER_CUSOLVER','1')
9
  os.putenv('PYTORCH_CUDA_ALLOC_CONF','max_split_size_mb:128')
10
  os.environ["SAFETENSORS_FAST_GPU"] = "1"
 
21
  # --- Model and Tokenizer Configuration ---
22
  model_name = "FelixChao/vicuna-33b-coder"
23
 
24
+ # --- Quantization Configuration (Example: 4-bit) ---
25
+ # This section is included based on our previous discussion.
26
+ # Remove or comment out if you are not using quantization.
 
 
27
  print("Setting up 4-bit quantization config...")
28
  quantization_config_4bit = BitsAndBytesConfig(
29
  load_in_4bit=True,
30
+ bnb_4bit_use_double_quant=True,
31
+ bnb_4bit_quant_type="nf4",
32
+ bnb_4bit_compute_dtype=torch.bfloat16
 
33
  )
34
 
 
 
 
 
 
 
 
35
  print(f"Loading model: {model_name} with quantization")
36
  model = AutoModelForCausalLM.from_pretrained(
37
  model_name,
38
+ quantization_config=quantization_config_4bit, # Comment out if not using quantization
39
+ device_map="auto",
 
 
 
 
 
40
  )
41
 
42
  print(f"Loading tokenizer: {model_name}")
43
  tokenizer = AutoTokenizer.from_pretrained(
44
  model_name,
 
45
  use_fast=True
46
  )
47
 
48
+ # ** MODIFICATION: Define and set the Vicuna chat template **
49
+ # ** DOCUMENTATION: Chat Template **
50
+ # Vicuna models expect a specific chat format. If the tokenizer doesn't have one
51
+ # built-in, we need to set it manually.
52
+ # This template handles a system prompt, user messages, and assistant responses.
53
+ # It will also add the "ASSISTANT:" prompt for generation if needed.
54
+ VICUNA_CHAT_TEMPLATE = (
55
+ "{% if messages[0]['role'] == 'system' %}" # Check if the first message is a system prompt
56
+ "{{ messages[0]['content'] + '\\n\\n' }}" # Add system prompt with two newlines
57
+ "{% set loop_messages = messages[1:] %}" # Slice to loop over remaining messages
58
+ "{% else %}"
59
+ "{% set loop_messages = messages %}" # No system prompt, loop over all messages
60
+ "{% endif %}"
61
+ "{% for message in loop_messages %}" # Loop through user and assistant messages
62
+ "{% if message['role'] == 'user' %}"
63
+ "{{ 'USER: ' + message['content'].strip() + '\\n' }}"
64
+ "{% elif message['role'] == 'assistant' %}"
65
+ "{{ 'ASSISTANT: ' + message['content'].strip() + eos_token + '\\n' }}"
66
+ "{% endif %}"
67
+ "{% endfor %}"
68
+ "{% if add_generation_prompt %}" # If we need to prompt the model for a response
69
+ "{% if messages[-1]['role'] != 'assistant' %}" # And the last message wasn't from the assistant
70
+ "{{ 'ASSISTANT:' }}" # Add the assistant prompt
71
+ "{% endif %}"
72
+ "{% endif %}"
73
+ )
74
+ tokenizer.chat_template = VICUNA_CHAT_TEMPLATE
75
+ print("Manually set Vicuna chat template on the tokenizer.")
76
+
77
+
78
  if tokenizer.pad_token is None:
79
  tokenizer.pad_token = tokenizer.eos_token
80
+ # Also update the model config's pad_token_id if you are setting tokenizer.pad_token
81
+ # This is crucial if the model's config doesn't get updated automatically.
82
+ if model.config.pad_token_id is None:
83
+ model.config.pad_token_id = tokenizer.pad_token_id
84
  print(f"Tokenizer `pad_token` was None, set to `eos_token`: {tokenizer.eos_token}")
85
 
 
 
 
 
 
 
 
 
 
86
 
87
  @spaces.GPU(required=True)
88
  def generate_code(prompt: str) -> str:
 
91
  {"role": "user", "content": prompt}
92
  ]
93
  try:
94
+ # ** DOCUMENTATION: Applying Chat Template **
95
+ # Now that tokenizer.chat_template is set, this should work.
96
  text = tokenizer.apply_chat_template(
97
  messages,
98
  tokenize=False,
99
+ add_generation_prompt=True # Important to append "ASSISTANT:"
100
  )
101
+ print(f"Formatted prompt using chat template:\n{text}") # For debugging
102
  except Exception as e:
103
  print(f"Error applying chat template: {e}")
104
+ # Provide a more informative error or fallback if needed
105
+ return f"Error: Could not apply chat template. Details: {e}. Ensure the tokenizer has a valid `chat_template` attribute."
106
+
107
+ # Determine device for inputs if model is on multiple devices
108
+ # For device_map="auto", input tensors should go to the device of the first model block.
109
+ input_device = model.hf_device_map.get("", next(iter(model.hf_device_map.values()))) if hasattr(model, "hf_device_map") else model.device
 
 
 
 
 
 
 
 
 
 
 
110
 
111
+ model_inputs = tokenizer([text], return_tensors="pt").to(input_device)
112
 
113
  with torch.no_grad():
114
  generated_ids = model.generate(
115
+ **model_inputs, # Pass tokenized inputs
 
116
  max_new_tokens=1024,
117
  min_new_tokens=256,
118
  do_sample=True,
119
  temperature=0.7,
120
  top_p=0.9,
121
+ pad_token_id=tokenizer.eos_token_id # Use EOS token for padding
122
  )
123
 
 
124
  response_ids = generated_ids[0][len(model_inputs.input_ids[0]):]
125
  response = tokenizer.decode(response_ids, skip_special_tokens=True)
126
  return response.strip()
127
 
128
+ # --- Gradio Interface ---
129
+ with gr.Blocks(title="Vicuna 33B Coder") as demo:
130
  with gr.Tab("Code Chat"):
131
  gr.Markdown("# Vicuna 33B Coder\nProvide a prompt to generate code.")
132
  with gr.Row():
133
+ prompt_input = gr.Textbox( # Renamed to avoid conflict with 'prompt' variable in function scope
134
  label="Prompt",
135
+ show_label=True,
136
+ lines=3,
137
  placeholder="Enter your coding prompt here...",
138
  )
139
  run_button = gr.Button("Generate Code", variant="primary")
140
  with gr.Row():
141
+ result_output = gr.Code( # Renamed
142
  label="Generated Code",
143
+ show_label=True,
144
+ language="python",
145
  lines=20,
146
  )
147
  gr.on(
148
  triggers=[
149
  run_button.click,
150
+ prompt_input.submit
151
  ],
152
  fn=generate_code,
153
+ inputs=[prompt_input],
154
+ outputs=[result_output],
155
  )
156
 
157
  if __name__ == "__main__":