ford442 commited on
Commit
30e82da
·
verified ·
1 Parent(s): 1342ff2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +79 -106
app.py CHANGED
@@ -1,12 +1,13 @@
1
  import spaces # If using Hugging Face Spaces
2
- from transformers import AutoModelForCausalLM, AutoTokenizer
3
  import torch
4
  import gradio as gr
5
  import os
6
 
7
  # --- Environment and PyTorch Configurations (Kept from your original code) ---
 
8
  os.putenv('TORCH_LINALG_PREFER_CUSOLVER','1')
9
- os.putenv('PYTORCH_CUDA_ALLOC_CONF','max_split_size_mb:128') # Be mindful of this with a larger model
10
  os.environ["SAFETENSORS_FAST_GPU"] = "1"
11
  os.putenv('HF_HUB_ENABLE_HF_TRANSFER','1')
12
 
@@ -14,81 +15,73 @@ torch.backends.cuda.matmul.allow_tf32 = False
14
  torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False
15
  torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
16
  torch.backends.cudnn.allow_tf32 = False
17
- torch.backends.cudnn.deterministic = False # Setting to False can sometimes improve performance if exact reproducibility isn't critical
18
- torch.backends.cudnn.benchmark = True # Setting to True can help if input sizes are consistent
19
- # torch.backends.cuda.preferred_blas_library="cublas" # These are generally defaults or fine
20
- # torch.backends.cuda.preferred_linalg_library="cusolver" # These are generally defaults or fine
21
- torch.set_float32_matmul_precision("highest") # Or "high" if "highest" causes issues
22
 
23
  # --- Model and Tokenizer Configuration ---
24
- # ** MODIFICATION 1: Update the model_name **
25
  model_name = "FelixChao/vicuna-33b-coder"
26
 
27
- # ** DOCUMENTATION: Model and Tokenizer Loading **
28
- # Load model and tokenizer.
29
- # `torch_dtype="auto"` will attempt to use the optimal dtype (e.g., bfloat16 if available).
30
- # `.to('cuda', torch.bfloat16)` explicitly moves the model to CUDA and casts to bfloat16.
31
- # - Note: `FelixChao/vicuna-33b-coder` is a large model.
32
- # Loading in bfloat16 requires substantial VRAM (~66GB+).
33
- # If you encounter OOM errors, consider quantization:
34
- # e.g., `load_in_8bit=True` or `load_in_4bit=True` (requires `bitsandbytes` library).
35
- # ** MODIFICATION 2: Removed `trust_remote_code=True` (typically not needed for Vicuna) **
36
- print(f"Loading model: {model_name}")
 
 
 
 
 
 
 
 
 
 
 
 
37
  model = AutoModelForCausalLM.from_pretrained(
38
  model_name,
39
- torch_dtype="auto",
40
- load_in_8bit=True,
41
- device_map="auto", # device_map="auto" can be helpful for very large models to distribute layers if you have multiple GPUs or for offloading.
42
- # For single GPU, explicit .to('cuda') is fine.
43
- # trust_remote_code=True # Removed: Generally not needed for Vicuna/Llama models
44
- ).to('cuda') #, torch.bfloat16) # Explicitly using bfloat16 as in original code
45
-
46
- # ** MODIFICATION 3: Removed `trust_remote_code=True` for tokenizer **
 
47
  print(f"Loading tokenizer: {model_name}")
48
  tokenizer = AutoTokenizer.from_pretrained(
49
  model_name,
50
- # trust_remote_code=True, # Removed: Generally not needed
51
  use_fast=True
52
  )
53
 
54
- # ** DOCUMENTATION: Pad Token **
55
- # Vicuna/Llama models usually use EOS token as PAD token if PAD is not explicitly set.
56
- # This is often handled internally by the tokenizer or generation functions.
57
- # If you encounter issues related to padding, you might need to set it explicitly:
58
  if tokenizer.pad_token is None:
59
  tokenizer.pad_token = tokenizer.eos_token
60
  print(f"Tokenizer `pad_token` was None, set to `eos_token`: {tokenizer.eos_token}")
61
 
62
- # Ensure the model's pad_token_id is also configured if necessary,
63
- # though `generate` usually handles this.
64
- model.config.pad_token_id = tokenizer.pad_token_id
 
 
65
 
 
 
 
66
 
67
- @spaces.GPU(required=True) # For Hugging Face Spaces GPU allocation
68
  def generate_code(prompt: str) -> str:
69
- """
70
- Generates code based on the given prompt using the loaded Vicuna model.
71
-
72
- Args:
73
- prompt: The user's input prompt for code generation.
74
-
75
- Returns:
76
- The generated code as a string.
77
- """
78
- # ** MODIFICATION 4: Update the system prompt for Vicuna **
79
- # ** DOCUMENTATION: Chat Template Messages **
80
- # The `messages` format is used by `tokenizer.apply_chat_template`.
81
- # The system prompt provides context/instructions to the model.
82
- # For Vicuna, a generic helpful assistant prompt is suitable.
83
  messages = [
84
  {"role": "system", "content": "You are a helpful and proficient coding assistant."},
85
  {"role": "user", "content": prompt}
86
  ]
87
-
88
- # ** DOCUMENTATION: Applying Chat Template **
89
- # `apply_chat_template` formats the input messages according to the model's specific template.
90
- # `tokenize=False` returns a formatted string.
91
- # `add_generation_prompt=True` ensures the template is set up for the model to start generating a response.
92
  try:
93
  text = tokenizer.apply_chat_template(
94
  messages,
@@ -97,51 +90,40 @@ def generate_code(prompt: str) -> str:
97
  )
98
  except Exception as e:
99
  print(f"Error applying chat template: {e}")
100
- # Fallback or simpler prompt structure if template is missing/problematic
101
- # This is a basic fallback, actual Vicuna instruction format might be more specific
102
- # e.g., "USER: {prompt}\nASSISTANT:"
103
- # However, `apply_chat_template` is the preferred method.
104
- # If this fails, the tokenizer for `FelixChao/vicuna-33b-coder` might be missing its template.
105
- return f"Error: Could not apply chat template. The model's tokenizer might be misconfigured. ({e})"
106
-
107
-
108
- # ** DOCUMENTATION: Tokenization **
109
- # Tokenize the formatted text and move inputs to the model's device (GPU).
110
- model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
111
-
112
- # ** DOCUMENTATION: Code Generation **
113
- # Generate code using the model.
114
- # `torch.no_grad()` disables gradient calculations, saving memory and computation during inference.
115
- # `max_new_tokens`: Maximum number of new tokens to generate.
116
- # `min_new_tokens`: Minimum number of new tokens to generate.
117
- # `do_sample=True`: Enables sampling for more diverse outputs. If False, uses greedy decoding.
118
- # `low_memory` is not a standard Hugging Face `generate` parameter.
119
- # It might have been intended for a specific version or a custom argument.
120
- # I've removed it as it might cause an error with standard generate.
121
- # If you intended to use a specific memory optimization, that needs to be handled
122
- # via other means (like quantization or model offloading).
123
  with torch.no_grad():
124
  generated_ids = model.generate(
125
- **model_inputs,
 
126
  max_new_tokens=1024,
127
  min_new_tokens=256,
128
  do_sample=True,
129
- temperature=0.7, # Common sampling parameter, you can tune this
130
- top_p=0.9, # Common sampling parameter, you can tune this
131
- pad_token_id=tokenizer.eos_token_id # Important for open-ended generation
132
- # guidance_scale = 3.8, # Typically for Classifier Free Guidance, might not apply here or require specific setup
133
  )
134
 
135
- # ** DOCUMENTATION: Decoding Response **
136
- # Remove the input tokens from the generated output to get only the response.
137
- # `generated_ids[0]` because we process one prompt at a time.
138
- # `model_inputs.input_ids[0]` is the input part.
139
  response_ids = generated_ids[0][len(model_inputs.input_ids[0]):]
140
-
141
- # Decode the generated token IDs back into a string.
142
- # `skip_special_tokens=True` removes tokens like <s>, </s>, <unk>, etc.
143
  response = tokenizer.decode(response_ids, skip_special_tokens=True)
144
-
145
  return response.strip()
146
 
147
  # --- Gradio Interface (Kept mostly from your original code) ---
@@ -149,38 +131,29 @@ with gr.Blocks(title="Vicuna 33B Coder") as demo: # Updated title
149
  with gr.Tab("Code Chat"):
150
  gr.Markdown("# Vicuna 33B Coder\nProvide a prompt to generate code.")
151
  with gr.Row():
152
- prompt = gr.Textbox( # Changed to Textbox for potentially longer prompts
153
  label="Prompt",
154
- show_label=True, # Changed to True for clarity
155
- lines=3, # Allow a few lines for the prompt
156
  placeholder="Enter your coding prompt here...",
157
- # container=False, # container is not a standard param for Textbox in this context
158
  )
159
  run_button = gr.Button("Generate Code", variant="primary")
160
  with gr.Row():
161
- result = gr.Code( # Using gr.Code for better code display
162
  label="Generated Code",
163
- show_label=True, # Changed to True for clarity
164
- language="python", # Default language, can be auto-detected or set
165
  lines=20,
166
- # container=False,
167
  )
168
-
169
- # Gradio event listener
170
  gr.on(
171
  triggers=[
172
  run_button.click,
173
- prompt.submit # Allow submitting with Enter key in Textbox
174
  ],
175
  fn=generate_code,
176
  inputs=[prompt],
177
  outputs=[result],
178
- # api_name="generate_code" # Uncomment if you want to expose this as an API endpoint
179
  )
180
 
181
  if __name__ == "__main__":
182
- # ** DOCUMENTATION: Launching Gradio **
183
- # `share=False` by default, set to True if you want a public link (requires internet).
184
- # `debug=True` can be helpful for development to see more detailed errors.
185
- # `server_name="0.0.0.0"` to make it accessible on your local network.
186
  demo.launch(share=False, debug=True)
 
1
  import spaces # If using Hugging Face Spaces
2
+ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig # Import BitsAndBytesConfig
3
  import torch
4
  import gradio as gr
5
  import os
6
 
7
  # --- Environment and PyTorch Configurations (Kept from your original code) ---
8
+ # ... (rest of your os.putenv and torch.backends settings) ...
9
  os.putenv('TORCH_LINALG_PREFER_CUSOLVER','1')
10
+ os.putenv('PYTORCH_CUDA_ALLOC_CONF','max_split_size_mb:128')
11
  os.environ["SAFETENSORS_FAST_GPU"] = "1"
12
  os.putenv('HF_HUB_ENABLE_HF_TRANSFER','1')
13
 
 
15
  torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False
16
  torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
17
  torch.backends.cudnn.allow_tf32 = False
18
+ torch.backends.cudnn.deterministic = False
19
+ torch.backends.cudnn.benchmark = True
20
+ torch.set_float32_matmul_precision("highest")
 
 
21
 
22
  # --- Model and Tokenizer Configuration ---
 
23
  model_name = "FelixChao/vicuna-33b-coder"
24
 
25
+ # ** DOCUMENTATION: Quantization Configuration **
26
+ # To load the model in 4-bit (or 8-bit), you now use BitsAndBytesConfig.
27
+ # This is useful if you're VRAM-constrained.
28
+
29
+ # Example for 4-bit quantization:
30
+ print("Setting up 4-bit quantization config...")
31
+ quantization_config_4bit = BitsAndBytesConfig(
32
+ load_in_4bit=True,
33
+ bnb_4bit_use_double_quant=True, # Optional: Improves precision slightly but uses a bit more memory
34
+ bnb_4bit_quant_type="nf4", # Recommended: "nf4" (NormalFloat4) or "fp4" for floating point 4-bit
35
+ bnb_4bit_compute_dtype=torch.bfloat16 # Or torch.float16. Computation will happen in this dtype.
36
+ # bfloat16 is good if your GPU supports it (Ampere series onwards)
37
+ )
38
+
39
+ # Example for 8-bit quantization (if you prefer that over 4-bit):
40
+ # print("Setting up 8-bit quantization config...")
41
+ # quantization_config_8bit = BitsAndBytesConfig(
42
+ # load_in_8bit=True
43
+ # )
44
+
45
+ # ** DOCUMENTATION: Model Loading with Quantization **
46
+ print(f"Loading model: {model_name} with quantization")
47
  model = AutoModelForCausalLM.from_pretrained(
48
  model_name,
49
+ quantization_config=quantization_config_4bit, # Pass the config here
50
+ device_map="auto", # CRITICAL: Use device_map="auto" for quantized models.
51
+ # It automatically distributes the model across available GPUs/CPU memory as needed.
52
+ # Do NOT use .to('cuda') after this when using device_map="auto" with quantization.
53
+ # torch_dtype="auto", # With device_map="auto" and quantization, dtype is often handled,
54
+ # but bnb_4bit_compute_dtype in BitsAndBytesConfig specifies compute precision.
55
+ # trust_remote_code=True # As discussed, generally not needed for Vicuna
56
+ )
57
+
58
  print(f"Loading tokenizer: {model_name}")
59
  tokenizer = AutoTokenizer.from_pretrained(
60
  model_name,
61
+ # trust_remote_code=True,
62
  use_fast=True
63
  )
64
 
 
 
 
 
65
  if tokenizer.pad_token is None:
66
  tokenizer.pad_token = tokenizer.eos_token
67
  print(f"Tokenizer `pad_token` was None, set to `eos_token`: {tokenizer.eos_token}")
68
 
69
+ # Note: model.config.pad_token_id is usually set by the tokenizer or handled by generate.
70
+ # If using device_map, the model might not have a single `model.device` attribute in the traditional sense
71
+ # if it's spread across devices. model_inputs should still be moved to the device of the first layer,
72
+ # which `generate` often handles, or you can query input_device = model.hf_device_map[""] (for the first block)
73
+ # and .to(input_device)
74
 
75
+ # ... (rest of your generate_code function and Gradio app code) ...
76
+ # Make sure to adjust the device placement for model_inputs if needed,
77
+ # though often `model.generate` handles this correctly when `device_map` is used.
78
 
79
+ @spaces.GPU(required=True)
80
  def generate_code(prompt: str) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  messages = [
82
  {"role": "system", "content": "You are a helpful and proficient coding assistant."},
83
  {"role": "user", "content": prompt}
84
  ]
 
 
 
 
 
85
  try:
86
  text = tokenizer.apply_chat_template(
87
  messages,
 
90
  )
91
  except Exception as e:
92
  print(f"Error applying chat template: {e}")
93
+ return f"Error: Could not apply chat template. ({e})"
94
+
95
+ # Determine the device for inputs. If model is on multiple devices,
96
+ # inputs typically go to the device of the first part of the model.
97
+ # With device_map="auto", model.device might not be straightforward.
98
+ # model.generate usually handles input placement correctly.
99
+ # If you face issues, you might need to explicitly find the input device:
100
+ # input_device = model.hf_device_map.get("", "cuda:0") # Get device of first module or default
101
+ # model_inputs = tokenizer([text], return_tensors="pt").to(input_device)
102
+ # For now, let's assume .to(model.device) works or generate handles it.
103
+ # If model.device is not available due to device_map, remove .to(model.device)
104
+ # and let `generate` handle it, or use the hf_device_map.
105
+
106
+ # Since device_map="auto" is used, the model might be on multiple devices.
107
+ # We don't need to explicitly move model_inputs to model.device here,
108
+ # as the `generate` function should handle it correctly with `device_map`.
109
+ model_inputs = tokenizer([text], return_tensors="pt")
110
+
111
+
 
 
 
 
112
  with torch.no_grad():
113
  generated_ids = model.generate(
114
+ input_ids=model_inputs.input_ids.to(model.device if hasattr(model, "device") else model.hf_device_map[""]), # Ensure input_ids are on the correct device
115
+ attention_mask=model_inputs.attention_mask.to(model.device if hasattr(model, "device") else model.hf_device_map[""]), # Ensure attention_mask is on the correct device
116
  max_new_tokens=1024,
117
  min_new_tokens=256,
118
  do_sample=True,
119
+ temperature=0.7,
120
+ top_p=0.9,
121
+ pad_token_id=tokenizer.eos_token_id
 
122
  )
123
 
124
+ # The rest of your generate_code function for decoding should be fine
 
 
 
125
  response_ids = generated_ids[0][len(model_inputs.input_ids[0]):]
 
 
 
126
  response = tokenizer.decode(response_ids, skip_special_tokens=True)
 
127
  return response.strip()
128
 
129
  # --- Gradio Interface (Kept mostly from your original code) ---
 
131
  with gr.Tab("Code Chat"):
132
  gr.Markdown("# Vicuna 33B Coder\nProvide a prompt to generate code.")
133
  with gr.Row():
134
+ prompt = gr.Textbox(
135
  label="Prompt",
136
+ show_label=True,
137
+ lines=3,
138
  placeholder="Enter your coding prompt here...",
 
139
  )
140
  run_button = gr.Button("Generate Code", variant="primary")
141
  with gr.Row():
142
+ result = gr.Code(
143
  label="Generated Code",
144
+ show_label=True,
145
+ language="python",
146
  lines=20,
 
147
  )
 
 
148
  gr.on(
149
  triggers=[
150
  run_button.click,
151
+ prompt.submit
152
  ],
153
  fn=generate_code,
154
  inputs=[prompt],
155
  outputs=[result],
 
156
  )
157
 
158
  if __name__ == "__main__":
 
 
 
 
159
  demo.launch(share=False, debug=True)