Spaces:

ford442
/

vicuna-coder-33b

Running on Zero

App Files Files Community

ford442 commited on about 23 hours ago

Commit

143174c

verified ·

1 Parent(s): 30e82da

Update app.py

Browse files

Files changed (1) hide show

app.py +67 -68

app.py CHANGED Viewed

@@ -4,8 +4,7 @@ import torch
 import gradio as gr
 import os
-# --- Environment and PyTorch Configurations (Kept from your original code) ---
-# ... (rest of your os.putenv and torch.backends settings) ...
 os.putenv('TORCH_LINALG_PREFER_CUSOLVER','1')
 os.putenv('PYTORCH_CUDA_ALLOC_CONF','max_split_size_mb:128')
 os.environ["SAFETENSORS_FAST_GPU"] = "1"
@@ -22,59 +21,68 @@ torch.set_float32_matmul_precision("highest")
 # --- Model and Tokenizer Configuration ---
 model_name = "FelixChao/vicuna-33b-coder"
-# ** DOCUMENTATION: Quantization Configuration **
-# To load the model in 4-bit (or 8-bit), you now use BitsAndBytesConfig.
-# This is useful if you're VRAM-constrained.
-# Example for 4-bit quantization:
 print("Setting up 4-bit quantization config...")
 quantization_config_4bit = BitsAndBytesConfig(
     load_in_4bit=True,
-    bnb_4bit_use_double_quant=True,    # Optional: Improves precision slightly but uses a bit more memory
-    bnb_4bit_quant_type="nf4",         # Recommended: "nf4" (NormalFloat4) or "fp4" for floating point 4-bit
-    bnb_4bit_compute_dtype=torch.bfloat16 # Or torch.float16. Computation will happen in this dtype.
-                                        # bfloat16 is good if your GPU supports it (Ampere series onwards)
 )
-# Example for 8-bit quantization (if you prefer that over 4-bit):
-# print("Setting up 8-bit quantization config...")
-# quantization_config_8bit = BitsAndBytesConfig(
-#     load_in_8bit=True
-# )
-# ** DOCUMENTATION: Model Loading with Quantization **
 print(f"Loading model: {model_name} with quantization")
 model = AutoModelForCausalLM.from_pretrained(
     model_name,
-    quantization_config=quantization_config_4bit, # Pass the config here
-    device_map="auto", # CRITICAL: Use device_map="auto" for quantized models.
-                       # It automatically distributes the model across available GPUs/CPU memory as needed.
-                       # Do NOT use .to('cuda') after this when using device_map="auto" with quantization.
-    # torch_dtype="auto", # With device_map="auto" and quantization, dtype is often handled,
-                          # but bnb_4bit_compute_dtype in BitsAndBytesConfig specifies compute precision.
-    # trust_remote_code=True # As discussed, generally not needed for Vicuna
 )
 print(f"Loading tokenizer: {model_name}")
 tokenizer = AutoTokenizer.from_pretrained(
     model_name,
-    # trust_remote_code=True,
     use_fast=True
 )
 if tokenizer.pad_token is None:
     tokenizer.pad_token = tokenizer.eos_token
     print(f"Tokenizer `pad_token` was None, set to `eos_token`: {tokenizer.eos_token}")
-# Note: model.config.pad_token_id is usually set by the tokenizer or handled by generate.
-# If using device_map, the model might not have a single `model.device` attribute in the traditional sense
-# if it's spread across devices. model_inputs should still be moved to the device of the first layer,
-# which `generate` often handles, or you can query input_device = model.hf_device_map[""] (for the first block)
-# and .to(input_device)
-# ... (rest of your generate_code function and Gradio app code) ...
-# Make sure to adjust the device placement for model_inputs if needed,
-# though often `model.generate` handles this correctly when `device_map` is used.
 @spaces.GPU(required=True)
 def generate_code(prompt: str) -> str:
@@ -83,76 +91,67 @@ def generate_code(prompt: str) -> str:
         {"role": "user", "content": prompt}
     ]
     try:
         text = tokenizer.apply_chat_template(
             messages,
             tokenize=False,
-            add_generation_prompt=True
         )
     except Exception as e:
         print(f"Error applying chat template: {e}")
-        return f"Error: Could not apply chat template. ({e})"
-    # Determine the device for inputs. If model is on multiple devices,
-    # inputs typically go to the device of the first part of the model.
-    # With device_map="auto", model.device might not be straightforward.
-    # model.generate usually handles input placement correctly.
-    # If you face issues, you might need to explicitly find the input device:
-    #   input_device = model.hf_device_map.get("", "cuda:0") # Get device of first module or default
-    #   model_inputs = tokenizer([text], return_tensors="pt").to(input_device)
-    # For now, let's assume .to(model.device) works or generate handles it.
-    # If model.device is not available due to device_map, remove .to(model.device)
-    # and let `generate` handle it, or use the hf_device_map.
-    # Since device_map="auto" is used, the model might be on multiple devices.
-    # We don't need to explicitly move model_inputs to model.device here,
-    # as the `generate` function should handle it correctly with `device_map`.
-    model_inputs = tokenizer([text], return_tensors="pt")
     with torch.no_grad():
         generated_ids = model.generate(
-            input_ids=model_inputs.input_ids.to(model.device if hasattr(model, "device") else model.hf_device_map[""]), # Ensure input_ids are on the correct device
-            attention_mask=model_inputs.attention_mask.to(model.device if hasattr(model, "device") else model.hf_device_map[""]), # Ensure attention_mask is on the correct device
             max_new_tokens=1024,
             min_new_tokens=256,
             do_sample=True,
             temperature=0.7,
             top_p=0.9,
-            pad_token_id=tokenizer.eos_token_id
         )
-    # The rest of your generate_code function for decoding should be fine
     response_ids = generated_ids[0][len(model_inputs.input_ids[0]):]
     response = tokenizer.decode(response_ids, skip_special_tokens=True)
     return response.strip()
-# --- Gradio Interface (Kept mostly from your original code) ---
-with gr.Blocks(title="Vicuna 33B Coder") as demo: # Updated title
     with gr.Tab("Code Chat"):
         gr.Markdown("# Vicuna 33B Coder\nProvide a prompt to generate code.")
         with gr.Row():
-            prompt = gr.Textbox(
                 label="Prompt",
-                show_label=True,
-                lines=3,
                 placeholder="Enter your coding prompt here...",
             )
         run_button = gr.Button("Generate Code", variant="primary")
         with gr.Row():
-            result = gr.Code(
                 label="Generated Code",
-                show_label=True,
-                language="python",
                 lines=20,
             )
         gr.on(
             triggers=[
                 run_button.click,
-                prompt.submit
             ],
             fn=generate_code,
-            inputs=[prompt],
-            outputs=[result],
         )
 if __name__ == "__main__":

 import gradio as gr
 import os
+# --- Environment and PyTorch Configurations ---
 os.putenv('TORCH_LINALG_PREFER_CUSOLVER','1')
 os.putenv('PYTORCH_CUDA_ALLOC_CONF','max_split_size_mb:128')
 os.environ["SAFETENSORS_FAST_GPU"] = "1"
 # --- Model and Tokenizer Configuration ---
 model_name = "FelixChao/vicuna-33b-coder"
+# --- Quantization Configuration (Example: 4-bit) ---
+# This section is included based on our previous discussion.
+# Remove or comment out if you are not using quantization.
 print("Setting up 4-bit quantization config...")
 quantization_config_4bit = BitsAndBytesConfig(
     load_in_4bit=True,
+    bnb_4bit_use_double_quant=True,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_compute_dtype=torch.bfloat16
 )
 print(f"Loading model: {model_name} with quantization")
 model = AutoModelForCausalLM.from_pretrained(
     model_name,
+    quantization_config=quantization_config_4bit, # Comment out if not using quantization
+    device_map="auto",
 )
 print(f"Loading tokenizer: {model_name}")
 tokenizer = AutoTokenizer.from_pretrained(
     model_name,
     use_fast=True
 )
+# ** MODIFICATION: Define and set the Vicuna chat template **
+# ** DOCUMENTATION: Chat Template **
+# Vicuna models expect a specific chat format. If the tokenizer doesn't have one
+# built-in, we need to set it manually.
+# This template handles a system prompt, user messages, and assistant responses.
+# It will also add the "ASSISTANT:" prompt for generation if needed.
+VICUNA_CHAT_TEMPLATE = (
+    "{% if messages[0]['role'] == 'system' %}"  # Check if the first message is a system prompt
+        "{{ messages[0]['content'] + '\\n\\n' }}"  # Add system prompt with two newlines
+        "{% set loop_messages = messages[1:] %}"  # Slice to loop over remaining messages
+    "{% else %}"
+        "{% set loop_messages = messages %}"  # No system prompt, loop over all messages
+    "{% endif %}"
+    "{% for message in loop_messages %}"  # Loop through user and assistant messages
+        "{% if message['role'] == 'user' %}"
+            "{{ 'USER: ' + message['content'].strip() + '\\n' }}"
+        "{% elif message['role'] == 'assistant' %}"
+            "{{ 'ASSISTANT: ' + message['content'].strip() + eos_token + '\\n' }}"
+        "{% endif %}"
+    "{% endfor %}"
+    "{% if add_generation_prompt %}"  # If we need to prompt the model for a response
+        "{% if messages[-1]['role'] != 'assistant' %}" # And the last message wasn't from the assistant
+            "{{ 'ASSISTANT:' }}"  # Add the assistant prompt
+        "{% endif %}"
+    "{% endif %}"
+)
+tokenizer.chat_template = VICUNA_CHAT_TEMPLATE
+print("Manually set Vicuna chat template on the tokenizer.")
 if tokenizer.pad_token is None:
     tokenizer.pad_token = tokenizer.eos_token
+    # Also update the model config's pad_token_id if you are setting tokenizer.pad_token
+    # This is crucial if the model's config doesn't get updated automatically.
+    if model.config.pad_token_id is None:
+         model.config.pad_token_id = tokenizer.pad_token_id
     print(f"Tokenizer `pad_token` was None, set to `eos_token`: {tokenizer.eos_token}")
 @spaces.GPU(required=True)
 def generate_code(prompt: str) -> str:
         {"role": "user", "content": prompt}
     ]
     try:
+        # ** DOCUMENTATION: Applying Chat Template **
+        # Now that tokenizer.chat_template is set, this should work.
         text = tokenizer.apply_chat_template(
             messages,
             tokenize=False,
+            add_generation_prompt=True # Important to append "ASSISTANT:"
         )
+        print(f"Formatted prompt using chat template:\n{text}") # For debugging
     except Exception as e:
         print(f"Error applying chat template: {e}")
+        # Provide a more informative error or fallback if needed
+        return f"Error: Could not apply chat template. Details: {e}. Ensure the tokenizer has a valid `chat_template` attribute."
+    # Determine device for inputs if model is on multiple devices
+    # For device_map="auto", input tensors should go to the device of the first model block.
+    input_device = model.hf_device_map.get("", next(iter(model.hf_device_map.values()))) if hasattr(model, "hf_device_map") else model.device
+    model_inputs = tokenizer([text], return_tensors="pt").to(input_device)
     with torch.no_grad():
         generated_ids = model.generate(
+            **model_inputs, # Pass tokenized inputs
             max_new_tokens=1024,
             min_new_tokens=256,
             do_sample=True,
             temperature=0.7,
             top_p=0.9,
+            pad_token_id=tokenizer.eos_token_id # Use EOS token for padding
         )
     response_ids = generated_ids[0][len(model_inputs.input_ids[0]):]
     response = tokenizer.decode(response_ids, skip_special_tokens=True)
     return response.strip()
+# --- Gradio Interface ---
+with gr.Blocks(title="Vicuna 33B Coder") as demo:
     with gr.Tab("Code Chat"):
         gr.Markdown("# Vicuna 33B Coder\nProvide a prompt to generate code.")
         with gr.Row():
+            prompt_input = gr.Textbox( # Renamed to avoid conflict with 'prompt' variable in function scope
                 label="Prompt",
+                show_label=True,
+                lines=3,
                 placeholder="Enter your coding prompt here...",
             )
         run_button = gr.Button("Generate Code", variant="primary")
         with gr.Row():
+            result_output = gr.Code( # Renamed
                 label="Generated Code",
+                show_label=True,
+                language="python",
                 lines=20,
             )
         gr.on(
             triggers=[
                 run_button.click,
+                prompt_input.submit
             ],
             fn=generate_code,
+            inputs=[prompt_input],
+            outputs=[result_output],
         )
 if __name__ == "__main__":