import spaces # If using Hugging Face Spaces from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig # Import BitsAndBytesConfig import torch import gradio as gr import os # --- Environment and PyTorch Configurations --- os.putenv('TORCH_LINALG_PREFER_CUSOLVER','1') os.putenv('PYTORCH_CUDA_ALLOC_CONF','max_split_size_mb:128') os.environ["SAFETENSORS_FAST_GPU"] = "1" os.putenv('HF_HUB_ENABLE_HF_TRANSFER','1') torch.backends.cuda.matmul.allow_tf32 = False torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False torch.backends.cudnn.allow_tf32 = False torch.backends.cudnn.deterministic = False torch.backends.cudnn.benchmark = True torch.set_float32_matmul_precision("highest") # --- Model and Tokenizer Configuration --- model_name = "FelixChao/vicuna-33b-coder" # --- Quantization Configuration (Example: 4-bit) --- # This section is included based on our previous discussion. # Remove or comment out if you are not using quantization. print("Setting up 4-bit quantization config...") quantization_config_4bit = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16 ) print(f"Loading model: {model_name} with quantization") model = AutoModelForCausalLM.from_pretrained( model_name, quantization_config=quantization_config_4bit, # Comment out if not using quantization device_map="auto", ) print(f"Loading tokenizer: {model_name}") tokenizer = AutoTokenizer.from_pretrained( model_name, use_fast=True ) # ** MODIFICATION: Define and set the Vicuna chat template ** # ** DOCUMENTATION: Chat Template ** # Vicuna models expect a specific chat format. If the tokenizer doesn't have one # built-in, we need to set it manually. # This template handles a system prompt, user messages, and assistant responses. # It will also add the "ASSISTANT:" prompt for generation if needed. VICUNA_CHAT_TEMPLATE = ( "{% if messages[0]['role'] == 'system' %}" # Check if the first message is a system prompt "{{ messages[0]['content'] + '\\n\\n' }}" # Add system prompt with two newlines "{% set loop_messages = messages[1:] %}" # Slice to loop over remaining messages "{% else %}" "{% set loop_messages = messages %}" # No system prompt, loop over all messages "{% endif %}" "{% for message in loop_messages %}" # Loop through user and assistant messages "{% if message['role'] == 'user' %}" "{{ 'USER: ' + message['content'].strip() + '\\n' }}" "{% elif message['role'] == 'assistant' %}" "{{ 'ASSISTANT: ' + message['content'].strip() + eos_token + '\\n' }}" "{% endif %}" "{% endfor %}" "{% if add_generation_prompt %}" # If we need to prompt the model for a response "{% if messages[-1]['role'] != 'assistant' %}" # And the last message wasn't from the assistant "{{ 'ASSISTANT:' }}" # Add the assistant prompt "{% endif %}" "{% endif %}" ) tokenizer.chat_template = VICUNA_CHAT_TEMPLATE print("Manually set Vicuna chat template on the tokenizer.") if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token # Also update the model config's pad_token_id if you are setting tokenizer.pad_token # This is crucial if the model's config doesn't get updated automatically. if model.config.pad_token_id is None: model.config.pad_token_id = tokenizer.pad_token_id print(f"Tokenizer `pad_token` was None, set to `eos_token`: {tokenizer.eos_token}") @spaces.GPU(required=True) def generate_code(prompt: str) -> str: messages = [ {"role": "system", "content": "You are a helpful and proficient coding assistant."}, {"role": "user", "content": prompt} ] try: # ** DOCUMENTATION: Applying Chat Template ** # Now that tokenizer.chat_template is set, this should work. text = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True # Important to append "ASSISTANT:" ) print(f"Formatted prompt using chat template:\n{text}") # For debugging except Exception as e: print(f"Error applying chat template: {e}") # Provide a more informative error or fallback if needed return f"Error: Could not apply chat template. Details: {e}. Ensure the tokenizer has a valid `chat_template` attribute." # Determine device for inputs if model is on multiple devices # For device_map="auto", input tensors should go to the device of the first model block. input_device = model.hf_device_map.get("", next(iter(model.hf_device_map.values()))) if hasattr(model, "hf_device_map") else model.device model_inputs = tokenizer([text], return_tensors="pt").to(input_device) with torch.no_grad(): generated_ids = model.generate( **model_inputs, # Pass tokenized inputs max_new_tokens=1024, min_new_tokens=256, do_sample=True, temperature=0.7, top_p=0.9, pad_token_id=tokenizer.eos_token_id # Use EOS token for padding ) response_ids = generated_ids[0][len(model_inputs.input_ids[0]):] response = tokenizer.decode(response_ids, skip_special_tokens=True) return response.strip() # --- Gradio Interface --- with gr.Blocks(title="Vicuna 33B Coder") as demo: with gr.Tab("Code Chat"): gr.Markdown("# Vicuna 33B Coder\nProvide a prompt to generate code.") with gr.Row(): prompt_input = gr.Textbox( # Renamed to avoid conflict with 'prompt' variable in function scope label="Prompt", show_label=True, lines=3, placeholder="Enter your coding prompt here...", ) run_button = gr.Button("Generate Code", variant="primary") with gr.Row(): result_output = gr.Code( # Renamed label="Generated Code", show_label=True, language="python", lines=20, ) gr.on( triggers=[ run_button.click, prompt_input.submit ], fn=generate_code, inputs=[prompt_input], outputs=[result_output], ) if __name__ == "__main__": demo.launch(share=False, debug=True)