import spaces # If using Hugging Face Spaces from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig # Import BitsAndBytesConfig import torch import gradio as gr import os # --- Environment and PyTorch Configurations (Kept from your original code) --- # ... (rest of your os.putenv and torch.backends settings) ... os.putenv('TORCH_LINALG_PREFER_CUSOLVER','1') os.putenv('PYTORCH_CUDA_ALLOC_CONF','max_split_size_mb:128') os.environ["SAFETENSORS_FAST_GPU"] = "1" os.putenv('HF_HUB_ENABLE_HF_TRANSFER','1') torch.backends.cuda.matmul.allow_tf32 = False torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False torch.backends.cudnn.allow_tf32 = False torch.backends.cudnn.deterministic = False torch.backends.cudnn.benchmark = True torch.set_float32_matmul_precision("highest") # --- Model and Tokenizer Configuration --- model_name = "FelixChao/vicuna-33b-coder" # ** DOCUMENTATION: Quantization Configuration ** # To load the model in 4-bit (or 8-bit), you now use BitsAndBytesConfig. # This is useful if you're VRAM-constrained. # Example for 4-bit quantization: print("Setting up 4-bit quantization config...") quantization_config_4bit = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_use_double_quant=True, # Optional: Improves precision slightly but uses a bit more memory bnb_4bit_quant_type="nf4", # Recommended: "nf4" (NormalFloat4) or "fp4" for floating point 4-bit bnb_4bit_compute_dtype=torch.bfloat16 # Or torch.float16. Computation will happen in this dtype. # bfloat16 is good if your GPU supports it (Ampere series onwards) ) # Example for 8-bit quantization (if you prefer that over 4-bit): # print("Setting up 8-bit quantization config...") # quantization_config_8bit = BitsAndBytesConfig( # load_in_8bit=True # ) # ** DOCUMENTATION: Model Loading with Quantization ** print(f"Loading model: {model_name} with quantization") model = AutoModelForCausalLM.from_pretrained( model_name, quantization_config=quantization_config_4bit, # Pass the config here device_map="auto", # CRITICAL: Use device_map="auto" for quantized models. # It automatically distributes the model across available GPUs/CPU memory as needed. # Do NOT use .to('cuda') after this when using device_map="auto" with quantization. # torch_dtype="auto", # With device_map="auto" and quantization, dtype is often handled, # but bnb_4bit_compute_dtype in BitsAndBytesConfig specifies compute precision. # trust_remote_code=True # As discussed, generally not needed for Vicuna ) print(f"Loading tokenizer: {model_name}") tokenizer = AutoTokenizer.from_pretrained( model_name, # trust_remote_code=True, use_fast=True ) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token print(f"Tokenizer `pad_token` was None, set to `eos_token`: {tokenizer.eos_token}") # Note: model.config.pad_token_id is usually set by the tokenizer or handled by generate. # If using device_map, the model might not have a single `model.device` attribute in the traditional sense # if it's spread across devices. model_inputs should still be moved to the device of the first layer, # which `generate` often handles, or you can query input_device = model.hf_device_map[""] (for the first block) # and .to(input_device) # ... (rest of your generate_code function and Gradio app code) ... # Make sure to adjust the device placement for model_inputs if needed, # though often `model.generate` handles this correctly when `device_map` is used. @spaces.GPU(required=True) def generate_code(prompt: str) -> str: messages = [ {"role": "system", "content": "You are a helpful and proficient coding assistant."}, {"role": "user", "content": prompt} ] try: text = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) except Exception as e: print(f"Error applying chat template: {e}") return f"Error: Could not apply chat template. ({e})" # Determine the device for inputs. If model is on multiple devices, # inputs typically go to the device of the first part of the model. # With device_map="auto", model.device might not be straightforward. # model.generate usually handles input placement correctly. # If you face issues, you might need to explicitly find the input device: # input_device = model.hf_device_map.get("", "cuda:0") # Get device of first module or default # model_inputs = tokenizer([text], return_tensors="pt").to(input_device) # For now, let's assume .to(model.device) works or generate handles it. # If model.device is not available due to device_map, remove .to(model.device) # and let `generate` handle it, or use the hf_device_map. # Since device_map="auto" is used, the model might be on multiple devices. # We don't need to explicitly move model_inputs to model.device here, # as the `generate` function should handle it correctly with `device_map`. model_inputs = tokenizer([text], return_tensors="pt") with torch.no_grad(): generated_ids = model.generate( input_ids=model_inputs.input_ids.to(model.device if hasattr(model, "device") else model.hf_device_map[""]), # Ensure input_ids are on the correct device attention_mask=model_inputs.attention_mask.to(model.device if hasattr(model, "device") else model.hf_device_map[""]), # Ensure attention_mask is on the correct device max_new_tokens=1024, min_new_tokens=256, do_sample=True, temperature=0.7, top_p=0.9, pad_token_id=tokenizer.eos_token_id ) # The rest of your generate_code function for decoding should be fine response_ids = generated_ids[0][len(model_inputs.input_ids[0]):] response = tokenizer.decode(response_ids, skip_special_tokens=True) return response.strip() # --- Gradio Interface (Kept mostly from your original code) --- with gr.Blocks(title="Vicuna 33B Coder") as demo: # Updated title with gr.Tab("Code Chat"): gr.Markdown("# Vicuna 33B Coder\nProvide a prompt to generate code.") with gr.Row(): prompt = gr.Textbox( label="Prompt", show_label=True, lines=3, placeholder="Enter your coding prompt here...", ) run_button = gr.Button("Generate Code", variant="primary") with gr.Row(): result = gr.Code( label="Generated Code", show_label=True, language="python", lines=20, ) gr.on( triggers=[ run_button.click, prompt.submit ], fn=generate_code, inputs=[prompt], outputs=[result], ) if __name__ == "__main__": demo.launch(share=False, debug=True)