Spaces:
Running
on
Zero
Running
on
Zero
import spaces # If using Hugging Face Spaces | |
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig # Import BitsAndBytesConfig | |
import torch | |
import gradio as gr | |
import os | |
# --- Environment and PyTorch Configurations (Kept from your original code) --- | |
# ... (rest of your os.putenv and torch.backends settings) ... | |
os.putenv('TORCH_LINALG_PREFER_CUSOLVER','1') | |
os.putenv('PYTORCH_CUDA_ALLOC_CONF','max_split_size_mb:128') | |
os.environ["SAFETENSORS_FAST_GPU"] = "1" | |
os.putenv('HF_HUB_ENABLE_HF_TRANSFER','1') | |
torch.backends.cuda.matmul.allow_tf32 = False | |
torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False | |
torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False | |
torch.backends.cudnn.allow_tf32 = False | |
torch.backends.cudnn.deterministic = False | |
torch.backends.cudnn.benchmark = True | |
torch.set_float32_matmul_precision("highest") | |
# --- Model and Tokenizer Configuration --- | |
model_name = "FelixChao/vicuna-33b-coder" | |
# ** DOCUMENTATION: Quantization Configuration ** | |
# To load the model in 4-bit (or 8-bit), you now use BitsAndBytesConfig. | |
# This is useful if you're VRAM-constrained. | |
# Example for 4-bit quantization: | |
print("Setting up 4-bit quantization config...") | |
quantization_config_4bit = BitsAndBytesConfig( | |
load_in_4bit=True, | |
bnb_4bit_use_double_quant=True, # Optional: Improves precision slightly but uses a bit more memory | |
bnb_4bit_quant_type="nf4", # Recommended: "nf4" (NormalFloat4) or "fp4" for floating point 4-bit | |
bnb_4bit_compute_dtype=torch.bfloat16 # Or torch.float16. Computation will happen in this dtype. | |
# bfloat16 is good if your GPU supports it (Ampere series onwards) | |
) | |
# Example for 8-bit quantization (if you prefer that over 4-bit): | |
# print("Setting up 8-bit quantization config...") | |
# quantization_config_8bit = BitsAndBytesConfig( | |
# load_in_8bit=True | |
# ) | |
# ** DOCUMENTATION: Model Loading with Quantization ** | |
print(f"Loading model: {model_name} with quantization") | |
model = AutoModelForCausalLM.from_pretrained( | |
model_name, | |
quantization_config=quantization_config_4bit, # Pass the config here | |
device_map="auto", # CRITICAL: Use device_map="auto" for quantized models. | |
# It automatically distributes the model across available GPUs/CPU memory as needed. | |
# Do NOT use .to('cuda') after this when using device_map="auto" with quantization. | |
# torch_dtype="auto", # With device_map="auto" and quantization, dtype is often handled, | |
# but bnb_4bit_compute_dtype in BitsAndBytesConfig specifies compute precision. | |
# trust_remote_code=True # As discussed, generally not needed for Vicuna | |
) | |
print(f"Loading tokenizer: {model_name}") | |
tokenizer = AutoTokenizer.from_pretrained( | |
model_name, | |
# trust_remote_code=True, | |
use_fast=True | |
) | |
if tokenizer.pad_token is None: | |
tokenizer.pad_token = tokenizer.eos_token | |
print(f"Tokenizer `pad_token` was None, set to `eos_token`: {tokenizer.eos_token}") | |
# Note: model.config.pad_token_id is usually set by the tokenizer or handled by generate. | |
# If using device_map, the model might not have a single `model.device` attribute in the traditional sense | |
# if it's spread across devices. model_inputs should still be moved to the device of the first layer, | |
# which `generate` often handles, or you can query input_device = model.hf_device_map[""] (for the first block) | |
# and .to(input_device) | |
# ... (rest of your generate_code function and Gradio app code) ... | |
# Make sure to adjust the device placement for model_inputs if needed, | |
# though often `model.generate` handles this correctly when `device_map` is used. | |
def generate_code(prompt: str) -> str: | |
messages = [ | |
{"role": "system", "content": "You are a helpful and proficient coding assistant."}, | |
{"role": "user", "content": prompt} | |
] | |
try: | |
text = tokenizer.apply_chat_template( | |
messages, | |
tokenize=False, | |
add_generation_prompt=True | |
) | |
except Exception as e: | |
print(f"Error applying chat template: {e}") | |
return f"Error: Could not apply chat template. ({e})" | |
# Determine the device for inputs. If model is on multiple devices, | |
# inputs typically go to the device of the first part of the model. | |
# With device_map="auto", model.device might not be straightforward. | |
# model.generate usually handles input placement correctly. | |
# If you face issues, you might need to explicitly find the input device: | |
# input_device = model.hf_device_map.get("", "cuda:0") # Get device of first module or default | |
# model_inputs = tokenizer([text], return_tensors="pt").to(input_device) | |
# For now, let's assume .to(model.device) works or generate handles it. | |
# If model.device is not available due to device_map, remove .to(model.device) | |
# and let `generate` handle it, or use the hf_device_map. | |
# Since device_map="auto" is used, the model might be on multiple devices. | |
# We don't need to explicitly move model_inputs to model.device here, | |
# as the `generate` function should handle it correctly with `device_map`. | |
model_inputs = tokenizer([text], return_tensors="pt") | |
with torch.no_grad(): | |
generated_ids = model.generate( | |
input_ids=model_inputs.input_ids.to(model.device if hasattr(model, "device") else model.hf_device_map[""]), # Ensure input_ids are on the correct device | |
attention_mask=model_inputs.attention_mask.to(model.device if hasattr(model, "device") else model.hf_device_map[""]), # Ensure attention_mask is on the correct device | |
max_new_tokens=1024, | |
min_new_tokens=256, | |
do_sample=True, | |
temperature=0.7, | |
top_p=0.9, | |
pad_token_id=tokenizer.eos_token_id | |
) | |
# The rest of your generate_code function for decoding should be fine | |
response_ids = generated_ids[0][len(model_inputs.input_ids[0]):] | |
response = tokenizer.decode(response_ids, skip_special_tokens=True) | |
return response.strip() | |
# --- Gradio Interface (Kept mostly from your original code) --- | |
with gr.Blocks(title="Vicuna 33B Coder") as demo: # Updated title | |
with gr.Tab("Code Chat"): | |
gr.Markdown("# Vicuna 33B Coder\nProvide a prompt to generate code.") | |
with gr.Row(): | |
prompt = gr.Textbox( | |
label="Prompt", | |
show_label=True, | |
lines=3, | |
placeholder="Enter your coding prompt here...", | |
) | |
run_button = gr.Button("Generate Code", variant="primary") | |
with gr.Row(): | |
result = gr.Code( | |
label="Generated Code", | |
show_label=True, | |
language="python", | |
lines=20, | |
) | |
gr.on( | |
triggers=[ | |
run_button.click, | |
prompt.submit | |
], | |
fn=generate_code, | |
inputs=[prompt], | |
outputs=[result], | |
) | |
if __name__ == "__main__": | |
demo.launch(share=False, debug=True) |