ford442's picture
Update app.py
30e82da verified
raw
history blame
7.19 kB
import spaces # If using Hugging Face Spaces
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig # Import BitsAndBytesConfig
import torch
import gradio as gr
import os
# --- Environment and PyTorch Configurations (Kept from your original code) ---
# ... (rest of your os.putenv and torch.backends settings) ...
os.putenv('TORCH_LINALG_PREFER_CUSOLVER','1')
os.putenv('PYTORCH_CUDA_ALLOC_CONF','max_split_size_mb:128')
os.environ["SAFETENSORS_FAST_GPU"] = "1"
os.putenv('HF_HUB_ENABLE_HF_TRANSFER','1')
torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False
torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
torch.backends.cudnn.allow_tf32 = False
torch.backends.cudnn.deterministic = False
torch.backends.cudnn.benchmark = True
torch.set_float32_matmul_precision("highest")
# --- Model and Tokenizer Configuration ---
model_name = "FelixChao/vicuna-33b-coder"
# ** DOCUMENTATION: Quantization Configuration **
# To load the model in 4-bit (or 8-bit), you now use BitsAndBytesConfig.
# This is useful if you're VRAM-constrained.
# Example for 4-bit quantization:
print("Setting up 4-bit quantization config...")
quantization_config_4bit = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True, # Optional: Improves precision slightly but uses a bit more memory
bnb_4bit_quant_type="nf4", # Recommended: "nf4" (NormalFloat4) or "fp4" for floating point 4-bit
bnb_4bit_compute_dtype=torch.bfloat16 # Or torch.float16. Computation will happen in this dtype.
# bfloat16 is good if your GPU supports it (Ampere series onwards)
)
# Example for 8-bit quantization (if you prefer that over 4-bit):
# print("Setting up 8-bit quantization config...")
# quantization_config_8bit = BitsAndBytesConfig(
# load_in_8bit=True
# )
# ** DOCUMENTATION: Model Loading with Quantization **
print(f"Loading model: {model_name} with quantization")
model = AutoModelForCausalLM.from_pretrained(
model_name,
quantization_config=quantization_config_4bit, # Pass the config here
device_map="auto", # CRITICAL: Use device_map="auto" for quantized models.
# It automatically distributes the model across available GPUs/CPU memory as needed.
# Do NOT use .to('cuda') after this when using device_map="auto" with quantization.
# torch_dtype="auto", # With device_map="auto" and quantization, dtype is often handled,
# but bnb_4bit_compute_dtype in BitsAndBytesConfig specifies compute precision.
# trust_remote_code=True # As discussed, generally not needed for Vicuna
)
print(f"Loading tokenizer: {model_name}")
tokenizer = AutoTokenizer.from_pretrained(
model_name,
# trust_remote_code=True,
use_fast=True
)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
print(f"Tokenizer `pad_token` was None, set to `eos_token`: {tokenizer.eos_token}")
# Note: model.config.pad_token_id is usually set by the tokenizer or handled by generate.
# If using device_map, the model might not have a single `model.device` attribute in the traditional sense
# if it's spread across devices. model_inputs should still be moved to the device of the first layer,
# which `generate` often handles, or you can query input_device = model.hf_device_map[""] (for the first block)
# and .to(input_device)
# ... (rest of your generate_code function and Gradio app code) ...
# Make sure to adjust the device placement for model_inputs if needed,
# though often `model.generate` handles this correctly when `device_map` is used.
@spaces.GPU(required=True)
def generate_code(prompt: str) -> str:
messages = [
{"role": "system", "content": "You are a helpful and proficient coding assistant."},
{"role": "user", "content": prompt}
]
try:
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
except Exception as e:
print(f"Error applying chat template: {e}")
return f"Error: Could not apply chat template. ({e})"
# Determine the device for inputs. If model is on multiple devices,
# inputs typically go to the device of the first part of the model.
# With device_map="auto", model.device might not be straightforward.
# model.generate usually handles input placement correctly.
# If you face issues, you might need to explicitly find the input device:
# input_device = model.hf_device_map.get("", "cuda:0") # Get device of first module or default
# model_inputs = tokenizer([text], return_tensors="pt").to(input_device)
# For now, let's assume .to(model.device) works or generate handles it.
# If model.device is not available due to device_map, remove .to(model.device)
# and let `generate` handle it, or use the hf_device_map.
# Since device_map="auto" is used, the model might be on multiple devices.
# We don't need to explicitly move model_inputs to model.device here,
# as the `generate` function should handle it correctly with `device_map`.
model_inputs = tokenizer([text], return_tensors="pt")
with torch.no_grad():
generated_ids = model.generate(
input_ids=model_inputs.input_ids.to(model.device if hasattr(model, "device") else model.hf_device_map[""]), # Ensure input_ids are on the correct device
attention_mask=model_inputs.attention_mask.to(model.device if hasattr(model, "device") else model.hf_device_map[""]), # Ensure attention_mask is on the correct device
max_new_tokens=1024,
min_new_tokens=256,
do_sample=True,
temperature=0.7,
top_p=0.9,
pad_token_id=tokenizer.eos_token_id
)
# The rest of your generate_code function for decoding should be fine
response_ids = generated_ids[0][len(model_inputs.input_ids[0]):]
response = tokenizer.decode(response_ids, skip_special_tokens=True)
return response.strip()
# --- Gradio Interface (Kept mostly from your original code) ---
with gr.Blocks(title="Vicuna 33B Coder") as demo: # Updated title
with gr.Tab("Code Chat"):
gr.Markdown("# Vicuna 33B Coder\nProvide a prompt to generate code.")
with gr.Row():
prompt = gr.Textbox(
label="Prompt",
show_label=True,
lines=3,
placeholder="Enter your coding prompt here...",
)
run_button = gr.Button("Generate Code", variant="primary")
with gr.Row():
result = gr.Code(
label="Generated Code",
show_label=True,
language="python",
lines=20,
)
gr.on(
triggers=[
run_button.click,
prompt.submit
],
fn=generate_code,
inputs=[prompt],
outputs=[result],
)
if __name__ == "__main__":
demo.launch(share=False, debug=True)