Spaces:
Running
on
Zero
Running
on
Zero
File size: 7,187 Bytes
9505cff 30e82da 9505cff 30e82da 9505cff 30e82da 9505cff 30e82da 9505cff 30e82da 9505cff 30e82da 9505cff 30e82da 9505cff 30e82da 9505cff 30e82da 9505cff 30e82da 9505cff 30e82da 9505cff 30e82da 9505cff 30e82da 9505cff 30e82da 9505cff 30e82da 9505cff 30e82da 9505cff 30e82da 9505cff 30e82da 9505cff 30e82da 9505cff |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 |
import spaces # If using Hugging Face Spaces
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig # Import BitsAndBytesConfig
import torch
import gradio as gr
import os
# --- Environment and PyTorch Configurations (Kept from your original code) ---
# ... (rest of your os.putenv and torch.backends settings) ...
os.putenv('TORCH_LINALG_PREFER_CUSOLVER','1')
os.putenv('PYTORCH_CUDA_ALLOC_CONF','max_split_size_mb:128')
os.environ["SAFETENSORS_FAST_GPU"] = "1"
os.putenv('HF_HUB_ENABLE_HF_TRANSFER','1')
torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False
torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
torch.backends.cudnn.allow_tf32 = False
torch.backends.cudnn.deterministic = False
torch.backends.cudnn.benchmark = True
torch.set_float32_matmul_precision("highest")
# --- Model and Tokenizer Configuration ---
model_name = "FelixChao/vicuna-33b-coder"
# ** DOCUMENTATION: Quantization Configuration **
# To load the model in 4-bit (or 8-bit), you now use BitsAndBytesConfig.
# This is useful if you're VRAM-constrained.
# Example for 4-bit quantization:
print("Setting up 4-bit quantization config...")
quantization_config_4bit = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True, # Optional: Improves precision slightly but uses a bit more memory
bnb_4bit_quant_type="nf4", # Recommended: "nf4" (NormalFloat4) or "fp4" for floating point 4-bit
bnb_4bit_compute_dtype=torch.bfloat16 # Or torch.float16. Computation will happen in this dtype.
# bfloat16 is good if your GPU supports it (Ampere series onwards)
)
# Example for 8-bit quantization (if you prefer that over 4-bit):
# print("Setting up 8-bit quantization config...")
# quantization_config_8bit = BitsAndBytesConfig(
# load_in_8bit=True
# )
# ** DOCUMENTATION: Model Loading with Quantization **
print(f"Loading model: {model_name} with quantization")
model = AutoModelForCausalLM.from_pretrained(
model_name,
quantization_config=quantization_config_4bit, # Pass the config here
device_map="auto", # CRITICAL: Use device_map="auto" for quantized models.
# It automatically distributes the model across available GPUs/CPU memory as needed.
# Do NOT use .to('cuda') after this when using device_map="auto" with quantization.
# torch_dtype="auto", # With device_map="auto" and quantization, dtype is often handled,
# but bnb_4bit_compute_dtype in BitsAndBytesConfig specifies compute precision.
# trust_remote_code=True # As discussed, generally not needed for Vicuna
)
print(f"Loading tokenizer: {model_name}")
tokenizer = AutoTokenizer.from_pretrained(
model_name,
# trust_remote_code=True,
use_fast=True
)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
print(f"Tokenizer `pad_token` was None, set to `eos_token`: {tokenizer.eos_token}")
# Note: model.config.pad_token_id is usually set by the tokenizer or handled by generate.
# If using device_map, the model might not have a single `model.device` attribute in the traditional sense
# if it's spread across devices. model_inputs should still be moved to the device of the first layer,
# which `generate` often handles, or you can query input_device = model.hf_device_map[""] (for the first block)
# and .to(input_device)
# ... (rest of your generate_code function and Gradio app code) ...
# Make sure to adjust the device placement for model_inputs if needed,
# though often `model.generate` handles this correctly when `device_map` is used.
@spaces.GPU(required=True)
def generate_code(prompt: str) -> str:
messages = [
{"role": "system", "content": "You are a helpful and proficient coding assistant."},
{"role": "user", "content": prompt}
]
try:
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
except Exception as e:
print(f"Error applying chat template: {e}")
return f"Error: Could not apply chat template. ({e})"
# Determine the device for inputs. If model is on multiple devices,
# inputs typically go to the device of the first part of the model.
# With device_map="auto", model.device might not be straightforward.
# model.generate usually handles input placement correctly.
# If you face issues, you might need to explicitly find the input device:
# input_device = model.hf_device_map.get("", "cuda:0") # Get device of first module or default
# model_inputs = tokenizer([text], return_tensors="pt").to(input_device)
# For now, let's assume .to(model.device) works or generate handles it.
# If model.device is not available due to device_map, remove .to(model.device)
# and let `generate` handle it, or use the hf_device_map.
# Since device_map="auto" is used, the model might be on multiple devices.
# We don't need to explicitly move model_inputs to model.device here,
# as the `generate` function should handle it correctly with `device_map`.
model_inputs = tokenizer([text], return_tensors="pt")
with torch.no_grad():
generated_ids = model.generate(
input_ids=model_inputs.input_ids.to(model.device if hasattr(model, "device") else model.hf_device_map[""]), # Ensure input_ids are on the correct device
attention_mask=model_inputs.attention_mask.to(model.device if hasattr(model, "device") else model.hf_device_map[""]), # Ensure attention_mask is on the correct device
max_new_tokens=1024,
min_new_tokens=256,
do_sample=True,
temperature=0.7,
top_p=0.9,
pad_token_id=tokenizer.eos_token_id
)
# The rest of your generate_code function for decoding should be fine
response_ids = generated_ids[0][len(model_inputs.input_ids[0]):]
response = tokenizer.decode(response_ids, skip_special_tokens=True)
return response.strip()
# --- Gradio Interface (Kept mostly from your original code) ---
with gr.Blocks(title="Vicuna 33B Coder") as demo: # Updated title
with gr.Tab("Code Chat"):
gr.Markdown("# Vicuna 33B Coder\nProvide a prompt to generate code.")
with gr.Row():
prompt = gr.Textbox(
label="Prompt",
show_label=True,
lines=3,
placeholder="Enter your coding prompt here...",
)
run_button = gr.Button("Generate Code", variant="primary")
with gr.Row():
result = gr.Code(
label="Generated Code",
show_label=True,
language="python",
lines=20,
)
gr.on(
triggers=[
run_button.click,
prompt.submit
],
fn=generate_code,
inputs=[prompt],
outputs=[result],
)
if __name__ == "__main__":
demo.launch(share=False, debug=True) |