Spaces:
Running
on
Zero
Running
on
Zero
File size: 9,219 Bytes
9505cff 78217cc 9505cff |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 |
import spaces # If using Hugging Face Spaces
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import gradio as gr
import os
# --- Environment and PyTorch Configurations (Kept from your original code) ---
os.putenv('TORCH_LINALG_PREFER_CUSOLVER','1')
os.putenv('PYTORCH_CUDA_ALLOC_CONF','max_split_size_mb:128') # Be mindful of this with a larger model
os.environ["SAFETENSORS_FAST_GPU"] = "1"
os.putenv('HF_HUB_ENABLE_HF_TRANSFER','1')
torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False
torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
torch.backends.cudnn.allow_tf32 = False
torch.backends.cudnn.deterministic = False # Setting to False can sometimes improve performance if exact reproducibility isn't critical
torch.backends.cudnn.benchmark = True # Setting to True can help if input sizes are consistent
# torch.backends.cuda.preferred_blas_library="cublas" # These are generally defaults or fine
# torch.backends.cuda.preferred_linalg_library="cusolver" # These are generally defaults or fine
torch.set_float32_matmul_precision("highest") # Or "high" if "highest" causes issues
# --- Model and Tokenizer Configuration ---
# ** MODIFICATION 1: Update the model_name **
model_name = "FelixChao/vicuna-33b-coder"
# ** DOCUMENTATION: Model and Tokenizer Loading **
# Load model and tokenizer.
# `torch_dtype="auto"` will attempt to use the optimal dtype (e.g., bfloat16 if available).
# `.to('cuda', torch.bfloat16)` explicitly moves the model to CUDA and casts to bfloat16.
# - Note: `FelixChao/vicuna-33b-coder` is a large model.
# Loading in bfloat16 requires substantial VRAM (~66GB+).
# If you encounter OOM errors, consider quantization:
# e.g., `load_in_8bit=True` or `load_in_4bit=True` (requires `bitsandbytes` library).
# ** MODIFICATION 2: Removed `trust_remote_code=True` (typically not needed for Vicuna) **
print(f"Loading model: {model_name}")
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype="auto",
load_in_8bit=True,
device_map="auto", # device_map="auto" can be helpful for very large models to distribute layers if you have multiple GPUs or for offloading.
# For single GPU, explicit .to('cuda') is fine.
# trust_remote_code=True # Removed: Generally not needed for Vicuna/Llama models
).to('cuda', torch.bfloat16) # Explicitly using bfloat16 as in original code
# ** MODIFICATION 3: Removed `trust_remote_code=True` for tokenizer **
print(f"Loading tokenizer: {model_name}")
tokenizer = AutoTokenizer.from_pretrained(
model_name,
# trust_remote_code=True, # Removed: Generally not needed
use_fast=True
)
# ** DOCUMENTATION: Pad Token **
# Vicuna/Llama models usually use EOS token as PAD token if PAD is not explicitly set.
# This is often handled internally by the tokenizer or generation functions.
# If you encounter issues related to padding, you might need to set it explicitly:
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
print(f"Tokenizer `pad_token` was None, set to `eos_token`: {tokenizer.eos_token}")
# Ensure the model's pad_token_id is also configured if necessary,
# though `generate` usually handles this.
model.config.pad_token_id = tokenizer.pad_token_id
@spaces.GPU(required=True) # For Hugging Face Spaces GPU allocation
def generate_code(prompt: str) -> str:
"""
Generates code based on the given prompt using the loaded Vicuna model.
Args:
prompt: The user's input prompt for code generation.
Returns:
The generated code as a string.
"""
# ** MODIFICATION 4: Update the system prompt for Vicuna **
# ** DOCUMENTATION: Chat Template Messages **
# The `messages` format is used by `tokenizer.apply_chat_template`.
# The system prompt provides context/instructions to the model.
# For Vicuna, a generic helpful assistant prompt is suitable.
messages = [
{"role": "system", "content": "You are a helpful and proficient coding assistant."},
{"role": "user", "content": prompt}
]
# ** DOCUMENTATION: Applying Chat Template **
# `apply_chat_template` formats the input messages according to the model's specific template.
# `tokenize=False` returns a formatted string.
# `add_generation_prompt=True` ensures the template is set up for the model to start generating a response.
try:
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
except Exception as e:
print(f"Error applying chat template: {e}")
# Fallback or simpler prompt structure if template is missing/problematic
# This is a basic fallback, actual Vicuna instruction format might be more specific
# e.g., "USER: {prompt}\nASSISTANT:"
# However, `apply_chat_template` is the preferred method.
# If this fails, the tokenizer for `FelixChao/vicuna-33b-coder` might be missing its template.
return f"Error: Could not apply chat template. The model's tokenizer might be misconfigured. ({e})"
# ** DOCUMENTATION: Tokenization **
# Tokenize the formatted text and move inputs to the model's device (GPU).
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
# ** DOCUMENTATION: Code Generation **
# Generate code using the model.
# `torch.no_grad()` disables gradient calculations, saving memory and computation during inference.
# `max_new_tokens`: Maximum number of new tokens to generate.
# `min_new_tokens`: Minimum number of new tokens to generate.
# `do_sample=True`: Enables sampling for more diverse outputs. If False, uses greedy decoding.
# `low_memory` is not a standard Hugging Face `generate` parameter.
# It might have been intended for a specific version or a custom argument.
# I've removed it as it might cause an error with standard generate.
# If you intended to use a specific memory optimization, that needs to be handled
# via other means (like quantization or model offloading).
with torch.no_grad():
generated_ids = model.generate(
**model_inputs,
max_new_tokens=1024,
min_new_tokens=256,
do_sample=True,
temperature=0.7, # Common sampling parameter, you can tune this
top_p=0.9, # Common sampling parameter, you can tune this
pad_token_id=tokenizer.eos_token_id # Important for open-ended generation
# guidance_scale = 3.8, # Typically for Classifier Free Guidance, might not apply here or require specific setup
)
# ** DOCUMENTATION: Decoding Response **
# Remove the input tokens from the generated output to get only the response.
# `generated_ids[0]` because we process one prompt at a time.
# `model_inputs.input_ids[0]` is the input part.
response_ids = generated_ids[0][len(model_inputs.input_ids[0]):]
# Decode the generated token IDs back into a string.
# `skip_special_tokens=True` removes tokens like <s>, </s>, <unk>, etc.
response = tokenizer.decode(response_ids, skip_special_tokens=True)
return response.strip()
# --- Gradio Interface (Kept mostly from your original code) ---
with gr.Blocks(title="Vicuna 33B Coder") as demo: # Updated title
with gr.Tab("Code Chat"):
gr.Markdown("# Vicuna 33B Coder\nProvide a prompt to generate code.")
with gr.Row():
prompt = gr.Textbox( # Changed to Textbox for potentially longer prompts
label="Prompt",
show_label=True, # Changed to True for clarity
lines=3, # Allow a few lines for the prompt
placeholder="Enter your coding prompt here...",
# container=False, # container is not a standard param for Textbox in this context
)
run_button = gr.Button("Generate Code", variant="primary")
with gr.Row():
result = gr.Code( # Using gr.Code for better code display
label="Generated Code",
show_label=True, # Changed to True for clarity
language="python", # Default language, can be auto-detected or set
lines=20,
# container=False,
)
# Gradio event listener
gr.on(
triggers=[
run_button.click,
prompt.submit # Allow submitting with Enter key in Textbox
],
fn=generate_code,
inputs=[prompt],
outputs=[result],
# api_name="generate_code" # Uncomment if you want to expose this as an API endpoint
)
if __name__ == "__main__":
# ** DOCUMENTATION: Launching Gradio **
# `share=False` by default, set to True if you want a public link (requires internet).
# `debug=True` can be helpful for development to see more detailed errors.
# `server_name="0.0.0.0"` to make it accessible on your local network.
demo.launch(share=False, debug=True) |