Spaces:
Running
on
Zero
Running
on
Zero
File size: 6,519 Bytes
9505cff 30e82da 9505cff 143174c 9505cff 30e82da 9505cff 30e82da 9505cff 143174c 30e82da 143174c 30e82da 9505cff 143174c 30e82da 9505cff 143174c 9505cff 143174c 9505cff 30e82da 9505cff 143174c 9505cff 143174c 9505cff 143174c 9505cff 143174c 30e82da 143174c 30e82da 9505cff 143174c 9505cff 30e82da 143174c 9505cff 143174c 9505cff 143174c 9505cff 143174c 9505cff 143174c 9505cff 143174c 9505cff 143174c 9505cff 143174c 9505cff |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 |
import spaces # If using Hugging Face Spaces
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig # Import BitsAndBytesConfig
import torch
import gradio as gr
import os
# --- Environment and PyTorch Configurations ---
os.putenv('TORCH_LINALG_PREFER_CUSOLVER','1')
os.putenv('PYTORCH_CUDA_ALLOC_CONF','max_split_size_mb:128')
os.environ["SAFETENSORS_FAST_GPU"] = "1"
os.putenv('HF_HUB_ENABLE_HF_TRANSFER','1')
torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False
torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
torch.backends.cudnn.allow_tf32 = False
torch.backends.cudnn.deterministic = False
torch.backends.cudnn.benchmark = True
torch.set_float32_matmul_precision("highest")
# --- Model and Tokenizer Configuration ---
model_name = "FelixChao/vicuna-33b-coder"
# --- Quantization Configuration (Example: 4-bit) ---
# This section is included based on our previous discussion.
# Remove or comment out if you are not using quantization.
print("Setting up 4-bit quantization config...")
quantization_config_4bit = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16
)
print(f"Loading model: {model_name} with quantization")
model = AutoModelForCausalLM.from_pretrained(
model_name,
quantization_config=quantization_config_4bit, # Comment out if not using quantization
device_map="auto",
)
print(f"Loading tokenizer: {model_name}")
tokenizer = AutoTokenizer.from_pretrained(
model_name,
use_fast=True
)
# ** MODIFICATION: Define and set the Vicuna chat template **
# ** DOCUMENTATION: Chat Template **
# Vicuna models expect a specific chat format. If the tokenizer doesn't have one
# built-in, we need to set it manually.
# This template handles a system prompt, user messages, and assistant responses.
# It will also add the "ASSISTANT:" prompt for generation if needed.
VICUNA_CHAT_TEMPLATE = (
"{% if messages[0]['role'] == 'system' %}" # Check if the first message is a system prompt
"{{ messages[0]['content'] + '\\n\\n' }}" # Add system prompt with two newlines
"{% set loop_messages = messages[1:] %}" # Slice to loop over remaining messages
"{% else %}"
"{% set loop_messages = messages %}" # No system prompt, loop over all messages
"{% endif %}"
"{% for message in loop_messages %}" # Loop through user and assistant messages
"{% if message['role'] == 'user' %}"
"{{ 'USER: ' + message['content'].strip() + '\\n' }}"
"{% elif message['role'] == 'assistant' %}"
"{{ 'ASSISTANT: ' + message['content'].strip() + eos_token + '\\n' }}"
"{% endif %}"
"{% endfor %}"
"{% if add_generation_prompt %}" # If we need to prompt the model for a response
"{% if messages[-1]['role'] != 'assistant' %}" # And the last message wasn't from the assistant
"{{ 'ASSISTANT:' }}" # Add the assistant prompt
"{% endif %}"
"{% endif %}"
)
tokenizer.chat_template = VICUNA_CHAT_TEMPLATE
print("Manually set Vicuna chat template on the tokenizer.")
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
# Also update the model config's pad_token_id if you are setting tokenizer.pad_token
# This is crucial if the model's config doesn't get updated automatically.
if model.config.pad_token_id is None:
model.config.pad_token_id = tokenizer.pad_token_id
print(f"Tokenizer `pad_token` was None, set to `eos_token`: {tokenizer.eos_token}")
@spaces.GPU(required=True)
def generate_code(prompt: str) -> str:
messages = [
{"role": "system", "content": "You are a helpful and proficient coding assistant."},
{"role": "user", "content": prompt}
]
try:
# ** DOCUMENTATION: Applying Chat Template **
# Now that tokenizer.chat_template is set, this should work.
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True # Important to append "ASSISTANT:"
)
print(f"Formatted prompt using chat template:\n{text}") # For debugging
except Exception as e:
print(f"Error applying chat template: {e}")
# Provide a more informative error or fallback if needed
return f"Error: Could not apply chat template. Details: {e}. Ensure the tokenizer has a valid `chat_template` attribute."
# Determine device for inputs if model is on multiple devices
# For device_map="auto", input tensors should go to the device of the first model block.
input_device = model.hf_device_map.get("", next(iter(model.hf_device_map.values()))) if hasattr(model, "hf_device_map") else model.device
model_inputs = tokenizer([text], return_tensors="pt").to(input_device)
with torch.no_grad():
generated_ids = model.generate(
**model_inputs, # Pass tokenized inputs
max_new_tokens=1024,
min_new_tokens=256,
do_sample=True,
temperature=0.7,
top_p=0.9,
pad_token_id=tokenizer.eos_token_id # Use EOS token for padding
)
response_ids = generated_ids[0][len(model_inputs.input_ids[0]):]
response = tokenizer.decode(response_ids, skip_special_tokens=True)
return response.strip()
# --- Gradio Interface ---
with gr.Blocks(title="Vicuna 33B Coder") as demo:
with gr.Tab("Code Chat"):
gr.Markdown("# Vicuna 33B Coder\nProvide a prompt to generate code.")
with gr.Row():
prompt_input = gr.Textbox( # Renamed to avoid conflict with 'prompt' variable in function scope
label="Prompt",
show_label=True,
lines=3,
placeholder="Enter your coding prompt here...",
)
run_button = gr.Button("Generate Code", variant="primary")
with gr.Row():
result_output = gr.Code( # Renamed
label="Generated Code",
show_label=True,
language="python",
lines=20,
)
gr.on(
triggers=[
run_button.click,
prompt_input.submit
],
fn=generate_code,
inputs=[prompt_input],
outputs=[result_output],
)
if __name__ == "__main__":
demo.launch(share=False, debug=True) |