Spaces:

ford442
/

vicuna-coder-33b

Running on Zero

File size: 7,187 Bytes

9505cff
30e82da
9505cff
 
 
 
 
30e82da
9505cff
30e82da
9505cff
 
 
 
 
 
 
30e82da
 
 
9505cff
 
 
 
30e82da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9505cff
 
30e82da
 
 
 
 
 
 
 
 
9505cff
 
 
30e82da
9505cff
 
 
 
 
 
 
30e82da
 
 
 
 
9505cff
30e82da
 
 
9505cff
30e82da
9505cff
 
 
 
 
 
 
 
 
 
 
 
 
30e82da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9505cff
 
30e82da
 
9505cff
 
 
30e82da
 
 
9505cff
 
30e82da
9505cff
 
 
 
 
 
 
 
 
30e82da
9505cff
30e82da
 
9505cff
 
 
 
30e82da
9505cff
30e82da
 
9505cff
 
 
 
 
30e82da
9505cff

import spaces # If using Hugging Face Spaces
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig # Import BitsAndBytesConfig
import torch
import gradio as gr
import os

# --- Environment and PyTorch Configurations (Kept from your original code) ---
# ... (rest of your os.putenv and torch.backends settings) ...
os.putenv('TORCH_LINALG_PREFER_CUSOLVER','1')
os.putenv('PYTORCH_CUDA_ALLOC_CONF','max_split_size_mb:128')
os.environ["SAFETENSORS_FAST_GPU"] = "1"
os.putenv('HF_HUB_ENABLE_HF_TRANSFER','1')

torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False
torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
torch.backends.cudnn.allow_tf32 = False
torch.backends.cudnn.deterministic = False
torch.backends.cudnn.benchmark = True
torch.set_float32_matmul_precision("highest")

# --- Model and Tokenizer Configuration ---
model_name = "FelixChao/vicuna-33b-coder"

# ** DOCUMENTATION: Quantization Configuration **
# To load the model in 4-bit (or 8-bit), you now use BitsAndBytesConfig.
# This is useful if you're VRAM-constrained.

# Example for 4-bit quantization:
print("Setting up 4-bit quantization config...")
quantization_config_4bit = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,    # Optional: Improves precision slightly but uses a bit more memory
    bnb_4bit_quant_type="nf4",         # Recommended: "nf4" (NormalFloat4) or "fp4" for floating point 4-bit
    bnb_4bit_compute_dtype=torch.bfloat16 # Or torch.float16. Computation will happen in this dtype.
                                        # bfloat16 is good if your GPU supports it (Ampere series onwards)
)

# Example for 8-bit quantization (if you prefer that over 4-bit):
# print("Setting up 8-bit quantization config...")
# quantization_config_8bit = BitsAndBytesConfig(
#     load_in_8bit=True
# )

# ** DOCUMENTATION: Model Loading with Quantization **
print(f"Loading model: {model_name} with quantization")
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quantization_config_4bit, # Pass the config here
    device_map="auto", # CRITICAL: Use device_map="auto" for quantized models.
                       # It automatically distributes the model across available GPUs/CPU memory as needed.
                       # Do NOT use .to('cuda') after this when using device_map="auto" with quantization.
    # torch_dtype="auto", # With device_map="auto" and quantization, dtype is often handled,
                          # but bnb_4bit_compute_dtype in BitsAndBytesConfig specifies compute precision.
    # trust_remote_code=True # As discussed, generally not needed for Vicuna
)

print(f"Loading tokenizer: {model_name}")
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    # trust_remote_code=True,
    use_fast=True
)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    print(f"Tokenizer `pad_token` was None, set to `eos_token`: {tokenizer.eos_token}")

# Note: model.config.pad_token_id is usually set by the tokenizer or handled by generate.
# If using device_map, the model might not have a single `model.device` attribute in the traditional sense
# if it's spread across devices. model_inputs should still be moved to the device of the first layer,
# which `generate` often handles, or you can query input_device = model.hf_device_map[""] (for the first block)
# and .to(input_device)

# ... (rest of your generate_code function and Gradio app code) ...
# Make sure to adjust the device placement for model_inputs if needed,
# though often `model.generate` handles this correctly when `device_map` is used.

@spaces.GPU(required=True)
def generate_code(prompt: str) -> str:
    messages = [
        {"role": "system", "content": "You are a helpful and proficient coding assistant."},
        {"role": "user", "content": prompt}
    ]
    try:
        text = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )
    except Exception as e:
        print(f"Error applying chat template: {e}")
        return f"Error: Could not apply chat template. ({e})"

    # Determine the device for inputs. If model is on multiple devices,
    # inputs typically go to the device of the first part of the model.
    # With device_map="auto", model.device might not be straightforward.
    # model.generate usually handles input placement correctly.
    # If you face issues, you might need to explicitly find the input device:
    #   input_device = model.hf_device_map.get("", "cuda:0") # Get device of first module or default
    #   model_inputs = tokenizer([text], return_tensors="pt").to(input_device)
    # For now, let's assume .to(model.device) works or generate handles it.
    # If model.device is not available due to device_map, remove .to(model.device)
    # and let `generate` handle it, or use the hf_device_map.
    
    # Since device_map="auto" is used, the model might be on multiple devices.
    # We don't need to explicitly move model_inputs to model.device here,
    # as the `generate` function should handle it correctly with `device_map`.
    model_inputs = tokenizer([text], return_tensors="pt")


    with torch.no_grad():
        generated_ids = model.generate(
            input_ids=model_inputs.input_ids.to(model.device if hasattr(model, "device") else model.hf_device_map[""]), # Ensure input_ids are on the correct device
            attention_mask=model_inputs.attention_mask.to(model.device if hasattr(model, "device") else model.hf_device_map[""]), # Ensure attention_mask is on the correct device
            max_new_tokens=1024,
            min_new_tokens=256,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id
        )

    # The rest of your generate_code function for decoding should be fine
    response_ids = generated_ids[0][len(model_inputs.input_ids[0]):]
    response = tokenizer.decode(response_ids, skip_special_tokens=True)
    return response.strip()

# --- Gradio Interface (Kept mostly from your original code) ---
with gr.Blocks(title="Vicuna 33B Coder") as demo: # Updated title
    with gr.Tab("Code Chat"):
        gr.Markdown("# Vicuna 33B Coder\nProvide a prompt to generate code.")
        with gr.Row():
            prompt = gr.Textbox( 
                label="Prompt",
                show_label=True, 
                lines=3,        
                placeholder="Enter your coding prompt here...",
            )
        run_button = gr.Button("Generate Code", variant="primary")
        with gr.Row():
            result = gr.Code( 
                label="Generated Code",
                show_label=True, 
                language="python", 
                lines=20,
            )
        gr.on(
            triggers=[
                run_button.click,
                prompt.submit 
            ],
            fn=generate_code,
            inputs=[prompt],
            outputs=[result],
        )

if __name__ == "__main__":
    demo.launch(share=False, debug=True)