File size: 7,187 Bytes
9505cff
30e82da
9505cff
 
 
 
 
30e82da
9505cff
30e82da
9505cff
 
 
 
 
 
 
30e82da
 
 
9505cff
 
 
 
30e82da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9505cff
 
30e82da
 
 
 
 
 
 
 
 
9505cff
 
 
30e82da
9505cff
 
 
 
 
 
 
30e82da
 
 
 
 
9505cff
30e82da
 
 
9505cff
30e82da
9505cff
 
 
 
 
 
 
 
 
 
 
 
 
30e82da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9505cff
 
30e82da
 
9505cff
 
 
30e82da
 
 
9505cff
 
30e82da
9505cff
 
 
 
 
 
 
 
 
30e82da
9505cff
30e82da
 
9505cff
 
 
 
30e82da
9505cff
30e82da
 
9505cff
 
 
 
 
30e82da
9505cff
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import spaces # If using Hugging Face Spaces
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig # Import BitsAndBytesConfig
import torch
import gradio as gr
import os

# --- Environment and PyTorch Configurations (Kept from your original code) ---
# ... (rest of your os.putenv and torch.backends settings) ...
os.putenv('TORCH_LINALG_PREFER_CUSOLVER','1')
os.putenv('PYTORCH_CUDA_ALLOC_CONF','max_split_size_mb:128')
os.environ["SAFETENSORS_FAST_GPU"] = "1"
os.putenv('HF_HUB_ENABLE_HF_TRANSFER','1')

torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False
torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
torch.backends.cudnn.allow_tf32 = False
torch.backends.cudnn.deterministic = False
torch.backends.cudnn.benchmark = True
torch.set_float32_matmul_precision("highest")

# --- Model and Tokenizer Configuration ---
model_name = "FelixChao/vicuna-33b-coder"

# ** DOCUMENTATION: Quantization Configuration **
# To load the model in 4-bit (or 8-bit), you now use BitsAndBytesConfig.
# This is useful if you're VRAM-constrained.

# Example for 4-bit quantization:
print("Setting up 4-bit quantization config...")
quantization_config_4bit = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,    # Optional: Improves precision slightly but uses a bit more memory
    bnb_4bit_quant_type="nf4",         # Recommended: "nf4" (NormalFloat4) or "fp4" for floating point 4-bit
    bnb_4bit_compute_dtype=torch.bfloat16 # Or torch.float16. Computation will happen in this dtype.
                                        # bfloat16 is good if your GPU supports it (Ampere series onwards)
)

# Example for 8-bit quantization (if you prefer that over 4-bit):
# print("Setting up 8-bit quantization config...")
# quantization_config_8bit = BitsAndBytesConfig(
#     load_in_8bit=True
# )

# ** DOCUMENTATION: Model Loading with Quantization **
print(f"Loading model: {model_name} with quantization")
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quantization_config_4bit, # Pass the config here
    device_map="auto", # CRITICAL: Use device_map="auto" for quantized models.
                       # It automatically distributes the model across available GPUs/CPU memory as needed.
                       # Do NOT use .to('cuda') after this when using device_map="auto" with quantization.
    # torch_dtype="auto", # With device_map="auto" and quantization, dtype is often handled,
                          # but bnb_4bit_compute_dtype in BitsAndBytesConfig specifies compute precision.
    # trust_remote_code=True # As discussed, generally not needed for Vicuna
)

print(f"Loading tokenizer: {model_name}")
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    # trust_remote_code=True,
    use_fast=True
)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    print(f"Tokenizer `pad_token` was None, set to `eos_token`: {tokenizer.eos_token}")

# Note: model.config.pad_token_id is usually set by the tokenizer or handled by generate.
# If using device_map, the model might not have a single `model.device` attribute in the traditional sense
# if it's spread across devices. model_inputs should still be moved to the device of the first layer,
# which `generate` often handles, or you can query input_device = model.hf_device_map[""] (for the first block)
# and .to(input_device)

# ... (rest of your generate_code function and Gradio app code) ...
# Make sure to adjust the device placement for model_inputs if needed,
# though often `model.generate` handles this correctly when `device_map` is used.

@spaces.GPU(required=True)
def generate_code(prompt: str) -> str:
    messages = [
        {"role": "system", "content": "You are a helpful and proficient coding assistant."},
        {"role": "user", "content": prompt}
    ]
    try:
        text = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )
    except Exception as e:
        print(f"Error applying chat template: {e}")
        return f"Error: Could not apply chat template. ({e})"

    # Determine the device for inputs. If model is on multiple devices,
    # inputs typically go to the device of the first part of the model.
    # With device_map="auto", model.device might not be straightforward.
    # model.generate usually handles input placement correctly.
    # If you face issues, you might need to explicitly find the input device:
    #   input_device = model.hf_device_map.get("", "cuda:0") # Get device of first module or default
    #   model_inputs = tokenizer([text], return_tensors="pt").to(input_device)
    # For now, let's assume .to(model.device) works or generate handles it.
    # If model.device is not available due to device_map, remove .to(model.device)
    # and let `generate` handle it, or use the hf_device_map.
    
    # Since device_map="auto" is used, the model might be on multiple devices.
    # We don't need to explicitly move model_inputs to model.device here,
    # as the `generate` function should handle it correctly with `device_map`.
    model_inputs = tokenizer([text], return_tensors="pt")


    with torch.no_grad():
        generated_ids = model.generate(
            input_ids=model_inputs.input_ids.to(model.device if hasattr(model, "device") else model.hf_device_map[""]), # Ensure input_ids are on the correct device
            attention_mask=model_inputs.attention_mask.to(model.device if hasattr(model, "device") else model.hf_device_map[""]), # Ensure attention_mask is on the correct device
            max_new_tokens=1024,
            min_new_tokens=256,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id
        )

    # The rest of your generate_code function for decoding should be fine
    response_ids = generated_ids[0][len(model_inputs.input_ids[0]):]
    response = tokenizer.decode(response_ids, skip_special_tokens=True)
    return response.strip()

# --- Gradio Interface (Kept mostly from your original code) ---
with gr.Blocks(title="Vicuna 33B Coder") as demo: # Updated title
    with gr.Tab("Code Chat"):
        gr.Markdown("# Vicuna 33B Coder\nProvide a prompt to generate code.")
        with gr.Row():
            prompt = gr.Textbox( 
                label="Prompt",
                show_label=True, 
                lines=3,        
                placeholder="Enter your coding prompt here...",
            )
        run_button = gr.Button("Generate Code", variant="primary")
        with gr.Row():
            result = gr.Code( 
                label="Generated Code",
                show_label=True, 
                language="python", 
                lines=20,
            )
        gr.on(
            triggers=[
                run_button.click,
                prompt.submit 
            ],
            fn=generate_code,
            inputs=[prompt],
            outputs=[result],
        )

if __name__ == "__main__":
    demo.launch(share=False, debug=True)