File size: 6,519 Bytes
9505cff
30e82da
9505cff
 
 
 
143174c
9505cff
30e82da
9505cff
 
 
 
 
 
 
30e82da
 
 
9505cff
 
 
 
143174c
 
 
30e82da
 
 
143174c
 
 
30e82da
 
 
9505cff
 
143174c
 
30e82da
 
9505cff
 
 
 
 
 
143174c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9505cff
 
143174c
 
 
 
9505cff
 
 
30e82da
9505cff
 
 
 
 
 
143174c
 
9505cff
 
 
143174c
9505cff
143174c
9505cff
 
143174c
 
 
 
 
 
30e82da
143174c
30e82da
9505cff
 
143174c
9505cff
 
 
30e82da
 
143174c
9505cff
 
 
 
 
 
143174c
 
9505cff
 
 
143174c
9505cff
143174c
 
9505cff
 
 
 
143174c
9505cff
143174c
 
9505cff
 
 
 
 
143174c
9505cff
 
143174c
 
9505cff
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import spaces # If using Hugging Face Spaces
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig # Import BitsAndBytesConfig
import torch
import gradio as gr
import os

# --- Environment and PyTorch Configurations ---
os.putenv('TORCH_LINALG_PREFER_CUSOLVER','1')
os.putenv('PYTORCH_CUDA_ALLOC_CONF','max_split_size_mb:128')
os.environ["SAFETENSORS_FAST_GPU"] = "1"
os.putenv('HF_HUB_ENABLE_HF_TRANSFER','1')

torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False
torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
torch.backends.cudnn.allow_tf32 = False
torch.backends.cudnn.deterministic = False
torch.backends.cudnn.benchmark = True
torch.set_float32_matmul_precision("highest")

# --- Model and Tokenizer Configuration ---
model_name = "FelixChao/vicuna-33b-coder"

# --- Quantization Configuration (Example: 4-bit) ---
# This section is included based on our previous discussion.
# Remove or comment out if you are not using quantization.
print("Setting up 4-bit quantization config...")
quantization_config_4bit = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

print(f"Loading model: {model_name} with quantization")
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quantization_config_4bit, # Comment out if not using quantization
    device_map="auto",
)

print(f"Loading tokenizer: {model_name}")
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    use_fast=True
)

# ** MODIFICATION: Define and set the Vicuna chat template **
# ** DOCUMENTATION: Chat Template **
# Vicuna models expect a specific chat format. If the tokenizer doesn't have one
# built-in, we need to set it manually.
# This template handles a system prompt, user messages, and assistant responses.
# It will also add the "ASSISTANT:" prompt for generation if needed.
VICUNA_CHAT_TEMPLATE = (
    "{% if messages[0]['role'] == 'system' %}"  # Check if the first message is a system prompt
        "{{ messages[0]['content'] + '\\n\\n' }}"  # Add system prompt with two newlines
        "{% set loop_messages = messages[1:] %}"  # Slice to loop over remaining messages
    "{% else %}"
        "{% set loop_messages = messages %}"  # No system prompt, loop over all messages
    "{% endif %}"
    "{% for message in loop_messages %}"  # Loop through user and assistant messages
        "{% if message['role'] == 'user' %}"
            "{{ 'USER: ' + message['content'].strip() + '\\n' }}"
        "{% elif message['role'] == 'assistant' %}"
            "{{ 'ASSISTANT: ' + message['content'].strip() + eos_token + '\\n' }}"
        "{% endif %}"
    "{% endfor %}"
    "{% if add_generation_prompt %}"  # If we need to prompt the model for a response
        "{% if messages[-1]['role'] != 'assistant' %}" # And the last message wasn't from the assistant
            "{{ 'ASSISTANT:' }}"  # Add the assistant prompt
        "{% endif %}"
    "{% endif %}"
)
tokenizer.chat_template = VICUNA_CHAT_TEMPLATE
print("Manually set Vicuna chat template on the tokenizer.")


if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    # Also update the model config's pad_token_id if you are setting tokenizer.pad_token
    # This is crucial if the model's config doesn't get updated automatically.
    if model.config.pad_token_id is None:
         model.config.pad_token_id = tokenizer.pad_token_id
    print(f"Tokenizer `pad_token` was None, set to `eos_token`: {tokenizer.eos_token}")


@spaces.GPU(required=True)
def generate_code(prompt: str) -> str:
    messages = [
        {"role": "system", "content": "You are a helpful and proficient coding assistant."},
        {"role": "user", "content": prompt}
    ]
    try:
        # ** DOCUMENTATION: Applying Chat Template **
        # Now that tokenizer.chat_template is set, this should work.
        text = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True # Important to append "ASSISTANT:"
        )
        print(f"Formatted prompt using chat template:\n{text}") # For debugging
    except Exception as e:
        print(f"Error applying chat template: {e}")
        # Provide a more informative error or fallback if needed
        return f"Error: Could not apply chat template. Details: {e}. Ensure the tokenizer has a valid `chat_template` attribute."

    # Determine device for inputs if model is on multiple devices
    # For device_map="auto", input tensors should go to the device of the first model block.
    input_device = model.hf_device_map.get("", next(iter(model.hf_device_map.values()))) if hasattr(model, "hf_device_map") else model.device

    model_inputs = tokenizer([text], return_tensors="pt").to(input_device)

    with torch.no_grad():
        generated_ids = model.generate(
            **model_inputs, # Pass tokenized inputs
            max_new_tokens=1024,
            min_new_tokens=256,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id # Use EOS token for padding
        )

    response_ids = generated_ids[0][len(model_inputs.input_ids[0]):]
    response = tokenizer.decode(response_ids, skip_special_tokens=True)
    return response.strip()

# --- Gradio Interface ---
with gr.Blocks(title="Vicuna 33B Coder") as demo:
    with gr.Tab("Code Chat"):
        gr.Markdown("# Vicuna 33B Coder\nProvide a prompt to generate code.")
        with gr.Row():
            prompt_input = gr.Textbox( # Renamed to avoid conflict with 'prompt' variable in function scope
                label="Prompt",
                show_label=True,
                lines=3,
                placeholder="Enter your coding prompt here...",
            )
        run_button = gr.Button("Generate Code", variant="primary")
        with gr.Row():
            result_output = gr.Code( # Renamed
                label="Generated Code",
                show_label=True,
                language="python",
                lines=20,
            )
        gr.on(
            triggers=[
                run_button.click,
                prompt_input.submit
            ],
            fn=generate_code,
            inputs=[prompt_input],
            outputs=[result_output],
        )

if __name__ == "__main__":
    demo.launch(share=False, debug=True)