File size: 9,219 Bytes
9505cff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78217cc
 
9505cff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
import spaces # If using Hugging Face Spaces
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import gradio as gr
import os

# --- Environment and PyTorch Configurations (Kept from your original code) ---
os.putenv('TORCH_LINALG_PREFER_CUSOLVER','1')
os.putenv('PYTORCH_CUDA_ALLOC_CONF','max_split_size_mb:128') # Be mindful of this with a larger model
os.environ["SAFETENSORS_FAST_GPU"] = "1"
os.putenv('HF_HUB_ENABLE_HF_TRANSFER','1')

torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False
torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
torch.backends.cudnn.allow_tf32 = False
torch.backends.cudnn.deterministic = False # Setting to False can sometimes improve performance if exact reproducibility isn't critical
torch.backends.cudnn.benchmark = True     # Setting to True can help if input sizes are consistent
# torch.backends.cuda.preferred_blas_library="cublas" # These are generally defaults or fine
# torch.backends.cuda.preferred_linalg_library="cusolver" # These are generally defaults or fine
torch.set_float32_matmul_precision("highest") # Or "high" if "highest" causes issues

# --- Model and Tokenizer Configuration ---
# ** MODIFICATION 1: Update the model_name **
model_name = "FelixChao/vicuna-33b-coder"

# ** DOCUMENTATION: Model and Tokenizer Loading **
# Load model and tokenizer.
# `torch_dtype="auto"` will attempt to use the optimal dtype (e.g., bfloat16 if available).
# `.to('cuda', torch.bfloat16)` explicitly moves the model to CUDA and casts to bfloat16.
#   - Note: `FelixChao/vicuna-33b-coder` is a large model.
#     Loading in bfloat16 requires substantial VRAM (~66GB+).
#     If you encounter OOM errors, consider quantization:
#     e.g., `load_in_8bit=True` or `load_in_4bit=True` (requires `bitsandbytes` library).
# ** MODIFICATION 2: Removed `trust_remote_code=True` (typically not needed for Vicuna) **
print(f"Loading model: {model_name}")
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    load_in_8bit=True,
    device_map="auto", # device_map="auto" can be helpful for very large models to distribute layers if you have multiple GPUs or for offloading.
                         # For single GPU, explicit .to('cuda') is fine.
    # trust_remote_code=True # Removed: Generally not needed for Vicuna/Llama models
).to('cuda', torch.bfloat16) # Explicitly using bfloat16 as in original code

# ** MODIFICATION 3: Removed `trust_remote_code=True` for tokenizer **
print(f"Loading tokenizer: {model_name}")
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    # trust_remote_code=True, # Removed: Generally not needed
    use_fast=True
)

# ** DOCUMENTATION: Pad Token **
# Vicuna/Llama models usually use EOS token as PAD token if PAD is not explicitly set.
# This is often handled internally by the tokenizer or generation functions.
# If you encounter issues related to padding, you might need to set it explicitly:
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    print(f"Tokenizer `pad_token` was None, set to `eos_token`: {tokenizer.eos_token}")

# Ensure the model's pad_token_id is also configured if necessary,
# though `generate` usually handles this.
model.config.pad_token_id = tokenizer.pad_token_id


@spaces.GPU(required=True) # For Hugging Face Spaces GPU allocation
def generate_code(prompt: str) -> str:
    """
    Generates code based on the given prompt using the loaded Vicuna model.

    Args:
        prompt: The user's input prompt for code generation.

    Returns:
        The generated code as a string.
    """
    # ** MODIFICATION 4: Update the system prompt for Vicuna **
    # ** DOCUMENTATION: Chat Template Messages **
    # The `messages` format is used by `tokenizer.apply_chat_template`.
    # The system prompt provides context/instructions to the model.
    # For Vicuna, a generic helpful assistant prompt is suitable.
    messages = [
        {"role": "system", "content": "You are a helpful and proficient coding assistant."},
        {"role": "user", "content": prompt}
    ]

    # ** DOCUMENTATION: Applying Chat Template **
    # `apply_chat_template` formats the input messages according to the model's specific template.
    # `tokenize=False` returns a formatted string.
    # `add_generation_prompt=True` ensures the template is set up for the model to start generating a response.
    try:
        text = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )
    except Exception as e:
        print(f"Error applying chat template: {e}")
        # Fallback or simpler prompt structure if template is missing/problematic
        # This is a basic fallback, actual Vicuna instruction format might be more specific
        # e.g., "USER: {prompt}\nASSISTANT:"
        # However, `apply_chat_template` is the preferred method.
        # If this fails, the tokenizer for `FelixChao/vicuna-33b-coder` might be missing its template.
        return f"Error: Could not apply chat template. The model's tokenizer might be misconfigured. ({e})"


    # ** DOCUMENTATION: Tokenization **
    # Tokenize the formatted text and move inputs to the model's device (GPU).
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

    # ** DOCUMENTATION: Code Generation **
    # Generate code using the model.
    # `torch.no_grad()` disables gradient calculations, saving memory and computation during inference.
    # `max_new_tokens`: Maximum number of new tokens to generate.
    # `min_new_tokens`: Minimum number of new tokens to generate.
    # `do_sample=True`: Enables sampling for more diverse outputs. If False, uses greedy decoding.
    # `low_memory` is not a standard Hugging Face `generate` parameter.
    #                 It might have been intended for a specific version or a custom argument.
    #                 I've removed it as it might cause an error with standard generate.
    #                 If you intended to use a specific memory optimization, that needs to be handled
    #                 via other means (like quantization or model offloading).
    with torch.no_grad():
        generated_ids = model.generate(
            **model_inputs,
            max_new_tokens=1024,
            min_new_tokens=256,
            do_sample=True,
            temperature=0.7, # Common sampling parameter, you can tune this
            top_p=0.9,       # Common sampling parameter, you can tune this
            pad_token_id=tokenizer.eos_token_id # Important for open-ended generation
            # guidance_scale = 3.8, # Typically for Classifier Free Guidance, might not apply here or require specific setup
        )

    # ** DOCUMENTATION: Decoding Response **
    # Remove the input tokens from the generated output to get only the response.
    # `generated_ids[0]` because we process one prompt at a time.
    # `model_inputs.input_ids[0]` is the input part.
    response_ids = generated_ids[0][len(model_inputs.input_ids[0]):]

    # Decode the generated token IDs back into a string.
    # `skip_special_tokens=True` removes tokens like <s>, </s>, <unk>, etc.
    response = tokenizer.decode(response_ids, skip_special_tokens=True)

    return response.strip()

# --- Gradio Interface (Kept mostly from your original code) ---
with gr.Blocks(title="Vicuna 33B Coder") as demo: # Updated title
    with gr.Tab("Code Chat"):
        gr.Markdown("# Vicuna 33B Coder\nProvide a prompt to generate code.")
        with gr.Row():
            prompt = gr.Textbox( # Changed to Textbox for potentially longer prompts
                label="Prompt",
                show_label=True, # Changed to True for clarity
                lines=3,         # Allow a few lines for the prompt
                placeholder="Enter your coding prompt here...",
                # container=False, # container is not a standard param for Textbox in this context
            )
        run_button = gr.Button("Generate Code", variant="primary")
        with gr.Row():
            result = gr.Code( # Using gr.Code for better code display
                label="Generated Code",
                show_label=True, # Changed to True for clarity
                language="python", # Default language, can be auto-detected or set
                lines=20,
                # container=False,
            )

        # Gradio event listener
        gr.on(
            triggers=[
                run_button.click,
                prompt.submit # Allow submitting with Enter key in Textbox
            ],
            fn=generate_code,
            inputs=[prompt],
            outputs=[result],
            # api_name="generate_code" # Uncomment if you want to expose this as an API endpoint
        )

if __name__ == "__main__":
    # ** DOCUMENTATION: Launching Gradio **
    # `share=False` by default, set to True if you want a public link (requires internet).
    # `debug=True` can be helpful for development to see more detailed errors.
    # `server_name="0.0.0.0"` to make it accessible on your local network.
    demo.launch(share=False, debug=True)