ford442 commited on
Commit
9505cff
·
verified ·
1 Parent(s): 01483e4

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +185 -0
app.py ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spaces # If using Hugging Face Spaces
2
+ from transformers import AutoModelForCausalLM, AutoTokenizer
3
+ import torch
4
+ import gradio as gr
5
+ import os
6
+
7
+ # --- Environment and PyTorch Configurations (Kept from your original code) ---
8
+ os.putenv('TORCH_LINALG_PREFER_CUSOLVER','1')
9
+ os.putenv('PYTORCH_CUDA_ALLOC_CONF','max_split_size_mb:128') # Be mindful of this with a larger model
10
+ os.environ["SAFETENSORS_FAST_GPU"] = "1"
11
+ os.putenv('HF_HUB_ENABLE_HF_TRANSFER','1')
12
+
13
+ torch.backends.cuda.matmul.allow_tf32 = False
14
+ torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False
15
+ torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
16
+ torch.backends.cudnn.allow_tf32 = False
17
+ torch.backends.cudnn.deterministic = False # Setting to False can sometimes improve performance if exact reproducibility isn't critical
18
+ torch.backends.cudnn.benchmark = True # Setting to True can help if input sizes are consistent
19
+ # torch.backends.cuda.preferred_blas_library="cublas" # These are generally defaults or fine
20
+ # torch.backends.cuda.preferred_linalg_library="cusolver" # These are generally defaults or fine
21
+ torch.set_float32_matmul_precision("highest") # Or "high" if "highest" causes issues
22
+
23
+ # --- Model and Tokenizer Configuration ---
24
+ # ** MODIFICATION 1: Update the model_name **
25
+ model_name = "FelixChao/vicuna-33b-coder"
26
+
27
+ # ** DOCUMENTATION: Model and Tokenizer Loading **
28
+ # Load model and tokenizer.
29
+ # `torch_dtype="auto"` will attempt to use the optimal dtype (e.g., bfloat16 if available).
30
+ # `.to('cuda', torch.bfloat16)` explicitly moves the model to CUDA and casts to bfloat16.
31
+ # - Note: `FelixChao/vicuna-33b-coder` is a large model.
32
+ # Loading in bfloat16 requires substantial VRAM (~66GB+).
33
+ # If you encounter OOM errors, consider quantization:
34
+ # e.g., `load_in_8bit=True` or `load_in_4bit=True` (requires `bitsandbytes` library).
35
+ # ** MODIFICATION 2: Removed `trust_remote_code=True` (typically not needed for Vicuna) **
36
+ print(f"Loading model: {model_name}")
37
+ model = AutoModelForCausalLM.from_pretrained(
38
+ model_name,
39
+ torch_dtype="auto",
40
+ # device_map="auto", # device_map="auto" can be helpful for very large models to distribute layers if you have multiple GPUs or for offloading.
41
+ # For single GPU, explicit .to('cuda') is fine.
42
+ # trust_remote_code=True # Removed: Generally not needed for Vicuna/Llama models
43
+ ).to('cuda', torch.bfloat16) # Explicitly using bfloat16 as in original code
44
+
45
+ # ** MODIFICATION 3: Removed `trust_remote_code=True` for tokenizer **
46
+ print(f"Loading tokenizer: {model_name}")
47
+ tokenizer = AutoTokenizer.from_pretrained(
48
+ model_name,
49
+ # trust_remote_code=True, # Removed: Generally not needed
50
+ use_fast=True
51
+ )
52
+
53
+ # ** DOCUMENTATION: Pad Token **
54
+ # Vicuna/Llama models usually use EOS token as PAD token if PAD is not explicitly set.
55
+ # This is often handled internally by the tokenizer or generation functions.
56
+ # If you encounter issues related to padding, you might need to set it explicitly:
57
+ if tokenizer.pad_token is None:
58
+ tokenizer.pad_token = tokenizer.eos_token
59
+ print(f"Tokenizer `pad_token` was None, set to `eos_token`: {tokenizer.eos_token}")
60
+
61
+ # Ensure the model's pad_token_id is also configured if necessary,
62
+ # though `generate` usually handles this.
63
+ model.config.pad_token_id = tokenizer.pad_token_id
64
+
65
+
66
+ @spaces.GPU(required=True) # For Hugging Face Spaces GPU allocation
67
+ def generate_code(prompt: str) -> str:
68
+ """
69
+ Generates code based on the given prompt using the loaded Vicuna model.
70
+
71
+ Args:
72
+ prompt: The user's input prompt for code generation.
73
+
74
+ Returns:
75
+ The generated code as a string.
76
+ """
77
+ # ** MODIFICATION 4: Update the system prompt for Vicuna **
78
+ # ** DOCUMENTATION: Chat Template Messages **
79
+ # The `messages` format is used by `tokenizer.apply_chat_template`.
80
+ # The system prompt provides context/instructions to the model.
81
+ # For Vicuna, a generic helpful assistant prompt is suitable.
82
+ messages = [
83
+ {"role": "system", "content": "You are a helpful and proficient coding assistant."},
84
+ {"role": "user", "content": prompt}
85
+ ]
86
+
87
+ # ** DOCUMENTATION: Applying Chat Template **
88
+ # `apply_chat_template` formats the input messages according to the model's specific template.
89
+ # `tokenize=False` returns a formatted string.
90
+ # `add_generation_prompt=True` ensures the template is set up for the model to start generating a response.
91
+ try:
92
+ text = tokenizer.apply_chat_template(
93
+ messages,
94
+ tokenize=False,
95
+ add_generation_prompt=True
96
+ )
97
+ except Exception as e:
98
+ print(f"Error applying chat template: {e}")
99
+ # Fallback or simpler prompt structure if template is missing/problematic
100
+ # This is a basic fallback, actual Vicuna instruction format might be more specific
101
+ # e.g., "USER: {prompt}\nASSISTANT:"
102
+ # However, `apply_chat_template` is the preferred method.
103
+ # If this fails, the tokenizer for `FelixChao/vicuna-33b-coder` might be missing its template.
104
+ return f"Error: Could not apply chat template. The model's tokenizer might be misconfigured. ({e})"
105
+
106
+
107
+ # ** DOCUMENTATION: Tokenization **
108
+ # Tokenize the formatted text and move inputs to the model's device (GPU).
109
+ model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
110
+
111
+ # ** DOCUMENTATION: Code Generation **
112
+ # Generate code using the model.
113
+ # `torch.no_grad()` disables gradient calculations, saving memory and computation during inference.
114
+ # `max_new_tokens`: Maximum number of new tokens to generate.
115
+ # `min_new_tokens`: Minimum number of new tokens to generate.
116
+ # `do_sample=True`: Enables sampling for more diverse outputs. If False, uses greedy decoding.
117
+ # `low_memory` is not a standard Hugging Face `generate` parameter.
118
+ # It might have been intended for a specific version or a custom argument.
119
+ # I've removed it as it might cause an error with standard generate.
120
+ # If you intended to use a specific memory optimization, that needs to be handled
121
+ # via other means (like quantization or model offloading).
122
+ with torch.no_grad():
123
+ generated_ids = model.generate(
124
+ **model_inputs,
125
+ max_new_tokens=1024,
126
+ min_new_tokens=256,
127
+ do_sample=True,
128
+ temperature=0.7, # Common sampling parameter, you can tune this
129
+ top_p=0.9, # Common sampling parameter, you can tune this
130
+ pad_token_id=tokenizer.eos_token_id # Important for open-ended generation
131
+ # guidance_scale = 3.8, # Typically for Classifier Free Guidance, might not apply here or require specific setup
132
+ )
133
+
134
+ # ** DOCUMENTATION: Decoding Response **
135
+ # Remove the input tokens from the generated output to get only the response.
136
+ # `generated_ids[0]` because we process one prompt at a time.
137
+ # `model_inputs.input_ids[0]` is the input part.
138
+ response_ids = generated_ids[0][len(model_inputs.input_ids[0]):]
139
+
140
+ # Decode the generated token IDs back into a string.
141
+ # `skip_special_tokens=True` removes tokens like <s>, </s>, <unk>, etc.
142
+ response = tokenizer.decode(response_ids, skip_special_tokens=True)
143
+
144
+ return response.strip()
145
+
146
+ # --- Gradio Interface (Kept mostly from your original code) ---
147
+ with gr.Blocks(title="Vicuna 33B Coder") as demo: # Updated title
148
+ with gr.Tab("Code Chat"):
149
+ gr.Markdown("# Vicuna 33B Coder\nProvide a prompt to generate code.")
150
+ with gr.Row():
151
+ prompt = gr.Textbox( # Changed to Textbox for potentially longer prompts
152
+ label="Prompt",
153
+ show_label=True, # Changed to True for clarity
154
+ lines=3, # Allow a few lines for the prompt
155
+ placeholder="Enter your coding prompt here...",
156
+ # container=False, # container is not a standard param for Textbox in this context
157
+ )
158
+ run_button = gr.Button("Generate Code", variant="primary")
159
+ with gr.Row():
160
+ result = gr.Code( # Using gr.Code for better code display
161
+ label="Generated Code",
162
+ show_label=True, # Changed to True for clarity
163
+ language="python", # Default language, can be auto-detected or set
164
+ lines=20,
165
+ # container=False,
166
+ )
167
+
168
+ # Gradio event listener
169
+ gr.on(
170
+ triggers=[
171
+ run_button.click,
172
+ prompt.submit # Allow submitting with Enter key in Textbox
173
+ ],
174
+ fn=generate_code,
175
+ inputs=[prompt],
176
+ outputs=[result],
177
+ # api_name="generate_code" # Uncomment if you want to expose this as an API endpoint
178
+ )
179
+
180
+ if __name__ == "__main__":
181
+ # ** DOCUMENTATION: Launching Gradio **
182
+ # `share=False` by default, set to True if you want a public link (requires internet).
183
+ # `debug=True` can be helpful for development to see more detailed errors.
184
+ # `server_name="0.0.0.0"` to make it accessible on your local network.
185
+ demo.launch(share=False, debug=True)