Spaces:
Sleeping
Sleeping
looker01202
commited on
Commit
Β·
083650c
1
Parent(s):
7226a27
setup local venv to use gguf2
Browse files
app.py
CHANGED
@@ -14,12 +14,13 @@ except ImportError:
|
|
14 |
AutoModelForCausalLM_GGUF = None # Define as None if import fails
|
15 |
CTRANSFORMERS_AVAILABLE = False
|
16 |
|
17 |
-
# --- Configuration
|
18 |
-
#
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
|
|
23 |
# --- End Configuration ---
|
24 |
|
25 |
# Detect Space environment by SPACE_ID env var
|
@@ -33,21 +34,20 @@ print(f"Using device: {device}")
|
|
33 |
|
34 |
# Load model function (handles HF Space vs Local GGUF)
|
35 |
def load_model():
|
36 |
-
primary_checkpoint = "ibm-granite/granite-3.3-2b-instruct"
|
37 |
-
model_name_display = primary_checkpoint #
|
38 |
|
39 |
# --- Function to load and apply template ---
|
40 |
def apply_template_from_file(tokenizer, template_filename):
|
41 |
applied_template = False
|
42 |
try:
|
43 |
print(f"Attempting to load corrected chat template from: {template_filename}")
|
44 |
-
# Ensure the template file path is relative to the script location
|
45 |
script_dir = os.path.dirname(os.path.abspath(__file__))
|
46 |
template_path = os.path.join(script_dir, template_filename)
|
47 |
|
48 |
if not os.path.exists(template_path):
|
49 |
print(f"β οΈ WARNING: Corrected template file not found at: {template_path}")
|
50 |
-
return False
|
51 |
|
52 |
with open(template_path, "r", encoding="utf-8") as f:
|
53 |
custom_chat_template_content = f.read()
|
@@ -55,12 +55,10 @@ def load_model():
|
|
55 |
applied_template = True
|
56 |
print(f"β
Loaded and applied corrected chat template from: {template_filename}")
|
57 |
except FileNotFoundError:
|
58 |
-
# This case is handled by the os.path.exists check above
|
59 |
pass
|
60 |
except Exception as e:
|
61 |
print(f"β ERROR reading corrected template file '{template_filename}': {e}")
|
62 |
|
63 |
-
# Fallback / Verification print
|
64 |
if not applied_template:
|
65 |
print("Falling back to tokenizer's default built-in template.")
|
66 |
print("--- Final Chat Template Being Used ---")
|
@@ -72,29 +70,28 @@ def load_model():
|
|
72 |
if is_space:
|
73 |
print(f"π Running in Space. Loading HF model: {primary_checkpoint}")
|
74 |
try:
|
75 |
-
# Load HF Tokenizer
|
76 |
tokenizer = AutoTokenizer.from_pretrained(primary_checkpoint, use_fast=True)
|
77 |
-
# Load HF Model
|
78 |
model = AutoModelForCausalLM.from_pretrained(
|
79 |
primary_checkpoint,
|
80 |
torch_dtype=torch.float16,
|
81 |
low_cpu_mem_usage=True,
|
82 |
-
device_map="auto"
|
83 |
)
|
84 |
print(f"β
Loaded HF {primary_checkpoint}")
|
85 |
apply_template_from_file(tokenizer, CORRECTED_TEMPLATE_FILENAME)
|
86 |
-
return tokenizer, model, model_name_display
|
87 |
|
88 |
except Exception as e:
|
89 |
print(f"β HF Primary load failed: {e}")
|
90 |
raise RuntimeError(f"Failed to load primary HF model {primary_checkpoint} in Space.") from e
|
91 |
|
92 |
-
else: # Running Locally - Load GGUF
|
93 |
-
print(f"π» Running Locally. Attempting GGUF setup.")
|
94 |
if not CTRANSFORMERS_AVAILABLE:
|
95 |
raise RuntimeError("ctransformers library is required for local GGUF execution but not installed.")
|
96 |
|
97 |
-
print(f" GGUF
|
|
|
98 |
print(f" Using HF tokenizer for template: {primary_checkpoint}")
|
99 |
try:
|
100 |
# Load HF Tokenizer (needed for apply_chat_template)
|
@@ -102,24 +99,24 @@ def load_model():
|
|
102 |
print("β
Loaded HF Tokenizer for template application.")
|
103 |
apply_template_from_file(tokenizer, CORRECTED_TEMPLATE_FILENAME)
|
104 |
|
105 |
-
#
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
# Load GGUF Model using ctransformers
|
110 |
model = AutoModelForCausalLM_GGUF.from_pretrained(
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
gpu_layers=0 # CPU-only inference
|
115 |
)
|
116 |
-
print(f"β
Loaded GGUF model {
|
117 |
-
# Display GGUF
|
118 |
-
model_name_display = f"GGUF: {
|
119 |
return tokenizer, model, model_name_display
|
120 |
|
121 |
except Exception as e:
|
122 |
print(f"β Local GGUF load failed: {e}")
|
|
|
|
|
|
|
123 |
raise RuntimeError(f"Failed to load local GGUF model or its tokenizer.") from e
|
124 |
|
125 |
# --- Call load_model ---
|
@@ -127,8 +124,6 @@ try:
|
|
127 |
tokenizer, model, model_name = load_model()
|
128 |
except Exception as load_err:
|
129 |
print(f"π¨ CRITICAL ERROR DURING MODEL LOADING: {load_err}")
|
130 |
-
# Optionally, exit or provide a dummy model/tokenizer for Gradio UI to load without crashing
|
131 |
-
# For now, we'll let it potentially crash Gradio if loading fails.
|
132 |
raise
|
133 |
|
134 |
# --- Load hotel docs function ---
|
@@ -141,8 +136,6 @@ def load_hotel_docs(hotel_id):
|
|
141 |
try:
|
142 |
with open(path, encoding="utf-8") as f:
|
143 |
content = f.read().strip()
|
144 |
-
# Return as list of tuples: [(doc_id, content)]
|
145 |
-
# Using hotel_id as doc_id here
|
146 |
return [(hotel_id, content)]
|
147 |
except Exception as e:
|
148 |
print(f"β Error reading knowledge file {path}: {e}")
|
@@ -180,20 +173,16 @@ print("Hotel scan complete.\n")
|
|
180 |
|
181 |
# --- Chat function ---
|
182 |
def chat(message, history, hotel_id):
|
183 |
-
|
184 |
-
if history is None: history = [] # Ensure history is a list
|
185 |
history_tuples = [(m['role'], m['content']) for m in history if isinstance(m, dict) and 'role' in m and 'content' in m]
|
186 |
history_tuples.append(("user", message))
|
187 |
|
188 |
-
# Yield user message immediately
|
189 |
ui_history = [{"role": r, "content": c} for r, c in history_tuples]
|
190 |
-
yield ui_history, ""
|
191 |
|
192 |
-
|
193 |
-
input_text = "" # Initialize to avoid potential UnboundLocalError
|
194 |
try:
|
195 |
-
|
196 |
-
default_system_prompt = "You are a helpful hotel assistant..." # Define your default
|
197 |
system_prompt_filename = f"{hotel_id}-system.txt"
|
198 |
system_prompt_path = os.path.join("knowledge", system_prompt_filename)
|
199 |
system_prompt_content = default_system_prompt
|
@@ -204,14 +193,11 @@ def chat(message, history, hotel_id):
|
|
204 |
else: print(f"β οΈ System prompt file '{system_prompt_path}' is empty. Using default.")
|
205 |
except Exception as e: print(f"β Error reading system prompt file '{system_prompt_path}': {e}. Using default.")
|
206 |
else: print(f"β οΈ System prompt file not found: '{system_prompt_path}'. Using default.")
|
207 |
-
# --- End Load System Prompt ---
|
208 |
|
209 |
messages = [{"role": "system", "content": system_prompt_content}]
|
210 |
|
211 |
-
# --- Load and add hotel document(s) ---
|
212 |
hotel_docs = load_hotel_docs(hotel_id)
|
213 |
if not hotel_docs:
|
214 |
-
# If no knowledge doc found, inform user and stop
|
215 |
ui_history.append({"role": "assistant", "content": f"Sorry, I don't have specific information loaded for the hotel '{hotel_id}'."})
|
216 |
yield ui_history, ""
|
217 |
return
|
@@ -219,23 +205,19 @@ def chat(message, history, hotel_id):
|
|
219 |
for hotel_doc_id, doc_content in hotel_docs:
|
220 |
messages.append({
|
221 |
"role": "document",
|
222 |
-
"text": doc_content,
|
223 |
-
"doc_id": hotel_doc_id
|
224 |
})
|
225 |
-
# --- End Load Documents ---
|
226 |
|
227 |
-
# --- Include chat history ---
|
|
|
|
|
228 |
for role, content in history_tuples:
|
229 |
-
|
230 |
-
if role == "user" and content == message and history_tuples.index((role, content)) == len(history_tuples) - 1:
|
231 |
-
continue # Skip adding the very last user message again if template adds it
|
232 |
-
messages.append({"role": role, "content": content})
|
233 |
# --- End Include History ---
|
234 |
|
235 |
-
# --- Set controls ---
|
236 |
controls = {"length":"short","originality": "abstractive"}
|
237 |
|
238 |
-
# --- Apply the template ---
|
239 |
input_text = tokenizer.apply_chat_template(
|
240 |
messages,
|
241 |
tokenize=False,
|
@@ -253,11 +235,9 @@ def chat(message, history, hotel_id):
|
|
253 |
yield ui_history, ""
|
254 |
return
|
255 |
|
256 |
-
|
257 |
-
response = "Sorry, an error occurred during generation." # Default error response
|
258 |
try:
|
259 |
if is_space:
|
260 |
-
# --- HF Model Generation (Space) ---
|
261 |
print("π Generating response using HF model...")
|
262 |
inputs = tokenizer(input_text, return_tensors="pt").to(device)
|
263 |
input_length = inputs.input_ids.shape[1]
|
@@ -269,51 +249,43 @@ def chat(message, history, hotel_id):
|
|
269 |
attention_mask=inputs.attention_mask,
|
270 |
max_new_tokens=1024,
|
271 |
do_sample=False,
|
272 |
-
eos_token_id=tokenizer.eos_token_id
|
273 |
)
|
274 |
print(f"DEBUG: Output tokens shape = {outputs.shape}")
|
275 |
|
276 |
-
# Decode using the IBM example strategy
|
277 |
new_token_ids = outputs[0][input_length:]
|
278 |
print(f"DEBUG: Number of new tokens generated = {len(new_token_ids)}")
|
279 |
response = tokenizer.decode(new_token_ids, skip_special_tokens=True).strip()
|
280 |
print(f"DEBUG: Decoded response (skip_special_tokens=True) = {repr(response)}")
|
281 |
print("β
HF Generation complete.")
|
282 |
|
283 |
-
else:
|
284 |
-
# --- GGUF Model Generation (Local) ---
|
285 |
print("π» Generating response using GGUF model...")
|
286 |
response = model(
|
287 |
input_text,
|
288 |
max_new_tokens=1024,
|
289 |
-
stop=["<|end_of_text|>"],
|
290 |
-
temperature=0.3
|
291 |
)
|
292 |
response = response.strip()
|
293 |
print("β
GGUF Generation complete.")
|
294 |
|
295 |
-
# Handle empty response after generation
|
296 |
if not response:
|
297 |
response = "Sorry, I encountered an issue generating a response (empty)."
|
298 |
|
299 |
except Exception as e:
|
300 |
print(f"β Error during model generation or decoding: {e}")
|
301 |
-
# Keep the default error response defined above
|
302 |
|
303 |
-
# --- Final Response Handling ---
|
304 |
print(f"DEBUG: Final response variable before UI append = {repr(response)}")
|
305 |
|
306 |
-
# Add the final assistant reply to the UI history
|
307 |
ui_history.append({"role": "assistant", "content": response})
|
308 |
-
|
309 |
-
# Final yield with assistant reply
|
310 |
-
yield ui_history, "" # Update chat, keep textbox cleared
|
311 |
|
312 |
# --- Gradio UI ---
|
313 |
with gr.Blocks() as demo:
|
314 |
with gr.Column(variant="panel"):
|
315 |
gr.Markdown("### π¨ MultiβHotel Chatbot Demo")
|
316 |
-
gr.Markdown(f"**Running:** {model_name}") # Displays HF name or GGUF
|
317 |
|
318 |
hotel_selector = gr.Dropdown(
|
319 |
choices=available_hotels,
|
@@ -348,4 +320,4 @@ demo.queue(default_concurrency_limit=2, max_size=32)
|
|
348 |
if __name__ == "__main__":
|
349 |
print("Launching Gradio Interface...")
|
350 |
demo.launch()
|
351 |
-
print("Gradio Interface closed.")
|
|
|
14 |
AutoModelForCausalLM_GGUF = None # Define as None if import fails
|
15 |
CTRANSFORMERS_AVAILABLE = False
|
16 |
|
17 |
+
# --- Configuration ---
|
18 |
+
# HF Repo ID and Filename for the GGUF model to be used locally
|
19 |
+
GGUF_REPO_ID = "ibm-granite/granite-3.3-2b-instruct-gguf"
|
20 |
+
GGUF_FILENAME = "granite-3.3-2b-instruct-Q2_K.gguf" # Smallest footprint version
|
21 |
+
GGUF_FILENAME = "granite-3.3-2b-instruct-Q4_K_M.gguf" # Try this more standard quantization
|
22 |
+
|
23 |
+
CORRECTED_TEMPLATE_FILENAME = "granite3.3_2b_chat_template.jinja" # Name of your corrected template file
|
24 |
# --- End Configuration ---
|
25 |
|
26 |
# Detect Space environment by SPACE_ID env var
|
|
|
34 |
|
35 |
# Load model function (handles HF Space vs Local GGUF)
|
36 |
def load_model():
|
37 |
+
primary_checkpoint = "ibm-granite/granite-3.3-2b-instruct" # Standard HF model ID
|
38 |
+
model_name_display = primary_checkpoint # Default display name
|
39 |
|
40 |
# --- Function to load and apply template ---
|
41 |
def apply_template_from_file(tokenizer, template_filename):
|
42 |
applied_template = False
|
43 |
try:
|
44 |
print(f"Attempting to load corrected chat template from: {template_filename}")
|
|
|
45 |
script_dir = os.path.dirname(os.path.abspath(__file__))
|
46 |
template_path = os.path.join(script_dir, template_filename)
|
47 |
|
48 |
if not os.path.exists(template_path):
|
49 |
print(f"β οΈ WARNING: Corrected template file not found at: {template_path}")
|
50 |
+
return False
|
51 |
|
52 |
with open(template_path, "r", encoding="utf-8") as f:
|
53 |
custom_chat_template_content = f.read()
|
|
|
55 |
applied_template = True
|
56 |
print(f"β
Loaded and applied corrected chat template from: {template_filename}")
|
57 |
except FileNotFoundError:
|
|
|
58 |
pass
|
59 |
except Exception as e:
|
60 |
print(f"β ERROR reading corrected template file '{template_filename}': {e}")
|
61 |
|
|
|
62 |
if not applied_template:
|
63 |
print("Falling back to tokenizer's default built-in template.")
|
64 |
print("--- Final Chat Template Being Used ---")
|
|
|
70 |
if is_space:
|
71 |
print(f"π Running in Space. Loading HF model: {primary_checkpoint}")
|
72 |
try:
|
|
|
73 |
tokenizer = AutoTokenizer.from_pretrained(primary_checkpoint, use_fast=True)
|
|
|
74 |
model = AutoModelForCausalLM.from_pretrained(
|
75 |
primary_checkpoint,
|
76 |
torch_dtype=torch.float16,
|
77 |
low_cpu_mem_usage=True,
|
78 |
+
device_map="auto"
|
79 |
)
|
80 |
print(f"β
Loaded HF {primary_checkpoint}")
|
81 |
apply_template_from_file(tokenizer, CORRECTED_TEMPLATE_FILENAME)
|
82 |
+
return tokenizer, model, model_name_display # Use HF checkpoint name for display
|
83 |
|
84 |
except Exception as e:
|
85 |
print(f"β HF Primary load failed: {e}")
|
86 |
raise RuntimeError(f"Failed to load primary HF model {primary_checkpoint} in Space.") from e
|
87 |
|
88 |
+
else: # Running Locally - Load GGUF from Hub
|
89 |
+
print(f"π» Running Locally. Attempting GGUF setup via Hub.")
|
90 |
if not CTRANSFORMERS_AVAILABLE:
|
91 |
raise RuntimeError("ctransformers library is required for local GGUF execution but not installed.")
|
92 |
|
93 |
+
print(f" GGUF Repo ID: {GGUF_REPO_ID}")
|
94 |
+
print(f" GGUF Filename: {GGUF_FILENAME}")
|
95 |
print(f" Using HF tokenizer for template: {primary_checkpoint}")
|
96 |
try:
|
97 |
# Load HF Tokenizer (needed for apply_chat_template)
|
|
|
99 |
print("β
Loaded HF Tokenizer for template application.")
|
100 |
apply_template_from_file(tokenizer, CORRECTED_TEMPLATE_FILENAME)
|
101 |
|
102 |
+
# Load GGUF Model using ctransformers, downloading from Hub
|
103 |
+
# ctransformers will download the specified model_file from the repo_id
|
104 |
+
# if it's not already cached locally.
|
|
|
|
|
105 |
model = AutoModelForCausalLM_GGUF.from_pretrained(
|
106 |
+
GGUF_REPO_ID, # Pass the Repository ID
|
107 |
+
model_file=GGUF_FILENAME, # Specify the exact file to load/download
|
108 |
+
gpu_layers=0 # CPU-only inference
|
|
|
109 |
)
|
110 |
+
print(f"β
Loaded GGUF model {GGUF_FILENAME} from {GGUF_REPO_ID}")
|
111 |
+
# Display GGUF info in UI when running locally
|
112 |
+
model_name_display = f"GGUF: {GGUF_FILENAME}"
|
113 |
return tokenizer, model, model_name_display
|
114 |
|
115 |
except Exception as e:
|
116 |
print(f"β Local GGUF load failed: {e}")
|
117 |
+
# Add more specific error message if file not found on Hub
|
118 |
+
if "not found on HuggingFace Hub" in str(e):
|
119 |
+
print(f" Please ensure Repo ID '{GGUF_REPO_ID}' and Filename '{GGUF_FILENAME}' are correct.")
|
120 |
raise RuntimeError(f"Failed to load local GGUF model or its tokenizer.") from e
|
121 |
|
122 |
# --- Call load_model ---
|
|
|
124 |
tokenizer, model, model_name = load_model()
|
125 |
except Exception as load_err:
|
126 |
print(f"π¨ CRITICAL ERROR DURING MODEL LOADING: {load_err}")
|
|
|
|
|
127 |
raise
|
128 |
|
129 |
# --- Load hotel docs function ---
|
|
|
136 |
try:
|
137 |
with open(path, encoding="utf-8") as f:
|
138 |
content = f.read().strip()
|
|
|
|
|
139 |
return [(hotel_id, content)]
|
140 |
except Exception as e:
|
141 |
print(f"β Error reading knowledge file {path}: {e}")
|
|
|
173 |
|
174 |
# --- Chat function ---
|
175 |
def chat(message, history, hotel_id):
|
176 |
+
if history is None: history = []
|
|
|
177 |
history_tuples = [(m['role'], m['content']) for m in history if isinstance(m, dict) and 'role' in m and 'content' in m]
|
178 |
history_tuples.append(("user", message))
|
179 |
|
|
|
180 |
ui_history = [{"role": r, "content": c} for r, c in history_tuples]
|
181 |
+
yield ui_history, ""
|
182 |
|
183 |
+
input_text = ""
|
|
|
184 |
try:
|
185 |
+
default_system_prompt = "You are a helpful hotel assistant..."
|
|
|
186 |
system_prompt_filename = f"{hotel_id}-system.txt"
|
187 |
system_prompt_path = os.path.join("knowledge", system_prompt_filename)
|
188 |
system_prompt_content = default_system_prompt
|
|
|
193 |
else: print(f"β οΈ System prompt file '{system_prompt_path}' is empty. Using default.")
|
194 |
except Exception as e: print(f"β Error reading system prompt file '{system_prompt_path}': {e}. Using default.")
|
195 |
else: print(f"β οΈ System prompt file not found: '{system_prompt_path}'. Using default.")
|
|
|
196 |
|
197 |
messages = [{"role": "system", "content": system_prompt_content}]
|
198 |
|
|
|
199 |
hotel_docs = load_hotel_docs(hotel_id)
|
200 |
if not hotel_docs:
|
|
|
201 |
ui_history.append({"role": "assistant", "content": f"Sorry, I don't have specific information loaded for the hotel '{hotel_id}'."})
|
202 |
yield ui_history, ""
|
203 |
return
|
|
|
205 |
for hotel_doc_id, doc_content in hotel_docs:
|
206 |
messages.append({
|
207 |
"role": "document",
|
208 |
+
"text": doc_content,
|
209 |
+
"doc_id": hotel_doc_id
|
210 |
})
|
|
|
211 |
|
212 |
+
# --- Include chat history (excluding last user message if template handles it) ---
|
213 |
+
# Note: The template provided seems to process all messages in loop_messages,
|
214 |
+
# so we might need to include the last user message here. Let's keep it simple for now.
|
215 |
for role, content in history_tuples:
|
216 |
+
messages.append({"role": role, "content": content})
|
|
|
|
|
|
|
217 |
# --- End Include History ---
|
218 |
|
|
|
219 |
controls = {"length":"short","originality": "abstractive"}
|
220 |
|
|
|
221 |
input_text = tokenizer.apply_chat_template(
|
222 |
messages,
|
223 |
tokenize=False,
|
|
|
235 |
yield ui_history, ""
|
236 |
return
|
237 |
|
238 |
+
response = "Sorry, an error occurred during generation."
|
|
|
239 |
try:
|
240 |
if is_space:
|
|
|
241 |
print("π Generating response using HF model...")
|
242 |
inputs = tokenizer(input_text, return_tensors="pt").to(device)
|
243 |
input_length = inputs.input_ids.shape[1]
|
|
|
249 |
attention_mask=inputs.attention_mask,
|
250 |
max_new_tokens=1024,
|
251 |
do_sample=False,
|
252 |
+
eos_token_id=tokenizer.eos_token_id
|
253 |
)
|
254 |
print(f"DEBUG: Output tokens shape = {outputs.shape}")
|
255 |
|
|
|
256 |
new_token_ids = outputs[0][input_length:]
|
257 |
print(f"DEBUG: Number of new tokens generated = {len(new_token_ids)}")
|
258 |
response = tokenizer.decode(new_token_ids, skip_special_tokens=True).strip()
|
259 |
print(f"DEBUG: Decoded response (skip_special_tokens=True) = {repr(response)}")
|
260 |
print("β
HF Generation complete.")
|
261 |
|
262 |
+
else: # Local GGUF Generation
|
|
|
263 |
print("π» Generating response using GGUF model...")
|
264 |
response = model(
|
265 |
input_text,
|
266 |
max_new_tokens=1024,
|
267 |
+
stop=["<|end_of_text|>"],
|
268 |
+
temperature=0.3
|
269 |
)
|
270 |
response = response.strip()
|
271 |
print("β
GGUF Generation complete.")
|
272 |
|
|
|
273 |
if not response:
|
274 |
response = "Sorry, I encountered an issue generating a response (empty)."
|
275 |
|
276 |
except Exception as e:
|
277 |
print(f"β Error during model generation or decoding: {e}")
|
|
|
278 |
|
|
|
279 |
print(f"DEBUG: Final response variable before UI append = {repr(response)}")
|
280 |
|
|
|
281 |
ui_history.append({"role": "assistant", "content": response})
|
282 |
+
yield ui_history, ""
|
|
|
|
|
283 |
|
284 |
# --- Gradio UI ---
|
285 |
with gr.Blocks() as demo:
|
286 |
with gr.Column(variant="panel"):
|
287 |
gr.Markdown("### π¨ MultiβHotel Chatbot Demo")
|
288 |
+
gr.Markdown(f"**Running:** {model_name}") # Displays HF name or GGUF info
|
289 |
|
290 |
hotel_selector = gr.Dropdown(
|
291 |
choices=available_hotels,
|
|
|
320 |
if __name__ == "__main__":
|
321 |
print("Launching Gradio Interface...")
|
322 |
demo.launch()
|
323 |
+
print("Gradio Interface closed.")
|