looker01202 commited on
Commit
083650c
Β·
1 Parent(s): 7226a27

setup local venv to use gguf2

Browse files
Files changed (1) hide show
  1. app.py +46 -74
app.py CHANGED
@@ -14,12 +14,13 @@ except ImportError:
14
  AutoModelForCausalLM_GGUF = None # Define as None if import fails
15
  CTRANSFORMERS_AVAILABLE = False
16
 
17
- # --- Configuration for Local GGUF ---
18
- # Set this environment variable or replace the default path
19
- # Download granite-3.3-2b-instruct-Q2_K.gguf (or other) from Hugging Face
20
- DEFAULT_GGUF_PATH = "./models/granite-3.3-2b-instruct-Q2_K.gguf" # Example path
21
- GGUF_MODEL_PATH = os.environ.get("GGUF_MODEL_PATH", DEFAULT_GGUF_PATH)
22
- CORRECTED_TEMPLATE_FILENAME = "corrected_granite_template.jinja" # Name of your corrected template file
 
23
  # --- End Configuration ---
24
 
25
  # Detect Space environment by SPACE_ID env var
@@ -33,21 +34,20 @@ print(f"Using device: {device}")
33
 
34
  # Load model function (handles HF Space vs Local GGUF)
35
  def load_model():
36
- primary_checkpoint = "ibm-granite/granite-3.3-2b-instruct"
37
- model_name_display = primary_checkpoint # Use this for UI display always
38
 
39
  # --- Function to load and apply template ---
40
  def apply_template_from_file(tokenizer, template_filename):
41
  applied_template = False
42
  try:
43
  print(f"Attempting to load corrected chat template from: {template_filename}")
44
- # Ensure the template file path is relative to the script location
45
  script_dir = os.path.dirname(os.path.abspath(__file__))
46
  template_path = os.path.join(script_dir, template_filename)
47
 
48
  if not os.path.exists(template_path):
49
  print(f"⚠️ WARNING: Corrected template file not found at: {template_path}")
50
- return False # Indicate failure
51
 
52
  with open(template_path, "r", encoding="utf-8") as f:
53
  custom_chat_template_content = f.read()
@@ -55,12 +55,10 @@ def load_model():
55
  applied_template = True
56
  print(f"βœ… Loaded and applied corrected chat template from: {template_filename}")
57
  except FileNotFoundError:
58
- # This case is handled by the os.path.exists check above
59
  pass
60
  except Exception as e:
61
  print(f"❌ ERROR reading corrected template file '{template_filename}': {e}")
62
 
63
- # Fallback / Verification print
64
  if not applied_template:
65
  print("Falling back to tokenizer's default built-in template.")
66
  print("--- Final Chat Template Being Used ---")
@@ -72,29 +70,28 @@ def load_model():
72
  if is_space:
73
  print(f"πŸš€ Running in Space. Loading HF model: {primary_checkpoint}")
74
  try:
75
- # Load HF Tokenizer
76
  tokenizer = AutoTokenizer.from_pretrained(primary_checkpoint, use_fast=True)
77
- # Load HF Model
78
  model = AutoModelForCausalLM.from_pretrained(
79
  primary_checkpoint,
80
  torch_dtype=torch.float16,
81
  low_cpu_mem_usage=True,
82
- device_map="auto" # Use device_map for HF model
83
  )
84
  print(f"βœ… Loaded HF {primary_checkpoint}")
85
  apply_template_from_file(tokenizer, CORRECTED_TEMPLATE_FILENAME)
86
- return tokenizer, model, model_name_display
87
 
88
  except Exception as e:
89
  print(f"❌ HF Primary load failed: {e}")
90
  raise RuntimeError(f"Failed to load primary HF model {primary_checkpoint} in Space.") from e
91
 
92
- else: # Running Locally - Load GGUF
93
- print(f"πŸ’» Running Locally. Attempting GGUF setup.")
94
  if not CTRANSFORMERS_AVAILABLE:
95
  raise RuntimeError("ctransformers library is required for local GGUF execution but not installed.")
96
 
97
- print(f" GGUF model path: {GGUF_MODEL_PATH}")
 
98
  print(f" Using HF tokenizer for template: {primary_checkpoint}")
99
  try:
100
  # Load HF Tokenizer (needed for apply_chat_template)
@@ -102,24 +99,24 @@ def load_model():
102
  print("βœ… Loaded HF Tokenizer for template application.")
103
  apply_template_from_file(tokenizer, CORRECTED_TEMPLATE_FILENAME)
104
 
105
- # Check if GGUF file exists before attempting to load
106
- if not os.path.exists(GGUF_MODEL_PATH):
107
- raise FileNotFoundError(f"GGUF model file not found at specified path: {GGUF_MODEL_PATH}. Please download the model or set the GGUF_MODEL_PATH environment variable.")
108
-
109
- # Load GGUF Model using ctransformers
110
  model = AutoModelForCausalLM_GGUF.from_pretrained(
111
- GGUF_MODEL_PATH,
112
- model_type="llama", # Adjust if needed based on model card
113
- context_length=4096, # Can be adjusted
114
- gpu_layers=0 # CPU-only inference
115
  )
116
- print(f"βœ… Loaded GGUF model {GGUF_MODEL_PATH}")
117
- # Display GGUF path in UI when running locally
118
- model_name_display = f"GGUF: {os.path.basename(GGUF_MODEL_PATH)}"
119
  return tokenizer, model, model_name_display
120
 
121
  except Exception as e:
122
  print(f"❌ Local GGUF load failed: {e}")
 
 
 
123
  raise RuntimeError(f"Failed to load local GGUF model or its tokenizer.") from e
124
 
125
  # --- Call load_model ---
@@ -127,8 +124,6 @@ try:
127
  tokenizer, model, model_name = load_model()
128
  except Exception as load_err:
129
  print(f"🚨 CRITICAL ERROR DURING MODEL LOADING: {load_err}")
130
- # Optionally, exit or provide a dummy model/tokenizer for Gradio UI to load without crashing
131
- # For now, we'll let it potentially crash Gradio if loading fails.
132
  raise
133
 
134
  # --- Load hotel docs function ---
@@ -141,8 +136,6 @@ def load_hotel_docs(hotel_id):
141
  try:
142
  with open(path, encoding="utf-8") as f:
143
  content = f.read().strip()
144
- # Return as list of tuples: [(doc_id, content)]
145
- # Using hotel_id as doc_id here
146
  return [(hotel_id, content)]
147
  except Exception as e:
148
  print(f"❌ Error reading knowledge file {path}: {e}")
@@ -180,20 +173,16 @@ print("Hotel scan complete.\n")
180
 
181
  # --- Chat function ---
182
  def chat(message, history, hotel_id):
183
- # Convert incoming UI history (list of dicts) to tuple list
184
- if history is None: history = [] # Ensure history is a list
185
  history_tuples = [(m['role'], m['content']) for m in history if isinstance(m, dict) and 'role' in m and 'content' in m]
186
  history_tuples.append(("user", message))
187
 
188
- # Yield user message immediately
189
  ui_history = [{"role": r, "content": c} for r, c in history_tuples]
190
- yield ui_history, "" # Update chat, clear textbox
191
 
192
- # --- Prompt Preparation (Common for both HF/GGUF) ---
193
- input_text = "" # Initialize to avoid potential UnboundLocalError
194
  try:
195
- # --- Load System Prompt ---
196
- default_system_prompt = "You are a helpful hotel assistant..." # Define your default
197
  system_prompt_filename = f"{hotel_id}-system.txt"
198
  system_prompt_path = os.path.join("knowledge", system_prompt_filename)
199
  system_prompt_content = default_system_prompt
@@ -204,14 +193,11 @@ def chat(message, history, hotel_id):
204
  else: print(f"⚠️ System prompt file '{system_prompt_path}' is empty. Using default.")
205
  except Exception as e: print(f"❌ Error reading system prompt file '{system_prompt_path}': {e}. Using default.")
206
  else: print(f"⚠️ System prompt file not found: '{system_prompt_path}'. Using default.")
207
- # --- End Load System Prompt ---
208
 
209
  messages = [{"role": "system", "content": system_prompt_content}]
210
 
211
- # --- Load and add hotel document(s) ---
212
  hotel_docs = load_hotel_docs(hotel_id)
213
  if not hotel_docs:
214
- # If no knowledge doc found, inform user and stop
215
  ui_history.append({"role": "assistant", "content": f"Sorry, I don't have specific information loaded for the hotel '{hotel_id}'."})
216
  yield ui_history, ""
217
  return
@@ -219,23 +205,19 @@ def chat(message, history, hotel_id):
219
  for hotel_doc_id, doc_content in hotel_docs:
220
  messages.append({
221
  "role": "document",
222
- "text": doc_content, # Use 'text' key
223
- "doc_id": hotel_doc_id # Use 'doc_id' key
224
  })
225
- # --- End Load Documents ---
226
 
227
- # --- Include chat history ---
 
 
228
  for role, content in history_tuples:
229
- # Exclude the last user message as it's implicitly handled by template
230
- if role == "user" and content == message and history_tuples.index((role, content)) == len(history_tuples) - 1:
231
- continue # Skip adding the very last user message again if template adds it
232
- messages.append({"role": role, "content": content})
233
  # --- End Include History ---
234
 
235
- # --- Set controls ---
236
  controls = {"length":"short","originality": "abstractive"}
237
 
238
- # --- Apply the template ---
239
  input_text = tokenizer.apply_chat_template(
240
  messages,
241
  tokenize=False,
@@ -253,11 +235,9 @@ def chat(message, history, hotel_id):
253
  yield ui_history, ""
254
  return
255
 
256
- # --- Generation Logic: Space (HF) vs Local (GGUF) ---
257
- response = "Sorry, an error occurred during generation." # Default error response
258
  try:
259
  if is_space:
260
- # --- HF Model Generation (Space) ---
261
  print("πŸš€ Generating response using HF model...")
262
  inputs = tokenizer(input_text, return_tensors="pt").to(device)
263
  input_length = inputs.input_ids.shape[1]
@@ -269,51 +249,43 @@ def chat(message, history, hotel_id):
269
  attention_mask=inputs.attention_mask,
270
  max_new_tokens=1024,
271
  do_sample=False,
272
- eos_token_id=tokenizer.eos_token_id # Explicitly use EOS token ID
273
  )
274
  print(f"DEBUG: Output tokens shape = {outputs.shape}")
275
 
276
- # Decode using the IBM example strategy
277
  new_token_ids = outputs[0][input_length:]
278
  print(f"DEBUG: Number of new tokens generated = {len(new_token_ids)}")
279
  response = tokenizer.decode(new_token_ids, skip_special_tokens=True).strip()
280
  print(f"DEBUG: Decoded response (skip_special_tokens=True) = {repr(response)}")
281
  print("βœ… HF Generation complete.")
282
 
283
- else:
284
- # --- GGUF Model Generation (Local) ---
285
  print("πŸ’» Generating response using GGUF model...")
286
  response = model(
287
  input_text,
288
  max_new_tokens=1024,
289
- stop=["<|end_of_text|>"], # Stop sequence for GGUF
290
- temperature=0.3 # Example temperature
291
  )
292
  response = response.strip()
293
  print("βœ… GGUF Generation complete.")
294
 
295
- # Handle empty response after generation
296
  if not response:
297
  response = "Sorry, I encountered an issue generating a response (empty)."
298
 
299
  except Exception as e:
300
  print(f"❌ Error during model generation or decoding: {e}")
301
- # Keep the default error response defined above
302
 
303
- # --- Final Response Handling ---
304
  print(f"DEBUG: Final response variable before UI append = {repr(response)}")
305
 
306
- # Add the final assistant reply to the UI history
307
  ui_history.append({"role": "assistant", "content": response})
308
-
309
- # Final yield with assistant reply
310
- yield ui_history, "" # Update chat, keep textbox cleared
311
 
312
  # --- Gradio UI ---
313
  with gr.Blocks() as demo:
314
  with gr.Column(variant="panel"):
315
  gr.Markdown("### 🏨 Multi‑Hotel Chatbot Demo")
316
- gr.Markdown(f"**Running:** {model_name}") # Displays HF name or GGUF path
317
 
318
  hotel_selector = gr.Dropdown(
319
  choices=available_hotels,
@@ -348,4 +320,4 @@ demo.queue(default_concurrency_limit=2, max_size=32)
348
  if __name__ == "__main__":
349
  print("Launching Gradio Interface...")
350
  demo.launch()
351
- print("Gradio Interface closed.")
 
14
  AutoModelForCausalLM_GGUF = None # Define as None if import fails
15
  CTRANSFORMERS_AVAILABLE = False
16
 
17
+ # --- Configuration ---
18
+ # HF Repo ID and Filename for the GGUF model to be used locally
19
+ GGUF_REPO_ID = "ibm-granite/granite-3.3-2b-instruct-gguf"
20
+ GGUF_FILENAME = "granite-3.3-2b-instruct-Q2_K.gguf" # Smallest footprint version
21
+ GGUF_FILENAME = "granite-3.3-2b-instruct-Q4_K_M.gguf" # Try this more standard quantization
22
+
23
+ CORRECTED_TEMPLATE_FILENAME = "granite3.3_2b_chat_template.jinja" # Name of your corrected template file
24
  # --- End Configuration ---
25
 
26
  # Detect Space environment by SPACE_ID env var
 
34
 
35
  # Load model function (handles HF Space vs Local GGUF)
36
  def load_model():
37
+ primary_checkpoint = "ibm-granite/granite-3.3-2b-instruct" # Standard HF model ID
38
+ model_name_display = primary_checkpoint # Default display name
39
 
40
  # --- Function to load and apply template ---
41
  def apply_template_from_file(tokenizer, template_filename):
42
  applied_template = False
43
  try:
44
  print(f"Attempting to load corrected chat template from: {template_filename}")
 
45
  script_dir = os.path.dirname(os.path.abspath(__file__))
46
  template_path = os.path.join(script_dir, template_filename)
47
 
48
  if not os.path.exists(template_path):
49
  print(f"⚠️ WARNING: Corrected template file not found at: {template_path}")
50
+ return False
51
 
52
  with open(template_path, "r", encoding="utf-8") as f:
53
  custom_chat_template_content = f.read()
 
55
  applied_template = True
56
  print(f"βœ… Loaded and applied corrected chat template from: {template_filename}")
57
  except FileNotFoundError:
 
58
  pass
59
  except Exception as e:
60
  print(f"❌ ERROR reading corrected template file '{template_filename}': {e}")
61
 
 
62
  if not applied_template:
63
  print("Falling back to tokenizer's default built-in template.")
64
  print("--- Final Chat Template Being Used ---")
 
70
  if is_space:
71
  print(f"πŸš€ Running in Space. Loading HF model: {primary_checkpoint}")
72
  try:
 
73
  tokenizer = AutoTokenizer.from_pretrained(primary_checkpoint, use_fast=True)
 
74
  model = AutoModelForCausalLM.from_pretrained(
75
  primary_checkpoint,
76
  torch_dtype=torch.float16,
77
  low_cpu_mem_usage=True,
78
+ device_map="auto"
79
  )
80
  print(f"βœ… Loaded HF {primary_checkpoint}")
81
  apply_template_from_file(tokenizer, CORRECTED_TEMPLATE_FILENAME)
82
+ return tokenizer, model, model_name_display # Use HF checkpoint name for display
83
 
84
  except Exception as e:
85
  print(f"❌ HF Primary load failed: {e}")
86
  raise RuntimeError(f"Failed to load primary HF model {primary_checkpoint} in Space.") from e
87
 
88
+ else: # Running Locally - Load GGUF from Hub
89
+ print(f"πŸ’» Running Locally. Attempting GGUF setup via Hub.")
90
  if not CTRANSFORMERS_AVAILABLE:
91
  raise RuntimeError("ctransformers library is required for local GGUF execution but not installed.")
92
 
93
+ print(f" GGUF Repo ID: {GGUF_REPO_ID}")
94
+ print(f" GGUF Filename: {GGUF_FILENAME}")
95
  print(f" Using HF tokenizer for template: {primary_checkpoint}")
96
  try:
97
  # Load HF Tokenizer (needed for apply_chat_template)
 
99
  print("βœ… Loaded HF Tokenizer for template application.")
100
  apply_template_from_file(tokenizer, CORRECTED_TEMPLATE_FILENAME)
101
 
102
+ # Load GGUF Model using ctransformers, downloading from Hub
103
+ # ctransformers will download the specified model_file from the repo_id
104
+ # if it's not already cached locally.
 
 
105
  model = AutoModelForCausalLM_GGUF.from_pretrained(
106
+ GGUF_REPO_ID, # Pass the Repository ID
107
+ model_file=GGUF_FILENAME, # Specify the exact file to load/download
108
+ gpu_layers=0 # CPU-only inference
 
109
  )
110
+ print(f"βœ… Loaded GGUF model {GGUF_FILENAME} from {GGUF_REPO_ID}")
111
+ # Display GGUF info in UI when running locally
112
+ model_name_display = f"GGUF: {GGUF_FILENAME}"
113
  return tokenizer, model, model_name_display
114
 
115
  except Exception as e:
116
  print(f"❌ Local GGUF load failed: {e}")
117
+ # Add more specific error message if file not found on Hub
118
+ if "not found on HuggingFace Hub" in str(e):
119
+ print(f" Please ensure Repo ID '{GGUF_REPO_ID}' and Filename '{GGUF_FILENAME}' are correct.")
120
  raise RuntimeError(f"Failed to load local GGUF model or its tokenizer.") from e
121
 
122
  # --- Call load_model ---
 
124
  tokenizer, model, model_name = load_model()
125
  except Exception as load_err:
126
  print(f"🚨 CRITICAL ERROR DURING MODEL LOADING: {load_err}")
 
 
127
  raise
128
 
129
  # --- Load hotel docs function ---
 
136
  try:
137
  with open(path, encoding="utf-8") as f:
138
  content = f.read().strip()
 
 
139
  return [(hotel_id, content)]
140
  except Exception as e:
141
  print(f"❌ Error reading knowledge file {path}: {e}")
 
173
 
174
  # --- Chat function ---
175
  def chat(message, history, hotel_id):
176
+ if history is None: history = []
 
177
  history_tuples = [(m['role'], m['content']) for m in history if isinstance(m, dict) and 'role' in m and 'content' in m]
178
  history_tuples.append(("user", message))
179
 
 
180
  ui_history = [{"role": r, "content": c} for r, c in history_tuples]
181
+ yield ui_history, ""
182
 
183
+ input_text = ""
 
184
  try:
185
+ default_system_prompt = "You are a helpful hotel assistant..."
 
186
  system_prompt_filename = f"{hotel_id}-system.txt"
187
  system_prompt_path = os.path.join("knowledge", system_prompt_filename)
188
  system_prompt_content = default_system_prompt
 
193
  else: print(f"⚠️ System prompt file '{system_prompt_path}' is empty. Using default.")
194
  except Exception as e: print(f"❌ Error reading system prompt file '{system_prompt_path}': {e}. Using default.")
195
  else: print(f"⚠️ System prompt file not found: '{system_prompt_path}'. Using default.")
 
196
 
197
  messages = [{"role": "system", "content": system_prompt_content}]
198
 
 
199
  hotel_docs = load_hotel_docs(hotel_id)
200
  if not hotel_docs:
 
201
  ui_history.append({"role": "assistant", "content": f"Sorry, I don't have specific information loaded for the hotel '{hotel_id}'."})
202
  yield ui_history, ""
203
  return
 
205
  for hotel_doc_id, doc_content in hotel_docs:
206
  messages.append({
207
  "role": "document",
208
+ "text": doc_content,
209
+ "doc_id": hotel_doc_id
210
  })
 
211
 
212
+ # --- Include chat history (excluding last user message if template handles it) ---
213
+ # Note: The template provided seems to process all messages in loop_messages,
214
+ # so we might need to include the last user message here. Let's keep it simple for now.
215
  for role, content in history_tuples:
216
+ messages.append({"role": role, "content": content})
 
 
 
217
  # --- End Include History ---
218
 
 
219
  controls = {"length":"short","originality": "abstractive"}
220
 
 
221
  input_text = tokenizer.apply_chat_template(
222
  messages,
223
  tokenize=False,
 
235
  yield ui_history, ""
236
  return
237
 
238
+ response = "Sorry, an error occurred during generation."
 
239
  try:
240
  if is_space:
 
241
  print("πŸš€ Generating response using HF model...")
242
  inputs = tokenizer(input_text, return_tensors="pt").to(device)
243
  input_length = inputs.input_ids.shape[1]
 
249
  attention_mask=inputs.attention_mask,
250
  max_new_tokens=1024,
251
  do_sample=False,
252
+ eos_token_id=tokenizer.eos_token_id
253
  )
254
  print(f"DEBUG: Output tokens shape = {outputs.shape}")
255
 
 
256
  new_token_ids = outputs[0][input_length:]
257
  print(f"DEBUG: Number of new tokens generated = {len(new_token_ids)}")
258
  response = tokenizer.decode(new_token_ids, skip_special_tokens=True).strip()
259
  print(f"DEBUG: Decoded response (skip_special_tokens=True) = {repr(response)}")
260
  print("βœ… HF Generation complete.")
261
 
262
+ else: # Local GGUF Generation
 
263
  print("πŸ’» Generating response using GGUF model...")
264
  response = model(
265
  input_text,
266
  max_new_tokens=1024,
267
+ stop=["<|end_of_text|>"],
268
+ temperature=0.3
269
  )
270
  response = response.strip()
271
  print("βœ… GGUF Generation complete.")
272
 
 
273
  if not response:
274
  response = "Sorry, I encountered an issue generating a response (empty)."
275
 
276
  except Exception as e:
277
  print(f"❌ Error during model generation or decoding: {e}")
 
278
 
 
279
  print(f"DEBUG: Final response variable before UI append = {repr(response)}")
280
 
 
281
  ui_history.append({"role": "assistant", "content": response})
282
+ yield ui_history, ""
 
 
283
 
284
  # --- Gradio UI ---
285
  with gr.Blocks() as demo:
286
  with gr.Column(variant="panel"):
287
  gr.Markdown("### 🏨 Multi‑Hotel Chatbot Demo")
288
+ gr.Markdown(f"**Running:** {model_name}") # Displays HF name or GGUF info
289
 
290
  hotel_selector = gr.Dropdown(
291
  choices=available_hotels,
 
320
  if __name__ == "__main__":
321
  print("Launching Gradio Interface...")
322
  demo.launch()
323
+ print("Gradio Interface closed.")