Spaces:

Looker01202
/

hotel-chat

Sleeping

App Files Files Community

looker01202 commited on 13 days ago

Commit

083650c

1 Parent(s): 7226a27

setup local venv to use gguf2

Browse files

Files changed (1) hide show

app.py +46 -74

app.py CHANGED Viewed

@@ -14,12 +14,13 @@ except ImportError:
     AutoModelForCausalLM_GGUF = None # Define as None if import fails
     CTRANSFORMERS_AVAILABLE = False
-# --- Configuration for Local GGUF ---
-# Set this environment variable or replace the default path
-# Download granite-3.3-2b-instruct-Q2_K.gguf (or other) from Hugging Face
-DEFAULT_GGUF_PATH = "./models/granite-3.3-2b-instruct-Q2_K.gguf" # Example path
-GGUF_MODEL_PATH = os.environ.get("GGUF_MODEL_PATH", DEFAULT_GGUF_PATH)
-CORRECTED_TEMPLATE_FILENAME = "corrected_granite_template.jinja" # Name of your corrected template file
 # --- End Configuration ---
 # Detect Space environment by SPACE_ID env var
@@ -33,21 +34,20 @@ print(f"Using device: {device}")
 # Load model function (handles HF Space vs Local GGUF)
 def load_model():
-    primary_checkpoint = "ibm-granite/granite-3.3-2b-instruct"
-    model_name_display = primary_checkpoint # Use this for UI display always
     # --- Function to load and apply template ---
     def apply_template_from_file(tokenizer, template_filename):
         applied_template = False
         try:
             print(f"Attempting to load corrected chat template from: {template_filename}")
-            # Ensure the template file path is relative to the script location
             script_dir = os.path.dirname(os.path.abspath(__file__))
             template_path = os.path.join(script_dir, template_filename)
             if not os.path.exists(template_path):
                  print(f"⚠️ WARNING: Corrected template file not found at: {template_path}")
-                 return False # Indicate failure
             with open(template_path, "r", encoding="utf-8") as f:
                 custom_chat_template_content = f.read()
@@ -55,12 +55,10 @@ def load_model():
             applied_template = True
             print(f"✅ Loaded and applied corrected chat template from: {template_filename}")
         except FileNotFoundError:
-            # This case is handled by the os.path.exists check above
             pass
         except Exception as e:
             print(f"❌ ERROR reading corrected template file '{template_filename}': {e}")
-        # Fallback / Verification print
         if not applied_template:
              print("Falling back to tokenizer's default built-in template.")
         print("--- Final Chat Template Being Used ---")
@@ -72,29 +70,28 @@ def load_model():
     if is_space:
         print(f"🚀 Running in Space. Loading HF model: {primary_checkpoint}")
         try:
-            # Load HF Tokenizer
             tokenizer = AutoTokenizer.from_pretrained(primary_checkpoint, use_fast=True)
-            # Load HF Model
             model = AutoModelForCausalLM.from_pretrained(
                 primary_checkpoint,
                 torch_dtype=torch.float16,
                 low_cpu_mem_usage=True,
-                device_map="auto" # Use device_map for HF model
             )
             print(f"✅ Loaded HF {primary_checkpoint}")
             apply_template_from_file(tokenizer, CORRECTED_TEMPLATE_FILENAME)
-            return tokenizer, model, model_name_display
         except Exception as e:
             print(f"❌ HF Primary load failed: {e}")
             raise RuntimeError(f"Failed to load primary HF model {primary_checkpoint} in Space.") from e
-    else: # Running Locally - Load GGUF
-        print(f"💻 Running Locally. Attempting GGUF setup.")
         if not CTRANSFORMERS_AVAILABLE:
              raise RuntimeError("ctransformers library is required for local GGUF execution but not installed.")
-        print(f"   GGUF model path: {GGUF_MODEL_PATH}")
         print(f"   Using HF tokenizer for template: {primary_checkpoint}")
         try:
             # Load HF Tokenizer (needed for apply_chat_template)
@@ -102,24 +99,24 @@ def load_model():
             print("✅ Loaded HF Tokenizer for template application.")
             apply_template_from_file(tokenizer, CORRECTED_TEMPLATE_FILENAME)
-            # Check if GGUF file exists before attempting to load
-            if not os.path.exists(GGUF_MODEL_PATH):
-                 raise FileNotFoundError(f"GGUF model file not found at specified path: {GGUF_MODEL_PATH}. Please download the model or set the GGUF_MODEL_PATH environment variable.")
-            # Load GGUF Model using ctransformers
             model = AutoModelForCausalLM_GGUF.from_pretrained(
-                GGUF_MODEL_PATH,
-                model_type="llama", # Adjust if needed based on model card
-                context_length=4096, # Can be adjusted
-                gpu_layers=0 # CPU-only inference
             )
-            print(f"✅ Loaded GGUF model {GGUF_MODEL_PATH}")
-            # Display GGUF path in UI when running locally
-            model_name_display = f"GGUF: {os.path.basename(GGUF_MODEL_PATH)}"
             return tokenizer, model, model_name_display
         except Exception as e:
             print(f"❌ Local GGUF load failed: {e}")
             raise RuntimeError(f"Failed to load local GGUF model or its tokenizer.") from e
 # --- Call load_model ---
@@ -127,8 +124,6 @@ try:
     tokenizer, model, model_name = load_model()
 except Exception as load_err:
     print(f"🚨 CRITICAL ERROR DURING MODEL LOADING: {load_err}")
-    # Optionally, exit or provide a dummy model/tokenizer for Gradio UI to load without crashing
-    # For now, we'll let it potentially crash Gradio if loading fails.
     raise
 # --- Load hotel docs function ---
@@ -141,8 +136,6 @@ def load_hotel_docs(hotel_id):
     try:
         with open(path, encoding="utf-8") as f:
             content = f.read().strip()
-        # Return as list of tuples: [(doc_id, content)]
-        # Using hotel_id as doc_id here
         return [(hotel_id, content)]
     except Exception as e:
         print(f"❌ Error reading knowledge file {path}: {e}")
@@ -180,20 +173,16 @@ print("Hotel scan complete.\n")
 # --- Chat function ---
 def chat(message, history, hotel_id):
-    # Convert incoming UI history (list of dicts) to tuple list
-    if history is None: history = [] # Ensure history is a list
     history_tuples = [(m['role'], m['content']) for m in history if isinstance(m, dict) and 'role' in m and 'content' in m]
     history_tuples.append(("user", message))
-    # Yield user message immediately
     ui_history = [{"role": r, "content": c} for r, c in history_tuples]
-    yield ui_history, "" # Update chat, clear textbox
-    # --- Prompt Preparation (Common for both HF/GGUF) ---
-    input_text = "" # Initialize to avoid potential UnboundLocalError
     try:
-        # --- Load System Prompt ---
-        default_system_prompt = "You are a helpful hotel assistant..." # Define your default
         system_prompt_filename = f"{hotel_id}-system.txt"
         system_prompt_path = os.path.join("knowledge", system_prompt_filename)
         system_prompt_content = default_system_prompt
@@ -204,14 +193,11 @@ def chat(message, history, hotel_id):
                 else: print(f"⚠️ System prompt file '{system_prompt_path}' is empty. Using default.")
             except Exception as e: print(f"❌ Error reading system prompt file '{system_prompt_path}': {e}. Using default.")
         else: print(f"⚠️ System prompt file not found: '{system_prompt_path}'. Using default.")
-        # --- End Load System Prompt ---
         messages = [{"role": "system", "content": system_prompt_content}]
-        # --- Load and add hotel document(s) ---
         hotel_docs = load_hotel_docs(hotel_id)
         if not hotel_docs:
-             # If no knowledge doc found, inform user and stop
              ui_history.append({"role": "assistant", "content": f"Sorry, I don't have specific information loaded for the hotel '{hotel_id}'."})
              yield ui_history, ""
              return
@@ -219,23 +205,19 @@ def chat(message, history, hotel_id):
         for hotel_doc_id, doc_content in hotel_docs:
             messages.append({
                 "role": "document",
-                "text": doc_content,   # Use 'text' key
-                "doc_id": hotel_doc_id # Use 'doc_id' key
             })
-        # --- End Load Documents ---
-        # --- Include chat history ---
         for role, content in history_tuples:
-            # Exclude the last user message as it's implicitly handled by template
-            if role == "user" and content == message and history_tuples.index((role, content)) == len(history_tuples) - 1:
-                 continue # Skip adding the very last user message again if template adds it
-            messages.append({"role": role, "content": content})
         # --- End Include History ---
-        # --- Set controls ---
         controls = {"length":"short","originality": "abstractive"}
-        # --- Apply the template ---
         input_text = tokenizer.apply_chat_template(
             messages,
             tokenize=False,
@@ -253,11 +235,9 @@ def chat(message, history, hotel_id):
         yield ui_history, ""
         return
-    # --- Generation Logic: Space (HF) vs Local (GGUF) ---
-    response = "Sorry, an error occurred during generation." # Default error response
     try:
         if is_space:
-            # --- HF Model Generation (Space) ---
             print("🚀 Generating response using HF model...")
             inputs = tokenizer(input_text, return_tensors="pt").to(device)
             input_length = inputs.input_ids.shape[1]
@@ -269,51 +249,43 @@ def chat(message, history, hotel_id):
                     attention_mask=inputs.attention_mask,
                     max_new_tokens=1024,
                     do_sample=False,
-                    eos_token_id=tokenizer.eos_token_id # Explicitly use EOS token ID
                 )
             print(f"DEBUG: Output tokens shape = {outputs.shape}")
-            # Decode using the IBM example strategy
             new_token_ids = outputs[0][input_length:]
             print(f"DEBUG: Number of new tokens generated = {len(new_token_ids)}")
             response = tokenizer.decode(new_token_ids, skip_special_tokens=True).strip()
             print(f"DEBUG: Decoded response (skip_special_tokens=True) = {repr(response)}")
             print("✅ HF Generation complete.")
-        else:
-            # --- GGUF Model Generation (Local) ---
             print("💻 Generating response using GGUF model...")
             response = model(
                 input_text,
                 max_new_tokens=1024,
-                stop=["<|end_of_text|>"], # Stop sequence for GGUF
-                temperature=0.3 # Example temperature
             )
             response = response.strip()
             print("✅ GGUF Generation complete.")
-        # Handle empty response after generation
         if not response:
              response = "Sorry, I encountered an issue generating a response (empty)."
     except Exception as e:
         print(f"❌ Error during model generation or decoding: {e}")
-        # Keep the default error response defined above
-    # --- Final Response Handling ---
     print(f"DEBUG: Final response variable before UI append = {repr(response)}")
-    # Add the final assistant reply to the UI history
     ui_history.append({"role": "assistant", "content": response})
-    # Final yield with assistant reply
-    yield ui_history, "" # Update chat, keep textbox cleared
 # --- Gradio UI ---
 with gr.Blocks() as demo:
     with gr.Column(variant="panel"):
         gr.Markdown("### 🏨 Multi‑Hotel Chatbot Demo")
-        gr.Markdown(f"**Running:** {model_name}") # Displays HF name or GGUF path
         hotel_selector = gr.Dropdown(
             choices=available_hotels,
@@ -348,4 +320,4 @@ demo.queue(default_concurrency_limit=2, max_size=32)
 if __name__ == "__main__":
     print("Launching Gradio Interface...")
     demo.launch()
-    print("Gradio Interface closed.")

     AutoModelForCausalLM_GGUF = None # Define as None if import fails
     CTRANSFORMERS_AVAILABLE = False
+# --- Configuration ---
+# HF Repo ID and Filename for the GGUF model to be used locally
+GGUF_REPO_ID = "ibm-granite/granite-3.3-2b-instruct-gguf"
+GGUF_FILENAME = "granite-3.3-2b-instruct-Q2_K.gguf" # Smallest footprint version
+GGUF_FILENAME = "granite-3.3-2b-instruct-Q4_K_M.gguf" # Try this more standard quantization
+CORRECTED_TEMPLATE_FILENAME = "granite3.3_2b_chat_template.jinja" # Name of your corrected template file
 # --- End Configuration ---
 # Detect Space environment by SPACE_ID env var
 # Load model function (handles HF Space vs Local GGUF)
 def load_model():
+    primary_checkpoint = "ibm-granite/granite-3.3-2b-instruct" # Standard HF model ID
+    model_name_display = primary_checkpoint # Default display name
     # --- Function to load and apply template ---
     def apply_template_from_file(tokenizer, template_filename):
         applied_template = False
         try:
             print(f"Attempting to load corrected chat template from: {template_filename}")
             script_dir = os.path.dirname(os.path.abspath(__file__))
             template_path = os.path.join(script_dir, template_filename)
             if not os.path.exists(template_path):
                  print(f"⚠️ WARNING: Corrected template file not found at: {template_path}")
+                 return False
             with open(template_path, "r", encoding="utf-8") as f:
                 custom_chat_template_content = f.read()
             applied_template = True
             print(f"✅ Loaded and applied corrected chat template from: {template_filename}")
         except FileNotFoundError:
             pass
         except Exception as e:
             print(f"❌ ERROR reading corrected template file '{template_filename}': {e}")
         if not applied_template:
              print("Falling back to tokenizer's default built-in template.")
         print("--- Final Chat Template Being Used ---")
     if is_space:
         print(f"🚀 Running in Space. Loading HF model: {primary_checkpoint}")
         try:
             tokenizer = AutoTokenizer.from_pretrained(primary_checkpoint, use_fast=True)
             model = AutoModelForCausalLM.from_pretrained(
                 primary_checkpoint,
                 torch_dtype=torch.float16,
                 low_cpu_mem_usage=True,
+                device_map="auto"
             )
             print(f"✅ Loaded HF {primary_checkpoint}")
             apply_template_from_file(tokenizer, CORRECTED_TEMPLATE_FILENAME)
+            return tokenizer, model, model_name_display # Use HF checkpoint name for display
         except Exception as e:
             print(f"❌ HF Primary load failed: {e}")
             raise RuntimeError(f"Failed to load primary HF model {primary_checkpoint} in Space.") from e
+    else: # Running Locally - Load GGUF from Hub
+        print(f"💻 Running Locally. Attempting GGUF setup via Hub.")
         if not CTRANSFORMERS_AVAILABLE:
              raise RuntimeError("ctransformers library is required for local GGUF execution but not installed.")
+        print(f"   GGUF Repo ID: {GGUF_REPO_ID}")
+        print(f"   GGUF Filename: {GGUF_FILENAME}")
         print(f"   Using HF tokenizer for template: {primary_checkpoint}")
         try:
             # Load HF Tokenizer (needed for apply_chat_template)
             print("✅ Loaded HF Tokenizer for template application.")
             apply_template_from_file(tokenizer, CORRECTED_TEMPLATE_FILENAME)
+            # Load GGUF Model using ctransformers, downloading from Hub
+            # ctransformers will download the specified model_file from the repo_id
+            # if it's not already cached locally.
             model = AutoModelForCausalLM_GGUF.from_pretrained(
+                GGUF_REPO_ID,           # Pass the Repository ID
+                model_file=GGUF_FILENAME, # Specify the exact file to load/download
+                gpu_layers=0            # CPU-only inference
             )
+            print(f"✅ Loaded GGUF model {GGUF_FILENAME} from {GGUF_REPO_ID}")
+            # Display GGUF info in UI when running locally
+            model_name_display = f"GGUF: {GGUF_FILENAME}"
             return tokenizer, model, model_name_display
         except Exception as e:
             print(f"❌ Local GGUF load failed: {e}")
+            # Add more specific error message if file not found on Hub
+            if "not found on HuggingFace Hub" in str(e):
+                 print(f"   Please ensure Repo ID '{GGUF_REPO_ID}' and Filename '{GGUF_FILENAME}' are correct.")
             raise RuntimeError(f"Failed to load local GGUF model or its tokenizer.") from e
 # --- Call load_model ---
     tokenizer, model, model_name = load_model()
 except Exception as load_err:
     print(f"🚨 CRITICAL ERROR DURING MODEL LOADING: {load_err}")
     raise
 # --- Load hotel docs function ---
     try:
         with open(path, encoding="utf-8") as f:
             content = f.read().strip()
         return [(hotel_id, content)]
     except Exception as e:
         print(f"❌ Error reading knowledge file {path}: {e}")
 # --- Chat function ---
 def chat(message, history, hotel_id):
+    if history is None: history = []
     history_tuples = [(m['role'], m['content']) for m in history if isinstance(m, dict) and 'role' in m and 'content' in m]
     history_tuples.append(("user", message))
     ui_history = [{"role": r, "content": c} for r, c in history_tuples]
+    yield ui_history, ""
+    input_text = ""
     try:
+        default_system_prompt = "You are a helpful hotel assistant..."
         system_prompt_filename = f"{hotel_id}-system.txt"
         system_prompt_path = os.path.join("knowledge", system_prompt_filename)
         system_prompt_content = default_system_prompt
                 else: print(f"⚠️ System prompt file '{system_prompt_path}' is empty. Using default.")
             except Exception as e: print(f"❌ Error reading system prompt file '{system_prompt_path}': {e}. Using default.")
         else: print(f"⚠️ System prompt file not found: '{system_prompt_path}'. Using default.")
         messages = [{"role": "system", "content": system_prompt_content}]
         hotel_docs = load_hotel_docs(hotel_id)
         if not hotel_docs:
              ui_history.append({"role": "assistant", "content": f"Sorry, I don't have specific information loaded for the hotel '{hotel_id}'."})
              yield ui_history, ""
              return
         for hotel_doc_id, doc_content in hotel_docs:
             messages.append({
                 "role": "document",
+                "text": doc_content,
+                "doc_id": hotel_doc_id
             })
+        # --- Include chat history (excluding last user message if template handles it) ---
+        # Note: The template provided seems to process all messages in loop_messages,
+        # so we might need to include the last user message here. Let's keep it simple for now.
         for role, content in history_tuples:
+             messages.append({"role": role, "content": content})
         # --- End Include History ---
         controls = {"length":"short","originality": "abstractive"}
         input_text = tokenizer.apply_chat_template(
             messages,
             tokenize=False,
         yield ui_history, ""
         return
+    response = "Sorry, an error occurred during generation."
     try:
         if is_space:
             print("🚀 Generating response using HF model...")
             inputs = tokenizer(input_text, return_tensors="pt").to(device)
             input_length = inputs.input_ids.shape[1]
                     attention_mask=inputs.attention_mask,
                     max_new_tokens=1024,
                     do_sample=False,
+                    eos_token_id=tokenizer.eos_token_id
                 )
             print(f"DEBUG: Output tokens shape = {outputs.shape}")
             new_token_ids = outputs[0][input_length:]
             print(f"DEBUG: Number of new tokens generated = {len(new_token_ids)}")
             response = tokenizer.decode(new_token_ids, skip_special_tokens=True).strip()
             print(f"DEBUG: Decoded response (skip_special_tokens=True) = {repr(response)}")
             print("✅ HF Generation complete.")
+        else: # Local GGUF Generation
             print("💻 Generating response using GGUF model...")
             response = model(
                 input_text,
                 max_new_tokens=1024,
+                stop=["<|end_of_text|>"],
+                temperature=0.3
             )
             response = response.strip()
             print("✅ GGUF Generation complete.")
         if not response:
              response = "Sorry, I encountered an issue generating a response (empty)."
     except Exception as e:
         print(f"❌ Error during model generation or decoding: {e}")
     print(f"DEBUG: Final response variable before UI append = {repr(response)}")
     ui_history.append({"role": "assistant", "content": response})
+    yield ui_history, ""
 # --- Gradio UI ---
 with gr.Blocks() as demo:
     with gr.Column(variant="panel"):
         gr.Markdown("### 🏨 Multi‑Hotel Chatbot Demo")
+        gr.Markdown(f"**Running:** {model_name}") # Displays HF name or GGUF info
         hotel_selector = gr.Dropdown(
             choices=available_hotels,
 if __name__ == "__main__":
     print("Launching Gradio Interface...")
     demo.launch()
+    print("Gradio Interface closed.")