Spaces:

Looker01202
/

hotel-chat

Running on T4

App Files Files Community

looker01202 commited on 16 days ago

Commit

50aecff

1 Parent(s): c5c9847

stable gradio interface but requires inprovement

Browse files

Files changed (2) hide show

app.py +78 -133
app2.py +196 -0

app.py CHANGED Viewed

@@ -1,196 +1,141 @@
 import os
-import getpass
 import gradio as gr
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
-# Detect execution environment: Spaces runs as user 'gradio'
-is_space = (getpass.getuser() == "user")
-print("RUNNING AS USER:", getpass.getuser())
-# Choose model checkpoints based on environment
 if is_space:
     primary_checkpoint = "ibm-granite/granite-3.3-2b-instruct"
     fallback_checkpoint = "Qwen/Qwen2.5-0.5B-Instruct"
 else:
-    # Local development: use smaller Qwen model only
     primary_checkpoint = "Qwen/Qwen2.5-0.5B-Instruct"
     fallback_checkpoint = None
 # Device setup
 device = "cuda" if torch.cuda.is_available() else "cpu"
-# Load tokenizer and model (with fallback on Spaces)
 def load_model():
-    print(f"🔍 Trying to load PRIMARY: {primary_checkpoint}")
     try:
-        #tokenizer = AutoTokenizer.from_pretrained(primary_checkpoint)
-        #model     = AutoModelForCausalLM.from_pretrained(primary_checkpoint).to(device)
-        # faster loading for large Granite model
         tokenizer = AutoTokenizer.from_pretrained(
             primary_checkpoint,
             use_fast=True
         )
         model = AutoModelForCausalLM.from_pretrained(
             primary_checkpoint,
-            torch_dtype=torch.float16,      # 16‑bit weights
-            low_cpu_mem_usage=True         # memory‑efficient
-            #device_map="auto"               # auto shard on GPU
         ).to(device)
-        print("✅ Loaded PRIMARY ✓")
         return tokenizer, model, primary_checkpoint
     except Exception as e:
-        print("❌ PRIMARY failed:", e)
         if fallback_checkpoint:
             print(f"🔁 Falling back to {fallback_checkpoint}")
             tokenizer = AutoTokenizer.from_pretrained(fallback_checkpoint)
-            model     = AutoModelForCausalLM.from_pretrained(fallback_checkpoint).to(device)
-            print("✅ Loaded FALLBACK ✓")
             return tokenizer, model, fallback_checkpoint
         raise
 tokenizer, model, model_name = load_model()
-# Load hotel-specific documents from disk as (document_id, content) pairs
-def load_hotel_docs(hotel_id: str):
     path = os.path.join("knowledge", f"{hotel_id}.txt")
     if not os.path.exists(path):
         return []
-    content = open(path, "r", encoding="utf-8").read().strip()
-    # Use a single document; document_id can be hotel_id
-    return [(f"{hotel_id}-info", content)]
-# Chat function integrating both local Qwen flow and IBM Granite RAG template with document roles
 def chat(message, history, hotel_id):
     if history is None:
-        history = []
-    # Append user message
-    history.append(("user", message))
-    # ==== Local development flow: simple chat via Qwen ====
-    # ==== Local development flow: simple chat via Qwen ====
-    # ==== Local development flow: simple chat via Qwen ====
-    # ==== Local development flow: simple chat via Qwen ====
     if not is_space:
-        # Build message dict list from history tuples
-        msgs = [{"role": role, "content": content} for role, content in history]
-        # Apply Qwen's chat template
         input_text = tokenizer.apply_chat_template(
             msgs,
             tokenize=False,
             add_generation_prompt=True
         )
-        print("printing templated chat (pre-tokenizes), ready for sending to the model\n")
-        print(input_text)
-        # Generate response
         inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)
-        outputs = model.generate(
-            inputs,
-            max_new_tokens=1024,
-            do_sample=False
         )
         decoded = tokenizer.decode(outputs[0], skip_special_tokens=False)
-        print("RAW DECODED:\n", decoded)
-        #response = decoded.split("<|assistant|>")[-1].strip()
-        response = decoded.split("<|im_start|>assistant\n")[-1].split("<|im_end|>")[0]
-        # history.append(("assistant", f"{response}\n_(Model: {model_name})_"))
-        history.append(("assistant", f"{response}"))
-        # Clear textbox by returning empty string as third output
-        return history, history, ""
-    # ==== Space production flow: IBM Granite RAG ====
-    # ==== Space production flow: IBM Granite RAG ====
-    # ==== Space production flow: IBM Granite RAG ====
-    # ==== Space production flow: IBM Granite RAG ====
-    # Prepare system prompt
-    system_prompt = (
-        "Knowledge Cutoff Date: April 2024. Today's Date: April 12, 2025. "
-        "You are Alexander, the front desk assistant at Family Village Inn in Cyprus."
-        "You only know what’s in the provided documents."
-        "Greet guests politely, but only engage in general chit‑chat if it helps answer their question about the hotel."
-        "Write the response to the user's questions about the hotel by strictly aligning with the facts in the provided documents. "
-        "If the information needed to answer the question is not available in the documents, inform the user that the question cannot be answered based on the available data."
-    )
-    system_prompt = (
-        "Knowledge Cutoff Date: April 2024. Today's Date: April 12, 2025. "
-        "You are Alexander, the front desk assistant at Family Village Inn in Cyprus. "
-        "You only know what’s in the provided documents. "
-        "Greet guests politely, and only engage in general chit‑chat if it helps answer their question about the hotel."
-        "Answer their questions by strictly using the facts in the documents. "
-        "If the information isn’t available, say: "
-        "\"I'm sorry, but I don't have enough information to answer that question.\""
-    )
-    # Start building message list
-    messages = [{"role": "system", "content": system_prompt}]
-    # Inject each document with role 'document' and metadata
-    for doc_id, doc_content in load_hotel_docs(hotel_id):
-        messages.append({
-            "role": "document",
-            "content": doc_content,
-            "document_id": doc_id
-        })
-    # Finally add the user turn
-    messages.append({"role": "user", "content": message})
-    # Apply the model's chat template (IBM-trained template)
-    input_text = tokenizer.apply_chat_template(
-        messages,
-        tokenize=False,
-        add_generation_prompt=True
-    )
-    print("printing templated chat (pre-tokenized), ready for sending to the model\n")
-    print(input_text)
-    # Tokenize, generate, and decode
-    inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)
-    outputs = model.generate(
-        inputs,
-        max_new_tokens=1024,
-        do_sample=False
-    )
-    decoded = tokenizer.decode(outputs[0], skip_special_tokens=False)
-    print("RAW DECODED:\n", decoded)
-    # Extract the assistant's reply
-    response = decoded.split("<|start_of_role|>assistant")[-1].split("<|end_of_role|>")[0]
-    #history.append(("assistant", f"{response}\n_(Model: {model_name})_"))
-    history.append(("assistant", f"{response}"))
-    # Clear textbox by returning empty string as third output
-    return history, history, ""
 # Available hotels
-hotel_ids = [
-    "cyprus-guesthouse-family",
-    "coastal-villa-family",
-    "village-inn-family"
-]
-# Gradio interface setup
-demo = gr.Blocks()
-with demo:
-    gr.Markdown("### 🏨 Hotel Chatbot Demo")
-    gr.Markdown(f"Currently running: **{model_name}**", elem_id="model‑status")
     with gr.Row():
-        hotel_selector = gr.Dropdown(hotel_ids, label="Choose a hotel", value=hotel_ids[0])
-    chatbot = gr.Chatbot()
-    msg = gr.Textbox(placeholder="Ask me about the hotel...", show_label=False)
     msg.submit(
         fn=chat,
         inputs=[msg, chatbot, hotel_selector],
-        outputs=[chatbot, chatbot, msg]
     )
-    gr.Markdown("⚠️ **Reminder:** Pause the Space when done to avoid GPU charges.")
 if __name__ == "__main__":
     demo.launch()

 import os
 import gradio as gr
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
+# Detect Space environment by SPACE_ID env var
+env = os.environ
+is_space = env.get("SPACE_ID") is not None
+print("RUNNING IN SPACE?", is_space)
+# Model selection
 if is_space:
     primary_checkpoint = "ibm-granite/granite-3.3-2b-instruct"
     fallback_checkpoint = "Qwen/Qwen2.5-0.5B-Instruct"
 else:
     primary_checkpoint = "Qwen/Qwen2.5-0.5B-Instruct"
     fallback_checkpoint = None
 # Device setup
 device = "cuda" if torch.cuda.is_available() else "cpu"
+# Load model with fallback
 def load_model():
+    print(f"🔍 Loading model: {primary_checkpoint}")
     try:
         tokenizer = AutoTokenizer.from_pretrained(
             primary_checkpoint,
             use_fast=True
         )
         model = AutoModelForCausalLM.from_pretrained(
             primary_checkpoint,
+            torch_dtype=torch.float16,
+            low_cpu_mem_usage=True
         ).to(device)
+        print(f"✅ Loaded primary {primary_checkpoint}")
         return tokenizer, model, primary_checkpoint
     except Exception as e:
+        print(f"❌ Primary load failed: {e}")
         if fallback_checkpoint:
             print(f"🔁 Falling back to {fallback_checkpoint}")
             tokenizer = AutoTokenizer.from_pretrained(fallback_checkpoint)
+            model = AutoModelForCausalLM.from_pretrained(fallback_checkpoint).to(device)
+            print(f"✅ Loaded fallback {fallback_checkpoint}")
             return tokenizer, model, fallback_checkpoint
         raise
 tokenizer, model, model_name = load_model()
+# Load hotel docs
+def load_hotel_docs(hotel_id):
     path = os.path.join("knowledge", f"{hotel_id}.txt")
     if not os.path.exists(path):
         return []
+    content = open(path, encoding="utf-8").read().strip()
+    return [(hotel_id, content)]
+# Chat function
 def chat(message, history, hotel_id):
+    # Convert incoming UI history (list of dicts) to tuple list
     if history is None:
+        history_tuples = []
+    else:
+        history_tuples = [(m['role'], m['content']) for m in history]
+    # Append the new user turn
+    history_tuples.append(("user", message))
+    # Yield user message immediately
+    ui_history = [{"role": r, "content": c} for r, c in history_tuples]
+    yield ui_history, ""
+    # Local Qwen flow
     if not is_space:
+        # Build messages including the new user turn
+        msgs = [{"role": role, "content": content} for role, content in history_tuples]
         input_text = tokenizer.apply_chat_template(
             msgs,
             tokenize=False,
             add_generation_prompt=True
         )
         inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)
+        outputs = model.generate(inputs, max_new_tokens=1024, do_sample=False)
+        decoded = tokenizer.decode(outputs[0], skip_special_tokens=False)
+        # Extract assistant response
+        response = decoded.split("<|im_start|>assistant")[-1]
+        response = response.split("<|im_end|>")[0].strip()
+    else:
+        # IBM Granite RAG flow
+        system_prompt = (
+            "Knowledge Cutoff Date: April 2024. Today's Date: April 12, 2025. "
+            "You are Alexander, the front desk assistant at Family Village Inn in Cyprus. "
+            "You only know what's in the provided documents. "
+            "Greet guests politely, but only chit-chat when it helps answer hotel questions. "
+            "Answer using only facts from the documents; if unavailable, say you cannot answer."
+        )
+        messages = [{"role": "system", "content": system_prompt}]
+        for doc_id, doc_content in load_hotel_docs(hotel_id):
+            messages.append({"role": "document", "content": doc_content, "document_id": doc_id})
+        # Include full history including the new user message
+        for role, content in history_tuples:
+            messages.append({"role": role, "content": content})
+        input_text = tokenizer.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True
         )
+        inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)
+        outputs = model.generate(inputs, max_new_tokens=1024, do_sample=False)
         decoded = tokenizer.decode(outputs[0], skip_special_tokens=False)
+        response = decoded.split("<|start_of_role|>assistant<|end_of_role|>")[-1]
+        response = response.split("<|end_of_text|>")[0].strip()
+    # Append assistant reply to history
+    history_tuples.append(("assistant", f"{response}"))
+    # Final yield with assistant reply
+    ui_history = [{"role": r, "content": c} for r, c in history_tuples]
+    yield ui_history, ""
 # Available hotels
+hotel_ids = ["cyprus-guesthouse-family", "coastal-villa-family", "village-inn-family"]
+# Gradio UI
+with gr.Blocks() as demo:
+    gr.Markdown("### 🏨 Multi-Hotel Chatbot Demo")
+    gr.Markdown(f"**Running:** {model_name}")
+    hotel_selector = gr.Dropdown(hotel_ids, label="Hotel", value=hotel_ids[0])
+    #chatbot = gr.Chatbot(type="messages")
     with gr.Row():
+        chatbot = gr.Chatbot(type="messages")
+    msg = gr.Textbox(show_label=False, placeholder="Ask about the hotel...")
     msg.submit(
         fn=chat,
         inputs=[msg, chatbot, hotel_selector],
+        outputs=[chatbot, msg]
     )
+    gr.Markdown("⚠️ Pause the Space when done to avoid charges.")
+# Enable streaming queue for generator-based chat
+demo.queue()
 if __name__ == "__main__":
     demo.launch()

app2.py ADDED Viewed

	@@ -0,0 +1,196 @@

+import os
+import getpass
+import gradio as gr
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+# Detect execution environment: Spaces runs as user 'gradio'
+is_space = (getpass.getuser() == "user")
+print("RUNNING AS USER:", getpass.getuser())
+# Choose model checkpoints based on environment
+if is_space:
+    primary_checkpoint = "ibm-granite/granite-3.3-2b-instruct"
+    fallback_checkpoint = "Qwen/Qwen2.5-0.5B-Instruct"
+else:
+    # Local development: use smaller Qwen model only
+    primary_checkpoint = "Qwen/Qwen2.5-0.5B-Instruct"
+    fallback_checkpoint = None
+# Device setup
+device = "cuda" if torch.cuda.is_available() else "cpu"
+# Load tokenizer and model (with fallback on Spaces)
+def load_model():
+    print(f"🔍 Trying to load PRIMARY: {primary_checkpoint}")
+    try:
+        #tokenizer = AutoTokenizer.from_pretrained(primary_checkpoint)
+        #model     = AutoModelForCausalLM.from_pretrained(primary_checkpoint).to(device)
+        # faster loading for large Granite model
+        tokenizer = AutoTokenizer.from_pretrained(
+            primary_checkpoint,
+            use_fast=True
+        )
+        model = AutoModelForCausalLM.from_pretrained(
+            primary_checkpoint,
+            torch_dtype=torch.float16,      # 16‑bit weights
+            low_cpu_mem_usage=True         # memory‑efficient
+            #device_map="auto"               # auto shard on GPU
+        ).to(device)
+        print("✅ Loaded PRIMARY ✓")
+        return tokenizer, model, primary_checkpoint
+    except Exception as e:
+        print("❌ PRIMARY failed:", e)
+        if fallback_checkpoint:
+            print(f"🔁 Falling back to {fallback_checkpoint}")
+            tokenizer = AutoTokenizer.from_pretrained(fallback_checkpoint)
+            model     = AutoModelForCausalLM.from_pretrained(fallback_checkpoint).to(device)
+            print("✅ Loaded FALLBACK ✓")
+            return tokenizer, model, fallback_checkpoint
+        raise
+tokenizer, model, model_name = load_model()
+# Load hotel-specific documents from disk as (document_id, content) pairs
+def load_hotel_docs(hotel_id: str):
+    path = os.path.join("knowledge", f"{hotel_id}.txt")
+    if not os.path.exists(path):
+        return []
+    content = open(path, "r", encoding="utf-8").read().strip()
+    # Use a single document; document_id can be hotel_id
+    return [(f"{hotel_id}-info", content)]
+# Chat function integrating both local Qwen flow and IBM Granite RAG template with document roles
+def chat(message, history, hotel_id):
+    if history is None:
+        history = []
+    # Append user message
+    history.append(("user", message))
+    # ==== Local development flow: simple chat via Qwen ====
+    # ==== Local development flow: simple chat via Qwen ====
+    # ==== Local development flow: simple chat via Qwen ====
+    # ==== Local development flow: simple chat via Qwen ====
+    if not is_space:
+        # Build message dict list from history tuples
+        msgs = [{"role": role, "content": content} for role, content in history]
+        # Apply Qwen's chat template
+        input_text = tokenizer.apply_chat_template(
+            msgs,
+            tokenize=False,
+            add_generation_prompt=True
+        )
+        print("printing templated chat (pre-tokenizes), ready for sending to the model\n")
+        print(input_text)
+        # Generate response
+        inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)
+        outputs = model.generate(
+            inputs,
+            max_new_tokens=1024,
+            do_sample=False
+        )
+        decoded = tokenizer.decode(outputs[0], skip_special_tokens=False)
+        print("RAW DECODED:\n", decoded)
+        #response = decoded.split("<|assistant|>")[-1].strip()
+        response = decoded.split("<|im_start|>assistant\n")[-1].split("<|im_end|>")[0]
+        # history.append(("assistant", f"{response}\n_(Model: {model_name})_"))
+        history.append(("assistant", f"{response}"))
+        # Clear textbox by returning empty string as third output
+        return history, history, ""
+    # ==== Space production flow: IBM Granite RAG ====
+    # ==== Space production flow: IBM Granite RAG ====
+    # ==== Space production flow: IBM Granite RAG ====
+    # ==== Space production flow: IBM Granite RAG ====
+    # Prepare system prompt
+    system_prompt = (
+        "Knowledge Cutoff Date: April 2024. Today's Date: April 12, 2025. "
+        "You are Alexander, the front desk assistant at Family Village Inn in Cyprus."
+        "You only know what’s in the provided documents."
+        "Greet guests politely, but only engage in general chit‑chat if it helps answer their question about the hotel."
+        "Write the response to the user's questions about the hotel by strictly aligning with the facts in the provided documents. "
+        "If the information needed to answer the question is not available in the documents, inform the user that the question cannot be answered based on the available data."
+    )
+    system_prompt = (
+        "Knowledge Cutoff Date: April 2024. Today's Date: April 12, 2025. "
+        "You are Alexander, the front desk assistant at Family Village Inn in Cyprus. "
+        "You only know what’s in the provided documents. "
+        "Greet guests politely, and only engage in general chit‑chat if it helps answer their question about the hotel."
+        "Answer their questions by strictly using the facts in the documents. "
+        "If the information isn’t available, say: "
+        "\"I'm sorry, but I don't have enough information to answer that question.\""
+    )
+    # Start building message list
+    messages = [{"role": "system", "content": system_prompt}]
+    # Inject each document with role 'document' and metadata
+    for doc_id, doc_content in load_hotel_docs(hotel_id):
+        messages.append({
+            "role": "document",
+            "content": doc_content,
+            "document_id": doc_id
+        })
+    # Finally add the user turn
+    messages.append({"role": "user", "content": message})
+    # Apply the model's chat template (IBM-trained template)
+    input_text = tokenizer.apply_chat_template(
+        messages,
+        tokenize=False,
+        add_generation_prompt=True
+    )
+    print("printing templated chat (pre-tokenized), ready for sending to the model\n")
+    print(input_text)
+    # Tokenize, generate, and decode
+    inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)
+    outputs = model.generate(
+        inputs,
+        max_new_tokens=1024,
+        do_sample=False
+    )
+    decoded = tokenizer.decode(outputs[0], skip_special_tokens=False)
+    print("RAW DECODED:\n", decoded)
+    # Extract the assistant's reply
+    response = decoded.split("<|start_of_role|>assistant")[-1].split("<|end_of_role|>")[0]
+    #history.append(("assistant", f"{response}\n_(Model: {model_name})_"))
+    history.append(("assistant", f"{response}"))
+    # Clear textbox by returning empty string as third output
+    return history, history, ""
+# Available hotels
+hotel_ids = [
+    "cyprus-guesthouse-family",
+    "coastal-villa-family",
+    "village-inn-family"
+]
+# Gradio interface setup
+demo = gr.Blocks()
+with demo:
+    gr.Markdown("### 🏨 Hotel Chatbot Demo")
+    gr.Markdown(f"Currently running: **{model_name}**", elem_id="model‑status")
+    with gr.Row():
+        hotel_selector = gr.Dropdown(hotel_ids, label="Choose a hotel", value=hotel_ids[0])
+    chatbot = gr.Chatbot()
+    msg = gr.Textbox(placeholder="Ask me about the hotel...", show_label=False)
+    msg.submit(
+        fn=chat,
+        inputs=[msg, chatbot, hotel_selector],
+        outputs=[chatbot, chatbot, msg]
+    )
+    gr.Markdown("⚠️ **Reminder:** Pause the Space when done to avoid GPU charges.")
+if __name__ == "__main__":
+    demo.launch()