Spaces:

Tanifh
/

phi3-chatbot

Sleeping

Tanifh commited on Mar 13

Commit

7fa8485

verified ·

1 Parent(s): f555c72

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -7,8 +7,8 @@ from llama_cpp import Llama
 st.set_page_config(page_title="Phi-3 Mini Chatbot", layout="centered")
 # ✅ Define model path
-MODEL_PATH = "./Phi-3-mini-4k-instruct-q4.gguf"
-MODEL_URL = "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi-3-mini-4k-instruct-q4.gguf"
 # ✅ Check if model exists, otherwise download
 if not os.path.exists(MODEL_PATH):
@@ -29,10 +29,10 @@ try:
     if "model" not in st.session_state:
         st.session_state["model"] = Llama(
             model_path=MODEL_PATH,
-            n_ctx=512,  # ✅ Lower memory usage, speeds up responses
             n_threads=2,  # Matches available vCPUs
             numa=True,
-            n_batch=32  # ✅ Faster token processing
         )
         st.write("✅ Model loaded successfully!")
 except Exception as e:
@@ -65,11 +65,11 @@ if st.button("Send") and user_input:
     # ✅ Use a minimal prompt format (no system message)
     formatted_messages = [{"role": "user", "content": user_input}]
-    # ✅ Disable streaming for debugging
     response_data = st.session_state["model"].create_chat_completion(
         messages=formatted_messages,
-        max_tokens=128, temperature=0.7, top_p=0.9,
-        stream=False  # ❌ Disabled streaming for debugging
     )
     # ✅ Debugging output

 st.set_page_config(page_title="Phi-3 Mini Chatbot", layout="centered")
 # ✅ Define model path
+MODEL_PATH = "./Phi-3-mini-4k-instruct-q3.gguf"
+MODEL_URL = "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi-3-mini-4k-instruct-q3.gguf"
 # ✅ Check if model exists, otherwise download
 if not os.path.exists(MODEL_PATH):
     if "model" not in st.session_state:
         st.session_state["model"] = Llama(
             model_path=MODEL_PATH,
+            n_ctx=256,  # ✅ Lower memory usage, speeds up responses
             n_threads=2,  # Matches available vCPUs
             numa=True,
+            n_batch=64  # ✅ Faster token processing
         )
         st.write("✅ Model loaded successfully!")
 except Exception as e:
     # ✅ Use a minimal prompt format (no system message)
     formatted_messages = [{"role": "user", "content": user_input}]
+    # ✅ Speed improvements: Reduce response length & force short answers
     response_data = st.session_state["model"].create_chat_completion(
         messages=formatted_messages,
+        max_tokens=64, temperature=0.5, top_p=0.8,
+        stream=False  # ✅ No streaming for debugging
     )
     # ✅ Debugging output