Spaces:

Tanifh
/

phi3-chatbot

Sleeping

App Files Files Community

Tanifh commited on Mar 13

Commit

9e36cc1

verified ·

1 Parent(s): dd6665f

Update app.py

Browse files

Files changed (1) hide show

app.py +28 -21

app.py CHANGED Viewed

@@ -24,10 +24,15 @@ if not os.path.exists(MODEL_PATH):
         st.error(f"🚨 Model download failed: {e}")
         st.stop()
-# ✅ Load model with reduced context length to reduce memory usage
 try:
     if "model" not in st.session_state:
-        st.session_state["model"] = Llama(model_path=MODEL_PATH, n_ctx=2048)  # Reduced from 4096
         st.write("✅ Model loaded successfully!")
 except Exception as e:
     st.error(f"🚨 Error loading model: {e}")
@@ -55,30 +60,32 @@ if st.button("Send") and user_input:
     # Add user input to chat history
     st.session_state["messages"].append(("user", user_input))
     st.chat_message("user").write(user_input)
     # ✅ Format messages using Phi-3 chat template
     formatted_messages = [
         {"role": "system", "content": "You are an AI assistant. Provide clear and concise answers."},
         {"role": "user", "content": user_input}
     ]
-    # Generate response
-    try:
-        response = st.session_state["model"].create_chat_completion(
-            messages=formatted_messages,
-            max_tokens=1024, temperature=0.7, top_p=0.9
-        )
-        # ✅ Debugging output
-        st.write("🔍 Debug: Raw Model Response:", response)
-        response_text = response["choices"][0]["message"]["content"].strip()
-        st.session_state["messages"].append(("assistant", response_text))
-        st.chat_message("assistant").write(response_text)
-    except Exception as e:
-        st.error(f"🚨 Error generating response: {e}")
-# Run the app with: streamlit run app.py

         st.error(f"🚨 Model download failed: {e}")
         st.stop()
+# ✅ Load optimized model
 try:
     if "model" not in st.session_state:
+        st.session_state["model"] = Llama(
+            model_path=MODEL_PATH,
+            n_ctx=1024,  # Reduce context window for faster inference
+            n_threads=2,  # Match available CPU cores (2 vCPUs)
+            numa=True  # Enable NUMA optimization
+        )
         st.write("✅ Model loaded successfully!")
 except Exception as e:
     st.error(f"🚨 Error loading model: {e}")
     # Add user input to chat history
     st.session_state["messages"].append(("user", user_input))
     st.chat_message("user").write(user_input)
     # ✅ Format messages using Phi-3 chat template
     formatted_messages = [
         {"role": "system", "content": "You are an AI assistant. Provide clear and concise answers."},
         {"role": "user", "content": user_input}
     ]
+    # ✅ Streamed response for faster user experience
+    response_data = st.session_state["model"].create_chat_completion(
+        messages=formatted_messages,
+        max_tokens=256, temperature=0.7, top_p=0.9,
+        stream=True  # ✅ Enables real-time streaming
+    )
+    response_text = ""
+    response_container = st.empty()  # Placeholder for live updates
+    for chunk in response_data:
+        if "choices" in chunk and len(chunk["choices"]) > 0:
+            choice = chunk["choices"][0]
+            if "message" in choice:
+                response_text += choice["message"]["content"]
+                response_container.markdown(f"**AI:** {response_text}")
+            if choice.get("finish_reason") == "stop":
+                break