Spaces:
Sleeping
Sleeping
File size: 7,938 Bytes
185bd57 3489832 185bd57 3489832 185bd57 3489832 185bd57 37bd78c 3489832 37bd78c 3489832 37bd78c 185bd57 37bd78c 3489832 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 |
import streamlit as st
import os
from openai import OpenAI
from langchain.memory import ConversationBufferMemory
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders import PyPDFLoader, TextLoader
import tempfile
# Page configuration
st.set_page_config(page_title="DeepSeek RAG Chatbot", page_icon="π€", layout="wide")
# App title and description
st.title("π€ DeepSeek RAG Chatbot")
st.subheader("A chatbot that uses your documents to give informed answers")
# Set up API key input
if 'DEEPSEEK_API_KEY' not in st.session_state:
api_key = st.text_input("Enter your DeepSeek API Key:", type="password")
if api_key:
st.session_state['DEEPSEEK_API_KEY'] = api_key
os.environ['DEEPSEEK_API_KEY'] = api_key
st.success("API Key saved!")
st.rerun()
# Initialize session state variables
if 'memory' not in st.session_state:
st.session_state.memory = ConversationBufferMemory(return_messages=True)
if 'chat_history' not in st.session_state:
st.session_state.chat_history = []
if 'vectorstore' not in st.session_state:
st.session_state.vectorstore = None
if 'client' not in st.session_state and 'DEEPSEEK_API_KEY' in st.session_state:
try:
# Initialize DeepSeek client for chat
st.session_state.client = OpenAI(
api_key=st.session_state['DEEPSEEK_API_KEY'],
base_url="https://api.deepseek.com"
)
# Initialize small HuggingFace embeddings model
# Using paraphrase-MiniLM-L3-v2 - a smaller version with only 22MB size
st.session_state.embeddings = HuggingFaceEmbeddings(
model_name="sentence-transformers/paraphrase-MiniLM-L3-v2"
)
st.success("Models loaded successfully!")
except Exception as e:
st.error(f"Error initializing API: {str(e)}")
# Function to process uploaded documents
def process_documents(uploaded_files):
temp_dir = tempfile.mkdtemp()
for file in uploaded_files:
file_path = os.path.join(temp_dir, file.name)
with open(file_path, "wb") as f:
f.write(file.getbuffer())
# Load documents based on file type
documents = []
for file in uploaded_files:
if file.name.endswith('.pdf'):
loader = PyPDFLoader(os.path.join(temp_dir, file.name))
documents.extend(loader.load())
elif file.name.endswith('.txt'):
loader = TextLoader(os.path.join(temp_dir, file.name))
documents.extend(loader.load())
# Split documents into chunks
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200
)
document_chunks = text_splitter.split_documents(documents)
# Create or update vector store
if st.session_state.vectorstore is None:
st.session_state.vectorstore = FAISS.from_documents(
document_chunks,
st.session_state.embeddings
)
else:
# Add new documents to existing vectorstore
st.session_state.vectorstore.add_documents(document_chunks)
return len(document_chunks)
# Function to retrieve relevant context from vector database
def retrieve_context(query, k=3):
if st.session_state.vectorstore is None:
return ""
docs = st.session_state.vectorstore.similarity_search(query, k=k)
context = "\n\n".join([doc.page_content for doc in docs])
return context
# Main application layout
if 'DEEPSEEK_API_KEY' in st.session_state:
# Create a sidebar for document upload and settings
with st.sidebar:
st.header("Document Upload")
uploaded_files = st.file_uploader(
"Upload your documents",
accept_multiple_files=True,
type=["pdf", "txt"]
)
if uploaded_files:
if st.button("Process Documents"):
with st.spinner("Processing documents..."):
num_chunks = process_documents(uploaded_files)
st.success(f"Successfully processed {len(uploaded_files)} documents into {num_chunks} chunks!")
st.header("RAG Settings")
k_documents = st.slider("Number of documents to retrieve", min_value=1, max_value=10, value=3)
# Clear conversation button
if st.button("Clear Conversation"):
st.session_state.memory = ConversationBufferMemory(return_messages=True)
st.session_state.chat_history = []
st.success("Conversation cleared!")
st.rerun()
# Clear knowledge base button
if st.button("Clear Knowledge Base"):
st.session_state.vectorstore = None
st.success("Knowledge base cleared!")
st.header("About")
st.markdown("""
This RAG chatbot uses:
- π¦ LangChain for memory and document processing
- π FAISS for vector storage and retrieval
- π§ HuggingFace for lightweight embeddings (paraphrase-MiniLM-L3-v2)
- π€ DeepSeek API for AI responses
- π₯οΈ Streamlit for the web interface
The chatbot can:
- Upload and process PDF and text documents
- Retrieve relevant information from documents
- Generate informed responses using your documents
- Maintain conversation context
""")
# Main chat area - create a container for the chat history
chat_container = st.container()
with chat_container:
# Display chat history
for message in st.session_state.chat_history:
with st.chat_message(message["role"]):
st.write(message["content"])
# IMPORTANT: Place chat_input outside of any container and if block
# This must be at the main page level
user_input = st.chat_input("Type your message here...")
# Handle user input - but only process if API key is available
if user_input and 'DEEPSEEK_API_KEY' in st.session_state:
# Add user message to chat history
st.session_state.chat_history.append({"role": "user", "content": user_input})
# Display user message
with st.chat_message("user"):
st.write(user_input)
# Get model response
with st.chat_message("assistant"):
with st.spinner("Thinking..."):
try:
# Retrieve relevant context from vector database
context = retrieve_context(user_input, k=k_documents)
# Prepare chat history for DeepSeek API
system_prompt = "You are a helpful assistant with access to a knowledge base."
if context:
system_prompt += f"\n\nRelevant information from knowledge base:\n{context}\n\nUse this information to answer the user's question. If the information doesn't contain the answer, just say that you don't know based on the available information."
messages = [{"role": "system", "content": system_prompt}]
for msg in st.session_state.chat_history:
messages.append({"role": msg["role"], "content": msg["content"]})
# Call DeepSeek API
response = st.session_state.client.chat.completions.create(
model="deepseek-chat",
messages=messages,
stream=False
)
assistant_response = response.choices[0].message.content
st.write(assistant_response)
# Add assistant response to chat history
st.session_state.chat_history.append({"role": "assistant", "content": assistant_response})
except Exception as e:
st.error(f"Error: {str(e)}") |