Skier8402 commited on
Commit
8141ad0
·
verified ·
1 Parent(s): 24a1172

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +110 -36
app.py CHANGED
@@ -17,9 +17,23 @@ from langchain.memory import ConversationBufferMemory
17
  from langchain.chains import ConversationalRetrievalChain
18
  from htmlTemplates import css, bot_template, user_template
19
  from langchain.llms import HuggingFaceHub
20
- from langchain.chains import RetrievalQA
21
 
22
  def get_pdf_text(pdf_docs):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  text = ""
24
  for pdf in pdf_docs:
25
  pdf_reader = PdfReader(pdf)
@@ -27,82 +41,134 @@ def get_pdf_text(pdf_docs):
27
  text += page.extract_text()
28
  return text
29
 
 
30
  def get_text_chunks(text):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  text_splitter = CharacterTextSplitter(
32
- separator="\n",
33
- chunk_size=1000,
34
- chunk_overlap=200,
35
- length_function=len
36
  )
37
  chunks = text_splitter.split_text(text)
38
  return chunks
39
 
 
40
  def get_vectorstore(text_chunks):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  model = "BAAI/bge-base-en-v1.5"
42
- encode_kwargs = {"normalize_embeddings": True}
 
 
43
  embeddings = HuggingFaceBgeEmbeddings(
44
- model_name=model,
45
- encode_kwargs=encode_kwargs,
46
- model_kwargs={"device": "cpu"}
47
  )
48
  vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
49
  return vectorstore
50
 
 
51
  def get_conversation_chain(vectorstore):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  llm = HuggingFaceHub(
53
  repo_id="mistralai/Mistral-7B-v0.3",
54
  model_kwargs={"temperature": 0.5, "max_length": 4000},
55
  )
 
56
 
57
  memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
58
  conversation_chain = ConversationalRetrievalChain.from_llm(
59
- llm=llm,
60
- retriever=vectorstore.as_retriever(),
61
- memory=memory,
62
- return_source_documents=True # Add this line to return source documents
63
  )
64
  return conversation_chain
65
 
 
66
  def handle_userinput(user_question):
 
 
 
 
 
 
 
67
  response = st.session_state.conversation({"question": user_question})
68
  st.session_state.chat_history = response["chat_history"]
69
-
70
  for i, message in enumerate(st.session_state.chat_history):
71
  if i % 2 == 0:
72
- st.write(user_template.replace("{{MSG}}", message.content), unsafe_allow_html=True)
73
  else:
74
- st.write(bot_template.replace("{{MSG}}", message.content), unsafe_allow_html=True)
75
-
76
- # Display references
77
- if "source_documents" in response:
78
- st.write("References:")
79
- for doc in response["source_documents"]:
80
- st.write(f"- {doc.metadata.get('source', 'Unknown source')}, page {doc.metadata.get('page', 'Unknown page')}")
81
 
82
  def main():
83
- load_dotenv()
84
- st.set_page_config(page_title="Chat with Multiple PDFs", page_icon=":books:")
 
 
 
 
 
 
 
 
 
85
  st.write(css, unsafe_allow_html=True)
86
 
 
 
 
 
 
 
 
 
 
 
87
  if "conversation" not in st.session_state:
88
  st.session_state.conversation = None
89
  if "chat_history" not in st.session_state:
90
  st.session_state.chat_history = None
91
 
92
- st.header("Chat with Multiple PDFs :books:")
93
-
94
- # Add Hugging Face token input
95
- huggingface_token = st.text_input("Enter your Hugging Face API token:", type="password")
96
- if huggingface_token:
97
- os.environ["HUGGINGFACEHUB_API_TOKEN"] = huggingface_token
98
-
99
  user_question = st.text_input("Ask a question about your documents:")
100
-
101
  if user_question:
102
- if not huggingface_token:
103
- st.error("Please enter your Hugging Face API token to proceed.")
104
- else:
105
- handle_userinput(user_question)
106
 
107
  with st.sidebar:
108
  st.subheader("Your documents")
@@ -111,10 +177,18 @@ def main():
111
  )
112
  if st.button("Process"):
113
  with st.spinner("Processing"):
 
114
  raw_text = get_pdf_text(pdf_docs)
 
 
115
  text_chunks = get_text_chunks(raw_text)
 
 
116
  vectorstore = get_vectorstore(text_chunks)
 
 
117
  st.session_state.conversation = get_conversation_chain(vectorstore)
118
 
 
119
  if __name__ == "__main__":
120
  main()
 
17
  from langchain.chains import ConversationalRetrievalChain
18
  from htmlTemplates import css, bot_template, user_template
19
  from langchain.llms import HuggingFaceHub
20
+
21
 
22
  def get_pdf_text(pdf_docs):
23
+ """
24
+ Extract text from a list of PDF documents.
25
+
26
+ Parameters
27
+ ----------
28
+ pdf_docs : list
29
+ List of PDF documents to extract text from.
30
+
31
+ Returns
32
+ -------
33
+ str
34
+ Extracted text from all the PDF documents.
35
+
36
+ """
37
  text = ""
38
  for pdf in pdf_docs:
39
  pdf_reader = PdfReader(pdf)
 
41
  text += page.extract_text()
42
  return text
43
 
44
+
45
  def get_text_chunks(text):
46
+ """
47
+ Split the input text into chunks.
48
+
49
+ Parameters
50
+ ----------
51
+ text : str
52
+ The input text to be split.
53
+
54
+ Returns
55
+ -------
56
+ list
57
+ List of text chunks.
58
+
59
+ """
60
  text_splitter = CharacterTextSplitter(
61
+ separator="\n", chunk_size=1500, chunk_overlap=300, length_function=len
 
 
 
62
  )
63
  chunks = text_splitter.split_text(text)
64
  return chunks
65
 
66
+
67
  def get_vectorstore(text_chunks):
68
+ """
69
+ Generate a vector store from a list of text chunks using HuggingFace BgeEmbeddings.
70
+
71
+ Parameters
72
+ ----------
73
+ text_chunks : list
74
+ List of text chunks to be embedded.
75
+
76
+ Returns
77
+ -------
78
+ FAISS
79
+ A FAISS vector store containing the embeddings of the text chunks.
80
+
81
+ """
82
  model = "BAAI/bge-base-en-v1.5"
83
+ encode_kwargs = {
84
+ "normalize_embeddings": True
85
+ } # set True to compute cosine similarity
86
  embeddings = HuggingFaceBgeEmbeddings(
87
+ model_name=model, encode_kwargs=encode_kwargs, model_kwargs={"device": "cpu"}
 
 
88
  )
89
  vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
90
  return vectorstore
91
 
92
+
93
  def get_conversation_chain(vectorstore):
94
+ """
95
+ Create a conversational retrieval chain using a vector store and a language model.
96
+
97
+ Parameters
98
+ ----------
99
+ vectorstore : FAISS
100
+ A FAISS vector store containing the embeddings of the text chunks.
101
+
102
+ Returns
103
+ -------
104
+ ConversationalRetrievalChain
105
+ A conversational retrieval chain for generating responses.
106
+
107
+ """
108
  llm = HuggingFaceHub(
109
  repo_id="mistralai/Mistral-7B-v0.3",
110
  model_kwargs={"temperature": 0.5, "max_length": 4000},
111
  )
112
+ # llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-0613")
113
 
114
  memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
115
  conversation_chain = ConversationalRetrievalChain.from_llm(
116
+ llm=llm, retriever=vectorstore.as_retriever(), memory=memory
 
 
 
117
  )
118
  return conversation_chain
119
 
120
+
121
  def handle_userinput(user_question):
122
+ """
123
+ Handle user input and generate a response using the conversational retrieval chain.
124
+ Parameters
125
+ ----------
126
+ user_question : str
127
+ The user's question.
128
+ """
129
  response = st.session_state.conversation({"question": user_question})
130
  st.session_state.chat_history = response["chat_history"]
131
+
132
  for i, message in enumerate(st.session_state.chat_history):
133
  if i % 2 == 0:
134
+ st.write("//_^ User: " + message.content)
135
  else:
136
+ st.write("🤖 ChatBot: " + message.content)
137
+
 
 
 
 
 
138
 
139
  def main():
140
+ """
141
+ Putting it all together.
142
+ """
143
+ st.set_page_config(
144
+ page_title="Chat with a Bot that tries to answer questions about multiple PDFs",
145
+ page_icon=":books:",
146
+ )
147
+
148
+ st.markdown("# Chat with a Bot")
149
+ st.markdown("This bot tries to answer questions about multiple PDFs. Let the processing of the PDF finish before adding your question. 🙏🏾")
150
+
151
  st.write(css, unsafe_allow_html=True)
152
 
153
+ # set huggingface hub token in st.text_input widget
154
+ # then hide the input
155
+ huggingface_token = st.text_input("Enter your HuggingFace Hub token", type="password")
156
+ #openai_api_key = st.text_input("Enter your OpenAI API key", type="password")
157
+
158
+ # set this key as an environment variable
159
+ os.environ["HUGGINGFACEHUB_API_TOKEN"] = huggingface_token
160
+ #os.environ["OPENAI_API_KEY"] = openai_api_key
161
+
162
+
163
  if "conversation" not in st.session_state:
164
  st.session_state.conversation = None
165
  if "chat_history" not in st.session_state:
166
  st.session_state.chat_history = None
167
 
168
+ st.header("Chat with a Bot 🤖🦾 that tries to answer questions about multiple PDFs :books:")
 
 
 
 
 
 
169
  user_question = st.text_input("Ask a question about your documents:")
 
170
  if user_question:
171
+ handle_userinput(user_question)
 
 
 
172
 
173
  with st.sidebar:
174
  st.subheader("Your documents")
 
177
  )
178
  if st.button("Process"):
179
  with st.spinner("Processing"):
180
+ # get pdf text
181
  raw_text = get_pdf_text(pdf_docs)
182
+
183
+ # get the text chunks
184
  text_chunks = get_text_chunks(raw_text)
185
+
186
+ # create vector store
187
  vectorstore = get_vectorstore(text_chunks)
188
+
189
+ # create conversation chain
190
  st.session_state.conversation = get_conversation_chain(vectorstore)
191
 
192
+
193
  if __name__ == "__main__":
194
  main()