Skier8402 commited on
Commit
1b26c07
·
verified ·
1 Parent(s): c00dbfa

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +61 -83
app.py CHANGED
@@ -17,103 +17,91 @@ from langchain.chains import ConversationalRetrievalChain
17
  from htmlTemplates import css, bot_template, user_template
18
  from langchain.llms import HuggingFaceHub
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  def get_pdf_text(pdf_docs):
21
  text = ""
22
  for pdf in pdf_docs:
23
- try:
24
- pdf_reader = PdfReader(pdf)
25
- for page in pdf_reader.pages:
26
- text += page.extract_text()
27
- except Exception as e:
28
- st.error(f"Error extracting text from PDF: {e}")
29
  return text
30
 
31
  def get_text_chunks(text):
32
  text_splitter = CharacterTextSplitter(
33
- separator="\n", chunk_size=1500, chunk_overlap=300, length_function=len
 
 
 
34
  )
35
- try:
36
- chunks = text_splitter.split_text(text)
37
- except Exception as e:
38
- st.error(f"Error splitting text into chunks: {e}")
39
- chunks = []
40
  return chunks
41
 
42
  def get_vectorstore(text_chunks):
43
  model = "BAAI/bge-base-en-v1.5"
44
- encode_kwargs = {
45
- "normalize_embeddings": True
46
- }
47
- try:
48
- embeddings = HuggingFaceBgeEmbeddings(
49
- model_name=model, encode_kwargs=encode_kwargs, model_kwargs={"device": "cpu"}
50
- )
51
- vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
52
- except Exception as e:
53
- st.error(f"Error creating vector store: {e}")
54
- vectorstore = None
55
  return vectorstore
56
 
57
  def get_conversation_chain(vectorstore):
58
- if vectorstore is None:
59
- return None
 
 
60
 
61
- try:
62
- llm = HuggingFaceHub(
63
- repo_id="mistralai/Mistral-7B-v0.3",
64
- model_kwargs={"temperature": 0.5, "max_length": 4000},
65
- )
66
- memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
67
- conversation_chain = ConversationalRetrievalChain.from_llm(
68
- llm=llm, retriever=vectorstore.as_retriever(), memory=memory
69
- )
70
- except Exception as e:
71
- st.error(f"Error creating conversation chain: {e}")
72
- conversation_chain = None
73
  return conversation_chain
74
 
75
  def handle_userinput(user_question):
76
- if st.session_state.conversation is None:
77
- st.error("Please process the PDF files before asking a question.")
78
- return
79
-
80
- try:
81
- response = st.session_state.conversation({"question": user_question})
82
- st.session_state.chat_history = response["chat_history"]
83
-
84
- for i, message in enumerate(st.session_state.chat_history):
85
- if i % 2 == 0:
86
- st.write("//_^ User: " + message.content)
87
- else:
88
- st.write("🤖 ChatBot: " + message.content)
89
- except Exception as e:
90
- st.error(f"Error handling user input: {e}")
91
 
92
  def main():
93
- st.set_page_config(
94
- page_title="Chat with a Bot that tries to answer questions about multiple PDFs",
95
- page_icon=":books:",
96
- )
97
-
98
- st.markdown("# Chat with a Bot")
99
- st.markdown("This bot tries to answer questions about multiple PDFs. Let the processing of the PDF finish before adding your question. 🙏🏾")
100
-
101
  st.write(css, unsafe_allow_html=True)
102
 
103
- huggingface_token = st.text_input("Enter your HuggingFace Hub token", type="password")
104
- #openai_api_key = st.text_input("Enter your OpenAI API key", type="password")
105
-
106
- if huggingface_token:
107
- os.environ["HUGGINGFACEHUB_API_TOKEN"] = huggingface_token
108
- #if openai_api_key:
109
- # os.environ["OPENAI_API_KEY"] = openai_api_key
110
-
111
  if "conversation" not in st.session_state:
112
  st.session_state.conversation = None
113
  if "chat_history" not in st.session_state:
114
  st.session_state.chat_history = None
115
 
116
- st.header("Chat with a Bot 🤖🦾 that tries to answer questions about multiple PDFs :books:")
117
  user_question = st.text_input("Ask a question about your documents:")
118
  if user_question:
119
  handle_userinput(user_question)
@@ -125,20 +113,10 @@ def main():
125
  )
126
  if st.button("Process"):
127
  with st.spinner("Processing"):
128
- try:
129
- # get pdf text
130
- raw_text = get_pdf_text(pdf_docs)
131
-
132
- # get the text chunks
133
- text_chunks = get_text_chunks(raw_text)
134
-
135
- # create vector store
136
- vectorstore = get_vectorstore(text_chunks)
137
-
138
- # create conversation chain
139
- st.session_state.conversation = get_conversation_chain(vectorstore)
140
- except Exception as e:
141
- st.error(f"Error processing PDF files: {e}")
142
 
143
  if __name__ == "__main__":
144
  main()
 
17
  from htmlTemplates import css, bot_template, user_template
18
  from langchain.llms import HuggingFaceHub
19
 
20
+ import os
21
+ import streamlit as st
22
+ from dotenv import load_dotenv
23
+ from PyPDF2 import PdfReader
24
+ from langchain.text_splitter import CharacterTextSplitter
25
+ from langchain.embeddings import HuggingFaceBgeEmbeddings
26
+ from langchain.vectorstores import FAISS
27
+ from langchain.chat_models import ChatOpenAI
28
+ from langchain.memory import ConversationBufferMemory
29
+ from langchain.chains import ConversationalRetrievalChain
30
+ from htmlTemplates import css, bot_template, user_template
31
+ from langchain.llms import HuggingFaceHub
32
+ from langchain.chains import RetrievalQA
33
+
34
  def get_pdf_text(pdf_docs):
35
  text = ""
36
  for pdf in pdf_docs:
37
+ pdf_reader = PdfReader(pdf)
38
+ for page in pdf_reader.pages:
39
+ text += page.extract_text()
 
 
 
40
  return text
41
 
42
  def get_text_chunks(text):
43
  text_splitter = CharacterTextSplitter(
44
+ separator="\n",
45
+ chunk_size=1000,
46
+ chunk_overlap=200,
47
+ length_function=len
48
  )
49
+ chunks = text_splitter.split_text(text)
 
 
 
 
50
  return chunks
51
 
52
  def get_vectorstore(text_chunks):
53
  model = "BAAI/bge-base-en-v1.5"
54
+ encode_kwargs = {"normalize_embeddings": True}
55
+ embeddings = HuggingFaceBgeEmbeddings(
56
+ model_name=model,
57
+ encode_kwargs=encode_kwargs,
58
+ model_kwargs={"device": "cpu"}
59
+ )
60
+ vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
 
 
 
 
61
  return vectorstore
62
 
63
  def get_conversation_chain(vectorstore):
64
+ llm = HuggingFaceHub(
65
+ repo_id="mistralai/Mistral-7B-v0.3",
66
+ model_kwargs={"temperature": 0.5, "max_length": 4000},
67
+ )
68
 
69
+ memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
70
+ conversation_chain = ConversationalRetrievalChain.from_llm(
71
+ llm=llm,
72
+ retriever=vectorstore.as_retriever(),
73
+ memory=memory,
74
+ return_source_documents=True # Add this line to return source documents
75
+ )
 
 
 
 
 
76
  return conversation_chain
77
 
78
  def handle_userinput(user_question):
79
+ response = st.session_state.conversation({"question": user_question})
80
+ st.session_state.chat_history = response["chat_history"]
81
+
82
+ for i, message in enumerate(st.session_state.chat_history):
83
+ if i % 2 == 0:
84
+ st.write(user_template.replace("{{MSG}}", message.content), unsafe_allow_html=True)
85
+ else:
86
+ st.write(bot_template.replace("{{MSG}}", message.content), unsafe_allow_html=True)
87
+
88
+ # Display references
89
+ if "source_documents" in response:
90
+ st.write("References:")
91
+ for doc in response["source_documents"]:
92
+ st.write(f"- {doc.metadata.get('source', 'Unknown source')}, page {doc.metadata.get('page', 'Unknown page')}")
 
93
 
94
  def main():
95
+ load_dotenv()
96
+ st.set_page_config(page_title="Chat with Multiple PDFs", page_icon=":books:")
 
 
 
 
 
 
97
  st.write(css, unsafe_allow_html=True)
98
 
 
 
 
 
 
 
 
 
99
  if "conversation" not in st.session_state:
100
  st.session_state.conversation = None
101
  if "chat_history" not in st.session_state:
102
  st.session_state.chat_history = None
103
 
104
+ st.header("Chat with Multiple PDFs :books:")
105
  user_question = st.text_input("Ask a question about your documents:")
106
  if user_question:
107
  handle_userinput(user_question)
 
113
  )
114
  if st.button("Process"):
115
  with st.spinner("Processing"):
116
+ raw_text = get_pdf_text(pdf_docs)
117
+ text_chunks = get_text_chunks(raw_text)
118
+ vectorstore = get_vectorstore(text_chunks)
119
+ st.session_state.conversation = get_conversation_chain(vectorstore)
 
 
 
 
 
 
 
 
 
 
120
 
121
  if __name__ == "__main__":
122
  main()