DrishtiSharma commited on
Commit
ade2e0f
Β·
verified Β·
1 Parent(s): 3ee5aea

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -4
app.py CHANGED
@@ -4,6 +4,15 @@ import re
4
  import shutil
5
  import time
6
  import streamlit as st
 
 
 
 
 
 
 
 
 
7
  sys.path.append(os.path.abspath("."))
8
  from langchain.chains import ConversationalRetrievalChain
9
  from langchain.memory import ConversationBufferMemory
@@ -32,7 +41,12 @@ check_poppler_installed()
32
 
33
  def load_docs(document_path):
34
  try:
35
- loader = UnstructuredPDFLoader(document_path)
 
 
 
 
 
36
  documents = loader.load()
37
  text_splitter = NLTKTextSplitter(chunk_size=1000)
38
  return text_splitter.split_documents(documents)
@@ -54,11 +68,11 @@ def load_chain(file_name=None):
54
  embedding_function=HuggingFaceEmbeddings(),
55
  )
56
  if loaded_patent == file_name or already_indexed(vectordb, file_name):
57
- st.write("Already indexed")
58
  else:
59
  vectordb.delete_collection()
60
  docs = load_docs(file_name)
61
- st.write("Length of Documents: ", len(docs))
62
 
63
  vectordb = Chroma.from_documents(
64
  docs, HuggingFaceEmbeddings(), persist_directory=PERSISTED_DIRECTORY
@@ -128,7 +142,7 @@ if __name__ == "__main__":
128
  # Load the conversational chain
129
  st.write("πŸ”„ Loading document into the system...")
130
  chain = load_chain(pdf_path)
131
- st.success("Document successfully loaded! You can now start asking questions.")
132
 
133
  # Initialize the chat
134
  if "messages" not in st.session_state:
 
4
  import shutil
5
  import time
6
  import streamlit as st
7
+ import nltk
8
+
9
+ nltk_data_path = os.path.join(os.getcwd(), "nltk_data")
10
+ nltk.data.path.append(nltk_data_path)
11
+
12
+ if not os.path.exists(os.path.join(nltk_data_path, "tokenizers/punkt")):
13
+ print("Downloading NLTK 'punkt' resource...")
14
+ nltk.download("punkt", download_dir=nltk_data_path)
15
+
16
  sys.path.append(os.path.abspath("."))
17
  from langchain.chains import ConversationalRetrievalChain
18
  from langchain.memory import ConversationBufferMemory
 
41
 
42
  def load_docs(document_path):
43
  try:
44
+ loader = UnstructuredPDFLoader(
45
+ document_path,
46
+ mode="elements",
47
+ strategy="fast",
48
+ ocr_languages=None # Explicitly disable OCR
49
+ )
50
  documents = loader.load()
51
  text_splitter = NLTKTextSplitter(chunk_size=1000)
52
  return text_splitter.split_documents(documents)
 
68
  embedding_function=HuggingFaceEmbeddings(),
69
  )
70
  if loaded_patent == file_name or already_indexed(vectordb, file_name):
71
+ st.write("βœ… Already indexed.")
72
  else:
73
  vectordb.delete_collection()
74
  docs = load_docs(file_name)
75
+ st.write("πŸ” Number of Documents: ", len(docs))
76
 
77
  vectordb = Chroma.from_documents(
78
  docs, HuggingFaceEmbeddings(), persist_directory=PERSISTED_DIRECTORY
 
142
  # Load the conversational chain
143
  st.write("πŸ”„ Loading document into the system...")
144
  chain = load_chain(pdf_path)
145
+ st.success("πŸš€ Document successfully loaded! You can now start asking questions.")
146
 
147
  # Initialize the chat
148
  if "messages" not in st.session_state: