Spaces:

DrishtiSharma
/

chat-w-google-patents

Running

App Files Files Community

DrishtiSharma commited on Dec 19, 2024

Commit

ade2e0f

verified ·

1 Parent(s): 3ee5aea

Update app.py

Browse files

Files changed (1) hide show

app.py +18 -4

app.py CHANGED Viewed

@@ -4,6 +4,15 @@ import re
 import shutil
 import time
 import streamlit as st
 sys.path.append(os.path.abspath("."))
 from langchain.chains import ConversationalRetrievalChain
 from langchain.memory import ConversationBufferMemory
@@ -32,7 +41,12 @@ check_poppler_installed()
 def load_docs(document_path):
     try:
-        loader = UnstructuredPDFLoader(document_path)
         documents = loader.load()
         text_splitter = NLTKTextSplitter(chunk_size=1000)
         return text_splitter.split_documents(documents)
@@ -54,11 +68,11 @@ def load_chain(file_name=None):
         embedding_function=HuggingFaceEmbeddings(),
     )
     if loaded_patent == file_name or already_indexed(vectordb, file_name):
-        st.write("Already indexed")
     else:
         vectordb.delete_collection()
         docs = load_docs(file_name)
-        st.write("Length of Documents: ", len(docs))
         vectordb = Chroma.from_documents(
             docs, HuggingFaceEmbeddings(), persist_directory=PERSISTED_DIRECTORY
@@ -128,7 +142,7 @@ if __name__ == "__main__":
     # Load the conversational chain
     st.write("🔄 Loading document into the system...")
     chain = load_chain(pdf_path)
-    st.success("Document successfully loaded! You can now start asking questions.")
     # Initialize the chat
     if "messages" not in st.session_state:

 import shutil
 import time
 import streamlit as st
+import nltk
+nltk_data_path = os.path.join(os.getcwd(), "nltk_data")
+nltk.data.path.append(nltk_data_path)
+if not os.path.exists(os.path.join(nltk_data_path, "tokenizers/punkt")):
+    print("Downloading NLTK 'punkt' resource...")
+    nltk.download("punkt", download_dir=nltk_data_path)
 sys.path.append(os.path.abspath("."))
 from langchain.chains import ConversationalRetrievalChain
 from langchain.memory import ConversationBufferMemory
 def load_docs(document_path):
     try:
+        loader = UnstructuredPDFLoader(
+            document_path,
+            mode="elements",
+            strategy="fast",
+            ocr_languages=None  # Explicitly disable OCR
+        )
         documents = loader.load()
         text_splitter = NLTKTextSplitter(chunk_size=1000)
         return text_splitter.split_documents(documents)
         embedding_function=HuggingFaceEmbeddings(),
     )
     if loaded_patent == file_name or already_indexed(vectordb, file_name):
+        st.write("✅ Already indexed.")
     else:
         vectordb.delete_collection()
         docs = load_docs(file_name)
+        st.write("🔍 Number of Documents: ", len(docs))
         vectordb = Chroma.from_documents(
             docs, HuggingFaceEmbeddings(), persist_directory=PERSISTED_DIRECTORY
     # Load the conversational chain
     st.write("🔄 Loading document into the system...")
     chain = load_chain(pdf_path)
+    st.success("🚀 Document successfully loaded! You can now start asking questions.")
     # Initialize the chat
     if "messages" not in st.session_state: