Update app.py
Browse files
app.py
CHANGED
@@ -4,6 +4,15 @@ import re
|
|
4 |
import shutil
|
5 |
import time
|
6 |
import streamlit as st
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
sys.path.append(os.path.abspath("."))
|
8 |
from langchain.chains import ConversationalRetrievalChain
|
9 |
from langchain.memory import ConversationBufferMemory
|
@@ -32,7 +41,12 @@ check_poppler_installed()
|
|
32 |
|
33 |
def load_docs(document_path):
|
34 |
try:
|
35 |
-
loader = UnstructuredPDFLoader(
|
|
|
|
|
|
|
|
|
|
|
36 |
documents = loader.load()
|
37 |
text_splitter = NLTKTextSplitter(chunk_size=1000)
|
38 |
return text_splitter.split_documents(documents)
|
@@ -54,11 +68,11 @@ def load_chain(file_name=None):
|
|
54 |
embedding_function=HuggingFaceEmbeddings(),
|
55 |
)
|
56 |
if loaded_patent == file_name or already_indexed(vectordb, file_name):
|
57 |
-
st.write("Already indexed")
|
58 |
else:
|
59 |
vectordb.delete_collection()
|
60 |
docs = load_docs(file_name)
|
61 |
-
st.write("
|
62 |
|
63 |
vectordb = Chroma.from_documents(
|
64 |
docs, HuggingFaceEmbeddings(), persist_directory=PERSISTED_DIRECTORY
|
@@ -128,7 +142,7 @@ if __name__ == "__main__":
|
|
128 |
# Load the conversational chain
|
129 |
st.write("π Loading document into the system...")
|
130 |
chain = load_chain(pdf_path)
|
131 |
-
st.success("Document successfully loaded! You can now start asking questions.")
|
132 |
|
133 |
# Initialize the chat
|
134 |
if "messages" not in st.session_state:
|
|
|
4 |
import shutil
|
5 |
import time
|
6 |
import streamlit as st
|
7 |
+
import nltk
|
8 |
+
|
9 |
+
nltk_data_path = os.path.join(os.getcwd(), "nltk_data")
|
10 |
+
nltk.data.path.append(nltk_data_path)
|
11 |
+
|
12 |
+
if not os.path.exists(os.path.join(nltk_data_path, "tokenizers/punkt")):
|
13 |
+
print("Downloading NLTK 'punkt' resource...")
|
14 |
+
nltk.download("punkt", download_dir=nltk_data_path)
|
15 |
+
|
16 |
sys.path.append(os.path.abspath("."))
|
17 |
from langchain.chains import ConversationalRetrievalChain
|
18 |
from langchain.memory import ConversationBufferMemory
|
|
|
41 |
|
42 |
def load_docs(document_path):
|
43 |
try:
|
44 |
+
loader = UnstructuredPDFLoader(
|
45 |
+
document_path,
|
46 |
+
mode="elements",
|
47 |
+
strategy="fast",
|
48 |
+
ocr_languages=None # Explicitly disable OCR
|
49 |
+
)
|
50 |
documents = loader.load()
|
51 |
text_splitter = NLTKTextSplitter(chunk_size=1000)
|
52 |
return text_splitter.split_documents(documents)
|
|
|
68 |
embedding_function=HuggingFaceEmbeddings(),
|
69 |
)
|
70 |
if loaded_patent == file_name or already_indexed(vectordb, file_name):
|
71 |
+
st.write("β
Already indexed.")
|
72 |
else:
|
73 |
vectordb.delete_collection()
|
74 |
docs = load_docs(file_name)
|
75 |
+
st.write("π Number of Documents: ", len(docs))
|
76 |
|
77 |
vectordb = Chroma.from_documents(
|
78 |
docs, HuggingFaceEmbeddings(), persist_directory=PERSISTED_DIRECTORY
|
|
|
142 |
# Load the conversational chain
|
143 |
st.write("π Loading document into the system...")
|
144 |
chain = load_chain(pdf_path)
|
145 |
+
st.success("π Document successfully loaded! You can now start asking questions.")
|
146 |
|
147 |
# Initialize the chat
|
148 |
if "messages" not in st.session_state:
|