Update interim/app.py
Browse files- interim/app.py +11 -1
interim/app.py
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
import sys
|
2 |
import os
|
3 |
import re
|
@@ -61,7 +62,16 @@ def load_docs(document_path):
|
|
61 |
)
|
62 |
documents = loader.load()
|
63 |
text_splitter = NLTKTextSplitter(chunk_size=1000)
|
64 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
except Exception as e:
|
66 |
st.error(f"Failed to load and process PDF: {e}")
|
67 |
st.stop()
|
|
|
1 |
+
# to-do: Enable downloading multiple patent PDFs via corresponding links
|
2 |
import sys
|
3 |
import os
|
4 |
import re
|
|
|
62 |
)
|
63 |
documents = loader.load()
|
64 |
text_splitter = NLTKTextSplitter(chunk_size=1000)
|
65 |
+
split_docs = text_splitter.split_documents(documents)
|
66 |
+
|
67 |
+
# Filter metadata to only include str, int, float, or bool
|
68 |
+
for doc in split_docs:
|
69 |
+
if hasattr(doc, "metadata") and isinstance(doc.metadata, dict):
|
70 |
+
doc.metadata = {
|
71 |
+
k: v for k, v in doc.metadata.items()
|
72 |
+
if isinstance(v, (str, int, float, bool))
|
73 |
+
}
|
74 |
+
return split_docs
|
75 |
except Exception as e:
|
76 |
st.error(f"Failed to load and process PDF: {e}")
|
77 |
st.stop()
|