Spaces:
Runtime error
Runtime error
Commit
·
bf1769e
1
Parent(s):
93f7525
Update app.py
Browse files
app.py
CHANGED
@@ -96,7 +96,7 @@ def load_model():
|
|
96 |
return llm
|
97 |
|
98 |
|
99 |
-
def create_vector_database():
|
100 |
# DB_DIR: str = os.path.join(ABS_PATH, "db")
|
101 |
"""
|
102 |
Creates a vector database using document loaders and embeddings.
|
@@ -107,27 +107,50 @@ def create_vector_database():
|
|
107 |
|
108 |
"""
|
109 |
# Initialize loaders for different file types
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
126 |
|
127 |
# Load documents from all loaders
|
128 |
-
|
129 |
-
|
130 |
-
loaded_documents.extend(loader.load())
|
131 |
|
132 |
# Split loaded documents into chunks
|
133 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=40)
|
|
|
96 |
return llm
|
97 |
|
98 |
|
99 |
+
def create_vector_database(uploaded_files):
|
100 |
# DB_DIR: str = os.path.join(ABS_PATH, "db")
|
101 |
"""
|
102 |
Creates a vector database using document loaders and embeddings.
|
|
|
107 |
|
108 |
"""
|
109 |
# Initialize loaders for different file types
|
110 |
+
loaders = {
|
111 |
+
"pdf": PyPDFLoader,
|
112 |
+
"md": UnstructuredMarkdownLoader,
|
113 |
+
"txt": TextLoader,
|
114 |
+
"csv": CSVLoader,
|
115 |
+
"py": PythonLoader,
|
116 |
+
"epub": UnstructuredEPubLoader,
|
117 |
+
"html": UnstructuredHTMLLoader,
|
118 |
+
"ppt": UnstructuredPowerPointLoader,
|
119 |
+
"pptx": UnstructuredPowerPointLoader,
|
120 |
+
"doc": UnstructuredWordDocumentLoader,
|
121 |
+
"docx": UnstructuredWordDocumentLoader,
|
122 |
+
"odt": UnstructuredODTLoader,
|
123 |
+
"ipynb": NotebookLoader
|
124 |
+
}
|
125 |
+
# pdf_loader = DirectoryLoader("data/", glob="**/*.pdf", loader_cls=PyPDFLoader)
|
126 |
+
# markdown_loader = DirectoryLoader("data/", glob="**/*.md", loader_cls=UnstructuredMarkdownLoader)
|
127 |
+
# text_loader = DirectoryLoader("data/", glob="**/*.txt", loader_cls=TextLoader)
|
128 |
+
# csv_loader = DirectoryLoader("data/", glob="**/*.csv", loader_cls=CSVLoader)
|
129 |
+
# python_loader = DirectoryLoader("data/", glob="**/*.py", loader_cls=PythonLoader)
|
130 |
+
# epub_loader = DirectoryLoader("data/", glob="**/*.epub", loader_cls=UnstructuredEPubLoader)
|
131 |
+
# html_loader = DirectoryLoader("data/", glob="**/*.html", loader_cls=UnstructuredHTMLLoader)
|
132 |
+
# ppt_loader = DirectoryLoader("data/", glob="**/*.ppt", loader_cls=UnstructuredPowerPointLoader)
|
133 |
+
# pptx_loader = DirectoryLoader("data/", glob="**/*.pptx", loader_cls=UnstructuredPowerPointLoader)
|
134 |
+
# doc_loader = DirectoryLoader("data/", glob="**/*.doc", loader_cls=UnstructuredWordDocumentLoader)
|
135 |
+
# docx_loader = DirectoryLoader("data/", glob="**/*.docx", loader_cls=UnstructuredWordDocumentLoader)
|
136 |
+
# odt_loader = DirectoryLoader("data/", glob="**/*.odt", loader_cls=UnstructuredODTLoader)
|
137 |
+
# notebook_loader = DirectoryLoader("data/", glob="**/*.ipynb", loader_cls=NotebookLoader)
|
138 |
+
|
139 |
+
# Load documents from uploaded files using the appropriate loaders
|
140 |
+
loaded_documents = []
|
141 |
+
for uploaded_file in uploaded_files:
|
142 |
+
# file_extension = os.path.splitext(uploaded_file.name)[-1].lower()[1:]
|
143 |
+
file_extension = os.path.splitext(uploaded_file.name)[-1][1:].lower()
|
144 |
+
if file_extension in loaders:
|
145 |
+
loader_cls = loaders[file_extension]
|
146 |
+
loader = loader_cls()
|
147 |
+
loaded_documents.extend(loader.load(uploaded_file))
|
148 |
+
|
149 |
+
# all_loaders = [pdf_loader, markdown_loader, text_loader, csv_loader, python_loader, epub_loader, html_loader, ppt_loader, pptx_loader, doc_loader, docx_loader, odt_loader, notebook_loader]
|
150 |
|
151 |
# Load documents from all loaders
|
152 |
+
# for loader in all_loaders:
|
153 |
+
# loaded_documents.extend(loader.load())
|
|
|
154 |
|
155 |
# Split loaded documents into chunks
|
156 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=40)
|