captain-awesome commited on
Commit
bf1769e
·
1 Parent(s): 93f7525

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +43 -20
app.py CHANGED
@@ -96,7 +96,7 @@ def load_model():
96
  return llm
97
 
98
 
99
- def create_vector_database():
100
  # DB_DIR: str = os.path.join(ABS_PATH, "db")
101
  """
102
  Creates a vector database using document loaders and embeddings.
@@ -107,27 +107,50 @@ def create_vector_database():
107
 
108
  """
109
  # Initialize loaders for different file types
110
- pdf_loader = DirectoryLoader("data/", glob="**/*.pdf", loader_cls=PyPDFLoader)
111
- markdown_loader = DirectoryLoader("data/", glob="**/*.md", loader_cls=UnstructuredMarkdownLoader)
112
- text_loader = DirectoryLoader("data/", glob="**/*.txt", loader_cls=TextLoader)
113
- csv_loader = DirectoryLoader("data/", glob="**/*.csv", loader_cls=CSVLoader)
114
- python_loader = DirectoryLoader("data/", glob="**/*.py", loader_cls=PythonLoader)
115
- epub_loader = DirectoryLoader("data/", glob="**/*.epub", loader_cls=UnstructuredEPubLoader)
116
- html_loader = DirectoryLoader("data/", glob="**/*.html", loader_cls=UnstructuredHTMLLoader)
117
- ppt_loader = DirectoryLoader("data/", glob="**/*.ppt", loader_cls=UnstructuredPowerPointLoader)
118
- pptx_loader = DirectoryLoader("data/", glob="**/*.pptx", loader_cls=UnstructuredPowerPointLoader)
119
- doc_loader = DirectoryLoader("data/", glob="**/*.doc", loader_cls=UnstructuredWordDocumentLoader)
120
- docx_loader = DirectoryLoader("data/", glob="**/*.docx", loader_cls=UnstructuredWordDocumentLoader)
121
- odt_loader = DirectoryLoader("data/", glob="**/*.odt", loader_cls=UnstructuredODTLoader)
122
- notebook_loader = DirectoryLoader("data/", glob="**/*.ipynb", loader_cls=NotebookLoader)
123
-
124
-
125
- all_loaders = [pdf_loader, markdown_loader, text_loader, csv_loader, python_loader, epub_loader, html_loader, ppt_loader, pptx_loader, doc_loader, docx_loader, odt_loader, notebook_loader]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
 
127
  # Load documents from all loaders
128
- loaded_documents = []
129
- for loader in all_loaders:
130
- loaded_documents.extend(loader.load())
131
 
132
  # Split loaded documents into chunks
133
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=40)
 
96
  return llm
97
 
98
 
99
+ def create_vector_database(uploaded_files):
100
  # DB_DIR: str = os.path.join(ABS_PATH, "db")
101
  """
102
  Creates a vector database using document loaders and embeddings.
 
107
 
108
  """
109
  # Initialize loaders for different file types
110
+ loaders = {
111
+ "pdf": PyPDFLoader,
112
+ "md": UnstructuredMarkdownLoader,
113
+ "txt": TextLoader,
114
+ "csv": CSVLoader,
115
+ "py": PythonLoader,
116
+ "epub": UnstructuredEPubLoader,
117
+ "html": UnstructuredHTMLLoader,
118
+ "ppt": UnstructuredPowerPointLoader,
119
+ "pptx": UnstructuredPowerPointLoader,
120
+ "doc": UnstructuredWordDocumentLoader,
121
+ "docx": UnstructuredWordDocumentLoader,
122
+ "odt": UnstructuredODTLoader,
123
+ "ipynb": NotebookLoader
124
+ }
125
+ # pdf_loader = DirectoryLoader("data/", glob="**/*.pdf", loader_cls=PyPDFLoader)
126
+ # markdown_loader = DirectoryLoader("data/", glob="**/*.md", loader_cls=UnstructuredMarkdownLoader)
127
+ # text_loader = DirectoryLoader("data/", glob="**/*.txt", loader_cls=TextLoader)
128
+ # csv_loader = DirectoryLoader("data/", glob="**/*.csv", loader_cls=CSVLoader)
129
+ # python_loader = DirectoryLoader("data/", glob="**/*.py", loader_cls=PythonLoader)
130
+ # epub_loader = DirectoryLoader("data/", glob="**/*.epub", loader_cls=UnstructuredEPubLoader)
131
+ # html_loader = DirectoryLoader("data/", glob="**/*.html", loader_cls=UnstructuredHTMLLoader)
132
+ # ppt_loader = DirectoryLoader("data/", glob="**/*.ppt", loader_cls=UnstructuredPowerPointLoader)
133
+ # pptx_loader = DirectoryLoader("data/", glob="**/*.pptx", loader_cls=UnstructuredPowerPointLoader)
134
+ # doc_loader = DirectoryLoader("data/", glob="**/*.doc", loader_cls=UnstructuredWordDocumentLoader)
135
+ # docx_loader = DirectoryLoader("data/", glob="**/*.docx", loader_cls=UnstructuredWordDocumentLoader)
136
+ # odt_loader = DirectoryLoader("data/", glob="**/*.odt", loader_cls=UnstructuredODTLoader)
137
+ # notebook_loader = DirectoryLoader("data/", glob="**/*.ipynb", loader_cls=NotebookLoader)
138
+
139
+ # Load documents from uploaded files using the appropriate loaders
140
+ loaded_documents = []
141
+ for uploaded_file in uploaded_files:
142
+ # file_extension = os.path.splitext(uploaded_file.name)[-1].lower()[1:]
143
+ file_extension = os.path.splitext(uploaded_file.name)[-1][1:].lower()
144
+ if file_extension in loaders:
145
+ loader_cls = loaders[file_extension]
146
+ loader = loader_cls()
147
+ loaded_documents.extend(loader.load(uploaded_file))
148
+
149
+ # all_loaders = [pdf_loader, markdown_loader, text_loader, csv_loader, python_loader, epub_loader, html_loader, ppt_loader, pptx_loader, doc_loader, docx_loader, odt_loader, notebook_loader]
150
 
151
  # Load documents from all loaders
152
+ # for loader in all_loaders:
153
+ # loaded_documents.extend(loader.load())
 
154
 
155
  # Split loaded documents into chunks
156
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=40)