jameszokah commited on
Commit
e6bb875
·
verified ·
1 Parent(s): 17ec764

Update in_memory.py

Browse files
Files changed (1) hide show
  1. in_memory.py +105 -104
in_memory.py CHANGED
@@ -1,104 +1,105 @@
1
- from langchain_community.document_loaders.parsers.pdf import PyPDFParser
2
- from langchain_community.document_loaders.generic import GenericLoader
3
- from langchain_core.document_loaders.blob_loaders import BlobLoader
4
- from io import BytesIO
5
- from starlette.datastructures import UploadFile
6
- from typing import List, Iterable, ByteString
7
- # Ensure this is the correct path for your custom loader
8
- from custon_generic_loader import CustomGenericLoader
9
- from langchain_core.documents import Document
10
- from langchain_community.document_loaders.blob_loaders.schema import Blob
11
- from parser.msword_parser import MsWordParser
12
- from parser.pptx_parser import PptxParser
13
- from parser.xlsx_parser import XlsxParser
14
- from parser.txt_parser import TxtParser
15
- from parser.audio_parser import AudioParser
16
- from parser.video_parser import VideoParser
17
-
18
-
19
- class InMemoryBlobLoader(BlobLoader):
20
- def __init__(self, upload_file: UploadFile):
21
- self.upload_file = upload_file
22
-
23
- async def yield_blobs(self) -> Iterable[ByteString]:
24
- data = await self.upload_file.read()
25
- yield Blob.from_data(data, mime_type=self.upload_file.content_type, metadata={
26
- 'name': self.upload_file.filename,
27
- 'size': self.upload_file.size,
28
- 'source': self.upload_file.filename
29
- })
30
-
31
-
32
- async def load_document(upload_file: UploadFile) -> List[Document]:
33
- blob_loader = InMemoryBlobLoader(upload_file)
34
-
35
- if upload_file.content_type == 'application/pdf':
36
- blob_parser = PyPDFParser()
37
- print(f'Loading PDF: {upload_file.filename}')
38
- elif upload_file.content_type in [
39
- 'application/msword',
40
- 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
41
- ]:
42
- blob_parser = MsWordParser()
43
- print(f'Loading Word Document: {upload_file.filename}')
44
- elif upload_file.content_type in [
45
- 'application/vnd.ms-powerpoint',
46
- 'application/vnd.openxmlformats-officedocument.presentationml.presentation'
47
- ]:
48
- blob_parser = PptxParser()
49
- print(f'Loading PowerPoint: {upload_file.filename}')
50
- elif upload_file.content_type == 'text/plain':
51
- blob_parser = TxtParser()
52
- print(f'Loading Text File: {upload_file.filename}')
53
- elif upload_file.content_type.startswith('audio/'):
54
- blob_parser = AudioParser()
55
- print(f'Loading Audio File: {upload_file.filename}')
56
- elif upload_file.content_type.startswith('video/'):
57
- blob_parser = VideoParser()
58
- print(f'Loading Video File: {upload_file.filename}')
59
-
60
- # Suggested code may be subject to a license. Learn more: ~LicenseLog:3330720155.
61
- elif upload_file.content_type in [
62
- 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
63
- 'application/vnd.ms-excel'
64
- ]:
65
- blob_parser = XlsxParser()
66
- print(f'Loading Excel File: {upload_file.filename}')
67
-
68
- else:
69
- raise ValueError(f"Unsupported file type: {upload_file.content_type}")
70
-
71
- loader = CustomGenericLoader(blob_loader, blob_parser)
72
- documents = []
73
- # async for document in loader.lazy_load():
74
- # documents.append(document)
75
-
76
- document = await loader.load_all()
77
- documents.append(document)
78
-
79
- if not documents:
80
- raise ValueError(
81
- f"No documents were loaded for file: {upload_file.filename}")
82
-
83
- return documents
84
-
85
-
86
- async def load_all_documents(upload_files: List[UploadFile]) -> List[List[Document]]:
87
- all_documents = []
88
- for upload_file in upload_files:
89
- try:
90
- documents = await load_document(upload_file)
91
- all_documents.extend(documents)
92
- except ValueError as e:
93
- print(f"Error loading {upload_file.filename}: {e}")
94
-
95
- if not all_documents:
96
- raise ValueError("No documents were loaded from the provided files.")
97
-
98
- return all_documents
99
-
100
- # Example usage:
101
- # Note: You would typically run this inside an async function or an async event loop.
102
- # Example:
103
- # upload_files = [UploadFile1, UploadFile2, ...]
104
- # documents = await load_all_documents(upload_files)
 
 
1
+ from langchain_community.document_loaders.parsers.pdf import PyPDFParser
2
+ from langchain_community.document_loaders.generic import GenericLoader
3
+ from langchain_core.document_loaders.blob_loaders import BlobLoader
4
+ from io import BytesIO
5
+ from starlette.datastructures import UploadFile
6
+ from typing import List, Iterable, ByteString
7
+ # Ensure this is the correct path for your custom loader
8
+ from custon_generic_loader import CustomGenericLoader
9
+ from langchain_core.documents import Document
10
+ from langchain_community.document_loaders.blob_loaders.schema import Blob
11
+ from parser.msword_parser import MsWordParser
12
+ from parser.pptx_parser import PptxParser
13
+ from parser.xlsx_parser import XlsxParser
14
+ from parser.txt_parser import TxtParser
15
+ from parser.audio_parser import AudioParser
16
+ from parser.video_parser import VideoParser
17
+
18
+
19
+ class InMemoryBlobLoader(BlobLoader):
20
+ def __init__(self, upload_file: UploadFile):
21
+ self.upload_file = upload_file
22
+
23
+ async def yield_blobs(self) -> Iterable[ByteString]:
24
+ data = await self.upload_file.read()
25
+ yield Blob.from_data(data, mime_type=self.upload_file.content_type, metadata={
26
+ 'name': self.upload_file.filename,
27
+ 'size': self.upload_file.size,
28
+ 'source': self.upload_file.filename
29
+ })
30
+
31
+
32
+ async def load_document(upload_file: UploadFile) -> List[Document]:
33
+ blob_loader = InMemoryBlobLoader(upload_file)
34
+
35
+ if upload_file.content_type == 'application/pdf':
36
+ blob_parser = PyPDFParser()
37
+ print(f'Loading PDF: {upload_file.filename}')
38
+ elif upload_file.content_type in [
39
+ 'application/msword',
40
+ 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
41
+ 'application/vnd.openxmlformats-officedocument.themeManager+xml'
42
+ ]:
43
+ blob_parser = MsWordParser()
44
+ print(f'Loading Word Document: {upload_file.filename}')
45
+ elif upload_file.content_type in [
46
+ 'application/vnd.ms-powerpoint',
47
+ 'application/vnd.openxmlformats-officedocument.presentationml.presentation'
48
+ ]:
49
+ blob_parser = PptxParser()
50
+ print(f'Loading PowerPoint: {upload_file.filename}')
51
+ elif upload_file.content_type == 'text/plain':
52
+ blob_parser = TxtParser()
53
+ print(f'Loading Text File: {upload_file.filename}')
54
+ elif upload_file.content_type.startswith('audio/'):
55
+ blob_parser = AudioParser()
56
+ print(f'Loading Audio File: {upload_file.filename}')
57
+ elif upload_file.content_type.startswith('video/'):
58
+ blob_parser = VideoParser()
59
+ print(f'Loading Video File: {upload_file.filename}')
60
+
61
+ # Suggested code may be subject to a license. Learn more: ~LicenseLog:3330720155.
62
+ elif upload_file.content_type in [
63
+ 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
64
+ 'application/vnd.ms-excel'
65
+ ]:
66
+ blob_parser = XlsxParser()
67
+ print(f'Loading Excel File: {upload_file.filename}')
68
+
69
+ else:
70
+ raise ValueError(f"Unsupported file type: {upload_file.content_type}")
71
+
72
+ loader = CustomGenericLoader(blob_loader, blob_parser)
73
+ documents = []
74
+ # async for document in loader.lazy_load():
75
+ # documents.append(document)
76
+
77
+ document = await loader.load_all()
78
+ documents.append(document)
79
+
80
+ if not documents:
81
+ raise ValueError(
82
+ f"No documents were loaded for file: {upload_file.filename}")
83
+
84
+ return documents
85
+
86
+
87
+ async def load_all_documents(upload_files: List[UploadFile]) -> List[List[Document]]:
88
+ all_documents = []
89
+ for upload_file in upload_files:
90
+ try:
91
+ documents = await load_document(upload_file)
92
+ all_documents.extend(documents)
93
+ except ValueError as e:
94
+ print(f"Error loading {upload_file.filename}: {e}")
95
+
96
+ if not all_documents:
97
+ raise ValueError("No documents were loaded from the provided files.")
98
+
99
+ return all_documents
100
+
101
+ # Example usage:
102
+ # Note: You would typically run this inside an async function or an async event loop.
103
+ # Example:
104
+ # upload_files = [UploadFile1, UploadFile2, ...]
105
+ # documents = await load_all_documents(upload_files)