gavinzli commited on
Commit
7a6b7b4
·
1 Parent(s): af61c79

Add file upload routes and implement PDF processing functionality

Browse files
app/controllers/loader.py ADDED
@@ -0,0 +1,245 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Module to extract text from PDF files and images using Azure OpenAI's GPT-4o-mini model."""
2
+ import base64
3
+ import hashlib
4
+ import json
5
+ import os
6
+ from io import BytesIO
7
+
8
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
9
+ from langchain_community.document_loaders import Docx2txtLoader
10
+ from langchain_core.documents import Document
11
+ from pdf2image import convert_from_path
12
+ from pydantic import BaseModel
13
+ from pypdf import PdfReader
14
+
15
+ from models.llm import client
16
+ from models.db import vectorstore
17
+
18
+ text_splitter = RecursiveCharacterTextSplitter()
19
+
20
+ class ExtractionResult(BaseModel):
21
+ """
22
+ ExtractionResult is a data model that represents the result of an extraction process.
23
+
24
+ Attributes:
25
+ content (str): The extracted content as a string.
26
+ """
27
+ content: str
28
+
29
+ def check_image(page):
30
+ """
31
+ Checks if a given PDF page contains any images.
32
+
33
+ This function examines the /Resources dictionary of the provided PDF page
34
+ to determine if it contains any XObjects of subtype /Image.
35
+
36
+ Args:
37
+ page: A dictionary-like object representing a PDF page.
38
+
39
+ Returns:
40
+ bool: True if the page contains at least one image, False otherwise.
41
+ """
42
+ # Get the /Resources dictionary
43
+ resources = page.get("/Resources")
44
+ if resources is None:
45
+ return False
46
+ # Check for /XObject in resources
47
+ xobjects = resources.get("/XObject")
48
+ if xobjects is None:
49
+ return False
50
+ # Iterate through XObjects to find images
51
+ for obj in xobjects.values():
52
+ if obj.get("/Subtype") == "/Image":
53
+ return True
54
+ return False
55
+
56
+ def extract_text_from_image(image):
57
+ """
58
+ Extracts text content from an image of a document page and returns it as structured JSON.
59
+
60
+ Args:
61
+ image (PIL.Image.Image): The image object representing the document page.
62
+
63
+ Returns:
64
+ str: The extracted plain text content of the page in JSON format.
65
+
66
+ Raises:
67
+ Exception: If the response from the AI model is invalid or cannot be parsed.
68
+
69
+ Dependencies:
70
+ - Requires the `BytesIO` module for handling image byte streams.
71
+ - Requires the `base64` module for encoding the image in Base64 format.
72
+ - Requires a client instance capable of interacting with the GPT-4o-mini model.
73
+ """
74
+ image_bytes = BytesIO()
75
+ image.save(image_bytes, format="PNG")
76
+ image_bytes = image_bytes.getvalue()
77
+ base64_image = base64.b64encode(image_bytes).decode("utf-8")
78
+ prompt = """
79
+ You are an AI assistant that extracts data from documents and returns it as structured JSON.
80
+ Analyze the provided image of a document page and extract the following:
81
+ - Content of the page (plain text)
82
+ """
83
+ response = client.beta.chat.completions.parse(
84
+ model="gpt-4o-mini",
85
+ response_format = ExtractionResult,
86
+ messages=[
87
+ {
88
+ "role": "user",
89
+ "content": [
90
+ {"type": "text", "text": prompt},
91
+ {
92
+ "type": "image_url",
93
+ "image_url": {"url": f"data:image/png;base64,{base64_image}"}
94
+ }
95
+ ]
96
+ }
97
+ ]
98
+ )
99
+ return json.loads(response.choices[0].message.content)["content"]
100
+
101
+ def load_pdf(content: bytes, filename: str):
102
+ """
103
+ Loads and processes PDF files from a specified directory.
104
+
105
+ This function iterates through all PDF files in the given directory, extracts text
106
+ from each page, and creates a list of Document objects containing the extracted text
107
+ and metadata. If a page contains an image, the text is extracted from the image using
108
+ OCR.
109
+
110
+ Args:
111
+ directory (str): The path to the directory containing the PDF files.
112
+
113
+ Returns:
114
+ list: A list of Document objects, where each object contains the page content
115
+ and metadata (filename and page number).
116
+
117
+ Raises:
118
+ FileNotFoundError: If a specified PDF file is not found.
119
+ Exception: For any other errors encountered during processing.
120
+
121
+ Notes:
122
+ - The function assumes the presence of helper functions `check_image`,
123
+ `convert_from_path`, and `extract_text_from_image`.
124
+ - The `Document` class is used to store the page content and metadata.
125
+ """
126
+ documents = []
127
+ path = os.path.join("/tmp", filename)
128
+ with open(path, "wb") as f:
129
+ f.write(content)
130
+ try:
131
+ pdf = PdfReader(path)
132
+ for page_num, page in enumerate(pdf.pages):
133
+ contain = check_image(page)
134
+ if contain:
135
+ images = convert_from_path(
136
+ path, first_page=page_num + 1, last_page=page_num + 1)
137
+ text = extract_text_from_image(images[0])
138
+ else:
139
+ text = page.extract_text()
140
+ doc = Document(
141
+ page_content=text,
142
+ metadata={"source": filename, "page": page_num + 1})
143
+ documents.append(doc.model_dump())
144
+ os.remove(path)
145
+ return documents
146
+ except (FileNotFoundError, ValueError, OSError) as e:
147
+ print(f"Error: {str(e)}")
148
+ return documents
149
+
150
+ def load_jsonl(directory):
151
+ """
152
+ Reads a JSONL file and converts its content into a list of Document objects.
153
+
154
+ Args:
155
+ path (str): Path to the JSONL file.
156
+
157
+ Returns:
158
+ list: A list of Document objects.
159
+ """
160
+ for filename in os.listdir(directory):
161
+ if filename.endswith(".pdf"):
162
+ documents = []
163
+ file_path = os.path.join(directory, filename)
164
+ with open(file_path, "r", encoding="utf-8") as file:
165
+ for line in file:
166
+ # Parse each line as JSON
167
+ json_obj = json.loads(line.strip())
168
+ metadata = {
169
+ "id": json_obj.get("id", ""),
170
+ "url": json_obj.get("url", ""),
171
+ "title": json_obj.get("title", ""),
172
+ "ts": json_obj.get("ts", "")
173
+ }
174
+ if json_obj.get("mine") == "text/html":
175
+ text = base64.urlsafe_b64decode(json_obj.get("text", "")).decode("utf-8")
176
+ else:
177
+ text = json_obj.get("text", "")
178
+ doc = Document(page_content=text, metadata=metadata)
179
+ documents.append(doc)
180
+ documents = []
181
+
182
+ def load_docx(directory):
183
+ """
184
+ Loads and processes all .docx files from a specified directory.
185
+
186
+ This function iterates through the files in the given directory, identifies
187
+ files with a .docx extension, and uses the Docx2txtLoader to load and extract
188
+ their contents. The extracted contents are aggregated into a single list.
189
+
190
+ Args:
191
+ directory (str): The path to the directory containing .docx files.
192
+
193
+ Returns:
194
+ list: A list containing the contents of all loaded .docx files.
195
+ """
196
+ documents = []
197
+ for filename in os.listdir(directory):
198
+ if filename.endswith(".docx"):
199
+ documents.extend(Docx2txtLoader(file_path=os.path.join(directory, filename)).load())
200
+ upload(documents)
201
+ return documents
202
+
203
+ def upload(docs):
204
+ """
205
+ Processes a list of documents, splits them into smaller chunks, updates their metadata,
206
+ generates unique IDs for each chunk, and adds them to a vector store.
207
+
208
+ Args:
209
+ docs (list): A list of document objects to be processed.
210
+
211
+ Metadata Processing:
212
+ - Extracts and updates the "page" metadata if "page_label" exists.
213
+ - Updates the "attachment" metadata by removing the "{FOLDER}/" prefix from the "source".
214
+ - Filters metadata to retain only "attachment" and "page" keys.
215
+ - Generates a unique "id" for each document based on the "attachment" metadata.
216
+ - Constructs unique IDs for each document chunk, incorporating "id", "page", and chunk index.
217
+
218
+ Operations:
219
+ - Splits each document into smaller chunks using `text_splitter.split_documents`.
220
+ - Appends processed document chunks and their IDs to the `documents` and `ids` lists.
221
+ - Adds the processed documents and their IDs to the `vector_store`.
222
+
223
+ Raises:
224
+ KeyError: If required metadata keys are missing during processing.
225
+ """
226
+ documents = []
227
+ ids = []
228
+ for doc in docs:
229
+ for index, document in enumerate(text_splitter.split_documents([doc])):
230
+ if "page_label" in document.metadata:
231
+ document.metadata["page"] = int(document.metadata["page_label"])
232
+ document.metadata["attachment"] = document.metadata["source"].replace("{FOLDER}/", "")
233
+ document.metadata = {
234
+ key: value
235
+ for key, value in document.metadata.items()
236
+ if key in ["attachment", "page"]
237
+ }
238
+ document.metadata["id"] = str(
239
+ hashlib.sha256(document.metadata['attachment'].encode()).hexdigest())
240
+ if "page" in document.metadata:
241
+ ids.append(f"{document.metadata["id"]}-{document.metadata["page"]}-{index}")
242
+ else:
243
+ ids.append(f"{document.metadata["id"]}-{index}")
244
+ documents.append(document)
245
+ vectorstore.add_documents(documents=documents, ids=ids)
app/main.py CHANGED
@@ -4,7 +4,7 @@ import logging
4
  from fastapi import FastAPI, Request
5
  from fastapi.middleware.cors import CORSMiddleware
6
  from jose import jwt
7
- from router import auth, content, service
8
  from starlette.middleware.base import BaseHTTPMiddleware
9
 
10
  SECRET_KEY = "your-secret-key"
@@ -67,6 +67,7 @@ app = FastAPI(docs_url="/")
67
  app.include_router(content.router)
68
  app.include_router(service.router)
69
  app.include_router(auth.router)
 
70
 
71
  origins = [
72
  "*"
 
4
  from fastapi import FastAPI, Request
5
  from fastapi.middleware.cors import CORSMiddleware
6
  from jose import jwt
7
+ from router import auth, content, service, file
8
  from starlette.middleware.base import BaseHTTPMiddleware
9
 
10
  SECRET_KEY = "your-secret-key"
 
67
  app.include_router(content.router)
68
  app.include_router(service.router)
69
  app.include_router(auth.router)
70
+ app.include_router(file.router)
71
 
72
  origins = [
73
  "*"
app/models/__init__.py ADDED
File without changes
app/models/llm/__init__.py CHANGED
@@ -2,8 +2,10 @@
2
  from typing import List
3
  from langchain.embeddings.base import Embeddings
4
  from sentence_transformers import SentenceTransformer
 
5
  from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
6
 
 
7
  class GPTModel(AzureChatOpenAI):
8
  """
9
  GPTModel class that extends AzureChatOpenAI.
@@ -74,3 +76,5 @@ class EmbeddingsModel(Embeddings):
74
  List[float]: The embedded representation of the query as a list of floats.
75
  """
76
  return self.model.encode([query]).tolist()[0]
 
 
 
2
  from typing import List
3
  from langchain.embeddings.base import Embeddings
4
  from sentence_transformers import SentenceTransformer
5
+ from openai import AzureOpenAI
6
  from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
7
 
8
+
9
  class GPTModel(AzureChatOpenAI):
10
  """
11
  GPTModel class that extends AzureChatOpenAI.
 
76
  List[float]: The embedded representation of the query as a list of floats.
77
  """
78
  return self.model.encode([query]).tolist()[0]
79
+
80
+ client = AzureOpenAI()
app/router/file.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Module for defining the main routes of the API."""
2
+ from pathlib import Path
3
+ from fastapi import APIRouter, File, UploadFile, HTTPException
4
+ from fastapi.responses import JSONResponse
5
+ from controllers.loader import load_pdf
6
+
7
+ router = APIRouter(prefix="/file", tags=["mail"])
8
+
9
+ ALLOWED_FILE_TYPES = {
10
+ "application/pdf": ".pdf",
11
+ "text/plain": ".txt",
12
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx"
13
+ }
14
+
15
+ @router.get("")
16
+ async def get(file: UploadFile = File(...)) -> JSONResponse:
17
+ """
18
+ Handles the chat POST request.
19
+
20
+ Args:
21
+ query (ReqData): The request data containing the query parameters.
22
+
23
+ Returns:
24
+ str: The generated response from the chat function.
25
+ """
26
+ content = await file.read()
27
+ result = []
28
+ if file.content_type not in ALLOWED_FILE_TYPES \
29
+ or Path(file.filename).suffix.lower() != ALLOWED_FILE_TYPES.get(file.content_type):
30
+ raise HTTPException(
31
+ status_code=400,
32
+ detail="Invalid file type. Only PDF, TXT, and DOCX are allowed."
33
+ )
34
+ elif file.content_type == "application/pdf":
35
+ result = load_pdf(content, file.filename)
36
+ return JSONResponse(content=result)