Add file upload routes and implement PDF processing functionality
Browse files- app/controllers/loader.py +245 -0
- app/main.py +2 -1
- app/models/__init__.py +0 -0
- app/models/llm/__init__.py +4 -0
- app/router/file.py +36 -0
app/controllers/loader.py
ADDED
@@ -0,0 +1,245 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Module to extract text from PDF files and images using Azure OpenAI's GPT-4o-mini model."""
|
2 |
+
import base64
|
3 |
+
import hashlib
|
4 |
+
import json
|
5 |
+
import os
|
6 |
+
from io import BytesIO
|
7 |
+
|
8 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
9 |
+
from langchain_community.document_loaders import Docx2txtLoader
|
10 |
+
from langchain_core.documents import Document
|
11 |
+
from pdf2image import convert_from_path
|
12 |
+
from pydantic import BaseModel
|
13 |
+
from pypdf import PdfReader
|
14 |
+
|
15 |
+
from models.llm import client
|
16 |
+
from models.db import vectorstore
|
17 |
+
|
18 |
+
text_splitter = RecursiveCharacterTextSplitter()
|
19 |
+
|
20 |
+
class ExtractionResult(BaseModel):
|
21 |
+
"""
|
22 |
+
ExtractionResult is a data model that represents the result of an extraction process.
|
23 |
+
|
24 |
+
Attributes:
|
25 |
+
content (str): The extracted content as a string.
|
26 |
+
"""
|
27 |
+
content: str
|
28 |
+
|
29 |
+
def check_image(page):
|
30 |
+
"""
|
31 |
+
Checks if a given PDF page contains any images.
|
32 |
+
|
33 |
+
This function examines the /Resources dictionary of the provided PDF page
|
34 |
+
to determine if it contains any XObjects of subtype /Image.
|
35 |
+
|
36 |
+
Args:
|
37 |
+
page: A dictionary-like object representing a PDF page.
|
38 |
+
|
39 |
+
Returns:
|
40 |
+
bool: True if the page contains at least one image, False otherwise.
|
41 |
+
"""
|
42 |
+
# Get the /Resources dictionary
|
43 |
+
resources = page.get("/Resources")
|
44 |
+
if resources is None:
|
45 |
+
return False
|
46 |
+
# Check for /XObject in resources
|
47 |
+
xobjects = resources.get("/XObject")
|
48 |
+
if xobjects is None:
|
49 |
+
return False
|
50 |
+
# Iterate through XObjects to find images
|
51 |
+
for obj in xobjects.values():
|
52 |
+
if obj.get("/Subtype") == "/Image":
|
53 |
+
return True
|
54 |
+
return False
|
55 |
+
|
56 |
+
def extract_text_from_image(image):
|
57 |
+
"""
|
58 |
+
Extracts text content from an image of a document page and returns it as structured JSON.
|
59 |
+
|
60 |
+
Args:
|
61 |
+
image (PIL.Image.Image): The image object representing the document page.
|
62 |
+
|
63 |
+
Returns:
|
64 |
+
str: The extracted plain text content of the page in JSON format.
|
65 |
+
|
66 |
+
Raises:
|
67 |
+
Exception: If the response from the AI model is invalid or cannot be parsed.
|
68 |
+
|
69 |
+
Dependencies:
|
70 |
+
- Requires the `BytesIO` module for handling image byte streams.
|
71 |
+
- Requires the `base64` module for encoding the image in Base64 format.
|
72 |
+
- Requires a client instance capable of interacting with the GPT-4o-mini model.
|
73 |
+
"""
|
74 |
+
image_bytes = BytesIO()
|
75 |
+
image.save(image_bytes, format="PNG")
|
76 |
+
image_bytes = image_bytes.getvalue()
|
77 |
+
base64_image = base64.b64encode(image_bytes).decode("utf-8")
|
78 |
+
prompt = """
|
79 |
+
You are an AI assistant that extracts data from documents and returns it as structured JSON.
|
80 |
+
Analyze the provided image of a document page and extract the following:
|
81 |
+
- Content of the page (plain text)
|
82 |
+
"""
|
83 |
+
response = client.beta.chat.completions.parse(
|
84 |
+
model="gpt-4o-mini",
|
85 |
+
response_format = ExtractionResult,
|
86 |
+
messages=[
|
87 |
+
{
|
88 |
+
"role": "user",
|
89 |
+
"content": [
|
90 |
+
{"type": "text", "text": prompt},
|
91 |
+
{
|
92 |
+
"type": "image_url",
|
93 |
+
"image_url": {"url": f"data:image/png;base64,{base64_image}"}
|
94 |
+
}
|
95 |
+
]
|
96 |
+
}
|
97 |
+
]
|
98 |
+
)
|
99 |
+
return json.loads(response.choices[0].message.content)["content"]
|
100 |
+
|
101 |
+
def load_pdf(content: bytes, filename: str):
|
102 |
+
"""
|
103 |
+
Loads and processes PDF files from a specified directory.
|
104 |
+
|
105 |
+
This function iterates through all PDF files in the given directory, extracts text
|
106 |
+
from each page, and creates a list of Document objects containing the extracted text
|
107 |
+
and metadata. If a page contains an image, the text is extracted from the image using
|
108 |
+
OCR.
|
109 |
+
|
110 |
+
Args:
|
111 |
+
directory (str): The path to the directory containing the PDF files.
|
112 |
+
|
113 |
+
Returns:
|
114 |
+
list: A list of Document objects, where each object contains the page content
|
115 |
+
and metadata (filename and page number).
|
116 |
+
|
117 |
+
Raises:
|
118 |
+
FileNotFoundError: If a specified PDF file is not found.
|
119 |
+
Exception: For any other errors encountered during processing.
|
120 |
+
|
121 |
+
Notes:
|
122 |
+
- The function assumes the presence of helper functions `check_image`,
|
123 |
+
`convert_from_path`, and `extract_text_from_image`.
|
124 |
+
- The `Document` class is used to store the page content and metadata.
|
125 |
+
"""
|
126 |
+
documents = []
|
127 |
+
path = os.path.join("/tmp", filename)
|
128 |
+
with open(path, "wb") as f:
|
129 |
+
f.write(content)
|
130 |
+
try:
|
131 |
+
pdf = PdfReader(path)
|
132 |
+
for page_num, page in enumerate(pdf.pages):
|
133 |
+
contain = check_image(page)
|
134 |
+
if contain:
|
135 |
+
images = convert_from_path(
|
136 |
+
path, first_page=page_num + 1, last_page=page_num + 1)
|
137 |
+
text = extract_text_from_image(images[0])
|
138 |
+
else:
|
139 |
+
text = page.extract_text()
|
140 |
+
doc = Document(
|
141 |
+
page_content=text,
|
142 |
+
metadata={"source": filename, "page": page_num + 1})
|
143 |
+
documents.append(doc.model_dump())
|
144 |
+
os.remove(path)
|
145 |
+
return documents
|
146 |
+
except (FileNotFoundError, ValueError, OSError) as e:
|
147 |
+
print(f"Error: {str(e)}")
|
148 |
+
return documents
|
149 |
+
|
150 |
+
def load_jsonl(directory):
|
151 |
+
"""
|
152 |
+
Reads a JSONL file and converts its content into a list of Document objects.
|
153 |
+
|
154 |
+
Args:
|
155 |
+
path (str): Path to the JSONL file.
|
156 |
+
|
157 |
+
Returns:
|
158 |
+
list: A list of Document objects.
|
159 |
+
"""
|
160 |
+
for filename in os.listdir(directory):
|
161 |
+
if filename.endswith(".pdf"):
|
162 |
+
documents = []
|
163 |
+
file_path = os.path.join(directory, filename)
|
164 |
+
with open(file_path, "r", encoding="utf-8") as file:
|
165 |
+
for line in file:
|
166 |
+
# Parse each line as JSON
|
167 |
+
json_obj = json.loads(line.strip())
|
168 |
+
metadata = {
|
169 |
+
"id": json_obj.get("id", ""),
|
170 |
+
"url": json_obj.get("url", ""),
|
171 |
+
"title": json_obj.get("title", ""),
|
172 |
+
"ts": json_obj.get("ts", "")
|
173 |
+
}
|
174 |
+
if json_obj.get("mine") == "text/html":
|
175 |
+
text = base64.urlsafe_b64decode(json_obj.get("text", "")).decode("utf-8")
|
176 |
+
else:
|
177 |
+
text = json_obj.get("text", "")
|
178 |
+
doc = Document(page_content=text, metadata=metadata)
|
179 |
+
documents.append(doc)
|
180 |
+
documents = []
|
181 |
+
|
182 |
+
def load_docx(directory):
|
183 |
+
"""
|
184 |
+
Loads and processes all .docx files from a specified directory.
|
185 |
+
|
186 |
+
This function iterates through the files in the given directory, identifies
|
187 |
+
files with a .docx extension, and uses the Docx2txtLoader to load and extract
|
188 |
+
their contents. The extracted contents are aggregated into a single list.
|
189 |
+
|
190 |
+
Args:
|
191 |
+
directory (str): The path to the directory containing .docx files.
|
192 |
+
|
193 |
+
Returns:
|
194 |
+
list: A list containing the contents of all loaded .docx files.
|
195 |
+
"""
|
196 |
+
documents = []
|
197 |
+
for filename in os.listdir(directory):
|
198 |
+
if filename.endswith(".docx"):
|
199 |
+
documents.extend(Docx2txtLoader(file_path=os.path.join(directory, filename)).load())
|
200 |
+
upload(documents)
|
201 |
+
return documents
|
202 |
+
|
203 |
+
def upload(docs):
|
204 |
+
"""
|
205 |
+
Processes a list of documents, splits them into smaller chunks, updates their metadata,
|
206 |
+
generates unique IDs for each chunk, and adds them to a vector store.
|
207 |
+
|
208 |
+
Args:
|
209 |
+
docs (list): A list of document objects to be processed.
|
210 |
+
|
211 |
+
Metadata Processing:
|
212 |
+
- Extracts and updates the "page" metadata if "page_label" exists.
|
213 |
+
- Updates the "attachment" metadata by removing the "{FOLDER}/" prefix from the "source".
|
214 |
+
- Filters metadata to retain only "attachment" and "page" keys.
|
215 |
+
- Generates a unique "id" for each document based on the "attachment" metadata.
|
216 |
+
- Constructs unique IDs for each document chunk, incorporating "id", "page", and chunk index.
|
217 |
+
|
218 |
+
Operations:
|
219 |
+
- Splits each document into smaller chunks using `text_splitter.split_documents`.
|
220 |
+
- Appends processed document chunks and their IDs to the `documents` and `ids` lists.
|
221 |
+
- Adds the processed documents and their IDs to the `vector_store`.
|
222 |
+
|
223 |
+
Raises:
|
224 |
+
KeyError: If required metadata keys are missing during processing.
|
225 |
+
"""
|
226 |
+
documents = []
|
227 |
+
ids = []
|
228 |
+
for doc in docs:
|
229 |
+
for index, document in enumerate(text_splitter.split_documents([doc])):
|
230 |
+
if "page_label" in document.metadata:
|
231 |
+
document.metadata["page"] = int(document.metadata["page_label"])
|
232 |
+
document.metadata["attachment"] = document.metadata["source"].replace("{FOLDER}/", "")
|
233 |
+
document.metadata = {
|
234 |
+
key: value
|
235 |
+
for key, value in document.metadata.items()
|
236 |
+
if key in ["attachment", "page"]
|
237 |
+
}
|
238 |
+
document.metadata["id"] = str(
|
239 |
+
hashlib.sha256(document.metadata['attachment'].encode()).hexdigest())
|
240 |
+
if "page" in document.metadata:
|
241 |
+
ids.append(f"{document.metadata["id"]}-{document.metadata["page"]}-{index}")
|
242 |
+
else:
|
243 |
+
ids.append(f"{document.metadata["id"]}-{index}")
|
244 |
+
documents.append(document)
|
245 |
+
vectorstore.add_documents(documents=documents, ids=ids)
|
app/main.py
CHANGED
@@ -4,7 +4,7 @@ import logging
|
|
4 |
from fastapi import FastAPI, Request
|
5 |
from fastapi.middleware.cors import CORSMiddleware
|
6 |
from jose import jwt
|
7 |
-
from router import auth, content, service
|
8 |
from starlette.middleware.base import BaseHTTPMiddleware
|
9 |
|
10 |
SECRET_KEY = "your-secret-key"
|
@@ -67,6 +67,7 @@ app = FastAPI(docs_url="/")
|
|
67 |
app.include_router(content.router)
|
68 |
app.include_router(service.router)
|
69 |
app.include_router(auth.router)
|
|
|
70 |
|
71 |
origins = [
|
72 |
"*"
|
|
|
4 |
from fastapi import FastAPI, Request
|
5 |
from fastapi.middleware.cors import CORSMiddleware
|
6 |
from jose import jwt
|
7 |
+
from router import auth, content, service, file
|
8 |
from starlette.middleware.base import BaseHTTPMiddleware
|
9 |
|
10 |
SECRET_KEY = "your-secret-key"
|
|
|
67 |
app.include_router(content.router)
|
68 |
app.include_router(service.router)
|
69 |
app.include_router(auth.router)
|
70 |
+
app.include_router(file.router)
|
71 |
|
72 |
origins = [
|
73 |
"*"
|
app/models/__init__.py
ADDED
File without changes
|
app/models/llm/__init__.py
CHANGED
@@ -2,8 +2,10 @@
|
|
2 |
from typing import List
|
3 |
from langchain.embeddings.base import Embeddings
|
4 |
from sentence_transformers import SentenceTransformer
|
|
|
5 |
from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
|
6 |
|
|
|
7 |
class GPTModel(AzureChatOpenAI):
|
8 |
"""
|
9 |
GPTModel class that extends AzureChatOpenAI.
|
@@ -74,3 +76,5 @@ class EmbeddingsModel(Embeddings):
|
|
74 |
List[float]: The embedded representation of the query as a list of floats.
|
75 |
"""
|
76 |
return self.model.encode([query]).tolist()[0]
|
|
|
|
|
|
2 |
from typing import List
|
3 |
from langchain.embeddings.base import Embeddings
|
4 |
from sentence_transformers import SentenceTransformer
|
5 |
+
from openai import AzureOpenAI
|
6 |
from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
|
7 |
|
8 |
+
|
9 |
class GPTModel(AzureChatOpenAI):
|
10 |
"""
|
11 |
GPTModel class that extends AzureChatOpenAI.
|
|
|
76 |
List[float]: The embedded representation of the query as a list of floats.
|
77 |
"""
|
78 |
return self.model.encode([query]).tolist()[0]
|
79 |
+
|
80 |
+
client = AzureOpenAI()
|
app/router/file.py
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Module for defining the main routes of the API."""
|
2 |
+
from pathlib import Path
|
3 |
+
from fastapi import APIRouter, File, UploadFile, HTTPException
|
4 |
+
from fastapi.responses import JSONResponse
|
5 |
+
from controllers.loader import load_pdf
|
6 |
+
|
7 |
+
router = APIRouter(prefix="/file", tags=["mail"])
|
8 |
+
|
9 |
+
ALLOWED_FILE_TYPES = {
|
10 |
+
"application/pdf": ".pdf",
|
11 |
+
"text/plain": ".txt",
|
12 |
+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx"
|
13 |
+
}
|
14 |
+
|
15 |
+
@router.get("")
|
16 |
+
async def get(file: UploadFile = File(...)) -> JSONResponse:
|
17 |
+
"""
|
18 |
+
Handles the chat POST request.
|
19 |
+
|
20 |
+
Args:
|
21 |
+
query (ReqData): The request data containing the query parameters.
|
22 |
+
|
23 |
+
Returns:
|
24 |
+
str: The generated response from the chat function.
|
25 |
+
"""
|
26 |
+
content = await file.read()
|
27 |
+
result = []
|
28 |
+
if file.content_type not in ALLOWED_FILE_TYPES \
|
29 |
+
or Path(file.filename).suffix.lower() != ALLOWED_FILE_TYPES.get(file.content_type):
|
30 |
+
raise HTTPException(
|
31 |
+
status_code=400,
|
32 |
+
detail="Invalid file type. Only PDF, TXT, and DOCX are allowed."
|
33 |
+
)
|
34 |
+
elif file.content_type == "application/pdf":
|
35 |
+
result = load_pdf(content, file.filename)
|
36 |
+
return JSONResponse(content=result)
|