"""Module to extract text from PDF files and images using Azure OpenAI's GPT-4o-mini model.""" import base64 import hashlib import json import os from io import BytesIO from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.document_loaders import Docx2txtLoader from langchain_core.documents import Document from pdf2image import convert_from_path from pydantic import BaseModel from pypdf import PdfReader from models.llm import client from models.db import vectorstore text_splitter = RecursiveCharacterTextSplitter() class ExtractionResult(BaseModel): """ ExtractionResult is a data model that represents the result of an extraction process. Attributes: content (str): The extracted content as a string. """ content: str def check_image(page): """ Checks if a given PDF page contains any images. This function examines the /Resources dictionary of the provided PDF page to determine if it contains any XObjects of subtype /Image. Args: page: A dictionary-like object representing a PDF page. Returns: bool: True if the page contains at least one image, False otherwise. """ # Get the /Resources dictionary resources = page.get("/Resources") if resources is None: return False # Check for /XObject in resources xobjects = resources.get("/XObject") if xobjects is None: return False # Iterate through XObjects to find images for obj in xobjects.values(): if obj.get("/Subtype") == "/Image": return True return False def extract_text_from_image(image): """ Extracts text content from an image of a document page and returns it as structured JSON. Args: image (PIL.Image.Image): The image object representing the document page. Returns: str: The extracted plain text content of the page in JSON format. Raises: Exception: If the response from the AI model is invalid or cannot be parsed. Dependencies: - Requires the `BytesIO` module for handling image byte streams. - Requires the `base64` module for encoding the image in Base64 format. - Requires a client instance capable of interacting with the GPT-4o-mini model. """ image_bytes = BytesIO() image.save(image_bytes, format="PNG") image_bytes = image_bytes.getvalue() base64_image = base64.b64encode(image_bytes).decode("utf-8") prompt = """ You are an AI assistant that extracts data from documents and returns it as structured JSON. Analyze the provided image of a document page and extract the following: - Content of the page (plain text) """ response = client.beta.chat.completions.parse( model="gpt-4o-mini", response_format = ExtractionResult, messages=[ { "role": "user", "content": [ {"type": "text", "text": prompt}, { "type": "image_url", "image_url": {"url": f"data:image/png;base64,{base64_image}"} } ] } ] ) return json.loads(response.choices[0].message.content)["content"] def load_pdf(content: bytes, filename: str): """ Loads and processes PDF files from a specified directory. This function iterates through all PDF files in the given directory, extracts text from each page, and creates a list of Document objects containing the extracted text and metadata. If a page contains an image, the text is extracted from the image using OCR. Args: directory (str): The path to the directory containing the PDF files. Returns: list: A list of Document objects, where each object contains the page content and metadata (filename and page number). Raises: FileNotFoundError: If a specified PDF file is not found. Exception: For any other errors encountered during processing. Notes: - The function assumes the presence of helper functions `check_image`, `convert_from_path`, and `extract_text_from_image`. - The `Document` class is used to store the page content and metadata. """ documents = [] path = os.path.join("/tmp", filename) with open(path, "wb") as f: f.write(content) try: pdf = PdfReader(path) for page_num, page in enumerate(pdf.pages): contain = check_image(page) if contain: images = convert_from_path( path, first_page=page_num + 1, last_page=page_num + 1) text = extract_text_from_image(images[0]) else: text = page.extract_text() doc = Document( page_content=text, metadata={"source": filename, "page": page_num + 1}) documents.append(doc.model_dump()) os.remove(path) return documents except (FileNotFoundError, ValueError, OSError) as e: print(f"Error: {str(e)}") return documents def load_jsonl(directory): """ Reads a JSONL file and converts its content into a list of Document objects. Args: path (str): Path to the JSONL file. Returns: list: A list of Document objects. """ for filename in os.listdir(directory): if filename.endswith(".pdf"): documents = [] file_path = os.path.join(directory, filename) with open(file_path, "r", encoding="utf-8") as file: for line in file: # Parse each line as JSON json_obj = json.loads(line.strip()) metadata = { "id": json_obj.get("id", ""), "url": json_obj.get("url", ""), "title": json_obj.get("title", ""), "ts": json_obj.get("ts", "") } if json_obj.get("mine") == "text/html": text = base64.urlsafe_b64decode(json_obj.get("text", "")).decode("utf-8") else: text = json_obj.get("text", "") doc = Document(page_content=text, metadata=metadata) documents.append(doc) documents = [] def load_docx(directory): """ Loads and processes all .docx files from a specified directory. This function iterates through the files in the given directory, identifies files with a .docx extension, and uses the Docx2txtLoader to load and extract their contents. The extracted contents are aggregated into a single list. Args: directory (str): The path to the directory containing .docx files. Returns: list: A list containing the contents of all loaded .docx files. """ documents = [] for filename in os.listdir(directory): if filename.endswith(".docx"): documents.extend(Docx2txtLoader(file_path=os.path.join(directory, filename)).load()) upload(documents) return documents def upload(docs): """ Processes a list of documents, splits them into smaller chunks, updates their metadata, generates unique IDs for each chunk, and adds them to a vector store. Args: docs (list): A list of document objects to be processed. Metadata Processing: - Extracts and updates the "page" metadata if "page_label" exists. - Updates the "attachment" metadata by removing the "{FOLDER}/" prefix from the "source". - Filters metadata to retain only "attachment" and "page" keys. - Generates a unique "id" for each document based on the "attachment" metadata. - Constructs unique IDs for each document chunk, incorporating "id", "page", and chunk index. Operations: - Splits each document into smaller chunks using `text_splitter.split_documents`. - Appends processed document chunks and their IDs to the `documents` and `ids` lists. - Adds the processed documents and their IDs to the `vector_store`. Raises: KeyError: If required metadata keys are missing during processing. """ documents = [] ids = [] for doc in docs: for index, document in enumerate(text_splitter.split_documents([doc])): if "page_label" in document.metadata: document.metadata["page"] = int(document.metadata["page_label"]) document.metadata["attachment"] = document.metadata["source"].replace("{FOLDER}/", "") document.metadata = { key: value for key, value in document.metadata.items() if key in ["attachment", "page"] } document.metadata["id"] = str( hashlib.sha256(document.metadata['attachment'].encode()).hexdigest()) if "page" in document.metadata: ids.append(f"{document.metadata['id']}-{document.metadata['page']}-{index}") else: ids.append(f"{document.metadata['id']}-{index}") documents.append(document) vectorstore.add_documents(documents=documents, ids=ids)