Spaces:

Oxbridge-Economics
/

knowledge-base

Running

App Files Files Community

knowledge-base / app /controllers /loader.py

gavinzli

Fix string formatting in upload function to use single quotes for consistency

d0e5539 14 days ago

raw

history blame

9.28 kB

	"""Module to extract text from PDF files and images using Azure OpenAI's GPT-4o-mini model."""
	import base64
	import hashlib
	import json
	import os
	from io import BytesIO

	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain_community.document_loaders import Docx2txtLoader
	from langchain_core.documents import Document
	from pdf2image import convert_from_path
	from pydantic import BaseModel
	from pypdf import PdfReader

	from models.llm import client
	from models.db import vectorstore

	text_splitter = RecursiveCharacterTextSplitter()

	class ExtractionResult(BaseModel):
	"""
	ExtractionResult is a data model that represents the result of an extraction process.

	Attributes:
	content (str): The extracted content as a string.
	"""
	content: str

	def check_image(page):
	"""
	Checks if a given PDF page contains any images.

	This function examines the /Resources dictionary of the provided PDF page
	to determine if it contains any XObjects of subtype /Image.

	Args:
	page: A dictionary-like object representing a PDF page.

	Returns:
	bool: True if the page contains at least one image, False otherwise.
	"""
	# Get the /Resources dictionary
	resources = page.get("/Resources")
	if resources is None:
	return False
	# Check for /XObject in resources
	xobjects = resources.get("/XObject")
	if xobjects is None:
	return False
	# Iterate through XObjects to find images
	for obj in xobjects.values():
	if obj.get("/Subtype") == "/Image":
	return True
	return False

	def extract_text_from_image(image):
	"""
	Extracts text content from an image of a document page and returns it as structured JSON.

	Args:
	image (PIL.Image.Image): The image object representing the document page.

	Returns:
	str: The extracted plain text content of the page in JSON format.

	Raises:
	Exception: If the response from the AI model is invalid or cannot be parsed.

	Dependencies:
	- Requires the `BytesIO` module for handling image byte streams.
	- Requires the `base64` module for encoding the image in Base64 format.
	- Requires a client instance capable of interacting with the GPT-4o-mini model.
	"""
	image_bytes = BytesIO()
	image.save(image_bytes, format="PNG")
	image_bytes = image_bytes.getvalue()
	base64_image = base64.b64encode(image_bytes).decode("utf-8")
	prompt = """
	You are an AI assistant that extracts data from documents and returns it as structured JSON.
	Analyze the provided image of a document page and extract the following:
	- Content of the page (plain text)
	"""
	response = client.beta.chat.completions.parse(
	model="gpt-4o-mini",
	response_format = ExtractionResult,
	messages=[
	{
	"role": "user",
	"content": [
	{"type": "text", "text": prompt},
	{
	"type": "image_url",
	"image_url": {"url": f"data:image/png;base64,{base64_image}"}
	}
	]
	}
	]
	)
	return json.loads(response.choices[0].message.content)["content"]

	def load_pdf(content: bytes, filename: str):
	"""
	Loads and processes PDF files from a specified directory.

	This function iterates through all PDF files in the given directory, extracts text
	from each page, and creates a list of Document objects containing the extracted text
	and metadata. If a page contains an image, the text is extracted from the image using
	OCR.

	Args:
	directory (str): The path to the directory containing the PDF files.

	Returns:
	list: A list of Document objects, where each object contains the page content
	and metadata (filename and page number).

	Raises:
	FileNotFoundError: If a specified PDF file is not found.
	Exception: For any other errors encountered during processing.

	Notes:
	- The function assumes the presence of helper functions `check_image`,
	`convert_from_path`, and `extract_text_from_image`.
	- The `Document` class is used to store the page content and metadata.
	"""
	documents = []
	path = os.path.join("/tmp", filename)
	with open(path, "wb") as f:
	f.write(content)
	try:
	pdf = PdfReader(path)
	for page_num, page in enumerate(pdf.pages):
	contain = check_image(page)
	if contain:
	images = convert_from_path(
	path, first_page=page_num + 1, last_page=page_num + 1)
	text = extract_text_from_image(images[0])
	else:
	text = page.extract_text()
	doc = Document(
	page_content=text,
	metadata={"source": filename, "page": page_num + 1})
	documents.append(doc.model_dump())
	os.remove(path)
	return documents
	except (FileNotFoundError, ValueError, OSError) as e:
	print(f"Error: {str(e)}")
	return documents

	def load_jsonl(directory):
	"""
	Reads a JSONL file and converts its content into a list of Document objects.

	Args:
	path (str): Path to the JSONL file.

	Returns:
	list: A list of Document objects.
	"""
	for filename in os.listdir(directory):
	if filename.endswith(".pdf"):
	documents = []
	file_path = os.path.join(directory, filename)
	with open(file_path, "r", encoding="utf-8") as file:
	for line in file:
	# Parse each line as JSON
	json_obj = json.loads(line.strip())
	metadata = {
	"id": json_obj.get("id", ""),
	"url": json_obj.get("url", ""),
	"title": json_obj.get("title", ""),
	"ts": json_obj.get("ts", "")
	}
	if json_obj.get("mine") == "text/html":
	text = base64.urlsafe_b64decode(json_obj.get("text", "")).decode("utf-8")
	else:
	text = json_obj.get("text", "")
	doc = Document(page_content=text, metadata=metadata)
	documents.append(doc)
	documents = []

	def load_docx(directory):
	"""
	Loads and processes all .docx files from a specified directory.

	This function iterates through the files in the given directory, identifies
	files with a .docx extension, and uses the Docx2txtLoader to load and extract
	their contents. The extracted contents are aggregated into a single list.

	Args:
	directory (str): The path to the directory containing .docx files.

	Returns:
	list: A list containing the contents of all loaded .docx files.
	"""
	documents = []
	for filename in os.listdir(directory):
	if filename.endswith(".docx"):
	documents.extend(Docx2txtLoader(file_path=os.path.join(directory, filename)).load())
	upload(documents)
	return documents

	def upload(docs):
	"""
	Processes a list of documents, splits them into smaller chunks, updates their metadata,
	generates unique IDs for each chunk, and adds them to a vector store.

	Args:
	docs (list): A list of document objects to be processed.

	Metadata Processing:
	- Extracts and updates the "page" metadata if "page_label" exists.
	- Updates the "attachment" metadata by removing the "{FOLDER}/" prefix from the "source".
	- Filters metadata to retain only "attachment" and "page" keys.
	- Generates a unique "id" for each document based on the "attachment" metadata.
	- Constructs unique IDs for each document chunk, incorporating "id", "page", and chunk index.

	Operations:
	- Splits each document into smaller chunks using `text_splitter.split_documents`.
	- Appends processed document chunks and their IDs to the `documents` and `ids` lists.
	- Adds the processed documents and their IDs to the `vector_store`.

	Raises:
	KeyError: If required metadata keys are missing during processing.
	"""
	documents = []
	ids = []
	for doc in docs:
	for index, document in enumerate(text_splitter.split_documents([doc])):
	if "page_label" in document.metadata:
	document.metadata["page"] = int(document.metadata["page_label"])
	document.metadata["attachment"] = document.metadata["source"].replace("{FOLDER}/", "")
	document.metadata = {
	key: value
	for key, value in document.metadata.items()
	if key in ["attachment", "page"]
	}
	document.metadata["id"] = str(
	hashlib.sha256(document.metadata['attachment'].encode()).hexdigest())
	if "page" in document.metadata:
	ids.append(f"{document.metadata['id']}-{document.metadata['page']}-{index}")
	else:
	ids.append(f"{document.metadata['id']}-{index}")
	documents.append(document)
	vectorstore.add_documents(documents=documents, ids=ids)