Spaces:

Oxbridge-Economics
/

knowledge-base

Running

App Files Files Community

gavinzli commited on 6 days ago

Commit

7a6b7b4

1 Parent(s): af61c79

Add file upload routes and implement PDF processing functionality

Browse files

Files changed (5) hide show

app/controllers/loader.py +245 -0
app/main.py +2 -1
app/models/__init__.py +0 -0
app/models/llm/__init__.py +4 -0
app/router/file.py +36 -0

app/controllers/loader.py ADDED Viewed

	@@ -0,0 +1,245 @@

+"""Module to extract text from PDF files and images using Azure OpenAI's GPT-4o-mini model."""
+import base64
+import hashlib
+import json
+import os
+from io import BytesIO
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.document_loaders import Docx2txtLoader
+from langchain_core.documents import Document
+from pdf2image import convert_from_path
+from pydantic import BaseModel
+from pypdf import PdfReader
+from models.llm import client
+from models.db import vectorstore
+text_splitter = RecursiveCharacterTextSplitter()
+class ExtractionResult(BaseModel):
+    """
+    ExtractionResult is a data model that represents the result of an extraction process.
+    Attributes:
+        content (str): The extracted content as a string.
+    """
+    content: str
+def check_image(page):
+    """
+    Checks if a given PDF page contains any images.
+    This function examines the /Resources dictionary of the provided PDF page
+    to determine if it contains any XObjects of subtype /Image.
+    Args:
+        page: A dictionary-like object representing a PDF page.
+    Returns:
+        bool: True if the page contains at least one image, False otherwise.
+    """
+    # Get the /Resources dictionary
+    resources = page.get("/Resources")
+    if resources is None:
+        return False
+    # Check for /XObject in resources
+    xobjects = resources.get("/XObject")
+    if xobjects is None:
+        return False
+    # Iterate through XObjects to find images
+    for obj in xobjects.values():
+        if obj.get("/Subtype") == "/Image":
+            return True
+    return False
+def extract_text_from_image(image):
+    """
+    Extracts text content from an image of a document page and returns it as structured JSON.
+    Args:
+        image (PIL.Image.Image): The image object representing the document page.
+    Returns:
+        str: The extracted plain text content of the page in JSON format.
+    Raises:
+        Exception: If the response from the AI model is invalid or cannot be parsed.
+    Dependencies:
+        - Requires the `BytesIO` module for handling image byte streams.
+        - Requires the `base64` module for encoding the image in Base64 format.
+        - Requires a client instance capable of interacting with the GPT-4o-mini model.
+    """
+    image_bytes = BytesIO()
+    image.save(image_bytes, format="PNG")
+    image_bytes = image_bytes.getvalue()
+    base64_image = base64.b64encode(image_bytes).decode("utf-8")
+    prompt = """
+    You are an AI assistant that extracts data from documents and returns it as structured JSON.
+    Analyze the provided image of a document page and extract the following:
+    - Content of the page (plain text)
+    """
+    response = client.beta.chat.completions.parse(
+        model="gpt-4o-mini",
+        response_format = ExtractionResult,
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": prompt},
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/png;base64,{base64_image}"}
+                    }
+                ]
+            }
+        ]
+    )
+    return json.loads(response.choices[0].message.content)["content"]
+def load_pdf(content: bytes, filename: str):
+    """
+    Loads and processes PDF files from a specified directory.
+    This function iterates through all PDF files in the given directory, extracts text
+    from each page, and creates a list of Document objects containing the extracted text
+    and metadata. If a page contains an image, the text is extracted from the image using
+    OCR.
+    Args:
+        directory (str): The path to the directory containing the PDF files.
+    Returns:
+        list: A list of Document objects, where each object contains the page content
+              and metadata (filename and page number).
+    Raises:
+        FileNotFoundError: If a specified PDF file is not found.
+        Exception: For any other errors encountered during processing.
+    Notes:
+        - The function assumes the presence of helper functions `check_image`,
+          `convert_from_path`, and `extract_text_from_image`.
+        - The `Document` class is used to store the page content and metadata.
+    """
+    documents = []
+    path = os.path.join("/tmp", filename)
+    with open(path, "wb") as f:
+        f.write(content)
+    try:
+        pdf = PdfReader(path)
+        for page_num, page in enumerate(pdf.pages):
+            contain = check_image(page)
+            if contain:
+                images = convert_from_path(
+                    path, first_page=page_num + 1, last_page=page_num + 1)
+                text = extract_text_from_image(images[0])
+            else:
+                text = page.extract_text()
+            doc = Document(
+                page_content=text,
+                metadata={"source": filename, "page": page_num + 1})
+            documents.append(doc.model_dump())
+        os.remove(path)
+        return documents
+    except (FileNotFoundError, ValueError, OSError) as e:
+        print(f"Error: {str(e)}")
+        return documents
+def load_jsonl(directory):
+    """
+    Reads a JSONL file and converts its content into a list of Document objects.
+    Args:
+        path (str): Path to the JSONL file.
+    Returns:
+        list: A list of Document objects.
+    """
+    for filename in os.listdir(directory):
+        if filename.endswith(".pdf"):
+            documents = []
+            file_path = os.path.join(directory, filename)
+            with open(file_path, "r", encoding="utf-8") as file:
+                for line in file:
+                    # Parse each line as JSON
+                    json_obj = json.loads(line.strip())
+                    metadata = {
+                        "id": json_obj.get("id", ""),
+                        "url": json_obj.get("url", ""),
+                        "title": json_obj.get("title", ""),
+                        "ts": json_obj.get("ts", "")
+                    }
+                    if json_obj.get("mine") == "text/html":
+                        text = base64.urlsafe_b64decode(json_obj.get("text", "")).decode("utf-8")
+                    else:
+                        text = json_obj.get("text", "")
+                    doc = Document(page_content=text, metadata=metadata)
+                    documents.append(doc)
+            documents = []
+def load_docx(directory):
+    """
+    Loads and processes all .docx files from a specified directory.
+    This function iterates through the files in the given directory, identifies
+    files with a .docx extension, and uses the Docx2txtLoader to load and extract
+    their contents. The extracted contents are aggregated into a single list.
+    Args:
+        directory (str): The path to the directory containing .docx files.
+    Returns:
+        list: A list containing the contents of all loaded .docx files.
+    """
+    documents = []
+    for filename in os.listdir(directory):
+        if filename.endswith(".docx"):
+            documents.extend(Docx2txtLoader(file_path=os.path.join(directory, filename)).load())
+    upload(documents)
+    return documents
+def upload(docs):
+    """
+    Processes a list of documents, splits them into smaller chunks, updates their metadata,
+    generates unique IDs for each chunk, and adds them to a vector store.
+    Args:
+        docs (list): A list of document objects to be processed.
+    Metadata Processing:
+        - Extracts and updates the "page" metadata if "page_label" exists.
+        - Updates the "attachment" metadata by removing the "{FOLDER}/" prefix from the "source".
+        - Filters metadata to retain only "attachment" and "page" keys.
+        - Generates a unique "id" for each document based on the "attachment" metadata.
+        - Constructs unique IDs for each document chunk, incorporating "id", "page", and chunk index.
+    Operations:
+        - Splits each document into smaller chunks using `text_splitter.split_documents`.
+        - Appends processed document chunks and their IDs to the `documents` and `ids` lists.
+        - Adds the processed documents and their IDs to the `vector_store`.
+    Raises:
+        KeyError: If required metadata keys are missing during processing.
+    """
+    documents = []
+    ids = []
+    for doc in docs:
+        for index, document in enumerate(text_splitter.split_documents([doc])):
+            if "page_label" in document.metadata:
+                document.metadata["page"] = int(document.metadata["page_label"])
+            document.metadata["attachment"] = document.metadata["source"].replace("{FOLDER}/", "")
+            document.metadata = {
+                key: value
+                for key, value in document.metadata.items()
+                if key in ["attachment", "page"]
+            }
+            document.metadata["id"] = str(
+                hashlib.sha256(document.metadata['attachment'].encode()).hexdigest())
+            if "page" in document.metadata:
+                ids.append(f"{document.metadata["id"]}-{document.metadata["page"]}-{index}")
+            else:
+                ids.append(f"{document.metadata["id"]}-{index}")
+            documents.append(document)
+    vectorstore.add_documents(documents=documents, ids=ids)

app/main.py CHANGED Viewed

@@ -4,7 +4,7 @@ import logging
 from fastapi import FastAPI, Request
 from fastapi.middleware.cors import CORSMiddleware
 from jose import jwt
-from router import auth, content, service
 from starlette.middleware.base import BaseHTTPMiddleware
 SECRET_KEY = "your-secret-key"
@@ -67,6 +67,7 @@ app = FastAPI(docs_url="/")
 app.include_router(content.router)
 app.include_router(service.router)
 app.include_router(auth.router)
 origins = [
     "*"

 from fastapi import FastAPI, Request
 from fastapi.middleware.cors import CORSMiddleware
 from jose import jwt
+from router import auth, content, service, file
 from starlette.middleware.base import BaseHTTPMiddleware
 SECRET_KEY = "your-secret-key"
 app.include_router(content.router)
 app.include_router(service.router)
 app.include_router(auth.router)
+app.include_router(file.router)
 origins = [
     "*"

app/models/__init__.py ADDED Viewed

File without changes

app/models/llm/__init__.py CHANGED Viewed

@@ -2,8 +2,10 @@
 from typing import List
 from langchain.embeddings.base import Embeddings
 from sentence_transformers import SentenceTransformer
 from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
 class GPTModel(AzureChatOpenAI):
     """
     GPTModel class that extends AzureChatOpenAI.
@@ -74,3 +76,5 @@ class EmbeddingsModel(Embeddings):
             List[float]: The embedded representation of the query as a list of floats.
         """
         return self.model.encode([query]).tolist()[0]

 from typing import List
 from langchain.embeddings.base import Embeddings
 from sentence_transformers import SentenceTransformer
+from openai import AzureOpenAI
 from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
 class GPTModel(AzureChatOpenAI):
     """
     GPTModel class that extends AzureChatOpenAI.
             List[float]: The embedded representation of the query as a list of floats.
         """
         return self.model.encode([query]).tolist()[0]
+client = AzureOpenAI()

app/router/file.py ADDED Viewed

	@@ -0,0 +1,36 @@

+"""Module for defining the main routes of the API."""
+from pathlib import Path
+from fastapi import APIRouter, File, UploadFile, HTTPException
+from fastapi.responses import JSONResponse
+from controllers.loader import load_pdf
+router = APIRouter(prefix="/file", tags=["mail"])
+ALLOWED_FILE_TYPES = {
+    "application/pdf": ".pdf",
+    "text/plain": ".txt",
+    "application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx"
+}
+@router.get("")
+async def get(file: UploadFile = File(...)) -> JSONResponse:
+    """
+    Handles the chat POST request.
+    Args:
+        query (ReqData): The request data containing the query parameters.
+    Returns:
+        str: The generated response from the chat function.
+    """
+    content = await file.read()
+    result = []
+    if file.content_type not in ALLOWED_FILE_TYPES \
+        or Path(file.filename).suffix.lower() != ALLOWED_FILE_TYPES.get(file.content_type):
+        raise HTTPException(
+            status_code=400,
+            detail="Invalid file type. Only PDF, TXT, and DOCX are allowed."
+        )
+    elif file.content_type == "application/pdf":
+        result = load_pdf(content, file.filename)
+    return JSONResponse(content=result)