Spaces:

YAMITEK
/

pdf_extracter_using_gemini

Sleeping

File size: 4,208 Bytes

0b4e7f3

import streamlit as st
import fitz  # PyMuPDF
from PIL import Image
import io
import pandas as pd
import json
import re
import google.generativeai as genai
from dotenv import load_dotenv
import os

# Load environment variables
load_dotenv()
genai.configure(api_key=os.getenv("api_key"))  # Secure API key loading

# Convert PIL Image to format Gemini accepts
def image_to_gemini_format(image):
    img_byte_arr = io.BytesIO()
    image.save(img_byte_arr, format="PNG")
    return {
        "mime_type": "image/png",
        "data": img_byte_arr.getvalue()
    }

# Generate content using Gemini
def get_response(model, image_part, user_prompt, system_instruction):
    response = model.generate_content([
        system_instruction,
        image_part,
        user_prompt
    ])
    return response.text

# Convert PDF to images
def convert_pdf_to_images(pdf_bytes):
    images = []
    doc = fitz.open(stream=pdf_bytes, filetype="pdf")
    for page in doc:
        pix = page.get_pixmap(dpi=300)
        img = Image.open(io.BytesIO(pix.tobytes("png")))
        images.append(img)
    return images

# Streamlit UI
st.set_page_config(page_title="Invoice Extractor", layout="centered")
st.title("📄 Invoice Table Extractor using Gemini AI")

uploaded_pdf = st.file_uploader("Upload a PDF Invoice", type=["pdf"])

if uploaded_pdf:
    with st.spinner("Converting PDF to images..."):
        images = convert_pdf_to_images(uploaded_pdf.read())

    st.image(images[0], caption="Page 1 of PDF", use_container_width=True)

    if st.button("Extract Table from Invoice"):
        with st.spinner("Extracting data with Gemini..."):
            try:
                model = genai.GenerativeModel('gemini-1.5-flash')

                system_instruction = "You are an AI specialized in extracting structured data from invoices."
                user_prompt = """
Extract the invoice table from the uploaded invoice document.
The table should include the following columns:
- CODE ARTICLE
- DESIGNATION
- QTE COMMANDÉE
- QTE LIVRÉE
- PRIX UNIT. REF
- PRIX UNIT. HT
- PRIX UNIT. TTC
- TOTAL HT
- TVA %
Also, extract and attach the following metadata fields to each row:
- N° CLIENT
- NOM CLIENT
- N° FACTURE
- DATE FACTURE
- DATE DE CDE
- Supplier/Company Name
After extraction:
- Create a clean pandas DataFrame containing all the above fields.
- Drop any rows where CODE ARTICLE is empty or missing.
- Return the data in JSON dictionary format.
"""

                image_part = image_to_gemini_format(images[0])
                response_text = get_response(model, image_part, user_prompt, system_instruction)

                # Extract JSON from Gemini response
                json_match = re.search(r"\[\s*{.*?}\s*]", response_text, re.DOTALL)
                if json_match:
                    clean_json = json_match.group()
                    data = json.loads(clean_json)
                    df = pd.DataFrame(data)

                    # Clean data
                    df = df[df["CODE ARTICLE"].notna() & (df["CODE ARTICLE"] != "")]

                    if df.empty:
                        st.warning("No valid rows with CODE ARTICLE found.")
                    else:
                        st.success("✅ Gemini responded!")
                        st.dataframe(df)

                        # Create Excel file in memory
                        output = io.BytesIO()
                        with pd.ExcelWriter(output, engine="xlsxwriter") as writer:
                            df.to_excel(writer, index=False, sheet_name="Invoice Data")
                        output.seek(0)

                        # Download button
                        st.download_button(
                            label="📥 Download Excel",
                            data=output,
                            file_name="invoice_extracted.xlsx",
                            mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
                        )
                else:
                    st.error("❌ Could not find valid JSON in Gemini's response.")

            except Exception as e:
                st.error("⚠️ Failed to extract or parse data.")
                st.exception(e)