Spaces:

YAMITEK
/

invoice_pdf_to_excel_using_gemini

Running

File size: 3,783 Bytes

import streamlit as st
import fitz  # PyMuPDF
from PIL import Image
import io
import pandas as pd
import json
import google.generativeai as genai
from dotenv import load_dotenv
import os
import requests

# Load environment variables
load_dotenv()

# Configure Google Generative AI API
genai.configure(api_key=os.getenv("apikey"))

def get_response(model, user_input, image, prompt):
    """Generate response from the model using input and image data."""
    try:
        # This is where the actual interaction with the model happens
        response = model.generate_content([user_input, image, prompt])
        return response.text
    except requests.exceptions.RequestException as e:
        st.error(f"⚠️ Error while calling the API: {e}")
        return None

def convert_pdf_to_images(pdf_bytes):
    """Convert PDF to images using fitz (PyMuPDF)."""
    images = []
    doc = fitz.open(stream=pdf_bytes, filetype="pdf")
    for page in doc:
        pix = page.get_pixmap(dpi=300)
        img = Image.open(io.BytesIO(pix.tobytes("png")))
        images.append(img)
    return images

# Streamlit UI
st.set_page_config(page_title="Invoice Extractor", layout="centered")
st.title("📄 Invoice Table Extractor using Gemini AI")

uploaded_pdf = st.file_uploader("Upload a PDF Invoice", type=["pdf"])

if uploaded_pdf:
    with st.spinner("Converting PDF to images..."):
        images = convert_pdf_to_images(uploaded_pdf.read())

    st.image(images[0], caption="Page 1 of PDF", use_column_width=True)

    if st.button("Extract Table from Invoice"):
        with st.spinner("Extracting data with Gemini..."):
            model = genai.GenerativeModel('gemini-1.5-flash')

            prompt = """Extract the invoice table from the uploaded invoice document.
The table should include the following columns:
- CODE ARTICLE
- DESIGNATION
- QTE COMMANDÉE
- QTE LIVRÉE
- PRIX UNIT. REF
- PRIX UNIT. HT
- PRIX UNIT. TTC
- TOTAL HT
- TVA %
Also, extract and attach the following metadata fields to each row:
- N° CLIENT
- NOM CLIENT
- N° FACTURE
- DATE FACTURE
- DATE DE CDE
- Supplier/Company Name
After extraction:
- Create a clean pandas DataFrame containing all the above fields.
- Drop any rows where CODE ARTICLE is empty or missing.
- Return the data in JSON dictionary format.
"""

            try:
                response_text = get_response(model, prompt, images[0], prompt)
                if response_text:
                    st.success("✅ Gemini responded!")

                    # Parse JSON response
                    start_index = response_text.find('[')
                    end_index = response_text.rfind(']') + 1
                    clean_json = response_text[start_index:end_index]
                    data = json.loads(clean_json)
                    df = pd.DataFrame(data)

                    # Clean data by removing rows with empty CODE ARTICLE
                    df = df[df["CODE ARTICLE"].notna() & (df["CODE ARTICLE"] != "")]

                    st.dataframe(df)

                    # Allow the user to download the result as an Excel file
                    output = io.BytesIO()
                    with pd.ExcelWriter(output, engine="xlsxwriter") as writer:
                        df.to_excel(writer, index=False, sheet_name="Invoice Data")
                        writer.save()
                    st.download_button(
                        label="📥 Download Excel",
                        data=output.getvalue(),
                        file_name="invoice_extracted.xlsx",
                        mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
                    )

            except Exception as e:
                st.error("⚠️ Failed to extract or parse data.")
                st.exception(e)