File size: 4,208 Bytes
0b4e7f3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import streamlit as st
import fitz  # PyMuPDF
from PIL import Image
import io
import pandas as pd
import json
import re
import google.generativeai as genai
from dotenv import load_dotenv
import os

# Load environment variables
load_dotenv()
genai.configure(api_key=os.getenv("api_key"))  # Secure API key loading

# Convert PIL Image to format Gemini accepts
def image_to_gemini_format(image):
    img_byte_arr = io.BytesIO()
    image.save(img_byte_arr, format="PNG")
    return {
        "mime_type": "image/png",
        "data": img_byte_arr.getvalue()
    }

# Generate content using Gemini
def get_response(model, image_part, user_prompt, system_instruction):
    response = model.generate_content([
        system_instruction,
        image_part,
        user_prompt
    ])
    return response.text

# Convert PDF to images
def convert_pdf_to_images(pdf_bytes):
    images = []
    doc = fitz.open(stream=pdf_bytes, filetype="pdf")
    for page in doc:
        pix = page.get_pixmap(dpi=300)
        img = Image.open(io.BytesIO(pix.tobytes("png")))
        images.append(img)
    return images

# Streamlit UI
st.set_page_config(page_title="Invoice Extractor", layout="centered")
st.title("📄 Invoice Table Extractor using Gemini AI")

uploaded_pdf = st.file_uploader("Upload a PDF Invoice", type=["pdf"])

if uploaded_pdf:
    with st.spinner("Converting PDF to images..."):
        images = convert_pdf_to_images(uploaded_pdf.read())

    st.image(images[0], caption="Page 1 of PDF", use_container_width=True)

    if st.button("Extract Table from Invoice"):
        with st.spinner("Extracting data with Gemini..."):
            try:
                model = genai.GenerativeModel('gemini-1.5-flash')

                system_instruction = "You are an AI specialized in extracting structured data from invoices."
                user_prompt = """
Extract the invoice table from the uploaded invoice document.
The table should include the following columns:
- CODE ARTICLE
- DESIGNATION
- QTE COMMANDÉE
- QTE LIVRÉE
- PRIX UNIT. REF
- PRIX UNIT. HT
- PRIX UNIT. TTC
- TOTAL HT
- TVA %
Also, extract and attach the following metadata fields to each row:
- N° CLIENT
- NOM CLIENT
- N° FACTURE
- DATE FACTURE
- DATE DE CDE
- Supplier/Company Name
After extraction:
- Create a clean pandas DataFrame containing all the above fields.
- Drop any rows where CODE ARTICLE is empty or missing.
- Return the data in JSON dictionary format.
"""

                image_part = image_to_gemini_format(images[0])
                response_text = get_response(model, image_part, user_prompt, system_instruction)

                # Extract JSON from Gemini response
                json_match = re.search(r"\[\s*{.*?}\s*]", response_text, re.DOTALL)
                if json_match:
                    clean_json = json_match.group()
                    data = json.loads(clean_json)
                    df = pd.DataFrame(data)

                    # Clean data
                    df = df[df["CODE ARTICLE"].notna() & (df["CODE ARTICLE"] != "")]

                    if df.empty:
                        st.warning("No valid rows with CODE ARTICLE found.")
                    else:
                        st.success("✅ Gemini responded!")
                        st.dataframe(df)

                        # Create Excel file in memory
                        output = io.BytesIO()
                        with pd.ExcelWriter(output, engine="xlsxwriter") as writer:
                            df.to_excel(writer, index=False, sheet_name="Invoice Data")
                        output.seek(0)

                        # Download button
                        st.download_button(
                            label="📥 Download Excel",
                            data=output,
                            file_name="invoice_extracted.xlsx",
                            mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
                        )
                else:
                    st.error("❌ Could not find valid JSON in Gemini's response.")

            except Exception as e:
                st.error("⚠️ Failed to extract or parse data.")
                st.exception(e)