YAMITEK's picture
Create app.py
0b4e7f3 verified
import streamlit as st
import fitz # PyMuPDF
from PIL import Image
import io
import pandas as pd
import json
import re
import google.generativeai as genai
from dotenv import load_dotenv
import os
# Load environment variables
load_dotenv()
genai.configure(api_key=os.getenv("api_key")) # Secure API key loading
# Convert PIL Image to format Gemini accepts
def image_to_gemini_format(image):
img_byte_arr = io.BytesIO()
image.save(img_byte_arr, format="PNG")
return {
"mime_type": "image/png",
"data": img_byte_arr.getvalue()
}
# Generate content using Gemini
def get_response(model, image_part, user_prompt, system_instruction):
response = model.generate_content([
system_instruction,
image_part,
user_prompt
])
return response.text
# Convert PDF to images
def convert_pdf_to_images(pdf_bytes):
images = []
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
for page in doc:
pix = page.get_pixmap(dpi=300)
img = Image.open(io.BytesIO(pix.tobytes("png")))
images.append(img)
return images
# Streamlit UI
st.set_page_config(page_title="Invoice Extractor", layout="centered")
st.title("📄 Invoice Table Extractor using Gemini AI")
uploaded_pdf = st.file_uploader("Upload a PDF Invoice", type=["pdf"])
if uploaded_pdf:
with st.spinner("Converting PDF to images..."):
images = convert_pdf_to_images(uploaded_pdf.read())
st.image(images[0], caption="Page 1 of PDF", use_container_width=True)
if st.button("Extract Table from Invoice"):
with st.spinner("Extracting data with Gemini..."):
try:
model = genai.GenerativeModel('gemini-1.5-flash')
system_instruction = "You are an AI specialized in extracting structured data from invoices."
user_prompt = """
Extract the invoice table from the uploaded invoice document.
The table should include the following columns:
- CODE ARTICLE
- DESIGNATION
- QTE COMMANDÉE
- QTE LIVRÉE
- PRIX UNIT. REF
- PRIX UNIT. HT
- PRIX UNIT. TTC
- TOTAL HT
- TVA %
Also, extract and attach the following metadata fields to each row:
- N° CLIENT
- NOM CLIENT
- N° FACTURE
- DATE FACTURE
- DATE DE CDE
- Supplier/Company Name
After extraction:
- Create a clean pandas DataFrame containing all the above fields.
- Drop any rows where CODE ARTICLE is empty or missing.
- Return the data in JSON dictionary format.
"""
image_part = image_to_gemini_format(images[0])
response_text = get_response(model, image_part, user_prompt, system_instruction)
# Extract JSON from Gemini response
json_match = re.search(r"\[\s*{.*?}\s*]", response_text, re.DOTALL)
if json_match:
clean_json = json_match.group()
data = json.loads(clean_json)
df = pd.DataFrame(data)
# Clean data
df = df[df["CODE ARTICLE"].notna() & (df["CODE ARTICLE"] != "")]
if df.empty:
st.warning("No valid rows with CODE ARTICLE found.")
else:
st.success("✅ Gemini responded!")
st.dataframe(df)
# Create Excel file in memory
output = io.BytesIO()
with pd.ExcelWriter(output, engine="xlsxwriter") as writer:
df.to_excel(writer, index=False, sheet_name="Invoice Data")
output.seek(0)
# Download button
st.download_button(
label="📥 Download Excel",
data=output,
file_name="invoice_extracted.xlsx",
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
)
else:
st.error("❌ Could not find valid JSON in Gemini's response.")
except Exception as e:
st.error("⚠️ Failed to extract or parse data.")
st.exception(e)