import streamlit as st import fitz # PyMuPDF from PIL import Image import io import pandas as pd import json import re import google.generativeai as genai from dotenv import load_dotenv import os # Load environment variables load_dotenv() genai.configure(api_key=os.getenv("api_key")) # Secure API key loading # Convert PIL Image to format Gemini accepts def image_to_gemini_format(image): img_byte_arr = io.BytesIO() image.save(img_byte_arr, format="PNG") return { "mime_type": "image/png", "data": img_byte_arr.getvalue() } # Generate content using Gemini def get_response(model, image_part, user_prompt, system_instruction): response = model.generate_content([ system_instruction, image_part, user_prompt ]) return response.text # Convert PDF to images def convert_pdf_to_images(pdf_bytes): images = [] doc = fitz.open(stream=pdf_bytes, filetype="pdf") for page in doc: pix = page.get_pixmap(dpi=300) img = Image.open(io.BytesIO(pix.tobytes("png"))) images.append(img) return images # Streamlit UI st.set_page_config(page_title="Invoice Extractor", layout="centered") st.title("📄 Invoice Table Extractor using Gemini AI") uploaded_pdf = st.file_uploader("Upload a PDF Invoice", type=["pdf"]) if uploaded_pdf: with st.spinner("Converting PDF to images..."): images = convert_pdf_to_images(uploaded_pdf.read()) st.image(images[0], caption="Page 1 of PDF", use_container_width=True) if st.button("Extract Table from Invoice"): with st.spinner("Extracting data with Gemini..."): try: model = genai.GenerativeModel('gemini-1.5-flash') system_instruction = "You are an AI specialized in extracting structured data from invoices." user_prompt = """ Extract the invoice table from the uploaded invoice document. The table should include the following columns: - CODE ARTICLE - DESIGNATION - QTE COMMANDÉE - QTE LIVRÉE - PRIX UNIT. REF - PRIX UNIT. HT - PRIX UNIT. TTC - TOTAL HT - TVA % Also, extract and attach the following metadata fields to each row: - N° CLIENT - NOM CLIENT - N° FACTURE - DATE FACTURE - DATE DE CDE - Supplier/Company Name After extraction: - Create a clean pandas DataFrame containing all the above fields. - Drop any rows where CODE ARTICLE is empty or missing. - Return the data in JSON dictionary format. """ image_part = image_to_gemini_format(images[0]) response_text = get_response(model, image_part, user_prompt, system_instruction) # Extract JSON from Gemini response json_match = re.search(r"\[\s*{.*?}\s*]", response_text, re.DOTALL) if json_match: clean_json = json_match.group() data = json.loads(clean_json) df = pd.DataFrame(data) # Clean data df = df[df["CODE ARTICLE"].notna() & (df["CODE ARTICLE"] != "")] if df.empty: st.warning("No valid rows with CODE ARTICLE found.") else: st.success("✅ Gemini responded!") st.dataframe(df) # Create Excel file in memory output = io.BytesIO() with pd.ExcelWriter(output, engine="xlsxwriter") as writer: df.to_excel(writer, index=False, sheet_name="Invoice Data") output.seek(0) # Download button st.download_button( label="📥 Download Excel", data=output, file_name="invoice_extracted.xlsx", mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" ) else: st.error("❌ Could not find valid JSON in Gemini's response.") except Exception as e: st.error("⚠️ Failed to extract or parse data.") st.exception(e)