import streamlit as st import fitz # PyMuPDF from PIL import Image import io import pandas as pd import json import google.generativeai as genai from dotenv import load_dotenv import os import requests # Load environment variables load_dotenv() # Configure Google Generative AI API genai.configure(api_key=os.getenv("apikey")) def get_response(model, user_input, image, prompt): """Generate response from the model using input and image data.""" try: # This is where the actual interaction with the model happens response = model.generate_content([user_input, image, prompt]) return response.text except requests.exceptions.RequestException as e: st.error(f"⚠️ Error while calling the API: {e}") return None def convert_pdf_to_images(pdf_bytes): """Convert PDF to images using fitz (PyMuPDF).""" images = [] doc = fitz.open(stream=pdf_bytes, filetype="pdf") for page in doc: pix = page.get_pixmap(dpi=300) img = Image.open(io.BytesIO(pix.tobytes("png"))) images.append(img) return images # Streamlit UI st.set_page_config(page_title="Invoice Extractor", layout="centered") st.title("📄 Invoice Table Extractor using Gemini AI") uploaded_pdf = st.file_uploader("Upload a PDF Invoice", type=["pdf"]) if uploaded_pdf: with st.spinner("Converting PDF to images..."): images = convert_pdf_to_images(uploaded_pdf.read()) st.image(images[0], caption="Page 1 of PDF", use_column_width=True) if st.button("Extract Table from Invoice"): with st.spinner("Extracting data with Gemini..."): model = genai.GenerativeModel('gemini-1.5-flash') prompt = """Extract the invoice table from the uploaded invoice document. The table should include the following columns: - CODE ARTICLE - DESIGNATION - QTE COMMANDÉE - QTE LIVRÉE - PRIX UNIT. REF - PRIX UNIT. HT - PRIX UNIT. TTC - TOTAL HT - TVA % Also, extract and attach the following metadata fields to each row: - N° CLIENT - NOM CLIENT - N° FACTURE - DATE FACTURE - DATE DE CDE - Supplier/Company Name After extraction: - Create a clean pandas DataFrame containing all the above fields. - Drop any rows where CODE ARTICLE is empty or missing. - Return the data in JSON dictionary format. """ try: response_text = get_response(model, prompt, images[0], prompt) if response_text: st.success("✅ Gemini responded!") # Parse JSON response start_index = response_text.find('[') end_index = response_text.rfind(']') + 1 clean_json = response_text[start_index:end_index] data = json.loads(clean_json) df = pd.DataFrame(data) # Clean data by removing rows with empty CODE ARTICLE df = df[df["CODE ARTICLE"].notna() & (df["CODE ARTICLE"] != "")] st.dataframe(df) # Allow the user to download the result as an Excel file output = io.BytesIO() with pd.ExcelWriter(output, engine="xlsxwriter") as writer: df.to_excel(writer, index=False, sheet_name="Invoice Data") writer.save() st.download_button( label="📥 Download Excel", data=output.getvalue(), file_name="invoice_extracted.xlsx", mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" ) except Exception as e: st.error("⚠️ Failed to extract or parse data.") st.exception(e)