Spaces:
Sleeping
Sleeping
import streamlit as st | |
import fitz # PyMuPDF | |
from PIL import Image | |
import io | |
import pandas as pd | |
import json | |
import re | |
import google.generativeai as genai | |
from dotenv import load_dotenv | |
import os | |
# Load environment variables | |
load_dotenv() | |
genai.configure(api_key=os.getenv("api_key")) # Secure API key loading | |
# Convert PIL Image to format Gemini accepts | |
def image_to_gemini_format(image): | |
img_byte_arr = io.BytesIO() | |
image.save(img_byte_arr, format="PNG") | |
return { | |
"mime_type": "image/png", | |
"data": img_byte_arr.getvalue() | |
} | |
# Generate content using Gemini | |
def get_response(model, image_part, user_prompt, system_instruction): | |
response = model.generate_content([ | |
system_instruction, | |
image_part, | |
user_prompt | |
]) | |
return response.text | |
# Convert PDF to images | |
def convert_pdf_to_images(pdf_bytes): | |
images = [] | |
doc = fitz.open(stream=pdf_bytes, filetype="pdf") | |
for page in doc: | |
pix = page.get_pixmap(dpi=300) | |
img = Image.open(io.BytesIO(pix.tobytes("png"))) | |
images.append(img) | |
return images | |
# Streamlit UI | |
st.set_page_config(page_title="Invoice Extractor", layout="centered") | |
st.title("📄 Invoice Table Extractor using Gemini AI") | |
uploaded_pdf = st.file_uploader("Upload a PDF Invoice", type=["pdf"]) | |
if uploaded_pdf: | |
with st.spinner("Converting PDF to images..."): | |
images = convert_pdf_to_images(uploaded_pdf.read()) | |
st.image(images[0], caption="Page 1 of PDF", use_container_width=True) | |
if st.button("Extract Table from Invoice"): | |
with st.spinner("Extracting data with Gemini..."): | |
try: | |
model = genai.GenerativeModel('gemini-1.5-flash') | |
system_instruction = "You are an AI specialized in extracting structured data from invoices." | |
user_prompt = """ | |
Extract the invoice table from the uploaded invoice document. | |
The table should include the following columns: | |
- CODE ARTICLE | |
- DESIGNATION | |
- QTE COMMANDÉE | |
- QTE LIVRÉE | |
- PRIX UNIT. REF | |
- PRIX UNIT. HT | |
- PRIX UNIT. TTC | |
- TOTAL HT | |
- TVA % | |
Also, extract and attach the following metadata fields to each row: | |
- N° CLIENT | |
- NOM CLIENT | |
- N° FACTURE | |
- DATE FACTURE | |
- DATE DE CDE | |
- Supplier/Company Name | |
After extraction: | |
- Create a clean pandas DataFrame containing all the above fields. | |
- Drop any rows where CODE ARTICLE is empty or missing. | |
- Return the data in JSON dictionary format. | |
""" | |
image_part = image_to_gemini_format(images[0]) | |
response_text = get_response(model, image_part, user_prompt, system_instruction) | |
# Extract JSON from Gemini response | |
json_match = re.search(r"\[\s*{.*?}\s*]", response_text, re.DOTALL) | |
if json_match: | |
clean_json = json_match.group() | |
data = json.loads(clean_json) | |
df = pd.DataFrame(data) | |
# Clean data | |
df = df[df["CODE ARTICLE"].notna() & (df["CODE ARTICLE"] != "")] | |
if df.empty: | |
st.warning("No valid rows with CODE ARTICLE found.") | |
else: | |
st.success("✅ Gemini responded!") | |
st.dataframe(df) | |
# Create Excel file in memory | |
output = io.BytesIO() | |
with pd.ExcelWriter(output, engine="xlsxwriter") as writer: | |
df.to_excel(writer, index=False, sheet_name="Invoice Data") | |
output.seek(0) | |
# Download button | |
st.download_button( | |
label="📥 Download Excel", | |
data=output, | |
file_name="invoice_extracted.xlsx", | |
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" | |
) | |
else: | |
st.error("❌ Could not find valid JSON in Gemini's response.") | |
except Exception as e: | |
st.error("⚠️ Failed to extract or parse data.") | |
st.exception(e) | |