|
import streamlit as st |
|
import fitz |
|
from PIL import Image |
|
import io |
|
import pandas as pd |
|
import json |
|
import google.generativeai as genai |
|
from dotenv import load_dotenv |
|
import os |
|
import requests |
|
|
|
|
|
load_dotenv() |
|
|
|
|
|
genai.configure(api_key=os.getenv("apikey")) |
|
|
|
def get_response(model, user_input, image, prompt): |
|
"""Generate response from the model using input and image data.""" |
|
try: |
|
|
|
response = model.generate_content([user_input, image, prompt]) |
|
return response.text |
|
except requests.exceptions.RequestException as e: |
|
st.error(f"⚠️ Error while calling the API: {e}") |
|
return None |
|
|
|
def convert_pdf_to_images(pdf_bytes): |
|
"""Convert PDF to images using fitz (PyMuPDF).""" |
|
images = [] |
|
doc = fitz.open(stream=pdf_bytes, filetype="pdf") |
|
for page in doc: |
|
pix = page.get_pixmap(dpi=300) |
|
img = Image.open(io.BytesIO(pix.tobytes("png"))) |
|
images.append(img) |
|
return images |
|
|
|
|
|
st.set_page_config(page_title="Invoice Extractor", layout="centered") |
|
st.title("📄 Invoice Table Extractor using Gemini AI") |
|
|
|
uploaded_pdf = st.file_uploader("Upload a PDF Invoice", type=["pdf"]) |
|
|
|
if uploaded_pdf: |
|
with st.spinner("Converting PDF to images..."): |
|
images = convert_pdf_to_images(uploaded_pdf.read()) |
|
|
|
st.image(images[0], caption="Page 1 of PDF", use_column_width=True) |
|
|
|
if st.button("Extract Table from Invoice"): |
|
with st.spinner("Extracting data with Gemini..."): |
|
model = genai.GenerativeModel('gemini-1.5-flash') |
|
|
|
prompt = """Extract the invoice table from the uploaded invoice document. |
|
The table should include the following columns: |
|
- CODE ARTICLE |
|
- DESIGNATION |
|
- QTE COMMANDÉE |
|
- QTE LIVRÉE |
|
- PRIX UNIT. REF |
|
- PRIX UNIT. HT |
|
- PRIX UNIT. TTC |
|
- TOTAL HT |
|
- TVA % |
|
Also, extract and attach the following metadata fields to each row: |
|
- N° CLIENT |
|
- NOM CLIENT |
|
- N° FACTURE |
|
- DATE FACTURE |
|
- DATE DE CDE |
|
- Supplier/Company Name |
|
After extraction: |
|
- Create a clean pandas DataFrame containing all the above fields. |
|
- Drop any rows where CODE ARTICLE is empty or missing. |
|
- Return the data in JSON dictionary format. |
|
""" |
|
|
|
try: |
|
response_text = get_response(model, prompt, images[0], prompt) |
|
if response_text: |
|
st.success("✅ Gemini responded!") |
|
|
|
|
|
start_index = response_text.find('[') |
|
end_index = response_text.rfind(']') + 1 |
|
clean_json = response_text[start_index:end_index] |
|
data = json.loads(clean_json) |
|
df = pd.DataFrame(data) |
|
|
|
|
|
df = df[df["CODE ARTICLE"].notna() & (df["CODE ARTICLE"] != "")] |
|
|
|
st.dataframe(df) |
|
|
|
|
|
output = io.BytesIO() |
|
with pd.ExcelWriter(output, engine="xlsxwriter") as writer: |
|
df.to_excel(writer, index=False, sheet_name="Invoice Data") |
|
writer.save() |
|
st.download_button( |
|
label="📥 Download Excel", |
|
data=output.getvalue(), |
|
file_name="invoice_extracted.xlsx", |
|
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" |
|
) |
|
|
|
except Exception as e: |
|
st.error("⚠️ Failed to extract or parse data.") |
|
st.exception(e) |