Spaces:

YAMITEK
/

invoice_pdf_to_excel_using_gemini

Running

App Files Files Community

invoice_pdf_to_excel_using_gemini / app.py

YAMITEK

Update app.py

2c572d0 verified 13 days ago

raw

history blame contribute delete

3.78 kB

	import streamlit as st
	import fitz # PyMuPDF
	from PIL import Image
	import io
	import pandas as pd
	import json
	import google.generativeai as genai
	from dotenv import load_dotenv
	import os
	import requests

	# Load environment variables
	load_dotenv()

	# Configure Google Generative AI API
	genai.configure(api_key=os.getenv("apikey"))

	def get_response(model, user_input, image, prompt):
	"""Generate response from the model using input and image data."""
	try:
	# This is where the actual interaction with the model happens
	response = model.generate_content([user_input, image, prompt])
	return response.text
	except requests.exceptions.RequestException as e:
	st.error(f"⚠️ Error while calling the API: {e}")
	return None

	def convert_pdf_to_images(pdf_bytes):
	"""Convert PDF to images using fitz (PyMuPDF)."""
	images = []
	doc = fitz.open(stream=pdf_bytes, filetype="pdf")
	for page in doc:
	pix = page.get_pixmap(dpi=300)
	img = Image.open(io.BytesIO(pix.tobytes("png")))
	images.append(img)
	return images

	# Streamlit UI
	st.set_page_config(page_title="Invoice Extractor", layout="centered")
	st.title("📄 Invoice Table Extractor using Gemini AI")

	uploaded_pdf = st.file_uploader("Upload a PDF Invoice", type=["pdf"])

	if uploaded_pdf:
	with st.spinner("Converting PDF to images..."):
	images = convert_pdf_to_images(uploaded_pdf.read())

	st.image(images[0], caption="Page 1 of PDF", use_column_width=True)

	if st.button("Extract Table from Invoice"):
	with st.spinner("Extracting data with Gemini..."):
	model = genai.GenerativeModel('gemini-1.5-flash')

	prompt = """Extract the invoice table from the uploaded invoice document.
	The table should include the following columns:
	- CODE ARTICLE
	- DESIGNATION
	- QTE COMMANDÉE
	- QTE LIVRÉE
	- PRIX UNIT. REF
	- PRIX UNIT. HT
	- PRIX UNIT. TTC
	- TOTAL HT
	- TVA %
	Also, extract and attach the following metadata fields to each row:
	- N° CLIENT
	- NOM CLIENT
	- N° FACTURE
	- DATE FACTURE
	- DATE DE CDE
	- Supplier/Company Name
	After extraction:
	- Create a clean pandas DataFrame containing all the above fields.
	- Drop any rows where CODE ARTICLE is empty or missing.
	- Return the data in JSON dictionary format.
	"""

	try:
	response_text = get_response(model, prompt, images[0], prompt)
	if response_text:
	st.success("✅ Gemini responded!")

	# Parse JSON response
	start_index = response_text.find('[')
	end_index = response_text.rfind(']') + 1
	clean_json = response_text[start_index:end_index]
	data = json.loads(clean_json)
	df = pd.DataFrame(data)

	# Clean data by removing rows with empty CODE ARTICLE
	df = df[df["CODE ARTICLE"].notna() & (df["CODE ARTICLE"] != "")]

	st.dataframe(df)

	# Allow the user to download the result as an Excel file
	output = io.BytesIO()
	with pd.ExcelWriter(output, engine="xlsxwriter") as writer:
	df.to_excel(writer, index=False, sheet_name="Invoice Data")
	writer.save()
	st.download_button(
	label="📥 Download Excel",
	data=output.getvalue(),
	file_name="invoice_extracted.xlsx",
	mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
	)

	except Exception as e:
	st.error("⚠️ Failed to extract or parse data.")
	st.exception(e)