Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import fitz # PyMuPDF
|
3 |
+
from PIL import Image
|
4 |
+
import io
|
5 |
+
import pandas as pd
|
6 |
+
import json
|
7 |
+
import google.generativeai as genai
|
8 |
+
from dotenv import load_dotenv
|
9 |
+
import os
|
10 |
+
import requests
|
11 |
+
|
12 |
+
# Load environment variables
|
13 |
+
load_dotenv()
|
14 |
+
|
15 |
+
# Configure Google Generative AI API
|
16 |
+
genai.configure(api_key="AIzaSyANPvOHyc2wp3xCX8oF1t-Fhxx8hNCe_rU")
|
17 |
+
|
18 |
+
def get_response(model, user_input, image, prompt):
|
19 |
+
"""Generate response from the model using input and image data."""
|
20 |
+
try:
|
21 |
+
# This is where the actual interaction with the model happens
|
22 |
+
response = model.generate_content([user_input, image, prompt])
|
23 |
+
return response.text
|
24 |
+
except requests.exceptions.RequestException as e:
|
25 |
+
st.error(f"⚠️ Error while calling the API: {e}")
|
26 |
+
return None
|
27 |
+
|
28 |
+
def convert_pdf_to_images(pdf_bytes):
|
29 |
+
"""Convert PDF to images using fitz (PyMuPDF)."""
|
30 |
+
images = []
|
31 |
+
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
|
32 |
+
for page in doc:
|
33 |
+
pix = page.get_pixmap(dpi=300)
|
34 |
+
img = Image.open(io.BytesIO(pix.tobytes("png")))
|
35 |
+
images.append(img)
|
36 |
+
return images
|
37 |
+
|
38 |
+
# Streamlit UI
|
39 |
+
st.set_page_config(page_title="Invoice Extractor", layout="centered")
|
40 |
+
st.title("📄 Invoice Table Extractor using Gemini AI")
|
41 |
+
|
42 |
+
uploaded_pdf = st.file_uploader("Upload a PDF Invoice", type=["pdf"])
|
43 |
+
|
44 |
+
if uploaded_pdf:
|
45 |
+
with st.spinner("Converting PDF to images..."):
|
46 |
+
images = convert_pdf_to_images(uploaded_pdf.read())
|
47 |
+
|
48 |
+
st.image(images[0], caption="Page 1 of PDF", use_column_width=True)
|
49 |
+
|
50 |
+
if st.button("Extract Table from Invoice"):
|
51 |
+
with st.spinner("Extracting data with Gemini..."):
|
52 |
+
model = genai.GenerativeModel('gemini-1.5-flash')
|
53 |
+
|
54 |
+
prompt = """Extract the invoice table from the uploaded invoice document.
|
55 |
+
The table should include the following columns:
|
56 |
+
- CODE ARTICLE
|
57 |
+
- DESIGNATION
|
58 |
+
- QTE COMMANDÉE
|
59 |
+
- QTE LIVRÉE
|
60 |
+
- PRIX UNIT. REF
|
61 |
+
- PRIX UNIT. HT
|
62 |
+
- PRIX UNIT. TTC
|
63 |
+
- TOTAL HT
|
64 |
+
- TVA %
|
65 |
+
Also, extract and attach the following metadata fields to each row:
|
66 |
+
- N° CLIENT
|
67 |
+
- NOM CLIENT
|
68 |
+
- N° FACTURE
|
69 |
+
- DATE FACTURE
|
70 |
+
- DATE DE CDE
|
71 |
+
- Supplier/Company Name
|
72 |
+
After extraction:
|
73 |
+
- Create a clean pandas DataFrame containing all the above fields.
|
74 |
+
- Drop any rows where CODE ARTICLE is empty or missing.
|
75 |
+
- Return the data in JSON dictionary format.
|
76 |
+
"""
|
77 |
+
|
78 |
+
try:
|
79 |
+
response_text = get_response(model, prompt, images[0], prompt)
|
80 |
+
if response_text:
|
81 |
+
st.success("✅ Gemini responded!")
|
82 |
+
|
83 |
+
# Parse JSON response
|
84 |
+
start_index = response_text.find('[')
|
85 |
+
end_index = response_text.rfind(']') + 1
|
86 |
+
clean_json = response_text[start_index:end_index]
|
87 |
+
data = json.loads(clean_json)
|
88 |
+
df = pd.DataFrame(data)
|
89 |
+
|
90 |
+
# Clean data by removing rows with empty CODE ARTICLE
|
91 |
+
df = df[df["CODE ARTICLE"].notna() & (df["CODE ARTICLE"] != "")]
|
92 |
+
|
93 |
+
st.dataframe(df)
|
94 |
+
|
95 |
+
# Allow the user to download the result as an Excel file
|
96 |
+
output = io.BytesIO()
|
97 |
+
with pd.ExcelWriter(output, engine="xlsxwriter") as writer:
|
98 |
+
df.to_excel(writer, index=False, sheet_name="Invoice Data")
|
99 |
+
writer.save()
|
100 |
+
st.download_button(
|
101 |
+
label="📥 Download Excel",
|
102 |
+
data=output.getvalue(),
|
103 |
+
file_name="invoice_extracted.xlsx",
|
104 |
+
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
105 |
+
)
|
106 |
+
|
107 |
+
except Exception as e:
|
108 |
+
st.error("⚠️ Failed to extract or parse data.")
|
109 |
+
st.exception(e)
|