Spaces:

Alexvatti
/

PDF-Invoce-Extract

Sleeping

App Files Files Community

Alexvatti commited on Apr 19

Commit

c1f7cbc

verified ·

1 Parent(s): 2f1a090

Update app.py

Browse files

Files changed (1) hide show

app.py +44 -53

app.py CHANGED Viewed

@@ -1,56 +1,47 @@
 import streamlit as st
-import fitz  # PyMuPDF
-import pytesseract
 from PIL import Image
-import pandas as pd
-import re
-import io
-st.set_page_config(page_title="Invoice Extractor", layout="centered")
-st.title("🧾 PDF Invoice Data Extractor")
-st.write("Upload a PDF invoice and extract details like Invoice Number, Date, Total, and more.")
-uploaded_file = st.file_uploader("Upload your invoice PDF", type=["pdf"])
-# 📌 Replaces pdf2image with fitz
-def extract_text_from_pdf(pdf_file):
-    text = ""
-    doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
-    for page in doc:
-        pix = page.get_pixmap(dpi=300)  # high-res rendering
-        img = Image.open(io.BytesIO(pix.tobytes("png")))
-        text += pytesseract.image_to_string(img)
-    return text
-def parse_invoice_text(text):
-    data = {}
-    data['Invoice Number'] = re.search(r'(Invoice\s*Number|No\.?)[:\-]?\s*([A-Za-z0-9\-]+)', text, re.IGNORECASE)
-    data['Date'] = re.search(r'(Date|Invoice Date)[:\-]?\s*([0-9]{2,4}[\/\-\.][0-9]{2}[\/\-\.][0-9]{2,4})', text)
-    data['Total Amount'] = re.search(r'(Total\s*Amount|Amount\s*Due|Grand\s*Total)[:\-]?\s*[\$₹€]?\s*([0-9,]+\.\d{2})', text, re.IGNORECASE)
-    data['Supplier Name'] = re.search(r'(From|Supplier|Billed\s*By)[:\-]?\s*(.*)', text)
-    parsed_data = {
-        "Invoice Number": data['Invoice Number'].group(2) if data['Invoice Number'] else "Not found",
-        "Date": data['Date'].group(2) if data['Date'] else "Not found",
-        "Total Amount": data['Total Amount'].group(2) if data['Total Amount'] else "Not found",
-        "Supplier Name": data['Supplier Name'].group(2).split("\n")[0] if data['Supplier Name'] else "Not found"
-    }
-    return parsed_data
-if uploaded_file:
-    with st.spinner("🔍 Extracting data from invoice..."):
-        text = extract_text_from_pdf(uploaded_file)
-        extracted_data = parse_invoice_text(text)
-        st.success("✅ Extraction Complete!")
-        st.subheader("Extracted Information:")
-        st.write(pd.DataFrame([extracted_data]))
-        # Option to download Excel
-        df = pd.DataFrame([extracted_data])
-        csv = df.to_csv(index=False)
-        st.download_button("📥 Download as CSV", csv, "invoice_data.csv", "text/csv")

 import streamlit as st
 from PIL import Image
+from transformers import DonutProcessor, VisionEncoderDecoderModel
+import torch
+# Load Donut model and processor
+@st.cache_resource
+def load_model():
+    processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa")
+    model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa")
+    return processor, model
+processor, model = load_model()
+st.title("🧾 Invoice Table Extractor - Hugging Face Donut")
+st.write("Upload an invoice image to extract the table (code article, designation, quantity, unit prices, totals).")
+uploaded_file = st.file_uploader("Choose an image", type=["png", "jpg", "jpeg"])
+if uploaded_file is not None:
+    image = Image.open(uploaded_file).convert("RGB")
+    st.image(image, caption="Uploaded Invoice", use_column_width=True)
+    with st.spinner("🔍 Analyzing..."):
+        # Preprocess image
+        pixel_values = processor(image, return_tensors="pt").pixel_values
+        # Prompt for table extraction
+        prompt = "<s_docvqa><question>Extract the invoice items table with code article, designation, quantity, unit prices, and totals.</question><answer>"
+        decoder_input_ids = processor.tokenizer(prompt, add_special_tokens=False, return_tensors="pt").input_ids
+        # Generate prediction
+        outputs = model.generate(
+            pixel_values,
+            decoder_input_ids=decoder_input_ids,
+            max_length=512,
+            early_stopping=True
+        )
+        # Decode response
+        result = processor.batch_decode(outputs, skip_special_tokens=True)[0]
+        result = result.replace("<s_docvqa><question>", "").replace("</question><answer>", "").strip()
+    st.subheader("📋 Extracted Table Info")
+    st.code(result)