Alexvatti commited on
Commit
c1f7cbc
·
verified ·
1 Parent(s): 2f1a090

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +44 -53
app.py CHANGED
@@ -1,56 +1,47 @@
1
  import streamlit as st
2
- import fitz # PyMuPDF
3
- import pytesseract
4
  from PIL import Image
5
- import pandas as pd
6
- import re
7
- import io
8
-
9
- st.set_page_config(page_title="Invoice Extractor", layout="centered")
10
-
11
- st.title("🧾 PDF Invoice Data Extractor")
12
- st.write("Upload a PDF invoice and extract details like Invoice Number, Date, Total, and more.")
13
-
14
- uploaded_file = st.file_uploader("Upload your invoice PDF", type=["pdf"])
15
-
16
- # 📌 Replaces pdf2image with fitz
17
- def extract_text_from_pdf(pdf_file):
18
- text = ""
19
- doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
20
-
21
- for page in doc:
22
- pix = page.get_pixmap(dpi=300) # high-res rendering
23
- img = Image.open(io.BytesIO(pix.tobytes("png")))
24
- text += pytesseract.image_to_string(img)
25
-
26
- return text
27
-
28
- def parse_invoice_text(text):
29
- data = {}
30
- data['Invoice Number'] = re.search(r'(Invoice\s*Number|No\.?)[:\-]?\s*([A-Za-z0-9\-]+)', text, re.IGNORECASE)
31
- data['Date'] = re.search(r'(Date|Invoice Date)[:\-]?\s*([0-9]{2,4}[\/\-\.][0-9]{2}[\/\-\.][0-9]{2,4})', text)
32
- data['Total Amount'] = re.search(r'(Total\s*Amount|Amount\s*Due|Grand\s*Total)[:\-]?\s*[\$₹€]?\s*([0-9,]+\.\d{2})', text, re.IGNORECASE)
33
- data['Supplier Name'] = re.search(r'(From|Supplier|Billed\s*By)[:\-]?\s*(.*)', text)
34
-
35
- parsed_data = {
36
- "Invoice Number": data['Invoice Number'].group(2) if data['Invoice Number'] else "Not found",
37
- "Date": data['Date'].group(2) if data['Date'] else "Not found",
38
- "Total Amount": data['Total Amount'].group(2) if data['Total Amount'] else "Not found",
39
- "Supplier Name": data['Supplier Name'].group(2).split("\n")[0] if data['Supplier Name'] else "Not found"
40
- }
41
- return parsed_data
42
-
43
- if uploaded_file:
44
- with st.spinner("🔍 Extracting data from invoice..."):
45
- text = extract_text_from_pdf(uploaded_file)
46
- extracted_data = parse_invoice_text(text)
47
-
48
- st.success("✅ Extraction Complete!")
49
- st.subheader("Extracted Information:")
50
- st.write(pd.DataFrame([extracted_data]))
51
-
52
- # Option to download Excel
53
- df = pd.DataFrame([extracted_data])
54
- csv = df.to_csv(index=False)
55
- st.download_button("📥 Download as CSV", csv, "invoice_data.csv", "text/csv")
56
 
 
1
  import streamlit as st
 
 
2
  from PIL import Image
3
+ from transformers import DonutProcessor, VisionEncoderDecoderModel
4
+ import torch
5
+
6
+ # Load Donut model and processor
7
+ @st.cache_resource
8
+ def load_model():
9
+ processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa")
10
+ model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa")
11
+ return processor, model
12
+
13
+ processor, model = load_model()
14
+
15
+ st.title("🧾 Invoice Table Extractor - Hugging Face Donut")
16
+ st.write("Upload an invoice image to extract the table (code article, designation, quantity, unit prices, totals).")
17
+
18
+ uploaded_file = st.file_uploader("Choose an image", type=["png", "jpg", "jpeg"])
19
+
20
+ if uploaded_file is not None:
21
+ image = Image.open(uploaded_file).convert("RGB")
22
+ st.image(image, caption="Uploaded Invoice", use_column_width=True)
23
+
24
+ with st.spinner("🔍 Analyzing..."):
25
+ # Preprocess image
26
+ pixel_values = processor(image, return_tensors="pt").pixel_values
27
+
28
+ # Prompt for table extraction
29
+ prompt = "<s_docvqa><question>Extract the invoice items table with code article, designation, quantity, unit prices, and totals.</question><answer>"
30
+ decoder_input_ids = processor.tokenizer(prompt, add_special_tokens=False, return_tensors="pt").input_ids
31
+
32
+ # Generate prediction
33
+ outputs = model.generate(
34
+ pixel_values,
35
+ decoder_input_ids=decoder_input_ids,
36
+ max_length=512,
37
+ early_stopping=True
38
+ )
39
+
40
+ # Decode response
41
+ result = processor.batch_decode(outputs, skip_special_tokens=True)[0]
42
+ result = result.replace("<s_docvqa><question>", "").replace("</question><answer>", "").strip()
43
+
44
+ st.subheader("📋 Extracted Table Info")
45
+ st.code(result)
46
+
 
 
 
 
 
 
 
47