import streamlit as st from PIL import Image from transformers import DonutProcessor, VisionEncoderDecoderModel import torch # Load Donut model and processor @st.cache_resource def load_model(): processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa") model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa") return processor, model processor, model = load_model() st.title("๐Ÿงพ Invoice Table Extractor - Hugging Face Donut") st.write("Upload an invoice image to extract the table (code article, designation, quantity, unit prices, totals).") uploaded_file = st.file_uploader("Choose an image", type=["png", "jpg", "jpeg"]) if uploaded_file is not None: image = Image.open(uploaded_file).convert("RGB") st.image(image, caption="Uploaded Invoice", use_column_width=True) with st.spinner("๐Ÿ” Analyzing..."): # Preprocess image pixel_values = processor(image, return_tensors="pt").pixel_values # Prompt for table extraction prompt = "Extract the invoice items table with code article, designation, quantity, unit prices, and totals." decoder_input_ids = processor.tokenizer(prompt, add_special_tokens=False, return_tensors="pt").input_ids # Generate prediction outputs = model.generate( pixel_values, decoder_input_ids=decoder_input_ids, max_length=512, early_stopping=True ) # Decode response result = processor.batch_decode(outputs, skip_special_tokens=True)[0] result = result.replace("", "").replace("", "").strip() st.subheader("๐Ÿ“‹ Extracted Table Info") st.code(result)