Spaces:

kryman27
/

pdf-extractor

Running

kryman27 commited on Feb 6

Commit

cbec0a2

verified ·

1 Parent(s): b1d3718

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -6,7 +6,7 @@ from transformers import LayoutLMForTokenClassification, AutoTokenizer
 # Wczytanie modelu LayoutLMv3
 model_name = "kryman27/layoutlmv3-finetuned"
 model = LayoutLMForTokenClassification.from_pretrained(model_name)
-tokenizer = AutoTokenizer.from_pretrained(model_name)  # Poprawiona linia
 # Reguły do wykrywania NIP, kwot, dat
 nip_pattern = re.compile(r'\bPL\s?\d{10}\b|\b\d{10}\b')
@@ -18,8 +18,9 @@ def extract_invoice_data(pdf_file):
     with pdfplumber.open(pdf_file) as pdf:
         full_text = "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())
-    # Tokenizacja danych z uwzględnieniem układu dokumentu
-    tokens = tokenizer(full_text, return_tensors="pt", truncation=True)
     # Predykcja modelu
     outputs = model(**tokens)
@@ -27,7 +28,7 @@ def extract_invoice_data(pdf_file):
     # Przetwarzanie wyników
     entities = []
-    for token, pred in zip(tokens.tokens(), predictions):
         if pred > 0:  # Pomijamy tło
             entities.append((token, model.config.id2label[pred]))

 # Wczytanie modelu LayoutLMv3
 model_name = "kryman27/layoutlmv3-finetuned"
 model = LayoutLMForTokenClassification.from_pretrained(model_name)
+tokenizer = AutoTokenizer.from_pretrained(model_name)  # Poprawiona wersja
 # Reguły do wykrywania NIP, kwot, dat
 nip_pattern = re.compile(r'\bPL\s?\d{10}\b|\b\d{10}\b')
     with pdfplumber.open(pdf_file) as pdf:
         full_text = "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())
+    # Podział tekstu na listę słów (LayoutLMv3 wymaga tokenizacji na poziomie słów)
+    words = full_text.split()  # Nowa poprawiona linia
+    tokens = tokenizer(words, is_split_into_words=True, return_tensors="pt", truncation=True)  # Poprawiona linia
     # Predykcja modelu
     outputs = model(**tokens)
     # Przetwarzanie wyników
     entities = []
+    for token, pred in zip(words, predictions):  # Teraz iterujemy po `words`
         if pred > 0:  # Pomijamy tło
             entities.append((token, model.config.id2label[pred]))