kryman27 commited on
Commit
cbec0a2
verified
1 Parent(s): b1d3718

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +5 -4
app.py CHANGED
@@ -6,7 +6,7 @@ from transformers import LayoutLMForTokenClassification, AutoTokenizer
6
  # Wczytanie modelu LayoutLMv3
7
  model_name = "kryman27/layoutlmv3-finetuned"
8
  model = LayoutLMForTokenClassification.from_pretrained(model_name)
9
- tokenizer = AutoTokenizer.from_pretrained(model_name) # Poprawiona linia
10
 
11
  # Regu艂y do wykrywania NIP, kwot, dat
12
  nip_pattern = re.compile(r'\bPL\s?\d{10}\b|\b\d{10}\b')
@@ -18,8 +18,9 @@ def extract_invoice_data(pdf_file):
18
  with pdfplumber.open(pdf_file) as pdf:
19
  full_text = "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())
20
 
21
- # Tokenizacja danych z uwzgl臋dnieniem uk艂adu dokumentu
22
- tokens = tokenizer(full_text, return_tensors="pt", truncation=True)
 
23
 
24
  # Predykcja modelu
25
  outputs = model(**tokens)
@@ -27,7 +28,7 @@ def extract_invoice_data(pdf_file):
27
 
28
  # Przetwarzanie wynik贸w
29
  entities = []
30
- for token, pred in zip(tokens.tokens(), predictions):
31
  if pred > 0: # Pomijamy t艂o
32
  entities.append((token, model.config.id2label[pred]))
33
 
 
6
  # Wczytanie modelu LayoutLMv3
7
  model_name = "kryman27/layoutlmv3-finetuned"
8
  model = LayoutLMForTokenClassification.from_pretrained(model_name)
9
+ tokenizer = AutoTokenizer.from_pretrained(model_name) # Poprawiona wersja
10
 
11
  # Regu艂y do wykrywania NIP, kwot, dat
12
  nip_pattern = re.compile(r'\bPL\s?\d{10}\b|\b\d{10}\b')
 
18
  with pdfplumber.open(pdf_file) as pdf:
19
  full_text = "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())
20
 
21
+ # Podzia艂 tekstu na list臋 s艂贸w (LayoutLMv3 wymaga tokenizacji na poziomie s艂贸w)
22
+ words = full_text.split() # Nowa poprawiona linia
23
+ tokens = tokenizer(words, is_split_into_words=True, return_tensors="pt", truncation=True) # Poprawiona linia
24
 
25
  # Predykcja modelu
26
  outputs = model(**tokens)
 
28
 
29
  # Przetwarzanie wynik贸w
30
  entities = []
31
+ for token, pred in zip(words, predictions): # Teraz iterujemy po `words`
32
  if pred > 0: # Pomijamy t艂o
33
  entities.append((token, model.config.id2label[pred]))
34