kryman27 commited on
Commit
b82e672
verified
1 Parent(s): cbec0a2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -15
app.py CHANGED
@@ -2,11 +2,12 @@ import gradio as gr
2
  import pdfplumber
3
  import re
4
  from transformers import LayoutLMForTokenClassification, AutoTokenizer
 
5
 
6
  # Wczytanie modelu LayoutLMv3
7
  model_name = "kryman27/layoutlmv3-finetuned"
8
  model = LayoutLMForTokenClassification.from_pretrained(model_name)
9
- tokenizer = AutoTokenizer.from_pretrained(model_name) # Poprawiona wersja
10
 
11
  # Regu艂y do wykrywania NIP, kwot, dat
12
  nip_pattern = re.compile(r'\bPL\s?\d{10}\b|\b\d{10}\b')
@@ -16,37 +17,45 @@ payment_keywords = ["data p艂atno艣ci", "termin p艂atno艣ci", "zap艂ata", "p艂at
16
 
17
  def extract_invoice_data(pdf_file):
18
  with pdfplumber.open(pdf_file) as pdf:
19
- full_text = "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())
20
 
21
- # Podzia艂 tekstu na list臋 s艂贸w (LayoutLMv3 wymaga tokenizacji na poziomie s艂贸w)
22
- words = full_text.split() # Nowa poprawiona linia
23
- tokens = tokenizer(words, is_split_into_words=True, return_tensors="pt", truncation=True) # Poprawiona linia
 
 
 
 
 
 
24
 
25
  # Predykcja modelu
26
- outputs = model(**tokens)
 
27
  predictions = outputs.logits.argmax(-1).squeeze().tolist()
28
 
29
  # Przetwarzanie wynik贸w
30
  entities = []
31
- for token, pred in zip(words, predictions): # Teraz iterujemy po `words`
32
  if pred > 0: # Pomijamy t艂o
33
  entities.append((token, model.config.id2label[pred]))
34
 
35
  # Wyszukiwanie kluczowych warto艣ci
36
  seller_name = [token for token, label in entities if "ORG" in label]
37
- seller_nip = nip_pattern.search(full_text)
38
- kwoty = kwota_pattern.findall(full_text)
39
  kwoty = [float(k.replace(",", ".")) for k in kwoty if k.replace(",", ".").replace(".", "").isdigit()]
40
  total_amount = max(kwoty) if kwoty else None
41
 
42
  # Szukamy daty p艂atno艣ci
43
  payment_date = None
44
- for line in full_text.split("\n"):
45
- if any(keyword in line.lower() for keyword in payment_keywords):
46
- date_match = data_pattern.search(line)
47
- if date_match:
48
- payment_date = date_match.group()
49
- break
 
50
 
51
  return {
52
  "Sprzedawca": " ".join(seller_name) if seller_name else "Nie znaleziono",
 
2
  import pdfplumber
3
  import re
4
  from transformers import LayoutLMForTokenClassification, AutoTokenizer
5
+ import torch
6
 
7
  # Wczytanie modelu LayoutLMv3
8
  model_name = "kryman27/layoutlmv3-finetuned"
9
  model = LayoutLMForTokenClassification.from_pretrained(model_name)
10
+ tokenizer = AutoTokenizer.from_pretrained(model_name) # Automatyczne wykrycie tokenizatora
11
 
12
  # Regu艂y do wykrywania NIP, kwot, dat
13
  nip_pattern = re.compile(r'\bPL\s?\d{10}\b|\b\d{10}\b')
 
17
 
18
  def extract_invoice_data(pdf_file):
19
  with pdfplumber.open(pdf_file) as pdf:
20
+ words, boxes = [], []
21
 
22
+ for page in pdf.pages:
23
+ extracted_words = page.extract_words()
24
+ for word in extracted_words:
25
+ words.append(word['text']) # Pobieramy tekst s艂owa
26
+ bbox = [word['x0'], word['top'], word['x1'], word['bottom']]
27
+ boxes.append(bbox) # Pobieramy bounding box (pozycj臋 s艂owa na stronie)
28
+
29
+ # Tokenizacja tekstu + dodanie bounding boxes
30
+ tokens = tokenizer(words, boxes=boxes, is_split_into_words=True, return_tensors="pt", truncation=True)
31
 
32
  # Predykcja modelu
33
+ with torch.no_grad():
34
+ outputs = model(**tokens)
35
  predictions = outputs.logits.argmax(-1).squeeze().tolist()
36
 
37
  # Przetwarzanie wynik贸w
38
  entities = []
39
+ for token, pred in zip(words, predictions):
40
  if pred > 0: # Pomijamy t艂o
41
  entities.append((token, model.config.id2label[pred]))
42
 
43
  # Wyszukiwanie kluczowych warto艣ci
44
  seller_name = [token for token, label in entities if "ORG" in label]
45
+ seller_nip = nip_pattern.search(" ".join(words))
46
+ kwoty = kwota_pattern.findall(" ".join(words))
47
  kwoty = [float(k.replace(",", ".")) for k in kwoty if k.replace(",", ".").replace(".", "").isdigit()]
48
  total_amount = max(kwoty) if kwoty else None
49
 
50
  # Szukamy daty p艂atno艣ci
51
  payment_date = None
52
+ for i, word in enumerate(words):
53
+ if any(keyword in word.lower() for keyword in payment_keywords):
54
+ if i + 1 < len(words):
55
+ date_match = data_pattern.search(words[i + 1])
56
+ if date_match:
57
+ payment_date = date_match.group()
58
+ break
59
 
60
  return {
61
  "Sprzedawca": " ".join(seller_name) if seller_name else "Nie znaleziono",