Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -11,13 +11,14 @@ tokenizer = AutoTokenizer.from_pretrained(model_name) # Automatyczne wykrycie t
|
|
11 |
|
12 |
# Reguły do wykrywania NIP, kwot, dat
|
13 |
nip_pattern = re.compile(r'\bPL\s?\d{10}\b|\b\d{10}\b')
|
14 |
-
kwota_pattern = re.compile(r'\b\d+[\.,]?\d*\b')
|
15 |
-
data_pattern = re.compile(r'\b\d{2}\.\d{2}\.\d{4}\b')
|
16 |
-
payment_keywords = ["data płatności", "termin płatności", "zapłata", "płatność"]
|
|
|
17 |
|
18 |
def extract_invoice_data(pdf_file):
|
19 |
with pdfplumber.open(pdf_file) as pdf:
|
20 |
-
words, boxes = [], []
|
21 |
|
22 |
for page in pdf.pages:
|
23 |
extracted_words = page.extract_words()
|
@@ -26,8 +27,14 @@ def extract_invoice_data(pdf_file):
|
|
26 |
bbox = [int(word['x0']), int(word['top']), int(word['x1']), int(word['bottom'])] # Zaokrąglamy wartości
|
27 |
boxes.append(bbox) # Pobieramy bounding box (pozycję słowa na stronie)
|
28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
# Tokenizacja tekstu + dodanie bounding boxes
|
30 |
-
encoding = tokenizer.encode_plus(words, boxes=boxes, return_tensors="pt", truncation=True)
|
31 |
|
32 |
# Predykcja modelu
|
33 |
with torch.no_grad():
|
@@ -40,22 +47,32 @@ def extract_invoice_data(pdf_file):
|
|
40 |
if pred > 0: # Pomijamy tło
|
41 |
entities.append((token, model.config.id2label[pred]))
|
42 |
|
43 |
-
# Wyszukiwanie
|
44 |
seller_name = [token for token, label in entities if "ORG" in label]
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
|
50 |
-
#
|
51 |
payment_date = None
|
52 |
-
for
|
53 |
-
if any(keyword in
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
break
|
59 |
|
60 |
return {
|
61 |
"Sprzedawca": " ".join(seller_name) if seller_name else "Nie znaleziono",
|
|
|
11 |
|
12 |
# Reguły do wykrywania NIP, kwot, dat
|
13 |
nip_pattern = re.compile(r'\bPL\s?\d{10}\b|\b\d{10}\b')
|
14 |
+
kwota_pattern = re.compile(r'\b\d+[\.,]?\d*\s?(PLN|zł|EUR|USD)?\b') # Rozpoznawanie walut
|
15 |
+
data_pattern = re.compile(r'\b\d{2}\.\d{2}\.\d{4}\b') # Format DD.MM.YYYY
|
16 |
+
payment_keywords = ["data płatności", "termin płatności", "zapłata", "zapłacono", "płatność"]
|
17 |
+
seller_keywords = ["sprzedawca", "faktura wystawiona przez", "wystawca", "nazwa firmy"]
|
18 |
|
19 |
def extract_invoice_data(pdf_file):
|
20 |
with pdfplumber.open(pdf_file) as pdf:
|
21 |
+
words, boxes, full_text = [], [], []
|
22 |
|
23 |
for page in pdf.pages:
|
24 |
extracted_words = page.extract_words()
|
|
|
27 |
bbox = [int(word['x0']), int(word['top']), int(word['x1']), int(word['bottom'])] # Zaokrąglamy wartości
|
28 |
boxes.append(bbox) # Pobieramy bounding box (pozycję słowa na stronie)
|
29 |
|
30 |
+
page_text = page.extract_text()
|
31 |
+
if page_text:
|
32 |
+
full_text.append(page_text.lower())
|
33 |
+
|
34 |
+
full_text = "\n".join(full_text) # Łączymy cały tekst dokumentu
|
35 |
+
|
36 |
# Tokenizacja tekstu + dodanie bounding boxes
|
37 |
+
encoding = tokenizer.encode_plus(words, boxes=boxes, return_tensors="pt", truncation=True)
|
38 |
|
39 |
# Predykcja modelu
|
40 |
with torch.no_grad():
|
|
|
47 |
if pred > 0: # Pomijamy tło
|
48 |
entities.append((token, model.config.id2label[pred]))
|
49 |
|
50 |
+
# 🏢 Wyszukiwanie nazwy sprzedawcy
|
51 |
seller_name = [token for token, label in entities if "ORG" in label]
|
52 |
+
|
53 |
+
# Jeśli model nie znalazł, szukamy w tekście
|
54 |
+
if not seller_name:
|
55 |
+
for line in full_text.split("\n"):
|
56 |
+
if any(keyword in line for keyword in seller_keywords):
|
57 |
+
seller_name = line.split(":")[-1].strip()
|
58 |
+
break
|
59 |
+
|
60 |
+
# 🔢 Wyszukiwanie NIP
|
61 |
+
seller_nip = nip_pattern.search(full_text)
|
62 |
+
|
63 |
+
# 💰 Wyszukiwanie kwoty całkowitej (największa kwota z walutą)
|
64 |
+
kwoty = kwota_pattern.findall(full_text)
|
65 |
+
kwoty = [k[0].replace(",", ".") for k in kwoty if k[0].replace(",", ".").replace(".", "").isdigit()]
|
66 |
+
total_amount = max(map(float, kwoty), default=None) if kwoty else None
|
67 |
|
68 |
+
# 📆 Wyszukiwanie daty płatności
|
69 |
payment_date = None
|
70 |
+
for line in full_text.split("\n"):
|
71 |
+
if any(keyword in line for keyword in payment_keywords):
|
72 |
+
date_match = data_pattern.search(line)
|
73 |
+
if date_match:
|
74 |
+
payment_date = date_match.group()
|
75 |
+
break
|
|
|
76 |
|
77 |
return {
|
78 |
"Sprzedawca": " ".join(seller_name) if seller_name else "Nie znaleziono",
|