Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,59 +1,22 @@
|
|
1 |
import gradio as gr
|
2 |
import pdfplumber
|
3 |
import re
|
4 |
-
from transformers import
|
5 |
|
6 |
-
#
|
7 |
-
|
|
|
|
|
8 |
|
9 |
# Regu艂y do wykrywania NIP, kwot, dat
|
10 |
-
nip_pattern = re.compile(r'\bPL\s?\d{10}\b|\b\d{10}\b')
|
11 |
-
kwota_pattern = re.compile(r'\b\d+[\.,]?\d*\b')
|
12 |
-
data_pattern = re.compile(r'\b\d{2}\.\d{2}\.\d{4}\b')
|
13 |
payment_keywords = ["data p艂atno艣ci", "termin p艂atno艣ci", "zap艂ata", "p艂atno艣膰"]
|
14 |
|
15 |
def extract_invoice_data(pdf_file):
|
16 |
with pdfplumber.open(pdf_file) as pdf:
|
17 |
full_text = "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())
|
18 |
|
19 |
-
#
|
20 |
-
|
21 |
-
seller_name = []
|
22 |
-
|
23 |
-
for entity in entities:
|
24 |
-
if "ORG" in entity["entity_group"]:
|
25 |
-
seller_name.append(entity["word"])
|
26 |
-
|
27 |
-
# Znajdujemy warto艣ci numeryczne dla NIP, kwot, dat
|
28 |
-
seller_nip = nip_pattern.search(full_text)
|
29 |
-
kwoty = kwota_pattern.findall(full_text)
|
30 |
-
kwoty = [float(k.replace(",", ".")) for k in kwoty if k.replace(",", ".").replace(".", "").isdigit()]
|
31 |
-
total_amount = max(kwoty) if kwoty else None
|
32 |
-
|
33 |
-
# Szukamy daty p艂atno艣ci na podstawie kontekstu
|
34 |
-
payment_date = None
|
35 |
-
for line in full_text.split("\n"):
|
36 |
-
if any(keyword in line.lower() for keyword in payment_keywords):
|
37 |
-
date_match = data_pattern.search(line)
|
38 |
-
if date_match:
|
39 |
-
payment_date = date_match.group()
|
40 |
-
break
|
41 |
-
|
42 |
-
return {
|
43 |
-
"Sprzedawca": " ".join(seller_name) if seller_name else "Nie znaleziono",
|
44 |
-
"NIP": seller_nip.group() if seller_nip else "Nie znaleziono",
|
45 |
-
"Kwota ca艂kowita": total_amount if total_amount else "Nie znaleziono",
|
46 |
-
"Data p艂atno艣ci": payment_date if payment_date else "Nie znaleziono"
|
47 |
-
}
|
48 |
-
|
49 |
-
# Interfejs u偶ytkownika w Hugging Face Spaces
|
50 |
-
iface = gr.Interface(
|
51 |
-
fn=extract_invoice_data,
|
52 |
-
inputs=gr.File(label="Wybierz plik PDF"),
|
53 |
-
outputs="json",
|
54 |
-
title="Ekstrakcja danych z faktury",
|
55 |
-
description="Prze艣lij plik PDF, a model zwr贸ci dane sprzedawcy, NIP, kwot臋 i dat臋 p艂atno艣ci."
|
56 |
-
)
|
57 |
-
|
58 |
-
if __name__ == "__main__":
|
59 |
-
iface.launch()
|
|
|
1 |
import gradio as gr
|
2 |
import pdfplumber
|
3 |
import re
|
4 |
+
from transformers import LayoutLMForTokenClassification, LayoutLMTokenizerFast
|
5 |
|
6 |
+
# Wczytanie modelu LayoutLMv3
|
7 |
+
model_name = "kryman27/layoutlmv3-finetuned"
|
8 |
+
model = LayoutLMForTokenClassification.from_pretrained(model_name)
|
9 |
+
tokenizer = LayoutLMTokenizerFast.from_pretrained(model_name)
|
10 |
|
11 |
# Regu艂y do wykrywania NIP, kwot, dat
|
12 |
+
nip_pattern = re.compile(r'\bPL\s?\d{10}\b|\b\d{10}\b')
|
13 |
+
kwota_pattern = re.compile(r'\b\d+[\.,]?\d*\b')
|
14 |
+
data_pattern = re.compile(r'\b\d{2}\.\d{2}\.\d{4}\b')
|
15 |
payment_keywords = ["data p艂atno艣ci", "termin p艂atno艣ci", "zap艂ata", "p艂atno艣膰"]
|
16 |
|
17 |
def extract_invoice_data(pdf_file):
|
18 |
with pdfplumber.open(pdf_file) as pdf:
|
19 |
full_text = "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())
|
20 |
|
21 |
+
# Tokenizacja danych z uwzgl臋dnieniem uk艂adu dokumentu
|
22 |
+
tokens = tokenizer(full_text, return_tensors="pt
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|