Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -3,30 +3,50 @@ import pdfplumber
|
|
3 |
import re
|
4 |
from transformers import pipeline
|
5 |
|
6 |
-
# Model do
|
7 |
-
extractor = pipeline("
|
8 |
|
9 |
-
|
|
|
|
|
|
|
|
|
|
|
10 |
with pdfplumber.open(pdf_file) as pdf:
|
11 |
-
# Pobranie całego tekstu z PDF
|
12 |
full_text = "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())
|
13 |
|
14 |
-
#
|
15 |
-
|
16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
|
18 |
-
#
|
19 |
-
|
|
|
|
|
20 |
|
21 |
-
return {
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
# Interfejs użytkownika w Hugging Face Spaces
|
24 |
iface = gr.Interface(
|
25 |
-
fn=
|
26 |
inputs=gr.File(label="Wybierz plik PDF"),
|
27 |
outputs="json",
|
28 |
-
title="Ekstrakcja danych
|
29 |
-
description="Prześlij plik PDF, a model zwróci
|
30 |
)
|
31 |
|
32 |
if __name__ == "__main__":
|
|
|
3 |
import re
|
4 |
from transformers import pipeline
|
5 |
|
6 |
+
# Model do rozpoznawania nazw organizacji i wartości numerycznych
|
7 |
+
extractor = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english", aggregation_strategy="simple")
|
8 |
|
9 |
+
# Reguły do identyfikacji wartości liczbowych (NIP, kwoty, daty)
|
10 |
+
nip_pattern = re.compile(r'\b\d{10}\b') # Polski NIP: 10 cyfr
|
11 |
+
kwota_pattern = re.compile(r'\b\d+[\.,]?\d*\b') # Kwoty: np. 123.45 lub 123
|
12 |
+
data_pattern = re.compile(r'\b\d{2}\.\d{2}\.\d{4}\b') # Daty: np. 21.10.2024
|
13 |
+
|
14 |
+
def extract_invoice_data(pdf_file):
|
15 |
with pdfplumber.open(pdf_file) as pdf:
|
|
|
16 |
full_text = "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())
|
17 |
|
18 |
+
# Szukamy danych w tekście
|
19 |
+
entities = extractor(full_text)
|
20 |
+
|
21 |
+
seller_name = []
|
22 |
+
seller_nip = None
|
23 |
+
items = []
|
24 |
+
total_amount = None
|
25 |
+
invoice_date = None
|
26 |
+
|
27 |
+
for entity in entities:
|
28 |
+
if "ORG" in entity["entity_group"]:
|
29 |
+
seller_name.append(entity["word"]) # Zbieramy nazwę sprzedawcy
|
30 |
|
31 |
+
# Znajdujemy wartości numeryczne dla NIP, kwot, dat
|
32 |
+
seller_nip = nip_pattern.search(full_text)
|
33 |
+
total_amount = max(kwota_pattern.findall(full_text), key=float, default=None)
|
34 |
+
invoice_date = data_pattern.search(full_text)
|
35 |
|
36 |
+
return {
|
37 |
+
"Sprzedawca": " ".join(seller_name) if seller_name else "Nie znaleziono",
|
38 |
+
"NIP": seller_nip.group() if seller_nip else "Nie znaleziono",
|
39 |
+
"Data faktury": invoice_date.group() if invoice_date else "Nie znaleziono",
|
40 |
+
"Kwota całkowita": total_amount if total_amount else "Nie znaleziono"
|
41 |
+
}
|
42 |
|
43 |
# Interfejs użytkownika w Hugging Face Spaces
|
44 |
iface = gr.Interface(
|
45 |
+
fn=extract_invoice_data,
|
46 |
inputs=gr.File(label="Wybierz plik PDF"),
|
47 |
outputs="json",
|
48 |
+
title="Ekstrakcja danych z faktury",
|
49 |
+
description="Prześlij plik PDF, a model zwróci dane sprzedawcy, NIP, kwotę i datę faktury."
|
50 |
)
|
51 |
|
52 |
if __name__ == "__main__":
|