kryman27 commited on
Commit
3e4d13c
verified
1 Parent(s): 0f572b1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -47
app.py CHANGED
@@ -1,59 +1,22 @@
1
  import gradio as gr
2
  import pdfplumber
3
  import re
4
- from transformers import pipeline
5
 
6
- # Model do rozpoznawania nazw organizacji i warto艣ci numerycznych
7
- extractor = pipeline("ner", model="kryman27/layoutlmv3-finetuned", aggregation_strategy="simple")
 
 
8
 
9
  # Regu艂y do wykrywania NIP, kwot, dat
10
- nip_pattern = re.compile(r'\bPL\s?\d{10}\b|\b\d{10}\b') # Polski NIP (z "PL" lub bez)
11
- kwota_pattern = re.compile(r'\b\d+[\.,]?\d*\b') # Kwoty: np. 123.45 lub 123
12
- data_pattern = re.compile(r'\b\d{2}\.\d{2}\.\d{4}\b') # Daty w formacie DD.MM.YYYY
13
  payment_keywords = ["data p艂atno艣ci", "termin p艂atno艣ci", "zap艂ata", "p艂atno艣膰"]
14
 
15
  def extract_invoice_data(pdf_file):
16
  with pdfplumber.open(pdf_file) as pdf:
17
  full_text = "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())
18
 
19
- # Znalezienie nazw organizacji
20
- entities = extractor(full_text)
21
- seller_name = []
22
-
23
- for entity in entities:
24
- if "ORG" in entity["entity_group"]:
25
- seller_name.append(entity["word"])
26
-
27
- # Znajdujemy warto艣ci numeryczne dla NIP, kwot, dat
28
- seller_nip = nip_pattern.search(full_text)
29
- kwoty = kwota_pattern.findall(full_text)
30
- kwoty = [float(k.replace(",", ".")) for k in kwoty if k.replace(",", ".").replace(".", "").isdigit()]
31
- total_amount = max(kwoty) if kwoty else None
32
-
33
- # Szukamy daty p艂atno艣ci na podstawie kontekstu
34
- payment_date = None
35
- for line in full_text.split("\n"):
36
- if any(keyword in line.lower() for keyword in payment_keywords):
37
- date_match = data_pattern.search(line)
38
- if date_match:
39
- payment_date = date_match.group()
40
- break
41
-
42
- return {
43
- "Sprzedawca": " ".join(seller_name) if seller_name else "Nie znaleziono",
44
- "NIP": seller_nip.group() if seller_nip else "Nie znaleziono",
45
- "Kwota ca艂kowita": total_amount if total_amount else "Nie znaleziono",
46
- "Data p艂atno艣ci": payment_date if payment_date else "Nie znaleziono"
47
- }
48
-
49
- # Interfejs u偶ytkownika w Hugging Face Spaces
50
- iface = gr.Interface(
51
- fn=extract_invoice_data,
52
- inputs=gr.File(label="Wybierz plik PDF"),
53
- outputs="json",
54
- title="Ekstrakcja danych z faktury",
55
- description="Prze艣lij plik PDF, a model zwr贸ci dane sprzedawcy, NIP, kwot臋 i dat臋 p艂atno艣ci."
56
- )
57
-
58
- if __name__ == "__main__":
59
- iface.launch()
 
1
  import gradio as gr
2
  import pdfplumber
3
  import re
4
+ from transformers import LayoutLMForTokenClassification, LayoutLMTokenizerFast
5
 
6
+ # Wczytanie modelu LayoutLMv3
7
+ model_name = "kryman27/layoutlmv3-finetuned"
8
+ model = LayoutLMForTokenClassification.from_pretrained(model_name)
9
+ tokenizer = LayoutLMTokenizerFast.from_pretrained(model_name)
10
 
11
  # Regu艂y do wykrywania NIP, kwot, dat
12
+ nip_pattern = re.compile(r'\bPL\s?\d{10}\b|\b\d{10}\b')
13
+ kwota_pattern = re.compile(r'\b\d+[\.,]?\d*\b')
14
+ data_pattern = re.compile(r'\b\d{2}\.\d{2}\.\d{4}\b')
15
  payment_keywords = ["data p艂atno艣ci", "termin p艂atno艣ci", "zap艂ata", "p艂atno艣膰"]
16
 
17
  def extract_invoice_data(pdf_file):
18
  with pdfplumber.open(pdf_file) as pdf:
19
  full_text = "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())
20
 
21
+ # Tokenizacja danych z uwzgl臋dnieniem uk艂adu dokumentu
22
+ tokens = tokenizer(full_text, return_tensors="pt