import os # workaround: install old version of pytorch since detectron2 hasn't released packages for pytorch 1.9 (issue: https://github.com/facebookresearch/detectron2/issues/3158) # os.system('pip install torch==1.8.0+cu101 torchvision==0.9.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html') os.system('pip install -q torch==1.10.0+cu111 torchvision==0.11+cu111 -f https://download.pytorch.org/whl/torch_stable.html') # install detectron2 that matches pytorch 1.8 # See https://detectron2.readthedocs.io/tutorials/install.html for instructions #os.system('pip install -q detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu101/torch1.8/index.html') os.system('pip install git+https://github.com/facebookresearch/detectron2.git') import detectron2 from detectron2.utils.logger import setup_logger setup_logger() import gradio as gr import re import string import torch from operator import itemgetter import collections import pypdf from pypdf import PdfReader from pypdf.errors import PdfReadError import pypdfium2 as pdfium import langdetect from langdetect import detect_langs import pandas as pd import numpy as np import random import tempfile import itertools from matplotlib import font_manager from PIL import Image, ImageDraw, ImageFont import cv2 import pathlib from pathlib import Path import shutil # Tesseract print(os.popen(f'cat /etc/debian_version').read()) print(os.popen(f'cat /etc/issue').read()) print(os.popen(f'apt search tesseract').read()) import pytesseract ## Key parameters # categories colors label2color = { 'Caption': 'brown', 'Footnote': 'orange', 'Formula': 'gray', 'List-item': 'yellow', 'Page-footer': 'red', 'Page-header': 'red', 'Picture': 'violet', 'Section-header': 'orange', 'Table': 'green', 'Text': 'blue', 'Title': 'pink' } # bounding boxes start and end of a sequence cls_box = [0, 0, 0, 0] sep_box = [1000, 1000, 1000, 1000] # model model_id = "pierreguillou/layout-xlm-base-finetuned-with-DocLayNet-base-at-paragraphlevel-ml512" # tokenizer tokenizer_id = "xlm-roberta-base" # (tokenization) The maximum length of a feature (sequence) if str(384) in model_id: max_length = 384 elif str(512) in model_id: max_length = 512 else: print("Error with max_length of chunks!") # (tokenization) overlap doc_stride = 128 # The authorized overlap between two part of the context when splitting it is needed. # max PDF page images that will be displayed max_imgboxes = 2 # get files examples_dir = 'files/' Path(examples_dir).mkdir(parents=True, exist_ok=True) from huggingface_hub import hf_hub_download files = ["example.pdf", "blank.pdf", "blank.png", "languages_iso.csv", "languages_tesseract.csv", "wo_content.png"] for file_name in files: path_to_file = hf_hub_download( repo_id = "pierreguillou/Inference-APP-Document-Understanding-at-paragraphlevel-v2", filename = "files/" + file_name, repo_type = "space" ) shutil.copy(path_to_file,examples_dir) # path to files image_wo_content = examples_dir + "wo_content.png" # image without content pdf_blank = examples_dir + "blank.pdf" # blank PDF image_blank = examples_dir + "blank.png" # blank image ## get langdetect2Tesseract dictionary t = "files/languages_tesseract.csv" l = "files/languages_iso.csv" df_t = pd.read_csv(t) df_l = pd.read_csv(l) langs_t = df_t["Language"].to_list() langs_t = [lang_t.lower().strip().translate(str.maketrans('', '', string.punctuation)) for lang_t in langs_t] langs_l = df_l["Language"].to_list() langs_l = [lang_l.lower().strip().translate(str.maketrans('', '', string.punctuation)) for lang_l in langs_l] langscode_t = df_t["LangCode"].to_list() langscode_l = df_l["LangCode"].to_list() Tesseract2langdetect, langdetect2Tesseract = dict(), dict() for lang_t, langcode_t in zip(langs_t,langscode_t): try: if lang_t == "Chinese - Simplified".lower().strip().translate(str.maketrans('', '', string.punctuation)): lang_t = "chinese" index = langs_l.index(lang_t) langcode_l = langscode_l[index] Tesseract2langdetect[langcode_t] = langcode_l except: continue langdetect2Tesseract = {v:k for k,v in Tesseract2langdetect.items()} ## model / feature extractor / tokenizer # get device import torch device = torch.device("cuda" if torch.cuda.is_available() else "cpu") ## model LiLT import transformers from transformers import AutoTokenizer, AutoModelForTokenClassification tokenizer_lilt = AutoTokenizer.from_pretrained(model_id_lilt) model_lilt = AutoModelForTokenClassification.from_pretrained(model_id_lilt); model_lilt.to(device); ## model LayoutXLM from transformers import LayoutLMv2ForTokenClassification # LayoutXLMTokenizerFast, model_layoutxlm = LayoutLMv2ForTokenClassification.from_pretrained(model_id_layoutxlm); model_layoutxlm.to(device); # feature extractor from transformers import LayoutLMv2FeatureExtractor feature_extractor = LayoutLMv2FeatureExtractor(apply_ocr=False) # tokenizer from transformers import AutoTokenizer tokenizer_layoutxlm = AutoTokenizer.from_pretrained(tokenizer_id_layoutxlm) # get labels id2label_lilt = model_lilt.config.id2label label2id_lilt = model_lilt.config.label2id num_labels_lilt = len(id2label_lilt) id2label_layoutxlm = model_layoutxlm.config.id2label label2id_layoutxlm = model_layoutxlm.config.label2id num_labels_layoutxlm = len(id2label_layoutxlm)