pierreguillou's picture
Update files/functions.py
a2cda1e
raw
history blame
5.41 kB
import os
# workaround: install old version of pytorch since detectron2 hasn't released packages for pytorch 1.9 (issue: https://github.com/facebookresearch/detectron2/issues/3158)
# os.system('pip install torch==1.8.0+cu101 torchvision==0.9.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html')
os.system('pip install -q torch==1.10.0+cu111 torchvision==0.11+cu111 -f https://download.pytorch.org/whl/torch_stable.html')
# install detectron2 that matches pytorch 1.8
# See https://detectron2.readthedocs.io/tutorials/install.html for instructions
#os.system('pip install -q detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu101/torch1.8/index.html')
os.system('pip install git+https://github.com/facebookresearch/detectron2.git')
import detectron2
from detectron2.utils.logger import setup_logger
setup_logger()
import gradio as gr
import re
import string
import torch
from operator import itemgetter
import collections
import pypdf
from pypdf import PdfReader
from pypdf.errors import PdfReadError
import pypdfium2 as pdfium
import langdetect
from langdetect import detect_langs
import pandas as pd
import numpy as np
import random
import tempfile
import itertools
from matplotlib import font_manager
from PIL import Image, ImageDraw, ImageFont
import cv2
import pathlib
from pathlib import Path
import shutil
# Tesseract
print(os.popen(f'cat /etc/debian_version').read())
print(os.popen(f'cat /etc/issue').read())
print(os.popen(f'apt search tesseract').read())
import pytesseract
## Key parameters
# categories colors
label2color = {
'Caption': 'brown',
'Footnote': 'orange',
'Formula': 'gray',
'List-item': 'yellow',
'Page-footer': 'red',
'Page-header': 'red',
'Picture': 'violet',
'Section-header': 'orange',
'Table': 'green',
'Text': 'blue',
'Title': 'pink'
}
# bounding boxes start and end of a sequence
cls_box = [0, 0, 0, 0]
sep_box = [1000, 1000, 1000, 1000]
# model
model_id = "pierreguillou/layout-xlm-base-finetuned-with-DocLayNet-base-at-paragraphlevel-ml512"
# tokenizer
tokenizer_id = "xlm-roberta-base"
# (tokenization) The maximum length of a feature (sequence)
if str(384) in model_id:
max_length = 384
elif str(512) in model_id:
max_length = 512
else:
print("Error with max_length of chunks!")
# (tokenization) overlap
doc_stride = 128 # The authorized overlap between two part of the context when splitting it is needed.
# max PDF page images that will be displayed
max_imgboxes = 2
# get files
examples_dir = 'files/'
Path(examples_dir).mkdir(parents=True, exist_ok=True)
from huggingface_hub import hf_hub_download
files = ["example.pdf", "blank.pdf", "blank.png", "languages_iso.csv", "languages_tesseract.csv", "wo_content.png"]
for file_name in files:
path_to_file = hf_hub_download(
repo_id = "pierreguillou/Inference-APP-Document-Understanding-at-paragraphlevel-v2",
filename = "files/" + file_name,
repo_type = "space"
)
shutil.copy(path_to_file,examples_dir)
# path to files
image_wo_content = examples_dir + "wo_content.png" # image without content
pdf_blank = examples_dir + "blank.pdf" # blank PDF
image_blank = examples_dir + "blank.png" # blank image
## get langdetect2Tesseract dictionary
t = "files/languages_tesseract.csv"
l = "files/languages_iso.csv"
df_t = pd.read_csv(t)
df_l = pd.read_csv(l)
langs_t = df_t["Language"].to_list()
langs_t = [lang_t.lower().strip().translate(str.maketrans('', '', string.punctuation)) for lang_t in langs_t]
langs_l = df_l["Language"].to_list()
langs_l = [lang_l.lower().strip().translate(str.maketrans('', '', string.punctuation)) for lang_l in langs_l]
langscode_t = df_t["LangCode"].to_list()
langscode_l = df_l["LangCode"].to_list()
Tesseract2langdetect, langdetect2Tesseract = dict(), dict()
for lang_t, langcode_t in zip(langs_t,langscode_t):
try:
if lang_t == "Chinese - Simplified".lower().strip().translate(str.maketrans('', '', string.punctuation)): lang_t = "chinese"
index = langs_l.index(lang_t)
langcode_l = langscode_l[index]
Tesseract2langdetect[langcode_t] = langcode_l
except:
continue
langdetect2Tesseract = {v:k for k,v in Tesseract2langdetect.items()}
## model / feature extractor / tokenizer
# get device
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
## model LiLT
import transformers
from transformers import AutoTokenizer, AutoModelForTokenClassification
tokenizer_lilt = AutoTokenizer.from_pretrained(model_id_lilt)
model_lilt = AutoModelForTokenClassification.from_pretrained(model_id_lilt);
model_lilt.to(device);
## model LayoutXLM
from transformers import LayoutLMv2ForTokenClassification # LayoutXLMTokenizerFast,
model_layoutxlm = LayoutLMv2ForTokenClassification.from_pretrained(model_id_layoutxlm);
model_layoutxlm.to(device);
# feature extractor
from transformers import LayoutLMv2FeatureExtractor
feature_extractor = LayoutLMv2FeatureExtractor(apply_ocr=False)
# tokenizer
from transformers import AutoTokenizer
tokenizer_layoutxlm = AutoTokenizer.from_pretrained(tokenizer_id_layoutxlm)
# get labels
id2label_lilt = model_lilt.config.id2label
label2id_lilt = model_lilt.config.label2id
num_labels_lilt = len(id2label_lilt)
id2label_layoutxlm = model_layoutxlm.config.id2label
label2id_layoutxlm = model_layoutxlm.config.label2id
num_labels_layoutxlm = len(id2label_layoutxlm)