Spaces:
Runtime error
Runtime error
import os | |
# workaround: install old version of pytorch since detectron2 hasn't released packages for pytorch 1.9 (issue: https://github.com/facebookresearch/detectron2/issues/3158) | |
# os.system('pip install torch==1.8.0+cu101 torchvision==0.9.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html') | |
os.system('pip install -q torch==1.10.0+cu111 torchvision==0.11+cu111 -f https://download.pytorch.org/whl/torch_stable.html') | |
# install detectron2 that matches pytorch 1.8 | |
# See https://detectron2.readthedocs.io/tutorials/install.html for instructions | |
#os.system('pip install -q detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu101/torch1.8/index.html') | |
os.system('pip install git+https://github.com/facebookresearch/detectron2.git') | |
import detectron2 | |
from detectron2.utils.logger import setup_logger | |
setup_logger() | |
import gradio as gr | |
import re | |
import string | |
import torch | |
from operator import itemgetter | |
import collections | |
import pypdf | |
from pypdf import PdfReader | |
from pypdf.errors import PdfReadError | |
import pypdfium2 as pdfium | |
import langdetect | |
from langdetect import detect_langs | |
import pandas as pd | |
import numpy as np | |
import random | |
import tempfile | |
import itertools | |
from matplotlib import font_manager | |
from PIL import Image, ImageDraw, ImageFont | |
import cv2 | |
import pathlib | |
from pathlib import Path | |
import shutil | |
# Tesseract | |
print(os.popen(f'cat /etc/debian_version').read()) | |
print(os.popen(f'cat /etc/issue').read()) | |
print(os.popen(f'apt search tesseract').read()) | |
import pytesseract | |
## Key parameters | |
# categories colors | |
label2color = { | |
'Caption': 'brown', | |
'Footnote': 'orange', | |
'Formula': 'gray', | |
'List-item': 'yellow', | |
'Page-footer': 'red', | |
'Page-header': 'red', | |
'Picture': 'violet', | |
'Section-header': 'orange', | |
'Table': 'green', | |
'Text': 'blue', | |
'Title': 'pink' | |
} | |
# bounding boxes start and end of a sequence | |
cls_box = [0, 0, 0, 0] | |
sep_box = [1000, 1000, 1000, 1000] | |
# model | |
model_id = "pierreguillou/layout-xlm-base-finetuned-with-DocLayNet-base-at-paragraphlevel-ml512" | |
# tokenizer | |
tokenizer_id = "xlm-roberta-base" | |
# (tokenization) The maximum length of a feature (sequence) | |
if str(384) in model_id: | |
max_length = 384 | |
elif str(512) in model_id: | |
max_length = 512 | |
else: | |
print("Error with max_length of chunks!") | |
# (tokenization) overlap | |
doc_stride = 128 # The authorized overlap between two part of the context when splitting it is needed. | |
# max PDF page images that will be displayed | |
max_imgboxes = 2 | |
# get files | |
examples_dir = 'files/' | |
Path(examples_dir).mkdir(parents=True, exist_ok=True) | |
from huggingface_hub import hf_hub_download | |
files = ["example.pdf", "blank.pdf", "blank.png", "languages_iso.csv", "languages_tesseract.csv", "wo_content.png"] | |
for file_name in files: | |
path_to_file = hf_hub_download( | |
repo_id = "pierreguillou/Inference-APP-Document-Understanding-at-paragraphlevel-v2", | |
filename = "files/" + file_name, | |
repo_type = "space" | |
) | |
shutil.copy(path_to_file,examples_dir) | |
# path to files | |
image_wo_content = examples_dir + "wo_content.png" # image without content | |
pdf_blank = examples_dir + "blank.pdf" # blank PDF | |
image_blank = examples_dir + "blank.png" # blank image | |
## get langdetect2Tesseract dictionary | |
t = "files/languages_tesseract.csv" | |
l = "files/languages_iso.csv" | |
df_t = pd.read_csv(t) | |
df_l = pd.read_csv(l) | |
langs_t = df_t["Language"].to_list() | |
langs_t = [lang_t.lower().strip().translate(str.maketrans('', '', string.punctuation)) for lang_t in langs_t] | |
langs_l = df_l["Language"].to_list() | |
langs_l = [lang_l.lower().strip().translate(str.maketrans('', '', string.punctuation)) for lang_l in langs_l] | |
langscode_t = df_t["LangCode"].to_list() | |
langscode_l = df_l["LangCode"].to_list() | |
Tesseract2langdetect, langdetect2Tesseract = dict(), dict() | |
for lang_t, langcode_t in zip(langs_t,langscode_t): | |
try: | |
if lang_t == "Chinese - Simplified".lower().strip().translate(str.maketrans('', '', string.punctuation)): lang_t = "chinese" | |
index = langs_l.index(lang_t) | |
langcode_l = langscode_l[index] | |
Tesseract2langdetect[langcode_t] = langcode_l | |
except: | |
continue | |
langdetect2Tesseract = {v:k for k,v in Tesseract2langdetect.items()} | |
## model / feature extractor / tokenizer | |
# get device | |
import torch | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
## model LiLT | |
import transformers | |
from transformers import AutoTokenizer, AutoModelForTokenClassification | |
tokenizer_lilt = AutoTokenizer.from_pretrained(model_id_lilt) | |
model_lilt = AutoModelForTokenClassification.from_pretrained(model_id_lilt); | |
model_lilt.to(device); | |
## model LayoutXLM | |
from transformers import LayoutLMv2ForTokenClassification # LayoutXLMTokenizerFast, | |
model_layoutxlm = LayoutLMv2ForTokenClassification.from_pretrained(model_id_layoutxlm); | |
model_layoutxlm.to(device); | |
# feature extractor | |
from transformers import LayoutLMv2FeatureExtractor | |
feature_extractor = LayoutLMv2FeatureExtractor(apply_ocr=False) | |
# tokenizer | |
from transformers import AutoTokenizer | |
tokenizer_layoutxlm = AutoTokenizer.from_pretrained(tokenizer_id_layoutxlm) | |
# get labels | |
id2label_lilt = model_lilt.config.id2label | |
label2id_lilt = model_lilt.config.label2id | |
num_labels_lilt = len(id2label_lilt) | |
id2label_layoutxlm = model_layoutxlm.config.id2label | |
label2id_layoutxlm = model_layoutxlm.config.label2id | |
num_labels_layoutxlm = len(id2label_layoutxlm) | |