Spaces:
Runtime error
Runtime error
uakarsh
commited on
Commit
·
c017f2e
1
Parent(s):
31dbc87
Add application file
Browse files- app.py +148 -0
- dataset.py +150 -0
- modeling.py +251 -0
- packages.txt +1 -0
- requirements.txt +16 -0
- utils.py +116 -0
app.py
ADDED
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Requirements.txt
|
2 |
+
from torch import cuda
|
3 |
+
from transformers import T5Tokenizer, T5ForConditionalGeneration
|
4 |
+
import gradio as gr
|
5 |
+
from utils import convert_ans_to_token, convert_ques_to_token, rotate, convert_token_to_ques, convert_token_to_answer
|
6 |
+
from modeling import LaTr_for_pretraining, LaTr_for_finetuning, LaTrForVQA
|
7 |
+
from dataset import load_json_file, get_specific_file, resize_align_bbox, get_tokens_with_boxes, create_features
|
8 |
+
import torch.nn as nn
|
9 |
+
from PIL import Image, ImageDraw
|
10 |
+
import pytesseract
|
11 |
+
from tqdm.auto import tqdm
|
12 |
+
import numpy as np
|
13 |
+
import json
|
14 |
+
import os
|
15 |
+
import torch
|
16 |
+
from torchvision import transforms
|
17 |
+
|
18 |
+
|
19 |
+
# install PyTesseract
|
20 |
+
os.system('pip install -q pytesseract')
|
21 |
+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
22 |
+
|
23 |
+
|
24 |
+
# Default Library import
|
25 |
+
# Visualization libraries
|
26 |
+
|
27 |
+
# Specific libraries of LaTr
|
28 |
+
|
29 |
+
# Setting the hyperparameters as well as primary configurations
|
30 |
+
|
31 |
+
PAD_TOKEN_BOX = [0, 0, 0, 0]
|
32 |
+
max_seq_len = 512
|
33 |
+
batch_size = 2
|
34 |
+
target_size = (500, 384)
|
35 |
+
t5_model = "t5-base"
|
36 |
+
|
37 |
+
|
38 |
+
device = 'cuda' if cuda.is_available() else 'cpu'
|
39 |
+
|
40 |
+
|
41 |
+
# Configuration for the model
|
42 |
+
config = {
|
43 |
+
't5_model': 't5-base',
|
44 |
+
'vocab_size': 32128,
|
45 |
+
'hidden_state': 768,
|
46 |
+
'max_2d_position_embeddings': 1001,
|
47 |
+
'classes': 32128, # number of tokens
|
48 |
+
'seq_len': 512
|
49 |
+
}
|
50 |
+
|
51 |
+
tokenizer = T5Tokenizer.from_pretrained(t5_model)
|
52 |
+
latr = LaTrForVQA(config)
|
53 |
+
url = 'https://www.kaggleusercontent.com/kf/99663112/eyJhbGciOiJkaXIiLCJlbmMiOiJBMTI4Q0JDLUhTMjU2In0..2HGa6jqeAbugMJYxSkh7eA.XkaLSf8XlITet17Bscupegw9zWLw-IEizSy1lM-_PJF_Gfj-YuinOpDw4ad0M8r-s3WlnclQhHYrd2seaZVjBmkm5WSE6Dae1fW54dnNhyWF5w5O2VafNar7QSuUTSRzacJcmtqI1ypL3OZofwXuETbXq4weeqfDptFS5luxuV0P4Vaer_xEgfsdld6v8O5jjMXwb1CVmPCjMdZUE-HTgzTDiwv3Lb-P3dkRgU7q-iI5GeYZCODYGrX-koxya9DlfzKQZXmJmvtMj45vUZ8OSRB0_hTc7UosQanA-SalWznnOuyOgwl4hMag5toTomriWsxfvJIRBn9CYgFcvUJNqO_kDzBUoAwnagjcxXeEIJTJglwAl9Rs37XyfJAZr7yQ_YTXeRW1j2QMsT_M3qtS96IKRTpsqPVibl8Vrs9Q5g_vKccIQR9t7R9ma_DZLwjWYhDvDO06AZqtdaYGfWaOrbqe8dDvJkZoHsZEO8ukpIH6YNLyCO_dqgRsE77I9jqxiUqQh1KnuNv2hGRSlQR7u8OF7lpiRS7JEwj2MaxlzD58dyhOOLDqrbLp7XWrgV79EQcRYHFSMfhDvG0zmGvHjWGAg-LGhnYIc0NMVhyRv5Pfta9WYEl4qXxCTZWe4olgV79WHLqksQMVyTteheB36n4biHZKx4KZj7k-j3aSI72DIAvj7_UFeHxUTTZ1c6MB.7BF6J5MPMuhQFU48xVZ2qQ/models/epoch=0-step=34602.ckpt'
|
54 |
+
|
55 |
+
|
56 |
+
|
57 |
+
try:
|
58 |
+
latr = latr.load_from_checkpoint(url)
|
59 |
+
print("Checkpoint loaded successfully")
|
60 |
+
except:
|
61 |
+
print("Checkpoint not loaded")
|
62 |
+
pass
|
63 |
+
|
64 |
+
|
65 |
+
image = gr.inputs.Image(type="pil")
|
66 |
+
question = gr.inputs.Textbox(label="Question")
|
67 |
+
answer = gr.outputs.Textbox(label="Predicted answer")
|
68 |
+
examples = [["remote.jpg", "what number is the button near the top left?"]]
|
69 |
+
|
70 |
+
|
71 |
+
from transformers import ViTFeatureExtractor, ViTModel
|
72 |
+
vit_feat_extract = ViTFeatureExtractor("google/vit-base-patch16-224-in21k")
|
73 |
+
|
74 |
+
import torchvision
|
75 |
+
import numpy as np
|
76 |
+
|
77 |
+
def answer_question(image, question):
|
78 |
+
|
79 |
+
# Extracting features from the image
|
80 |
+
image.save("sample.png")
|
81 |
+
img, boxes, tokenized_words = create_features("sample.png",
|
82 |
+
tokenizer=tokenizer,
|
83 |
+
target_size=target_size,
|
84 |
+
max_seq_length=max_seq_len,
|
85 |
+
use_ocr=True
|
86 |
+
)
|
87 |
+
|
88 |
+
## Converting the boxes as per the format required for model input
|
89 |
+
boxes = torch.as_tensor(boxes, dtype=torch.int32)
|
90 |
+
width = (boxes[:, 2] - boxes[:, 0]).view(-1, 1)
|
91 |
+
height = (boxes[:, 3] - boxes[:, 1]).view(-1, 1)
|
92 |
+
boxes = torch.cat([boxes, width, height], axis = -1)
|
93 |
+
|
94 |
+
## Clamping the value,as some of the box values are out of bound
|
95 |
+
boxes[:, 0] = torch.clamp(boxes[:, 0], min = 0, max = 0)
|
96 |
+
boxes[:, 2] = torch.clamp(boxes[:, 2], min = 1000, max = 1000)
|
97 |
+
boxes[:, 4] = torch.clamp(boxes[:, 4], min = 1000, max = 1000)
|
98 |
+
|
99 |
+
boxes[:, 1] = torch.clamp(boxes[:, 1], min = 0, max = 0)
|
100 |
+
boxes[:, 3] = torch.clamp(boxes[:, 3], min = 1000, max = 1000)
|
101 |
+
boxes[:, 5] = torch.clamp(boxes[:, 5], min = 1000, max = 1000)
|
102 |
+
|
103 |
+
## Tensor tokenized words
|
104 |
+
tokenized_words = torch.as_tensor(tokenized_words, dtype=torch.int32)
|
105 |
+
img = np.array(img)
|
106 |
+
img = torchvision.transforms.ToTensor()(img)
|
107 |
+
question = convert_ques_to_token(question = question, tokenizer = tokenizer)
|
108 |
+
|
109 |
+
## Expanding the dimension for inference
|
110 |
+
boxes = boxes.unsqueeze(0)
|
111 |
+
tokenized_words = tokenized_words.unsqueeze(0)
|
112 |
+
question = question.unsqueeze(0)
|
113 |
+
|
114 |
+
# print("Shape of Image is:", img.shape)
|
115 |
+
img = vit_feat_extract(img, return_tensors = 'pt')['pixel_values']
|
116 |
+
if int(len(img.shape)) == 3:
|
117 |
+
img = img.unsqueeze(0)
|
118 |
+
|
119 |
+
encoding = {'img': img, 'boxes': boxes, 'tokenized_words': tokenized_words, 'question': question}
|
120 |
+
|
121 |
+
with torch.no_grad():
|
122 |
+
logits = latr.forward(encoding)
|
123 |
+
logits = logits.squeeze(0)
|
124 |
+
|
125 |
+
_, preds = torch.max(logits, dim = 1)
|
126 |
+
preds = preds.detach().cpu()
|
127 |
+
mask = torch.clamp(preds, min = 0, max = 1)
|
128 |
+
last_non_zero_argument = (mask != 0).nonzero()[1][-1]
|
129 |
+
|
130 |
+
predicted_ans = convert_token_to_ques(preds[:last_non_zero_argument], tokenizer)
|
131 |
+
return predicted_ans
|
132 |
+
|
133 |
+
|
134 |
+
# Taken from here: https://huggingface.co/spaces/nielsr/vilt-vqa/blob/main/app.py
|
135 |
+
title = "Interactive demo: LaTr (Layout Aware Transformer) for VQA"
|
136 |
+
description = "Gradio Demo for LaTr (Layout Aware Transformer),trained on TextVQA Dataset. To use it, simply upload your image and type a question and click 'submit', or click one of the examples to load them. Read more at the links below."
|
137 |
+
article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2112.12494' target='_blank'>LaTr: Layout-aware transformer for scene-text VQA,a novel multimodal architecture for Scene Text Visual Question Answering (STVQA)</a> | <a href='https://github.com/uakarsh/latr' target='_blank'>Github Repo</a></p>"
|
138 |
+
examples = [['remote.png', "Is remote present in the picture?"]]
|
139 |
+
|
140 |
+
interface = gr.Interface(fn=answer_question,
|
141 |
+
inputs=[image, question],
|
142 |
+
outputs=answer,
|
143 |
+
examples=examples,
|
144 |
+
title=title,
|
145 |
+
description=description,
|
146 |
+
article=article,
|
147 |
+
enable_queue=True)
|
148 |
+
interface.launch(debug=True)
|
dataset.py
ADDED
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import json
|
3 |
+
import numpy as np
|
4 |
+
import pytesseract
|
5 |
+
from PIL import Image, ImageDraw
|
6 |
+
|
7 |
+
PAD_TOKEN_BOX = [0, 0, 0, 0]
|
8 |
+
max_seq_len = 512
|
9 |
+
|
10 |
+
## Function: 1
|
11 |
+
## Purpose: Resize and align the bounding box for the different sized image
|
12 |
+
|
13 |
+
def resize_align_bbox(bbox, orig_w, orig_h, target_w, target_h):
|
14 |
+
x_scale = target_w / orig_w
|
15 |
+
y_scale = target_h / orig_h
|
16 |
+
orig_left, orig_top, orig_right, orig_bottom = bbox
|
17 |
+
x = int(np.round(orig_left * x_scale))
|
18 |
+
y = int(np.round(orig_top * y_scale))
|
19 |
+
xmax = int(np.round(orig_right * x_scale))
|
20 |
+
ymax = int(np.round(orig_bottom * y_scale))
|
21 |
+
return [x, y, xmax, ymax]
|
22 |
+
|
23 |
+
## Function: 2
|
24 |
+
## Purpose: Reading the json file from the path and return the dictionary
|
25 |
+
|
26 |
+
def load_json_file(file_path):
|
27 |
+
with open(file_path, 'r') as f:
|
28 |
+
data = json.load(f)
|
29 |
+
return data
|
30 |
+
|
31 |
+
## Function: 3
|
32 |
+
## Purpose: Getting the address of specific file type, eg: .pdf, .tif, so and so
|
33 |
+
|
34 |
+
def get_specific_file(path, last_entry = 'tif'):
|
35 |
+
base_path = path
|
36 |
+
for i in os.listdir(path):
|
37 |
+
if i.endswith(last_entry):
|
38 |
+
return os.path.join(base_path, i)
|
39 |
+
|
40 |
+
return '-1'
|
41 |
+
|
42 |
+
|
43 |
+
## Function: 4
|
44 |
+
|
45 |
+
|
46 |
+
def get_tokens_with_boxes(unnormalized_word_boxes, list_of_words, tokenizer, pad_token_id = 0, pad_token_box = [0, 0, 0, 0], max_seq_len = 512):
|
47 |
+
|
48 |
+
'''
|
49 |
+
This function returns two items:
|
50 |
+
1. unnormalized_token_boxes -> a list of len = max_seq_len, containing the boxes corresponding to the tokenized words,
|
51 |
+
one box might repeat as per the tokenization procedure
|
52 |
+
2. tokenized_words -> tokenized words corresponding to the tokenizer and the list_of_words
|
53 |
+
'''
|
54 |
+
|
55 |
+
assert len(unnormalized_word_boxes) == len(list_of_words), "Bounding box length!= total words length"
|
56 |
+
|
57 |
+
length_of_box = len(unnormalized_word_boxes)
|
58 |
+
unnormalized_token_boxes = []
|
59 |
+
tokenized_words = []
|
60 |
+
|
61 |
+
for box, word in zip(unnormalized_word_boxes, list_of_words):
|
62 |
+
current_tokens = tokenizer(word, add_special_tokens = False).input_ids
|
63 |
+
unnormalized_token_boxes.extend([box]*len(current_tokens))
|
64 |
+
tokenized_words.extend(current_tokens)
|
65 |
+
|
66 |
+
if len(unnormalized_token_boxes)<max_seq_len:
|
67 |
+
unnormalized_token_boxes.extend([pad_token_box] * (max_seq_len-len(unnormalized_token_boxes)))
|
68 |
+
|
69 |
+
if len(tokenized_words)< max_seq_len:
|
70 |
+
tokenized_words.extend([pad_token_id]* (max_seq_len-len(tokenized_words)))
|
71 |
+
|
72 |
+
return unnormalized_token_boxes[:max_seq_len], tokenized_words[:max_seq_len]
|
73 |
+
|
74 |
+
## Function: 5
|
75 |
+
## Function, which would only be used when the below function is used
|
76 |
+
|
77 |
+
def get_topleft_bottomright_coordinates(df_row):
|
78 |
+
left, top, width, height = df_row["left"], df_row["top"], df_row["width"], df_row["height"]
|
79 |
+
return [left, top, left + width, top + height]
|
80 |
+
|
81 |
+
## Function: 6
|
82 |
+
## If the OCR is not provided, this function would help in extracting OCR
|
83 |
+
|
84 |
+
|
85 |
+
def apply_ocr(tif_path):
|
86 |
+
"""
|
87 |
+
Returns words and its bounding boxes from an image
|
88 |
+
"""
|
89 |
+
img = Image.open(tif_path).convert("RGB")
|
90 |
+
|
91 |
+
ocr_df = pytesseract.image_to_data(img, output_type="data.frame")
|
92 |
+
ocr_df = ocr_df.dropna().reset_index(drop=True)
|
93 |
+
float_cols = ocr_df.select_dtypes("float").columns
|
94 |
+
ocr_df[float_cols] = ocr_df[float_cols].round(0).astype(int)
|
95 |
+
ocr_df = ocr_df.replace(r"^\s*$", np.nan, regex=True)
|
96 |
+
ocr_df = ocr_df.dropna().reset_index(drop=True)
|
97 |
+
words = list(ocr_df.text.apply(lambda x: str(x).strip()))
|
98 |
+
actual_bboxes = ocr_df.apply(get_topleft_bottomright_coordinates, axis=1).values.tolist()
|
99 |
+
|
100 |
+
# add as extra columns
|
101 |
+
assert len(words) == len(actual_bboxes)
|
102 |
+
return {"words": words, "bbox": actual_bboxes}
|
103 |
+
|
104 |
+
|
105 |
+
## Function: 7
|
106 |
+
## Merging all the above functions, for the purpose of extracting the image, bounding box and the tokens (sentence wise)
|
107 |
+
|
108 |
+
|
109 |
+
def create_features(
|
110 |
+
image_path,
|
111 |
+
tokenizer,
|
112 |
+
target_size = (1000, 1000),
|
113 |
+
max_seq_length=512,
|
114 |
+
use_ocr = False,
|
115 |
+
bounding_box = None,
|
116 |
+
words = None
|
117 |
+
):
|
118 |
+
|
119 |
+
'''
|
120 |
+
We assume that the bounding box provided are given as per the image scale (i.e not normalized), so that we just need to scale it as per the ratio
|
121 |
+
'''
|
122 |
+
|
123 |
+
|
124 |
+
img = Image.open(image_path).convert("RGB")
|
125 |
+
width_old, height_old = img.size
|
126 |
+
img = img.resize(target_size)
|
127 |
+
width, height = img.size
|
128 |
+
|
129 |
+
## Rescaling the bounding box as per the image size
|
130 |
+
|
131 |
+
|
132 |
+
if (use_ocr == False) and (bounding_box == None or words == None):
|
133 |
+
raise Exception('Please provide the bounding box and words or pass the argument "use_ocr" = True')
|
134 |
+
|
135 |
+
if use_ocr == True:
|
136 |
+
entries = apply_ocr(image_path)
|
137 |
+
bounding_box = entries["bbox"]
|
138 |
+
words = entries["words"]
|
139 |
+
|
140 |
+
bounding_box = list(map(lambda x: resize_align_bbox(x,width_old,height_old, width, height), bounding_box))
|
141 |
+
boxes, tokenized_words = get_tokens_with_boxes(unnormalized_word_boxes = bounding_box,
|
142 |
+
list_of_words = words,
|
143 |
+
tokenizer = tokenizer,
|
144 |
+
pad_token_id = 0,
|
145 |
+
pad_token_box = PAD_TOKEN_BOX,
|
146 |
+
max_seq_len = max_seq_length
|
147 |
+
)
|
148 |
+
|
149 |
+
|
150 |
+
return img, boxes, tokenized_words
|
modeling.py
ADDED
@@ -0,0 +1,251 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch.nn as nn
|
2 |
+
import torch
|
3 |
+
from transformers import T5ForConditionalGeneration, ViTModel
|
4 |
+
|
5 |
+
import pytorch_lightning as pl
|
6 |
+
|
7 |
+
# Defining the pytorch model
|
8 |
+
|
9 |
+
|
10 |
+
class LaTr_for_pretraining(nn.Module):
|
11 |
+
def __init__(self, config, classify=False):
|
12 |
+
|
13 |
+
super(LaTr_for_pretraining, self).__init__()
|
14 |
+
self.vocab_size = config['vocab_size']
|
15 |
+
|
16 |
+
model = T5ForConditionalGeneration.from_pretrained(config['t5_model'])
|
17 |
+
# Removing the Embedding layer
|
18 |
+
dummy_encoder = list(nn.Sequential(
|
19 |
+
*list(model.encoder.children())[1:]).children())
|
20 |
+
# Removing the Embedding Layer
|
21 |
+
dummy_decoder = list(nn.Sequential(
|
22 |
+
*list(model.decoder.children())[1:]).children())
|
23 |
+
|
24 |
+
# Using the T5 Encoder
|
25 |
+
|
26 |
+
self.list_encoder = nn.Sequential(*list(dummy_encoder[0]))
|
27 |
+
self.residue_encoder = nn.Sequential(*list(dummy_encoder[1:]))
|
28 |
+
self.list_decoder = nn.Sequential(*list(dummy_decoder[0]))
|
29 |
+
self.residue_decoder = nn.Sequential(*list(dummy_decoder[1:]))
|
30 |
+
|
31 |
+
# We use the embeddings of T5 for encoding the tokenized words
|
32 |
+
self.language_emb = nn.Embedding.from_pretrained(model.shared.weight)
|
33 |
+
|
34 |
+
self.top_left_x = nn.Embedding(
|
35 |
+
config['max_2d_position_embeddings'], config['hidden_state'])
|
36 |
+
self.bottom_right_x = nn.Embedding(
|
37 |
+
config['max_2d_position_embeddings'], config['hidden_state'])
|
38 |
+
self.top_left_y = nn.Embedding(
|
39 |
+
config['max_2d_position_embeddings'], config['hidden_state'])
|
40 |
+
self.bottom_right_y = nn.Embedding(
|
41 |
+
config['max_2d_position_embeddings'], config['hidden_state'])
|
42 |
+
self.width_emb = nn.Embedding(
|
43 |
+
config['max_2d_position_embeddings'], config['hidden_state'])
|
44 |
+
self.height_emb = nn.Embedding(
|
45 |
+
config['max_2d_position_embeddings'], config['hidden_state'])
|
46 |
+
|
47 |
+
self.classify = classify
|
48 |
+
self.classification_layer = nn.Linear(
|
49 |
+
config['hidden_state'], config['classes'])
|
50 |
+
|
51 |
+
def forward(self, tokens, coordinates, predict_proba=False, predict_class=False):
|
52 |
+
|
53 |
+
batch_size = len(tokens)
|
54 |
+
embeded_feature = self.language_emb(tokens)
|
55 |
+
|
56 |
+
top_left_x_feat = self.top_left_x(coordinates[:, :, 0])
|
57 |
+
top_left_y_feat = self.top_left_y(coordinates[:, :, 1])
|
58 |
+
bottom_right_x_feat = self.bottom_right_x(coordinates[:, :, 2])
|
59 |
+
bottom_right_y_feat = self.bottom_right_y(coordinates[:, :, 3])
|
60 |
+
width_feat = self.width_emb(coordinates[:, :, 4])
|
61 |
+
height_feat = self.height_emb(coordinates[:, :, 5])
|
62 |
+
|
63 |
+
total_feat = embeded_feature + top_left_x_feat + top_left_y_feat + \
|
64 |
+
bottom_right_x_feat + bottom_right_y_feat + width_feat + height_feat
|
65 |
+
|
66 |
+
# Extracting the feature
|
67 |
+
|
68 |
+
for layer in self.list_encoder:
|
69 |
+
total_feat = layer(total_feat)[0]
|
70 |
+
total_feat = self.residue_encoder(total_feat)
|
71 |
+
|
72 |
+
for layer in self.list_decoder:
|
73 |
+
total_feat = layer(total_feat)[0]
|
74 |
+
total_feat = self.residue_decoder(total_feat)
|
75 |
+
|
76 |
+
if self.classify:
|
77 |
+
total_feat = self.classification_layer(total_feat)
|
78 |
+
|
79 |
+
if predict_proba:
|
80 |
+
return total_feat.softmax(axis=-1)
|
81 |
+
|
82 |
+
if predict_class:
|
83 |
+
return total_feat.argmax(axis=-1)
|
84 |
+
|
85 |
+
return total_feat
|
86 |
+
|
87 |
+
|
88 |
+
class LaTr_for_finetuning(nn.Module):
|
89 |
+
def __init__(self, config, address_to_pre_trained_weights=None):
|
90 |
+
super(LaTr_for_finetuning, self).__init__()
|
91 |
+
|
92 |
+
self.config = config
|
93 |
+
self.vocab_size = config['vocab_size']
|
94 |
+
|
95 |
+
self.pre_training_model = LaTr_for_pretraining(config)
|
96 |
+
if address_to_pre_trained_weights is not None:
|
97 |
+
self.pre_training_model.load_state_dict(
|
98 |
+
torch.load(address_to_pre_trained_weights))
|
99 |
+
self.vit = ViTModel.from_pretrained(
|
100 |
+
"google/vit-base-patch16-224-in21k")
|
101 |
+
|
102 |
+
# In the fine-tuning stage of vit, except the last layer, all the layers were freezed
|
103 |
+
|
104 |
+
self.classification_head = nn.Linear(
|
105 |
+
config['hidden_state'], config['classes'])
|
106 |
+
|
107 |
+
def forward(self, lang_vect, spatial_vect, quest_vect, img_vect):
|
108 |
+
|
109 |
+
# The below block of code calculates the language and spatial featuer
|
110 |
+
embeded_feature = self.pre_training_model.language_emb(lang_vect)
|
111 |
+
top_left_x_feat = self.pre_training_model.top_left_x(
|
112 |
+
spatial_vect[:, :, 0])
|
113 |
+
top_left_y_feat = self.pre_training_model.top_left_y(
|
114 |
+
spatial_vect[:, :, 1])
|
115 |
+
bottom_right_x_feat = self.pre_training_model.bottom_right_x(
|
116 |
+
spatial_vect[:, :, 2])
|
117 |
+
bottom_right_y_feat = self.pre_training_model.bottom_right_y(
|
118 |
+
spatial_vect[:, :, 3])
|
119 |
+
width_feat = self.pre_training_model.width_emb(spatial_vect[:, :, 4])
|
120 |
+
height_feat = self.pre_training_model.height_emb(spatial_vect[:, :, 5])
|
121 |
+
|
122 |
+
spatial_lang_feat = embeded_feature + top_left_x_feat + top_left_y_feat + \
|
123 |
+
bottom_right_x_feat + bottom_right_y_feat + width_feat + height_feat
|
124 |
+
|
125 |
+
# Extracting the image feature, using the Vision Transformer
|
126 |
+
img_feat = self.vit(img_vect).last_hidden_state
|
127 |
+
|
128 |
+
# Extracting the question vector
|
129 |
+
quest_feat = self.pre_training_model.language_emb(quest_vect)
|
130 |
+
|
131 |
+
# Concating the three features, and then passing it through the T5 Transformer
|
132 |
+
final_feat = torch.cat(
|
133 |
+
[img_feat, spatial_lang_feat, quest_feat], axis=-2)
|
134 |
+
|
135 |
+
# Passing through the T5 Transformer
|
136 |
+
for layer in self.pre_training_model.list_encoder:
|
137 |
+
final_feat = layer(final_feat)[0]
|
138 |
+
|
139 |
+
final_feat = self.pre_training_model.residue_encoder(final_feat)
|
140 |
+
|
141 |
+
for layer in self.pre_training_model.list_decoder:
|
142 |
+
final_feat = layer(final_feat)[0]
|
143 |
+
final_feat = self.pre_training_model.residue_decoder(final_feat)
|
144 |
+
|
145 |
+
answer_vector = self.classification_head(
|
146 |
+
final_feat)[:, :self.config['seq_len'], :]
|
147 |
+
|
148 |
+
return answer_vector
|
149 |
+
|
150 |
+
|
151 |
+
def polynomial(base_lr, iter, max_iter=1e5, power=1):
|
152 |
+
return base_lr * ((1 - float(iter) / max_iter) ** power)
|
153 |
+
|
154 |
+
|
155 |
+
class LaTrForVQA(pl.LightningModule):
|
156 |
+
def __init__(self, config, learning_rate=1e-4, max_steps=100000//2):
|
157 |
+
super(LaTrForVQA, self).__init__()
|
158 |
+
|
159 |
+
self.config = config
|
160 |
+
self.save_hyperparameters()
|
161 |
+
self.latr = LaTr_for_finetuning(config)
|
162 |
+
self.training_losses = []
|
163 |
+
self.validation_losses = []
|
164 |
+
self.max_steps = max_steps
|
165 |
+
|
166 |
+
def configure_optimizers(self):
|
167 |
+
return torch.optim.AdamW(self.parameters(), lr=self.hparams['learning_rate'])
|
168 |
+
|
169 |
+
def forward(self, batch_dict):
|
170 |
+
boxes = batch_dict['boxes']
|
171 |
+
img = batch_dict['img']
|
172 |
+
question = batch_dict['question']
|
173 |
+
words = batch_dict['tokenized_words']
|
174 |
+
answer_vector = self.latr(lang_vect=words,
|
175 |
+
spatial_vect=boxes,
|
176 |
+
img_vect=img,
|
177 |
+
quest_vect=question
|
178 |
+
)
|
179 |
+
return answer_vector
|
180 |
+
|
181 |
+
def calculate_metrics(self, prediction, labels):
|
182 |
+
|
183 |
+
# Calculate the accuracy score between the prediction and ground label for a batch, with considering the pad sequence
|
184 |
+
batch_size = len(prediction)
|
185 |
+
ac_score = 0
|
186 |
+
|
187 |
+
for (pred, gt) in zip(prediction, labels):
|
188 |
+
ac_score += calculate_acc_score(pred.detach().cpu(),
|
189 |
+
gt.detach().cpu())
|
190 |
+
ac_score = ac_score/batch_size
|
191 |
+
return ac_score
|
192 |
+
|
193 |
+
def training_step(self, batch, batch_idx):
|
194 |
+
answer_vector = self.forward(batch)
|
195 |
+
|
196 |
+
# https://discuss.huggingface.co/t/bertformaskedlm-s-loss-and-scores-how-the-loss-is-computed/607/2
|
197 |
+
loss = nn.CrossEntropyLoss(ignore_index=0)(
|
198 |
+
answer_vector.reshape(-1, self.config['classes']), batch['answer'].reshape(-1))
|
199 |
+
_, preds = torch.max(answer_vector, dim=-1)
|
200 |
+
|
201 |
+
# Calculating the accuracy score
|
202 |
+
train_acc = self.calculate_metrics(preds, batch['answer'])
|
203 |
+
train_acc = torch.tensor(train_acc)
|
204 |
+
|
205 |
+
# Logging
|
206 |
+
self.log('train_ce_loss', loss, prog_bar=True)
|
207 |
+
self.log('train_acc', train_acc, prog_bar=True)
|
208 |
+
self.training_losses.append(loss.item())
|
209 |
+
|
210 |
+
return loss
|
211 |
+
|
212 |
+
def validation_step(self, batch, batch_idx):
|
213 |
+
logits = self.forward(batch)
|
214 |
+
loss = nn.CrossEntropyLoss(ignore_index=0)(
|
215 |
+
logits.reshape(-1, self.config['classes']), batch['answer'].reshape(-1))
|
216 |
+
_, preds = torch.max(logits, dim=-1)
|
217 |
+
|
218 |
+
# Validation Accuracy
|
219 |
+
val_acc = self.calculate_metrics(preds.cpu(), batch['answer'].cpu())
|
220 |
+
val_acc = torch.tensor(val_acc)
|
221 |
+
|
222 |
+
# Logging
|
223 |
+
self.log('val_ce_loss', loss, prog_bar=True)
|
224 |
+
self.log('val_acc', val_acc, prog_bar=True)
|
225 |
+
self.validation_losses.append(loss.item())
|
226 |
+
return {'val_loss': loss, 'val_acc': val_acc}
|
227 |
+
|
228 |
+
def optimizer_step(self, epoch_nb, batch_nb, optimizer, optimizer_i, opt_closure=None, on_tpu=False,
|
229 |
+
using_native_amp=False, using_lbfgs=False):
|
230 |
+
|
231 |
+
# Warmup for 1000 steps
|
232 |
+
if self.trainer.global_step < 1000:
|
233 |
+
lr_scale = min(1., float(self.trainer.global_step + 1) / 1000.)
|
234 |
+
for pg in optimizer.param_groups:
|
235 |
+
pg['lr'] = lr_scale * self.hparams.learning_rate
|
236 |
+
|
237 |
+
# Linear Decay
|
238 |
+
else:
|
239 |
+
for pg in optimizer.param_groups:
|
240 |
+
pg['lr'] = polynomial(
|
241 |
+
self.hparams.learning_rate, self.trainer.global_step, max_iter=self.max_steps)
|
242 |
+
|
243 |
+
optimizer.step(opt_closure)
|
244 |
+
optimizer.zero_grad()
|
245 |
+
|
246 |
+
def validation_epoch_end(self, outputs):
|
247 |
+
val_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
|
248 |
+
val_acc = torch.stack([x['val_acc'] for x in outputs]).mean()
|
249 |
+
|
250 |
+
self.log('val_loss_epoch_end', val_loss, on_epoch=True, sync_dist=True)
|
251 |
+
self.log('val_acc_epoch_end', val_acc, on_epoch=True, sync_dist=True)
|
packages.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
tesseract-ocr
|
requirements.txt
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
transformers
|
2 |
+
sentencepiece==0.1.91
|
3 |
+
pytesseract
|
4 |
+
pillow
|
5 |
+
Pillow==7.1.2
|
6 |
+
pytorch-lightning
|
7 |
+
gradio
|
8 |
+
torchvision
|
9 |
+
scikit-learn
|
10 |
+
pandas
|
11 |
+
matplotlib
|
12 |
+
seaborn
|
13 |
+
numpy
|
14 |
+
torch
|
15 |
+
einops
|
16 |
+
tqdm
|
utils.py
ADDED
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# import random
|
2 |
+
import torch
|
3 |
+
import math
|
4 |
+
from torch.nn.utils.rnn import pad_sequence
|
5 |
+
|
6 |
+
|
7 |
+
def find_pad_idx(boxes):
|
8 |
+
for i, j in enumerate(boxes):
|
9 |
+
if int(boxes[i].sum().item()) == 0:
|
10 |
+
return i
|
11 |
+
return i
|
12 |
+
|
13 |
+
|
14 |
+
|
15 |
+
# def apply_mask_on_token_bbox(boxes, tokenized_words, only_actual_words = False, span = 4, proportion_to_mask = 0.15, special_token = 103):
|
16 |
+
|
17 |
+
# '''
|
18 |
+
# code taken from here: https://www.geeksforgeeks.org/python-non-overlapping-random-ranges/
|
19 |
+
|
20 |
+
# Note: A more robust solution is to be coded
|
21 |
+
# '''
|
22 |
+
# length_to_be_masked = int(proportion_to_mask*len(boxes))
|
23 |
+
|
24 |
+
# if only_actual_words:
|
25 |
+
# tot = find_pad_idx(tokenized_words)
|
26 |
+
# else:
|
27 |
+
# tot = len(boxes)
|
28 |
+
|
29 |
+
# res = set()
|
30 |
+
# for _ in range(length_to_be_masked):
|
31 |
+
# temp = random.randint(0, tot - span)
|
32 |
+
# while any(((temp >= idx) and (temp <= idx + span)) for idx in res):
|
33 |
+
# temp = random.randint(0, tot - span)
|
34 |
+
# res.add(temp)
|
35 |
+
|
36 |
+
# ## Applying the mask on token
|
37 |
+
# tokenized_words[temp] = special_token
|
38 |
+
|
39 |
+
# ## Applying the masking on the box
|
40 |
+
# boxes[temp, 0] = torch.min(boxes[temp: temp+span, 0])
|
41 |
+
# boxes[temp, 1] = torch.min(boxes[temp: temp+span, 1])
|
42 |
+
# boxes[temp, 2] = torch.max(boxes[temp: temp+span, 2])
|
43 |
+
# boxes[temp, 3] = torch.max(boxes[temp: temp+span, 3])
|
44 |
+
# boxes[temp, 4] = boxes[temp, 2] - boxes[temp, 0]
|
45 |
+
# boxes[temp, 5] = boxes[temp, 3] - boxes[temp, 1]
|
46 |
+
|
47 |
+
# return res,boxes, tokenized_words
|
48 |
+
|
49 |
+
|
50 |
+
def convert_ans_to_token(answer, label2id, max_seq_length = 512 ):
|
51 |
+
|
52 |
+
## Simple Trick to pad a sequence to deired length
|
53 |
+
dummy_array = torch.zeros(max_seq_length)
|
54 |
+
actual_ans_array = []
|
55 |
+
|
56 |
+
answer = answer.split(" ")
|
57 |
+
for token in answer:
|
58 |
+
actual_ans_array.append(label2id[token]['id'])
|
59 |
+
|
60 |
+
actual_ans_array = torch.tensor(actual_ans_array, dtype = torch.int32)
|
61 |
+
actual_ans_array = pad_sequence([actual_ans_array,dummy_array], batch_first = True)[0]
|
62 |
+
|
63 |
+
return actual_ans_array
|
64 |
+
|
65 |
+
|
66 |
+
def convert_ques_to_token(question, tokenizer, pad_token_id = 0, max_seq_len = 512):
|
67 |
+
|
68 |
+
question_array = []
|
69 |
+
question = question.split(" ")
|
70 |
+
|
71 |
+
for token in question:
|
72 |
+
question_array.extend(tokenizer(token, add_special_tokens = False).input_ids)
|
73 |
+
|
74 |
+
if len(question_array)< max_seq_len:
|
75 |
+
question_array.extend([pad_token_id]* (max_seq_len-len(question_array)))
|
76 |
+
|
77 |
+
question_array = torch.tensor(question_array, dtype = torch.int32)
|
78 |
+
return question_array[:max_seq_len]
|
79 |
+
|
80 |
+
|
81 |
+
## To be taken from here
|
82 |
+
## https://logicatcore.github.io/scratchpad/lidar/sensor-fusion/jupyter/2021/04/20/3D-Oriented-Bounding-Box.html
|
83 |
+
|
84 |
+
def rotate(origin, point, angle):
|
85 |
+
"""
|
86 |
+
Rotate a point counterclockwise by a given angle around a given origin.
|
87 |
+
The angle should be given in radians.
|
88 |
+
|
89 |
+
modified from answer here: https://stackoverflow.com/questions/34372480/rotate-point-about-another-point-in-degrees-python
|
90 |
+
"""
|
91 |
+
# angle = np.deg2rad(angle)
|
92 |
+
ox, oy = origin
|
93 |
+
px, py = point
|
94 |
+
|
95 |
+
qx = ox + math.cos(angle) * (px - ox) - math.sin(angle) * (py - oy)
|
96 |
+
qy = oy + math.sin(angle) * (px - ox) + math.cos(angle) * (py - oy)
|
97 |
+
return int(qx), int(qy)
|
98 |
+
|
99 |
+
|
100 |
+
def convert_token_to_ques(ques, tokenizer):
|
101 |
+
decoded_ques = tokenizer.decode(ques, skip_special_tokens=True)
|
102 |
+
return decoded_ques
|
103 |
+
|
104 |
+
|
105 |
+
def convert_token_to_answer(ans, id2label):
|
106 |
+
non_zero_argument = torch.nonzero(ans,as_tuple = False).view(-1)
|
107 |
+
|
108 |
+
actual_answer = ans[non_zero_argument].cpu().numpy()
|
109 |
+
decoded_answer = []
|
110 |
+
|
111 |
+
for token in actual_answer:
|
112 |
+
decoded_answer.append(id2label[token])
|
113 |
+
|
114 |
+
decoded_answer = " ".join(decoded_answer)
|
115 |
+
return decoded_answer
|
116 |
+
|