Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -8,7 +8,6 @@ from dataset import load_json_file, get_specific_file, resize_align_bbox, get_to
|
|
8 |
import torch.nn as nn
|
9 |
from PIL import Image, ImageDraw
|
10 |
import pytesseract
|
11 |
-
import pandas as pd
|
12 |
from tqdm.auto import tqdm
|
13 |
import numpy as np
|
14 |
import json
|
@@ -23,11 +22,6 @@ os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
|
23 |
|
24 |
|
25 |
# Default Library import
|
26 |
-
|
27 |
-
|
28 |
-
# For the purpose of displaying the progress of map function
|
29 |
-
tqdm.pandas()
|
30 |
-
|
31 |
# Visualization libraries
|
32 |
|
33 |
# Specific libraries of LaTr
|
@@ -76,11 +70,14 @@ examples = [["remote.jpg", "what number is the button near the top left?"]]
|
|
76 |
from transformers import ViTFeatureExtractor, ViTModel
|
77 |
vit_feat_extract = ViTFeatureExtractor("google/vit-base-patch16-224-in21k")
|
78 |
|
|
|
|
|
|
|
79 |
def answer_question(image, question):
|
80 |
-
image.save('sample_img.png')
|
81 |
|
82 |
# Extracting features from the image
|
83 |
-
|
|
|
84 |
tokenizer=tokenizer,
|
85 |
target_size=target_size,
|
86 |
max_seq_length=max_seq_len,
|
@@ -103,18 +100,20 @@ def answer_question(image, question):
|
|
103 |
boxes[:, 5] = torch.clamp(boxes[:, 5], min = 1000, max = 1000)
|
104 |
|
105 |
## Tensor tokenized words
|
106 |
-
tokenized_words = torch.as_tensor(tokenized_words, dtype=torch.int32)
|
107 |
-
|
108 |
-
img = transforms.ToTensor()(
|
109 |
question = convert_ques_to_token(question = question, tokenizer = tokenizer)
|
110 |
|
111 |
## Expanding the dimension for inference
|
112 |
-
img = img.unsqueeze(0)
|
113 |
boxes = boxes.unsqueeze(0)
|
114 |
tokenized_words = tokenized_words.unsqueeze(0)
|
115 |
question = question.unsqueeze(0)
|
116 |
|
117 |
img = vit_feat_extract(img, return_tensors = 'pt')['pixel_values']
|
|
|
|
|
|
|
118 |
encoding = {'img': img, 'boxes': boxes, 'tokenized_words': tokenized_words, 'question': question}
|
119 |
|
120 |
with torch.no_grad():
|
@@ -126,7 +125,7 @@ def answer_question(image, question):
|
|
126 |
mask = torch.clamp(preds, min = 0, max = 1)
|
127 |
last_non_zero_argument = (mask != 0).nonzero()[1][-1]
|
128 |
|
129 |
-
predicted_ans = convert_token_to_ques(
|
130 |
return predicted_ans
|
131 |
|
132 |
|
@@ -134,6 +133,7 @@ def answer_question(image, question):
|
|
134 |
title = "Interactive demo: laTr (Layout Aware Transformer) for VQA"
|
135 |
description = "Gradio Demo for LaTr (Layout Aware Transformer),trained on TextVQA Dataset. To use it, simply upload your image and type a question and click 'submit', or click one of the examples to load them. Read more at the links below."
|
136 |
article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2112.12494' target='_blank'>LaTr: Layout-aware transformer for scene-text VQA,a novel multimodal architecture for Scene Text Visual Question Answering (STVQA)</a> | <a href='https://github.com/uakarsh/latr' target='_blank'>Github Repo</a></p>"
|
|
|
137 |
|
138 |
interface = gr.Interface(fn=answer_question,
|
139 |
inputs=[image, question],
|
|
|
8 |
import torch.nn as nn
|
9 |
from PIL import Image, ImageDraw
|
10 |
import pytesseract
|
|
|
11 |
from tqdm.auto import tqdm
|
12 |
import numpy as np
|
13 |
import json
|
|
|
22 |
|
23 |
|
24 |
# Default Library import
|
|
|
|
|
|
|
|
|
|
|
25 |
# Visualization libraries
|
26 |
|
27 |
# Specific libraries of LaTr
|
|
|
70 |
from transformers import ViTFeatureExtractor, ViTModel
|
71 |
vit_feat_extract = ViTFeatureExtractor("google/vit-base-patch16-224-in21k")
|
72 |
|
73 |
+
import torchvision
|
74 |
+
import numpy as np
|
75 |
+
|
76 |
def answer_question(image, question):
|
|
|
77 |
|
78 |
# Extracting features from the image
|
79 |
+
image.save("sample.png")
|
80 |
+
img, boxes, tokenized_words = create_features("sample.png",
|
81 |
tokenizer=tokenizer,
|
82 |
target_size=target_size,
|
83 |
max_seq_length=max_seq_len,
|
|
|
100 |
boxes[:, 5] = torch.clamp(boxes[:, 5], min = 1000, max = 1000)
|
101 |
|
102 |
## Tensor tokenized words
|
103 |
+
tokenized_words = torch.as_tensor(tokenized_words, dtype=torch.int32)
|
104 |
+
img = np.array(img)
|
105 |
+
img = torchvision.transforms.ToTensor()(img)
|
106 |
question = convert_ques_to_token(question = question, tokenizer = tokenizer)
|
107 |
|
108 |
## Expanding the dimension for inference
|
|
|
109 |
boxes = boxes.unsqueeze(0)
|
110 |
tokenized_words = tokenized_words.unsqueeze(0)
|
111 |
question = question.unsqueeze(0)
|
112 |
|
113 |
img = vit_feat_extract(img, return_tensors = 'pt')['pixel_values']
|
114 |
+
if int(len(img.shape)) == 3:
|
115 |
+
img = img.unsqueeze(0)
|
116 |
+
|
117 |
encoding = {'img': img, 'boxes': boxes, 'tokenized_words': tokenized_words, 'question': question}
|
118 |
|
119 |
with torch.no_grad():
|
|
|
125 |
mask = torch.clamp(preds, min = 0, max = 1)
|
126 |
last_non_zero_argument = (mask != 0).nonzero()[1][-1]
|
127 |
|
128 |
+
predicted_ans = convert_token_to_ques(preds[:last_non_zero_argument], tokenizer)
|
129 |
return predicted_ans
|
130 |
|
131 |
|
|
|
133 |
title = "Interactive demo: laTr (Layout Aware Transformer) for VQA"
|
134 |
description = "Gradio Demo for LaTr (Layout Aware Transformer),trained on TextVQA Dataset. To use it, simply upload your image and type a question and click 'submit', or click one of the examples to load them. Read more at the links below."
|
135 |
article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2112.12494' target='_blank'>LaTr: Layout-aware transformer for scene-text VQA,a novel multimodal architecture for Scene Text Visual Question Answering (STVQA)</a> | <a href='https://github.com/uakarsh/latr' target='_blank'>Github Repo</a></p>"
|
136 |
+
examples = [['remote.png', "what number is the button near the top left?"]]
|
137 |
|
138 |
interface = gr.Interface(fn=answer_question,
|
139 |
inputs=[image, question],
|