iakarshu commited on
Commit
785c228
·
1 Parent(s): 0d9698c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -13
app.py CHANGED
@@ -8,7 +8,6 @@ from dataset import load_json_file, get_specific_file, resize_align_bbox, get_to
8
  import torch.nn as nn
9
  from PIL import Image, ImageDraw
10
  import pytesseract
11
- import pandas as pd
12
  from tqdm.auto import tqdm
13
  import numpy as np
14
  import json
@@ -23,11 +22,6 @@ os.environ["TOKENIZERS_PARALLELISM"] = "false"
23
 
24
 
25
  # Default Library import
26
-
27
-
28
- # For the purpose of displaying the progress of map function
29
- tqdm.pandas()
30
-
31
  # Visualization libraries
32
 
33
  # Specific libraries of LaTr
@@ -76,11 +70,14 @@ examples = [["remote.jpg", "what number is the button near the top left?"]]
76
  from transformers import ViTFeatureExtractor, ViTModel
77
  vit_feat_extract = ViTFeatureExtractor("google/vit-base-patch16-224-in21k")
78
 
 
 
 
79
  def answer_question(image, question):
80
- image.save('sample_img.png')
81
 
82
  # Extracting features from the image
83
- dummy_img, boxes, tokenized_words = create_features(image_path='sample_img.png',
 
84
  tokenizer=tokenizer,
85
  target_size=target_size,
86
  max_seq_length=max_seq_len,
@@ -103,18 +100,20 @@ def answer_question(image, question):
103
  boxes[:, 5] = torch.clamp(boxes[:, 5], min = 1000, max = 1000)
104
 
105
  ## Tensor tokenized words
106
- tokenized_words = torch.as_tensor(tokenized_words, dtype=torch.int32)
107
-
108
- img = transforms.ToTensor()(image)
109
  question = convert_ques_to_token(question = question, tokenizer = tokenizer)
110
 
111
  ## Expanding the dimension for inference
112
- img = img.unsqueeze(0)
113
  boxes = boxes.unsqueeze(0)
114
  tokenized_words = tokenized_words.unsqueeze(0)
115
  question = question.unsqueeze(0)
116
 
117
  img = vit_feat_extract(img, return_tensors = 'pt')['pixel_values']
 
 
 
118
  encoding = {'img': img, 'boxes': boxes, 'tokenized_words': tokenized_words, 'question': question}
119
 
120
  with torch.no_grad():
@@ -126,7 +125,7 @@ def answer_question(image, question):
126
  mask = torch.clamp(preds, min = 0, max = 1)
127
  last_non_zero_argument = (mask != 0).nonzero()[1][-1]
128
 
129
- predicted_ans = convert_token_to_ques(individual_ans_pred[:last_non_zero_argument], tokenizer)
130
  return predicted_ans
131
 
132
 
@@ -134,6 +133,7 @@ def answer_question(image, question):
134
  title = "Interactive demo: laTr (Layout Aware Transformer) for VQA"
135
  description = "Gradio Demo for LaTr (Layout Aware Transformer),trained on TextVQA Dataset. To use it, simply upload your image and type a question and click 'submit', or click one of the examples to load them. Read more at the links below."
136
  article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2112.12494' target='_blank'>LaTr: Layout-aware transformer for scene-text VQA,a novel multimodal architecture for Scene Text Visual Question Answering (STVQA)</a> | <a href='https://github.com/uakarsh/latr' target='_blank'>Github Repo</a></p>"
 
137
 
138
  interface = gr.Interface(fn=answer_question,
139
  inputs=[image, question],
 
8
  import torch.nn as nn
9
  from PIL import Image, ImageDraw
10
  import pytesseract
 
11
  from tqdm.auto import tqdm
12
  import numpy as np
13
  import json
 
22
 
23
 
24
  # Default Library import
 
 
 
 
 
25
  # Visualization libraries
26
 
27
  # Specific libraries of LaTr
 
70
  from transformers import ViTFeatureExtractor, ViTModel
71
  vit_feat_extract = ViTFeatureExtractor("google/vit-base-patch16-224-in21k")
72
 
73
+ import torchvision
74
+ import numpy as np
75
+
76
  def answer_question(image, question):
 
77
 
78
  # Extracting features from the image
79
+ image.save("sample.png")
80
+ img, boxes, tokenized_words = create_features("sample.png",
81
  tokenizer=tokenizer,
82
  target_size=target_size,
83
  max_seq_length=max_seq_len,
 
100
  boxes[:, 5] = torch.clamp(boxes[:, 5], min = 1000, max = 1000)
101
 
102
  ## Tensor tokenized words
103
+ tokenized_words = torch.as_tensor(tokenized_words, dtype=torch.int32)
104
+ img = np.array(img)
105
+ img = torchvision.transforms.ToTensor()(img)
106
  question = convert_ques_to_token(question = question, tokenizer = tokenizer)
107
 
108
  ## Expanding the dimension for inference
 
109
  boxes = boxes.unsqueeze(0)
110
  tokenized_words = tokenized_words.unsqueeze(0)
111
  question = question.unsqueeze(0)
112
 
113
  img = vit_feat_extract(img, return_tensors = 'pt')['pixel_values']
114
+ if int(len(img.shape)) == 3:
115
+ img = img.unsqueeze(0)
116
+
117
  encoding = {'img': img, 'boxes': boxes, 'tokenized_words': tokenized_words, 'question': question}
118
 
119
  with torch.no_grad():
 
125
  mask = torch.clamp(preds, min = 0, max = 1)
126
  last_non_zero_argument = (mask != 0).nonzero()[1][-1]
127
 
128
+ predicted_ans = convert_token_to_ques(preds[:last_non_zero_argument], tokenizer)
129
  return predicted_ans
130
 
131
 
 
133
  title = "Interactive demo: laTr (Layout Aware Transformer) for VQA"
134
  description = "Gradio Demo for LaTr (Layout Aware Transformer),trained on TextVQA Dataset. To use it, simply upload your image and type a question and click 'submit', or click one of the examples to load them. Read more at the links below."
135
  article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2112.12494' target='_blank'>LaTr: Layout-aware transformer for scene-text VQA,a novel multimodal architecture for Scene Text Visual Question Answering (STVQA)</a> | <a href='https://github.com/uakarsh/latr' target='_blank'>Github Repo</a></p>"
136
+ examples = [['remote.png', "what number is the button near the top left?"]]
137
 
138
  interface = gr.Interface(fn=answer_question,
139
  inputs=[image, question],