Spaces:

iakarshu
/

latr-vqa

Runtime error

App Files Files Community

iakarshu commited on Jul 14, 2022

Commit

785c228

1 Parent(s): 0d9698c

Update app.py

Browse files

Files changed (1) hide show

app.py +13 -13

app.py CHANGED Viewed

@@ -8,7 +8,6 @@ from dataset import load_json_file, get_specific_file, resize_align_bbox, get_to
 import torch.nn as nn
 from PIL import Image, ImageDraw
 import pytesseract
-import pandas as pd
 from tqdm.auto import tqdm
 import numpy as np
 import json
@@ -23,11 +22,6 @@ os.environ["TOKENIZERS_PARALLELISM"] = "false"
 # Default Library import
-# For the purpose of displaying the progress of map function
-tqdm.pandas()
 # Visualization libraries
 # Specific libraries of LaTr
@@ -76,11 +70,14 @@ examples = [["remote.jpg", "what number is the button near the top left?"]]
 from transformers import ViTFeatureExtractor, ViTModel
 vit_feat_extract = ViTFeatureExtractor("google/vit-base-patch16-224-in21k")
 def answer_question(image, question):
-    image.save('sample_img.png')
     # Extracting features from the image
-    dummy_img, boxes, tokenized_words = create_features(image_path='sample_img.png',
                                                   tokenizer=tokenizer,
                                                   target_size=target_size,
                                                   max_seq_length=max_seq_len,
@@ -103,18 +100,20 @@ def answer_question(image, question):
     boxes[:, 5] = torch.clamp(boxes[:, 5], min = 1000, max = 1000)
     ## Tensor tokenized words
-    tokenized_words = torch.as_tensor(tokenized_words, dtype=torch.int32)
-    img = transforms.ToTensor()(image)
     question = convert_ques_to_token(question = question, tokenizer = tokenizer)
     ## Expanding the dimension for inference
-    img = img.unsqueeze(0)
     boxes = boxes.unsqueeze(0)
     tokenized_words = tokenized_words.unsqueeze(0)
     question = question.unsqueeze(0)
     img = vit_feat_extract(img, return_tensors = 'pt')['pixel_values']
     encoding = {'img': img, 'boxes': boxes, 'tokenized_words': tokenized_words, 'question': question}
     with torch.no_grad():
@@ -126,7 +125,7 @@ def answer_question(image, question):
     mask = torch.clamp(preds, min = 0, max = 1)
     last_non_zero_argument = (mask != 0).nonzero()[1][-1]
-    predicted_ans = convert_token_to_ques(individual_ans_pred[:last_non_zero_argument], tokenizer)
     return predicted_ans
@@ -134,6 +133,7 @@ def answer_question(image, question):
 title = "Interactive demo: laTr (Layout Aware Transformer) for VQA"
 description = "Gradio Demo for LaTr (Layout Aware Transformer),trained on TextVQA Dataset. To use it, simply upload your image and type a question and click 'submit', or click one of the examples to load them. Read more at the links below."
 article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2112.12494' target='_blank'>LaTr: Layout-aware transformer for scene-text VQA,a novel multimodal architecture for Scene Text Visual Question Answering (STVQA)</a> | <a href='https://github.com/uakarsh/latr' target='_blank'>Github Repo</a></p>"
 interface = gr.Interface(fn=answer_question,
                          inputs=[image, question],

 import torch.nn as nn
 from PIL import Image, ImageDraw
 import pytesseract
 from tqdm.auto import tqdm
 import numpy as np
 import json
 # Default Library import
 # Visualization libraries
 # Specific libraries of LaTr
 from transformers import ViTFeatureExtractor, ViTModel
 vit_feat_extract = ViTFeatureExtractor("google/vit-base-patch16-224-in21k")
+import torchvision
+import numpy as np
 def answer_question(image, question):
     # Extracting features from the image
+    image.save("sample.png")
+    img, boxes, tokenized_words = create_features("sample.png",
                                                   tokenizer=tokenizer,
                                                   target_size=target_size,
                                                   max_seq_length=max_seq_len,
     boxes[:, 5] = torch.clamp(boxes[:, 5], min = 1000, max = 1000)
     ## Tensor tokenized words
+    tokenized_words = torch.as_tensor(tokenized_words, dtype=torch.int32)
+    img = np.array(img)
+    img = torchvision.transforms.ToTensor()(img)
     question = convert_ques_to_token(question = question, tokenizer = tokenizer)
     ## Expanding the dimension for inference
     boxes = boxes.unsqueeze(0)
     tokenized_words = tokenized_words.unsqueeze(0)
     question = question.unsqueeze(0)
     img = vit_feat_extract(img, return_tensors = 'pt')['pixel_values']
+    if int(len(img.shape)) == 3:
+      img = img.unsqueeze(0)
     encoding = {'img': img, 'boxes': boxes, 'tokenized_words': tokenized_words, 'question': question}
     with torch.no_grad():
     mask = torch.clamp(preds, min = 0, max = 1)
     last_non_zero_argument = (mask != 0).nonzero()[1][-1]
+    predicted_ans = convert_token_to_ques(preds[:last_non_zero_argument], tokenizer)
     return predicted_ans
 title = "Interactive demo: laTr (Layout Aware Transformer) for VQA"
 description = "Gradio Demo for LaTr (Layout Aware Transformer),trained on TextVQA Dataset. To use it, simply upload your image and type a question and click 'submit', or click one of the examples to load them. Read more at the links below."
 article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2112.12494' target='_blank'>LaTr: Layout-aware transformer for scene-text VQA,a novel multimodal architecture for Scene Text Visual Question Answering (STVQA)</a> | <a href='https://github.com/uakarsh/latr' target='_blank'>Github Repo</a></p>"
+examples = [['remote.png', "what number is the button near the top left?"]]
 interface = gr.Interface(fn=answer_question,
                          inputs=[image, question],