Spaces:

sanjaykamath
/

Safeworld_Captioning_Spaces

Runtime error

File size: 17,875 Bytes

7fc7f3d

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "15468c81",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "--2022-02-15 18:26:17--  https://upload.wikimedia.org/wikipedia/commons/thumb/e/ea/Van_Gogh_-_Starry_Night_-_Google_Art_Project.jpg/1920px-Van_Gogh_-_Starry_Night_-_Google_Art_Project.jpg\n",
      "Resolving upload.wikimedia.org (upload.wikimedia.org)... 91.198.174.208\n",
      "Connecting to upload.wikimedia.org (upload.wikimedia.org)|91.198.174.208|:443... connected.\n",
      "HTTP request sent, awaiting response... 200 OK\n",
      "Length: 1388211 (1.3M) [image/jpeg]\n",
      "Saving to: ‘starry.jpg’\n",
      "\n",
      "     0K .......... .......... .......... .......... ..........  3%  776K 2s\n",
      "    50K .......... .......... .......... .......... ..........  7%  877K 2s\n",
      "   100K .......... .......... .......... .......... .......... 11% 2.93M 1s\n",
      "   150K .......... .......... .......... .......... .......... 14% 2.28M 1s\n",
      "   200K .......... .......... .......... .......... .......... 18% 4.04M 1s\n",
      "   250K .......... .......... .......... .......... .......... 22% 5.46M 1s\n",
      "   300K .......... .......... .......... .......... .......... 25% 6.40M 1s\n",
      "   350K .......... .......... .......... .......... .......... 29% 2.41M 0s\n",
      "   400K .......... .......... .......... .......... .......... 33% 3.18M 0s\n",
      "   450K .......... .......... .......... .......... .......... 36% 3.03M 0s\n",
      "   500K .......... .......... .......... .......... .......... 40% 8.30M 0s\n",
      "   550K .......... .......... .......... .......... .......... 44% 3.31M 0s\n",
      "   600K .......... .......... .......... .......... .......... 47% 3.10M 0s\n",
      "   650K .......... .......... .......... .......... .......... 51% 12.3M 0s\n",
      "   700K .......... .......... .......... .......... .......... 55% 4.20M 0s\n",
      "   750K .......... .......... .......... .......... .......... 59% 1.93M 0s\n",
      "   800K .......... .......... .......... .......... .......... 62% 6.28M 0s\n",
      "   850K .......... .......... .......... .......... .......... 66% 3.09M 0s\n",
      "   900K .......... .......... .......... .......... .......... 70% 22.7M 0s\n",
      "   950K .......... .......... .......... .......... .......... 73% 4.43M 0s\n",
      "  1000K .......... .......... .......... .......... .......... 77% 4.16M 0s\n",
      "  1050K .......... .......... .......... .......... .......... 81% 2.29M 0s\n",
      "  1100K .......... .......... .......... .......... .......... 84% 1.81M 0s\n",
      "  1150K .......... .......... .......... .......... .......... 88% 6.20M 0s\n",
      "  1200K .......... .......... .......... .......... .......... 92% 2.03M 0s\n",
      "  1250K .......... .......... .......... .......... .......... 95% 23.5M 0s\n",
      "  1300K .......... .......... .......... .......... .......... 99% 5.04M 0s\n",
      "  1350K .....                                                 100% 9.95M=0.5s\n",
      "\n",
      "2022-02-15 18:26:17 (2.89 MB/s) - ‘starry.jpg’ saved [1388211/1388211]\n",
      "\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "02b7655f0b2b404b952b7c152a3a1661",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0.00/262k [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using cache found in /Users/sanjaykamath/.cache/torch/hub/ashkamath_mdetr_main\n",
      "Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.weight']\n",
      "- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
      "- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "load checkpoint from https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model*_base_caption.pth\n",
      "load checkpoint from https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model*_vqa.pth\n",
      "Running on local URL:  http://127.0.0.1:7862/\n",
      "Running on public URL: https://13389.gradio.app\n",
      "\n",
      "This share link expires in 72 hours. For free permanent hosting, check out Spaces (https://huggingface.co/spaces)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "\n",
       "        <iframe\n",
       "            width=\"900\"\n",
       "            height=\"500\"\n",
       "            src=\"https://13389.gradio.app\"\n",
       "            frameborder=\"0\"\n",
       "            allowfullscreen\n",
       "            \n",
       "        ></iframe>\n",
       "        "
      ],
      "text/plain": [
       "<IPython.lib.display.IFrame at 0x7fce90855f40>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "(<fastapi.applications.FastAPI at 0x7fcfa3376fd0>,\n",
       " 'http://127.0.0.1:7862/',\n",
       " 'https://13389.gradio.app')"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2022-02-15 18:27:19.011924: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA\n",
      "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "os.system(\"wget https://upload.wikimedia.org/wikipedia/commons/thumb/e/ea/Van_Gogh_-_Starry_Night_-_Google_Art_Project.jpg/1920px-Van_Gogh_-_Starry_Night_-_Google_Art_Project.jpg -O starry.jpg\")\n",
    "\n",
    "from PIL import Image\n",
    "import requests\n",
    "import torch\n",
    "from torchvision import transforms\n",
    "from torchvision.transforms.functional import InterpolationMode\n",
    "\n",
    "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
    "\n",
    "\n",
    "\n",
    "    \n",
    "#MDETR Code    \n",
    "import torchvision.transforms as T\n",
    "import matplotlib.pyplot as plt\n",
    "from collections import defaultdict\n",
    "import torch.nn.functional as F\n",
    "import numpy as np\n",
    "from skimage.measure import find_contours\n",
    "\n",
    "from matplotlib import patches,  lines\n",
    "from matplotlib.patches import Polygon\n",
    "import gradio as gr\n",
    "\n",
    "torch.hub.download_url_to_file('https://cdn.pixabay.com/photo/2014/03/04/15/10/elephants-279505_1280.jpg', 'elephant.jpg')\n",
    "\n",
    "\n",
    "model2, postprocessor = torch.hub.load('ashkamath/mdetr:main', 'mdetr_efficientnetB5', pretrained=True, return_postprocessor=True)\n",
    "model2 = model2.cpu()\n",
    "model2.eval()\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "torch.set_grad_enabled(False);\n",
    "# standard PyTorch mean-std input image normalization\n",
    "transform = T.Compose([\n",
    "    T.Resize(800),\n",
    "    T.ToTensor(),\n",
    "    T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])\n",
    "])\n",
    "\n",
    "# for output bounding box post-processing\n",
    "def box_cxcywh_to_xyxy(x):\n",
    "    x_c, y_c, w, h = x.unbind(1)\n",
    "    b = [(x_c - 0.5 * w), (y_c - 0.5 * h),\n",
    "         (x_c + 0.5 * w), (y_c + 0.5 * h)]\n",
    "    return torch.stack(b, dim=1)\n",
    "\n",
    "def rescale_bboxes(out_bbox, size):\n",
    "    img_w, img_h = size\n",
    "    b = box_cxcywh_to_xyxy(out_bbox)\n",
    "    b = b * torch.tensor([img_w, img_h, img_w, img_h], dtype=torch.float32)\n",
    "    return b\n",
    "# colors for visualization\n",
    "COLORS = [[0.000, 0.447, 0.741], [0.850, 0.325, 0.098], [0.929, 0.694, 0.125],\n",
    "          [0.494, 0.184, 0.556], [0.466, 0.674, 0.188], [0.301, 0.745, 0.933]]\n",
    "\n",
    "def apply_mask(image, mask, color, alpha=0.5):\n",
    "    \"\"\"Apply the given mask to the image.\n",
    "    \"\"\"\n",
    "    for c in range(3):\n",
    "        image[:, :, c] = np.where(mask == 1,\n",
    "                                  image[:, :, c] *\n",
    "                                  (1 - alpha) + alpha * color[c] * 255,\n",
    "                                  image[:, :, c])\n",
    "    return image\n",
    "\n",
    "def plot_results(pil_img, scores, boxes, labels, masks=None):\n",
    "    plt.figure(figsize=(16,10))\n",
    "    np_image = np.array(pil_img)\n",
    "    ax = plt.gca()\n",
    "    colors = COLORS * 100\n",
    "    if masks is None:\n",
    "      masks = [None for _ in range(len(scores))]\n",
    "    assert len(scores) == len(boxes) == len(labels) == len(masks)\n",
    "    for s, (xmin, ymin, xmax, ymax), l, mask, c in zip(scores, boxes.tolist(), labels, masks, colors):\n",
    "        ax.add_patch(plt.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin,\n",
    "                                   fill=False, color=c, linewidth=3))\n",
    "        text = f'{l}: {s:0.2f}'\n",
    "        ax.text(xmin, ymin, text, fontsize=15, bbox=dict(facecolor='white', alpha=0.8))\n",
    "\n",
    "        if mask is None:\n",
    "          continue\n",
    "        np_image = apply_mask(np_image, mask, c)\n",
    "\n",
    "        padded_mask = np.zeros((mask.shape[0] + 2, mask.shape[1] + 2), dtype=np.uint8)\n",
    "        padded_mask[1:-1, 1:-1] = mask\n",
    "        contours = find_contours(padded_mask, 0.5)\n",
    "        for verts in contours:\n",
    "          # Subtract the padding and flip (y, x) to (x, y)\n",
    "          verts = np.fliplr(verts) - 1\n",
    "          p = Polygon(verts, facecolor=\"none\", edgecolor=c)\n",
    "          ax.add_patch(p)\n",
    "\n",
    "\n",
    "    plt.imshow(np_image)\n",
    "    plt.axis('off')\n",
    "    plt.savefig('foo.png',bbox_inches='tight')\n",
    "    return 'foo.png'\n",
    "\n",
    "\n",
    "def add_res(results, ax, color='green'):\n",
    "    #for tt in results.values():\n",
    "    if True:\n",
    "        bboxes = results['boxes']\n",
    "        labels = results['labels']\n",
    "        scores = results['scores']\n",
    "        #keep = scores >= 0.0\n",
    "        #bboxes = bboxes[keep].tolist()\n",
    "        #labels = labels[keep].tolist()\n",
    "        #scores = scores[keep].tolist()\n",
    "    #print(torchvision.ops.box_iou(tt['boxes'].cpu().detach(), torch.as_tensor([[xmin, ymin, xmax, ymax]])))\n",
    "    \n",
    "    colors = ['purple', 'yellow', 'red', 'green', 'orange', 'pink']\n",
    "    \n",
    "    for i, (b, ll, ss) in enumerate(zip(bboxes, labels, scores)):\n",
    "        ax.add_patch(plt.Rectangle((b[0], b[1]), b[2] - b[0], b[3] - b[1], fill=False, color=colors[i], linewidth=3))\n",
    "        cls_name = ll if isinstance(ll,str) else CLASSES[ll]\n",
    "        text = f'{cls_name}: {ss:.2f}'\n",
    "        print(text)\n",
    "        ax.text(b[0], b[1], text, fontsize=15, bbox=dict(facecolor='white', alpha=0.8))\n",
    "\n",
    "\n",
    "def plot_inference(im, caption, approaches):\n",
    "    \n",
    "    choices = {\"Worker Helmet Separately\" : 1,\"Worker Helmet Vest\":2, \"Workers only\":3}\n",
    "    \n",
    "    \n",
    "# mean-std normalize the input image (batch-size: 1)\n",
    "    img = transform(im).unsqueeze(0).cpu()\n",
    "\n",
    "  # propagate through the model\n",
    "    memory_cache = model2(img, [caption], encode_and_save=True)\n",
    "    outputs = model2(img, [caption], encode_and_save=False, memory_cache=memory_cache)\n",
    "\n",
    "  # keep only predictions with 0.7+ confidence\n",
    "    probas = 1 - outputs['pred_logits'].softmax(-1)[0, :, -1].cpu()\n",
    "    keep = (probas > 0.7).cpu()\n",
    "\n",
    "  # convert boxes from [0; 1] to image scales\n",
    "    bboxes_scaled = rescale_bboxes(outputs['pred_boxes'].cpu()[0, keep], im.size)\n",
    "\n",
    "  # Extract the text spans predicted by each box\n",
    "    positive_tokens = (outputs[\"pred_logits\"].cpu()[0, keep].softmax(-1) > 0.1).nonzero().tolist()\n",
    "    predicted_spans = defaultdict(str)\n",
    "    for tok in positive_tokens:\n",
    "        item, pos = tok\n",
    "        if pos < 255:\n",
    "            span = memory_cache[\"tokenized\"].token_to_chars(0, pos)\n",
    "            predicted_spans [item] += \" \" + caption[span.start:span.end]\n",
    "\n",
    "    labels = [predicted_spans [k] for k in sorted(list(predicted_spans .keys()))]\n",
    "    caption = 'Caption: '+ caption\n",
    "    return (sepia_call(caption, im, plot_results(im, probas[keep], bboxes_scaled, labels), choices[approaches]))\n",
    "  \n",
    "\n",
    "\n",
    "    \n",
    "#BLIP Code\n",
    "\n",
    "\n",
    "from modelsn.blip import blip_decoder\n",
    "\n",
    "image_size = 384\n",
    "transform = transforms.Compose([\n",
    "    transforms.Resize((image_size,image_size),interpolation=InterpolationMode.BICUBIC),\n",
    "    transforms.ToTensor(),\n",
    "    transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))\n",
    "    ]) \n",
    "\n",
    "model_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model*_base_caption.pth'\n",
    "    \n",
    "model = blip_decoder(pretrained=model_url, image_size=384, vit='base')\n",
    "model.eval()\n",
    "model = model.to(device)\n",
    "\n",
    "\n",
    "from modelsn.blip_vqa import blip_vqa\n",
    "\n",
    "image_size_vq = 480\n",
    "transform_vq = transforms.Compose([\n",
    "    transforms.Resize((image_size_vq,image_size_vq),interpolation=InterpolationMode.BICUBIC),\n",
    "    transforms.ToTensor(),\n",
    "    transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))\n",
    "    ]) \n",
    "\n",
    "model_url_vq = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model*_vqa.pth'\n",
    "    \n",
    "model_vq = blip_vqa(pretrained=model_url_vq, image_size=480, vit='base')\n",
    "model_vq.eval()\n",
    "model_vq = model_vq.to(device)\n",
    "\n",
    "\n",
    "\n",
    "def inference(raw_image, approaches, question):\n",
    "    \n",
    "\n",
    "    image = transform(raw_image).unsqueeze(0).to(device)   \n",
    "    with torch.no_grad():\n",
    "        caption = model.generate(image, sample=False, num_beams=3, max_length=20, min_length=5)\n",
    "\n",
    "    return (plot_inference(raw_image, caption[0], approaches))\n",
    "    #return 'caption: '+caption[0]\n",
    "\n",
    "   \n",
    "\n",
    "    \n",
    "#PPE Detection code\n",
    "import numpy as np\n",
    "import run_code\n",
    "import gradio as gr\n",
    "  \n",
    "\n",
    "def sepia_call(caption, Input_Image, MDETR_im, Approach):\n",
    "    pil_image = Input_Image\n",
    "    open_cv_image = np.asarray(pil_image)\n",
    "    sepia_img = run_code.run(open_cv_image, Approach)\n",
    "    images = sepia_img['img']\n",
    "    texts= sepia_img['text']\n",
    "\n",
    "    return (caption, MDETR_im, images, texts)\n",
    "\n",
    "\n",
    "inputs = [gr.inputs.Image(type='pil'),gr.inputs.Radio(choices=[\"Worker Helmet Separately\",\"Worker Helmet Vest\", \"Workers only\"], type=\"value\", default=\"Worker Helmet Vest\", label=\"Model\"),\"textbox\"]\n",
    "outputs = [gr.outputs.Textbox(label=\"Output\"), \"image\", \"image\", gr.outputs.Textbox(label=\"Output\")]\n",
    "\n",
    "\n",
    "title = \"BLIP + MDETR + PPE Detection\"\n",
    "\n",
    "description = \"Gradio demo for BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation by Salesforce Research. To use it, simply upload your image, or click one of the examples to load them. Read more at the links below.\"\n",
    "\n",
    "article = \"<p style='text-align: center'><a href='https://arxiv.org/abs/2201.12086' target='_blank'>BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation</a> | <a href='https://github.com/salesforce/BLIP' target='_blank'>Github Repo</a></p>\"\n",
    "\n",
    "\n",
    "gr.Interface(inference, inputs, outputs, title=title, description=description, article=article, examples=[['starry.jpg',\"Image Captioning\",\"None\"]]).launch(share=True,enable_queue=True,cache_examples=False)"
   ]
  },
  {
   "cell_type": "raw",
   "id": "b2729aa9",
   "metadata": {},
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}