{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\student\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python311\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "WARNING:tensorflow:From C:\\Users\\student\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python311\\site-packages\\keras\\src\\losses.py:2976: The name tf.losses.sparse_softmax_cross_entropy is deprecated. Please use tf.compat.v1.losses.sparse_softmax_cross_entropy instead.\n", "\n" ] } ], "source": [ "from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor, AutoTokenizer\n", "import torch\n", "from PIL import Image\n", " " ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "config.json: 100%|██████████| 4.61k/4.61k [00:00" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Predicted caption: two girls playing in the water on a beach\n" ] } ], "source": [ "import matplotlib.pyplot as plt\n", "\n", "image_path = './3747543364_bf5b548527.jpg' \n", "sample_prediction = predict_step([image_path])\n", "\n", "image = Image.open(image_path)\n", "plt.imshow(image)\n", "plt.axis('off')\n", "plt.show()\n", "\n", "print(f\"Predicted caption: {sample_prediction[0]}\")\n" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['two girls playing in the water on a beach']" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "predict_step(['./3747543364_bf5b548527.jpg']) " ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Predictions with beam size 2: ['a man riding a horse on top of a beach']\n", "Predictions with beam size 4: ['a man riding a horse on top of a beach']\n", "Predictions with beam size 6: ['a man riding a horse on top of a beach']\n", "Predictions with beam size 8: ['a man riding a horse on top of a beach']\n" ] } ], "source": [ "# Experiment with different beam sizes\n", "beam_sizes = [2, 4, 6, 8]\n", "for beam_size in beam_sizes:\n", " gen_kwargs[\"num_beams\"] = beam_size\n", " \n", " predictions = predict_step(['./sample2.jpg']) # Replace with your image path\n", " print(f\"Predictions with beam size {beam_size}: {predictions}\")\n" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "VisionEncoderDecoderConfig {\n", " \"_name_or_path\": \"nlpconnect/vit-gpt2-image-captioning\",\n", " \"architectures\": [\n", " \"VisionEncoderDecoderModel\"\n", " ],\n", " \"bos_token_id\": 50256,\n", " \"decoder\": {\n", " \"_name_or_path\": \"\",\n", " \"activation_function\": \"gelu_new\",\n", " \"add_cross_attention\": true,\n", " \"architectures\": [\n", " \"GPT2LMHeadModel\"\n", " ],\n", " \"attn_pdrop\": 0.1,\n", " \"bad_words_ids\": null,\n", " \"begin_suppress_tokens\": null,\n", " \"bos_token_id\": 50256,\n", " \"chunk_size_feed_forward\": 0,\n", " \"cross_attention_hidden_size\": null,\n", " \"decoder_start_token_id\": 50256,\n", " \"diversity_penalty\": 0.0,\n", " \"do_sample\": false,\n", " \"early_stopping\": false,\n", " \"embd_pdrop\": 0.1,\n", " \"encoder_no_repeat_ngram_size\": 0,\n", " \"eos_token_id\": 50256,\n", " \"exponential_decay_length_penalty\": null,\n", " \"finetuning_task\": null,\n", " \"forced_bos_token_id\": null,\n", " \"forced_eos_token_id\": null,\n", " \"id2label\": {\n", " \"0\": \"LABEL_0\",\n", " \"1\": \"LABEL_1\"\n", " },\n", " \"initializer_range\": 0.02,\n", " \"is_decoder\": true,\n", " \"is_encoder_decoder\": false,\n", " \"label2id\": {\n", " \"LABEL_0\": 0,\n", " \"LABEL_1\": 1\n", " },\n", " \"layer_norm_epsilon\": 1e-05,\n", " \"length_penalty\": 1.0,\n", " \"max_length\": 20,\n", " \"min_length\": 0,\n", " \"model_type\": \"gpt2\",\n", " \"n_ctx\": 1024,\n", " \"n_embd\": 768,\n", " \"n_head\": 12,\n", " \"n_inner\": null,\n", " \"n_layer\": 12,\n", " \"n_positions\": 1024,\n", " \"no_repeat_ngram_size\": 0,\n", " \"num_beam_groups\": 1,\n", " \"num_beams\": 1,\n", " \"num_return_sequences\": 1,\n", " \"output_attentions\": false,\n", " \"output_hidden_states\": false,\n", " \"output_scores\": false,\n", " \"pad_token_id\": 50256,\n", " \"prefix\": null,\n", " \"problem_type\": null,\n", " \"pruned_heads\": {},\n", " \"remove_invalid_values\": false,\n", " \"reorder_and_upcast_attn\": false,\n", " \"repetition_penalty\": 1.0,\n", " \"resid_pdrop\": 0.1,\n", " \"return_dict\": true,\n", " \"return_dict_in_generate\": false,\n", " \"scale_attn_by_inverse_layer_idx\": false,\n", " \"scale_attn_weights\": true,\n", " \"sep_token_id\": null,\n", " \"summary_activation\": null,\n", " \"summary_first_dropout\": 0.1,\n", " \"summary_proj_to_labels\": true,\n", " \"summary_type\": \"cls_index\",\n", " \"summary_use_proj\": true,\n", " \"suppress_tokens\": null,\n", " \"task_specific_params\": {\n", " \"text-generation\": {\n", " \"do_sample\": true,\n", " \"max_length\": 50\n", " }\n", " },\n", " \"temperature\": 1.0,\n", " \"tf_legacy_loss\": false,\n", " \"tie_encoder_decoder\": false,\n", " \"tie_word_embeddings\": true,\n", " \"tokenizer_class\": null,\n", " \"top_k\": 50,\n", " \"top_p\": 1.0,\n", " \"torch_dtype\": null,\n", " \"torchscript\": false,\n", " \"typical_p\": 1.0,\n", " \"use_bfloat16\": false,\n", " \"use_cache\": true,\n", " \"vocab_size\": 50257\n", " },\n", " \"decoder_start_token_id\": 50256,\n", " \"encoder\": {\n", " \"_name_or_path\": \"\",\n", " \"add_cross_attention\": false,\n", " \"architectures\": [\n", " \"ViTModel\"\n", " ],\n", " \"attention_probs_dropout_prob\": 0.0,\n", " \"bad_words_ids\": null,\n", " \"begin_suppress_tokens\": null,\n", " \"bos_token_id\": null,\n", " \"chunk_size_feed_forward\": 0,\n", " \"cross_attention_hidden_size\": null,\n", " \"decoder_start_token_id\": null,\n", " \"diversity_penalty\": 0.0,\n", " \"do_sample\": false,\n", " \"early_stopping\": false,\n", " \"encoder_no_repeat_ngram_size\": 0,\n", " \"encoder_stride\": 16,\n", " \"eos_token_id\": null,\n", " \"exponential_decay_length_penalty\": null,\n", " \"finetuning_task\": null,\n", " \"forced_bos_token_id\": null,\n", " \"forced_eos_token_id\": null,\n", " \"hidden_act\": \"gelu\",\n", " \"hidden_dropout_prob\": 0.0,\n", " \"hidden_size\": 768,\n", " \"id2label\": {\n", " \"0\": \"LABEL_0\",\n", " \"1\": \"LABEL_1\"\n", " },\n", " \"image_size\": 224,\n", " \"initializer_range\": 0.02,\n", " \"intermediate_size\": 3072,\n", " \"is_decoder\": false,\n", " \"is_encoder_decoder\": false,\n", " \"label2id\": {\n", " \"LABEL_0\": 0,\n", " \"LABEL_1\": 1\n", " },\n", " \"layer_norm_eps\": 1e-12,\n", " \"length_penalty\": 1.0,\n", " \"max_length\": 20,\n", " \"min_length\": 0,\n", " \"model_type\": \"vit\",\n", " \"no_repeat_ngram_size\": 0,\n", " \"num_attention_heads\": 12,\n", " \"num_beam_groups\": 1,\n", " \"num_beams\": 1,\n", " \"num_channels\": 3,\n", " \"num_hidden_layers\": 12,\n", " \"num_return_sequences\": 1,\n", " \"output_attentions\": false,\n", " \"output_hidden_states\": false,\n", " \"output_scores\": false,\n", " \"pad_token_id\": null,\n", " \"patch_size\": 16,\n", " \"prefix\": null,\n", " \"problem_type\": null,\n", " \"pruned_heads\": {},\n", " \"qkv_bias\": true,\n", " \"remove_invalid_values\": false,\n", " \"repetition_penalty\": 1.0,\n", " \"return_dict\": true,\n", " \"return_dict_in_generate\": false,\n", " \"sep_token_id\": null,\n", " \"suppress_tokens\": null,\n", " \"task_specific_params\": null,\n", " \"temperature\": 1.0,\n", " \"tf_legacy_loss\": false,\n", " \"tie_encoder_decoder\": false,\n", " \"tie_word_embeddings\": true,\n", " \"tokenizer_class\": null,\n", " \"top_k\": 50,\n", " \"top_p\": 1.0,\n", " \"torch_dtype\": null,\n", " \"torchscript\": false,\n", " \"typical_p\": 1.0,\n", " \"use_bfloat16\": false\n", " },\n", " \"eos_token_id\": 50256,\n", " \"is_encoder_decoder\": true,\n", " \"model_type\": \"vision-encoder-decoder\",\n", " \"pad_token_id\": 50256,\n", " \"tie_word_embeddings\": false,\n", " \"torch_dtype\": \"float32\",\n", " \"transformers_version\": \"4.35.2\"\n", "}\n", "\n" ] } ], "source": [ "# Display basic information about the loaded model\n", "print(model.config)\n" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Image: ./img1.jpg\n", "Predicted Caption: a man and woman standing in front of a table full of food\n", "==================================================\n", "Image: ./img2.jpg\n", "Predicted Caption: a person sitting on a bench next to a body of water\n", "==================================================\n", "Image: ./img3.jpg\n", "Predicted Caption: a dog running in the grass with a frisbee in its mouth\n", "==================================================\n" ] } ], "source": [ "# Predict captions for multiple images and display the results\n", "image_paths = ['./img1.jpg', './img2.jpg', './img3.jpg'] # Replace with your image paths\n", "predictions = predict_step(image_paths)\n", "\n", "for idx, image_path in enumerate(image_paths):\n", " print(f\"Image: {image_path}\")\n", " print(f\"Predicted Caption: {predictions[idx]}\")\n", " print(\"=\" * 50)\n" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Inference Time for Single Image: 12.298733234405518 seconds\n" ] } ], "source": [ "import time\n", "\n", "# Measure model inference time for a single image\n", "start_time = time.time()\n", "image_path = './img2.jpg' # Replace with your image path\n", "_ = predict_step([image_path])\n", "inference_time = time.time() - start_time\n", "print(f\"Inference Time for Single Image: {inference_time} seconds\")\n" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'float32', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': False, 'is_encoder_decoder': True, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'chunk_size_feed_forward': 0, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['VisionEncoderDecoderModel'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 50256, 'pad_token_id': 50256, 'eos_token_id': 50256, 'sep_token_id': None, 'decoder_start_token_id': 50256, 'task_specific_params': None, 'problem_type': None, '_name_or_path': 'nlpconnect/vit-gpt2-image-captioning', 'transformers_version': '4.35.2', 'decoder': {'vocab_size': 50257, 'n_positions': 1024, 'n_embd': 768, 'n_layer': 12, 'n_head': 12, 'n_inner': None, 'activation_function': 'gelu_new', 'resid_pdrop': 0.1, 'embd_pdrop': 0.1, 'attn_pdrop': 0.1, 'layer_norm_epsilon': 1e-05, 'initializer_range': 0.02, 'summary_type': 'cls_index', 'summary_use_proj': True, 'summary_activation': None, 'summary_first_dropout': 0.1, 'summary_proj_to_labels': True, 'scale_attn_weights': True, 'use_cache': True, 'scale_attn_by_inverse_layer_idx': False, 'reorder_and_upcast_attn': False, 'bos_token_id': 50256, 'eos_token_id': 50256, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': None, 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': True, 'is_encoder_decoder': False, 'is_decoder': True, 'cross_attention_hidden_size': None, 'add_cross_attention': True, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'chunk_size_feed_forward': 0, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['GPT2LMHeadModel'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'pad_token_id': 50256, 'sep_token_id': None, 'decoder_start_token_id': 50256, 'task_specific_params': {'text-generation': {'do_sample': True, 'max_length': 50}}, 'problem_type': None, '_name_or_path': '', 'n_ctx': 1024, 'model_type': 'gpt2'}, 'encoder': {'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': None, 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': True, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'chunk_size_feed_forward': 0, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['ViTModel'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': None, 'pad_token_id': None, 'eos_token_id': None, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': '', 'hidden_size': 768, 'num_hidden_layers': 12, 'num_attention_heads': 12, 'intermediate_size': 3072, 'hidden_act': 'gelu', 'hidden_dropout_prob': 0.0, 'attention_probs_dropout_prob': 0.0, 'initializer_range': 0.02, 'layer_norm_eps': 1e-12, 'image_size': 224, 'patch_size': 16, 'num_channels': 3, 'qkv_bias': True, 'encoder_stride': 16, 'model_type': 'vit'}, 'model_type': 'vision-encoder-decoder'}\n" ] } ], "source": [ "# Display detailed configuration information of the loaded model\n", "print(model.config.to_dict())\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Apply image preprocessing before passing them to the model\n", "from torchvision import transforms\n", "\n", "# Define image transformations\n", "image_transforms = transforms.Compose([\n", " transforms.Resize((256, 256)),\n", " transforms.CenterCrop(224),\n", " transforms.ToTensor(),\n", " transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),\n", "])\n", "\n", "# Process image using defined transformations\n", "image = Image.open('image_path.jpg') # Replace with your image path\n", "image = image_transforms(image)\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.7" } }, "nbformat": 4, "nbformat_minor": 2 }