diff --git "a/final.ipynb" "b/final.ipynb" new file mode 100644--- /dev/null +++ "b/final.ipynb" @@ -0,0 +1,589 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\student\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python311\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "WARNING:tensorflow:From C:\\Users\\student\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python311\\site-packages\\keras\\src\\losses.py:2976: The name tf.losses.sparse_softmax_cross_entropy is deprecated. Please use tf.compat.v1.losses.sparse_softmax_cross_entropy instead.\n", + "\n" + ] + } + ], + "source": [ + "from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor, AutoTokenizer\n", + "import torch\n", + "from PIL import Image\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "config.json: 100%|██████████| 4.61k/4.61k [00:00" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Predicted caption: two girls playing in the water on a beach\n" + ] + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "image_path = './3747543364_bf5b548527.jpg' \n", + "sample_prediction = predict_step([image_path])\n", + "\n", + "image = Image.open(image_path)\n", + "plt.imshow(image)\n", + "plt.axis('off')\n", + "plt.show()\n", + "\n", + "print(f\"Predicted caption: {sample_prediction[0]}\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['two girls playing in the water on a beach']" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "predict_step(['./3747543364_bf5b548527.jpg']) " + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Predictions with beam size 2: ['a man riding a horse on top of a beach']\n", + "Predictions with beam size 4: ['a man riding a horse on top of a beach']\n", + "Predictions with beam size 6: ['a man riding a horse on top of a beach']\n", + "Predictions with beam size 8: ['a man riding a horse on top of a beach']\n" + ] + } + ], + "source": [ + "# Experiment with different beam sizes\n", + "beam_sizes = [2, 4, 6, 8]\n", + "for beam_size in beam_sizes:\n", + " gen_kwargs[\"num_beams\"] = beam_size\n", + " \n", + " predictions = predict_step(['./sample2.jpg']) # Replace with your image path\n", + " print(f\"Predictions with beam size {beam_size}: {predictions}\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "VisionEncoderDecoderConfig {\n", + " \"_name_or_path\": \"nlpconnect/vit-gpt2-image-captioning\",\n", + " \"architectures\": [\n", + " \"VisionEncoderDecoderModel\"\n", + " ],\n", + " \"bos_token_id\": 50256,\n", + " \"decoder\": {\n", + " \"_name_or_path\": \"\",\n", + " \"activation_function\": \"gelu_new\",\n", + " \"add_cross_attention\": true,\n", + " \"architectures\": [\n", + " \"GPT2LMHeadModel\"\n", + " ],\n", + " \"attn_pdrop\": 0.1,\n", + " \"bad_words_ids\": null,\n", + " \"begin_suppress_tokens\": null,\n", + " \"bos_token_id\": 50256,\n", + " \"chunk_size_feed_forward\": 0,\n", + " \"cross_attention_hidden_size\": null,\n", + " \"decoder_start_token_id\": 50256,\n", + " \"diversity_penalty\": 0.0,\n", + " \"do_sample\": false,\n", + " \"early_stopping\": false,\n", + " \"embd_pdrop\": 0.1,\n", + " \"encoder_no_repeat_ngram_size\": 0,\n", + " \"eos_token_id\": 50256,\n", + " \"exponential_decay_length_penalty\": null,\n", + " \"finetuning_task\": null,\n", + " \"forced_bos_token_id\": null,\n", + " \"forced_eos_token_id\": null,\n", + " \"id2label\": {\n", + " \"0\": \"LABEL_0\",\n", + " \"1\": \"LABEL_1\"\n", + " },\n", + " \"initializer_range\": 0.02,\n", + " \"is_decoder\": true,\n", + " \"is_encoder_decoder\": false,\n", + " \"label2id\": {\n", + " \"LABEL_0\": 0,\n", + " \"LABEL_1\": 1\n", + " },\n", + " \"layer_norm_epsilon\": 1e-05,\n", + " \"length_penalty\": 1.0,\n", + " \"max_length\": 20,\n", + " \"min_length\": 0,\n", + " \"model_type\": \"gpt2\",\n", + " \"n_ctx\": 1024,\n", + " \"n_embd\": 768,\n", + " \"n_head\": 12,\n", + " \"n_inner\": null,\n", + " \"n_layer\": 12,\n", + " \"n_positions\": 1024,\n", + " \"no_repeat_ngram_size\": 0,\n", + " \"num_beam_groups\": 1,\n", + " \"num_beams\": 1,\n", + " \"num_return_sequences\": 1,\n", + " \"output_attentions\": false,\n", + " \"output_hidden_states\": false,\n", + " \"output_scores\": false,\n", + " \"pad_token_id\": 50256,\n", + " \"prefix\": null,\n", + " \"problem_type\": null,\n", + " \"pruned_heads\": {},\n", + " \"remove_invalid_values\": false,\n", + " \"reorder_and_upcast_attn\": false,\n", + " \"repetition_penalty\": 1.0,\n", + " \"resid_pdrop\": 0.1,\n", + " \"return_dict\": true,\n", + " \"return_dict_in_generate\": false,\n", + " \"scale_attn_by_inverse_layer_idx\": false,\n", + " \"scale_attn_weights\": true,\n", + " \"sep_token_id\": null,\n", + " \"summary_activation\": null,\n", + " \"summary_first_dropout\": 0.1,\n", + " \"summary_proj_to_labels\": true,\n", + " \"summary_type\": \"cls_index\",\n", + " \"summary_use_proj\": true,\n", + " \"suppress_tokens\": null,\n", + " \"task_specific_params\": {\n", + " \"text-generation\": {\n", + " \"do_sample\": true,\n", + " \"max_length\": 50\n", + " }\n", + " },\n", + " \"temperature\": 1.0,\n", + " \"tf_legacy_loss\": false,\n", + " \"tie_encoder_decoder\": false,\n", + " \"tie_word_embeddings\": true,\n", + " \"tokenizer_class\": null,\n", + " \"top_k\": 50,\n", + " \"top_p\": 1.0,\n", + " \"torch_dtype\": null,\n", + " \"torchscript\": false,\n", + " \"typical_p\": 1.0,\n", + " \"use_bfloat16\": false,\n", + " \"use_cache\": true,\n", + " \"vocab_size\": 50257\n", + " },\n", + " \"decoder_start_token_id\": 50256,\n", + " \"encoder\": {\n", + " \"_name_or_path\": \"\",\n", + " \"add_cross_attention\": false,\n", + " \"architectures\": [\n", + " \"ViTModel\"\n", + " ],\n", + " \"attention_probs_dropout_prob\": 0.0,\n", + " \"bad_words_ids\": null,\n", + " \"begin_suppress_tokens\": null,\n", + " \"bos_token_id\": null,\n", + " \"chunk_size_feed_forward\": 0,\n", + " \"cross_attention_hidden_size\": null,\n", + " \"decoder_start_token_id\": null,\n", + " \"diversity_penalty\": 0.0,\n", + " \"do_sample\": false,\n", + " \"early_stopping\": false,\n", + " \"encoder_no_repeat_ngram_size\": 0,\n", + " \"encoder_stride\": 16,\n", + " \"eos_token_id\": null,\n", + " \"exponential_decay_length_penalty\": null,\n", + " \"finetuning_task\": null,\n", + " \"forced_bos_token_id\": null,\n", + " \"forced_eos_token_id\": null,\n", + " \"hidden_act\": \"gelu\",\n", + " \"hidden_dropout_prob\": 0.0,\n", + " \"hidden_size\": 768,\n", + " \"id2label\": {\n", + " \"0\": \"LABEL_0\",\n", + " \"1\": \"LABEL_1\"\n", + " },\n", + " \"image_size\": 224,\n", + " \"initializer_range\": 0.02,\n", + " \"intermediate_size\": 3072,\n", + " \"is_decoder\": false,\n", + " \"is_encoder_decoder\": false,\n", + " \"label2id\": {\n", + " \"LABEL_0\": 0,\n", + " \"LABEL_1\": 1\n", + " },\n", + " \"layer_norm_eps\": 1e-12,\n", + " \"length_penalty\": 1.0,\n", + " \"max_length\": 20,\n", + " \"min_length\": 0,\n", + " \"model_type\": \"vit\",\n", + " \"no_repeat_ngram_size\": 0,\n", + " \"num_attention_heads\": 12,\n", + " \"num_beam_groups\": 1,\n", + " \"num_beams\": 1,\n", + " \"num_channels\": 3,\n", + " \"num_hidden_layers\": 12,\n", + " \"num_return_sequences\": 1,\n", + " \"output_attentions\": false,\n", + " \"output_hidden_states\": false,\n", + " \"output_scores\": false,\n", + " \"pad_token_id\": null,\n", + " \"patch_size\": 16,\n", + " \"prefix\": null,\n", + " \"problem_type\": null,\n", + " \"pruned_heads\": {},\n", + " \"qkv_bias\": true,\n", + " \"remove_invalid_values\": false,\n", + " \"repetition_penalty\": 1.0,\n", + " \"return_dict\": true,\n", + " \"return_dict_in_generate\": false,\n", + " \"sep_token_id\": null,\n", + " \"suppress_tokens\": null,\n", + " \"task_specific_params\": null,\n", + " \"temperature\": 1.0,\n", + " \"tf_legacy_loss\": false,\n", + " \"tie_encoder_decoder\": false,\n", + " \"tie_word_embeddings\": true,\n", + " \"tokenizer_class\": null,\n", + " \"top_k\": 50,\n", + " \"top_p\": 1.0,\n", + " \"torch_dtype\": null,\n", + " \"torchscript\": false,\n", + " \"typical_p\": 1.0,\n", + " \"use_bfloat16\": false\n", + " },\n", + " \"eos_token_id\": 50256,\n", + " \"is_encoder_decoder\": true,\n", + " \"model_type\": \"vision-encoder-decoder\",\n", + " \"pad_token_id\": 50256,\n", + " \"tie_word_embeddings\": false,\n", + " \"torch_dtype\": \"float32\",\n", + " \"transformers_version\": \"4.35.2\"\n", + "}\n", + "\n" + ] + } + ], + "source": [ + "# Display basic information about the loaded model\n", + "print(model.config)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Image: ./img1.jpg\n", + "Predicted Caption: a man and woman standing in front of a table full of food\n", + "==================================================\n", + "Image: ./img2.jpg\n", + "Predicted Caption: a person sitting on a bench next to a body of water\n", + "==================================================\n", + "Image: ./img3.jpg\n", + "Predicted Caption: a dog running in the grass with a frisbee in its mouth\n", + "==================================================\n" + ] + } + ], + "source": [ + "# Predict captions for multiple images and display the results\n", + "image_paths = ['./img1.jpg', './img2.jpg', './img3.jpg'] # Replace with your image paths\n", + "predictions = predict_step(image_paths)\n", + "\n", + "for idx, image_path in enumerate(image_paths):\n", + " print(f\"Image: {image_path}\")\n", + " print(f\"Predicted Caption: {predictions[idx]}\")\n", + " print(\"=\" * 50)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Inference Time for Single Image: 12.298733234405518 seconds\n" + ] + } + ], + "source": [ + "import time\n", + "\n", + "# Measure model inference time for a single image\n", + "start_time = time.time()\n", + "image_path = './img2.jpg' # Replace with your image path\n", + "_ = predict_step([image_path])\n", + "inference_time = time.time() - start_time\n", + "print(f\"Inference Time for Single Image: {inference_time} seconds\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'float32', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': False, 'is_encoder_decoder': True, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'chunk_size_feed_forward': 0, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['VisionEncoderDecoderModel'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 50256, 'pad_token_id': 50256, 'eos_token_id': 50256, 'sep_token_id': None, 'decoder_start_token_id': 50256, 'task_specific_params': None, 'problem_type': None, '_name_or_path': 'nlpconnect/vit-gpt2-image-captioning', 'transformers_version': '4.35.2', 'decoder': {'vocab_size': 50257, 'n_positions': 1024, 'n_embd': 768, 'n_layer': 12, 'n_head': 12, 'n_inner': None, 'activation_function': 'gelu_new', 'resid_pdrop': 0.1, 'embd_pdrop': 0.1, 'attn_pdrop': 0.1, 'layer_norm_epsilon': 1e-05, 'initializer_range': 0.02, 'summary_type': 'cls_index', 'summary_use_proj': True, 'summary_activation': None, 'summary_first_dropout': 0.1, 'summary_proj_to_labels': True, 'scale_attn_weights': True, 'use_cache': True, 'scale_attn_by_inverse_layer_idx': False, 'reorder_and_upcast_attn': False, 'bos_token_id': 50256, 'eos_token_id': 50256, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': None, 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': True, 'is_encoder_decoder': False, 'is_decoder': True, 'cross_attention_hidden_size': None, 'add_cross_attention': True, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'chunk_size_feed_forward': 0, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['GPT2LMHeadModel'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'pad_token_id': 50256, 'sep_token_id': None, 'decoder_start_token_id': 50256, 'task_specific_params': {'text-generation': {'do_sample': True, 'max_length': 50}}, 'problem_type': None, '_name_or_path': '', 'n_ctx': 1024, 'model_type': 'gpt2'}, 'encoder': {'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': None, 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': True, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'chunk_size_feed_forward': 0, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['ViTModel'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': None, 'pad_token_id': None, 'eos_token_id': None, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': '', 'hidden_size': 768, 'num_hidden_layers': 12, 'num_attention_heads': 12, 'intermediate_size': 3072, 'hidden_act': 'gelu', 'hidden_dropout_prob': 0.0, 'attention_probs_dropout_prob': 0.0, 'initializer_range': 0.02, 'layer_norm_eps': 1e-12, 'image_size': 224, 'patch_size': 16, 'num_channels': 3, 'qkv_bias': True, 'encoder_stride': 16, 'model_type': 'vit'}, 'model_type': 'vision-encoder-decoder'}\n" + ] + } + ], + "source": [ + "# Display detailed configuration information of the loaded model\n", + "print(model.config.to_dict())\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Apply image preprocessing before passing them to the model\n", + "from torchvision import transforms\n", + "\n", + "# Define image transformations\n", + "image_transforms = transforms.Compose([\n", + " transforms.Resize((256, 256)),\n", + " transforms.CenterCrop(224),\n", + " transforms.ToTensor(),\n", + " transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),\n", + "])\n", + "\n", + "# Process image using defined transformations\n", + "image = Image.open('image_path.jpg') # Replace with your image path\n", + "image = image_transforms(image)\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}