Spaces:

aryan083
/

Image-Caption

Sleeping

App Files Files Community

aryan083 commited on Mar 20

Commit

3bfd95f

verified ·

1 Parent(s): 1f5aaa2

Upload 7 files

Browse files

Files changed (7) hide show

.gitignore +4 -0
README.md +15 -12
app.py +45 -154
final.ipynb +0 -0
gradio_ui.py +15 -0
predict_caption.py +30 -0
requirements.txt +90 -6

.gitignore ADDED Viewed

	@@ -0,0 +1,4 @@

+/vit-gpt2-image-captioning
+/venv
+**/__pycache__
+.qodo

README.md CHANGED Viewed

@@ -1,12 +1,15 @@
----
-title: Image Caption
-emoji: 🖼
-colorFrom: purple
-colorTo: red
-sdk: gradio
-sdk_version: 5.0.1
-app_file: app.py
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# Image Captioning
+Generate a caption/description for your image, simple and straight forward using [Transformers](https://huggingface.co/docs/transformers/index) library.
+### How to use
+* Create a Python 3.9 virtual environment
+* Install all the packages from the `requirements.txt` file
+* Download the model and supporting files from [Huggingface](https://huggingface.co/nlpconnect/vit-gpt2-image-captioning/tree/main)
+### HuggingFace Model
+* https://huggingface.co/nlpconnect/vit-gpt2-image-captioning
+##
+execute the code "python gradio_ui.py"

app.py CHANGED Viewed

@@ -1,154 +1,45 @@
-import gradio as gr
-import numpy as np
-import random
-# import spaces #[uncomment to use ZeroGPU]
-from diffusers import DiffusionPipeline
-import torch
-device = "cuda" if torch.cuda.is_available() else "cpu"
-model_repo_id = "stabilityai/sdxl-turbo"  # Replace to the model you would like to use
-if torch.cuda.is_available():
-    torch_dtype = torch.float16
-else:
-    torch_dtype = torch.float32
-pipe = DiffusionPipeline.from_pretrained(model_repo_id, torch_dtype=torch_dtype)
-pipe = pipe.to(device)
-MAX_SEED = np.iinfo(np.int32).max
-MAX_IMAGE_SIZE = 1024
-# @spaces.GPU #[uncomment to use ZeroGPU]
-def infer(
-    prompt,
-    negative_prompt,
-    seed,
-    randomize_seed,
-    width,
-    height,
-    guidance_scale,
-    num_inference_steps,
-    progress=gr.Progress(track_tqdm=True),
-):
-    if randomize_seed:
-        seed = random.randint(0, MAX_SEED)
-    generator = torch.Generator().manual_seed(seed)
-    image = pipe(
-        prompt=prompt,
-        negative_prompt=negative_prompt,
-        guidance_scale=guidance_scale,
-        num_inference_steps=num_inference_steps,
-        width=width,
-        height=height,
-        generator=generator,
-    ).images[0]
-    return image, seed
-examples = [
-    "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k",
-    "An astronaut riding a green horse",
-    "A delicious ceviche cheesecake slice",
-]
-css = """
-#col-container {
-    margin: 0 auto;
-    max-width: 640px;
-}
-"""
-with gr.Blocks(css=css) as demo:
-    with gr.Column(elem_id="col-container"):
-        gr.Markdown(" # Text-to-Image Gradio Template")
-        with gr.Row():
-            prompt = gr.Text(
-                label="Prompt",
-                show_label=False,
-                max_lines=1,
-                placeholder="Enter your prompt",
-                container=False,
-            )
-            run_button = gr.Button("Run", scale=0, variant="primary")
-        result = gr.Image(label="Result", show_label=False)
-        with gr.Accordion("Advanced Settings", open=False):
-            negative_prompt = gr.Text(
-                label="Negative prompt",
-                max_lines=1,
-                placeholder="Enter a negative prompt",
-                visible=False,
-            )
-            seed = gr.Slider(
-                label="Seed",
-                minimum=0,
-                maximum=MAX_SEED,
-                step=1,
-                value=0,
-            )
-            randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
-            with gr.Row():
-                width = gr.Slider(
-                    label="Width",
-                    minimum=256,
-                    maximum=MAX_IMAGE_SIZE,
-                    step=32,
-                    value=1024,  # Replace with defaults that work for your model
-                )
-                height = gr.Slider(
-                    label="Height",
-                    minimum=256,
-                    maximum=MAX_IMAGE_SIZE,
-                    step=32,
-                    value=1024,  # Replace with defaults that work for your model
-                )
-            with gr.Row():
-                guidance_scale = gr.Slider(
-                    label="Guidance scale",
-                    minimum=0.0,
-                    maximum=10.0,
-                    step=0.1,
-                    value=0.0,  # Replace with defaults that work for your model
-                )
-                num_inference_steps = gr.Slider(
-                    label="Number of inference steps",
-                    minimum=1,
-                    maximum=50,
-                    step=1,
-                    value=2,  # Replace with defaults that work for your model
-                )
-        gr.Examples(examples=examples, inputs=[prompt])
-    gr.on(
-        triggers=[run_button.click, prompt.submit],
-        fn=infer,
-        inputs=[
-            prompt,
-            negative_prompt,
-            seed,
-            randomize_seed,
-            width,
-            height,
-            guidance_scale,
-            num_inference_steps,
-        ],
-        outputs=[result, seed],
-    )
-if __name__ == "__main__":
-    demo.launch()

+from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
+import torch
+from PIL import Image
+import gradio as gr
+model_name = "aryan083/vit-gpt2-image-captioning"
+model = VisionEncoderDecoderModel.from_pretrained(model_name)
+feature_extractor = ViTFeatureExtractor.from_pretrained(model_name)
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+model.to(device)
+def predict_caption(image):
+    if image is None:
+        return None
+    images = []
+    images.append(image)
+    pixel_values = feature_extractor(images=images, return_tensors="pt").pixel_values
+    pixel_values = pixel_values.to(device)
+    output_ids = model.generate(
+        pixel_values,
+        do_sample=True,
+        max_length=16,
+        num_beams=4,
+        temperature=0.7
+    )
+    preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
+    return preds[0].strip()
+# Create Gradio interface
+iface = gr.Interface(
+    fn=predict_caption,
+    inputs=gr.Image(type="pil"),
+    outputs=gr.Textbox(label="Generated Caption"),
+    title="Image Captioning",
+    description="Upload an image and get its description generated using ViT-GPT2",
+    # examples=[["assets/example1.jpg"]]  # Add example images if you have any
+)
+iface.launch()

final.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

gradio_ui.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import gradio as gr
+from predict_caption import predict_step
+with gr.Blocks() as demo:
+    image = gr.Image(type='pil', label='Image')
+    label = gr.Text(label='Generated Caption')
+    image.upload(
+        predict_step,
+        [image],
+        [label]
+    )
+if __name__ == '__main__':
+    demo.launch()

predict_caption.py ADDED Viewed

	@@ -0,0 +1,30 @@

+from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor, AutoTokenizer
+import torch
+from PIL import Image
+# Load model and tokenizer from the Hugging Face repository
+model_name = "aryan083/vit-gpt2-image-captioning"
+model = VisionEncoderDecoderModel.from_pretrained(model_name)
+feature_extractor = ViTFeatureExtractor.from_pretrained(model_name)
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+model.to(device)
+max_length = 16
+num_beams = 4
+gen_kwargs = {'max_length': max_length, 'num_beams': num_beams}
+def predict_step(image_path):
+    image = Image.open(image_path)
+    pixel_values = feature_extractor(images=image, return_tensors='pt').pixel_values
+    pixel_values = pixel_values.to(device)
+    output_ids = model.generate(pixel_values, **gen_kwargs)
+    preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
+    preds = [pred.strip() for pred in preds]
+    return preds[0]
+# Example usage with your image file
+image_path = 'jon-parry-C8eSYwQkwHw-unsplash.jpg'
+print(predict_step(image_path=image_path))

requirements.txt CHANGED Viewed

@@ -1,6 +1,90 @@
-accelerate
-diffusers
-invisible_watermark
-torch
-transformers
-xformers

+aiofiles==23.1.0
+aiohttp==3.8.4
+aiosignal==1.3.1
+altair==4.2.2
+anyio==3.6.2
+async-timeout==4.0.2
+attrs==23.1.0
+autopep8==2.0.2
+certifi==2022.12.7
+charset-normalizer==3.1.0uuuu
+click==8.1.3
+cmake==3.26.3
+contourpy==1.0.7
+cycler==0.11.0
+entrypoints==0.4
+fastapi==0.95.1
+ffmpy==0.3.0
+filelock==3.12.0
+fonttools==4.39.3
+frozenlist==1.3.3
+fsspec==2023.4.0
+gradio==3.28.0
+gradio_client==0.1.4
+h11==0.14.0
+httpcore==0.17.0
+httpx==0.24.0
+huggingface-hub==0.14.1
+idna==3.4
+importlib-resources==5.12.0
+Jinja2==3.1.2
+jsonschema==4.17.3
+kiwisolver==1.4.4
+linkify-it-py==2.0.0
+lit==16.0.2
+markdown-it-py==2.2.0
+MarkupSafe==2.1.2
+matplotlib==3.7.1
+mdit-py-plugins==0.3.3
+mdurl==0.1.2
+mpmath==1.3.0
+multidict==6.0.4
+networkx==3.1
+numpy==1.24.3
+nvidia-cublas-cu11==11.10.3.66
+nvidia-cuda-cupti-cu11==11.7.101
+nvidia-cuda-nvrtc-cu11==11.7.99
+nvidia-cuda-runtime-cu11==11.7.99
+nvidia-cudnn-cu11==8.5.0.96
+nvidia-cufft-cu11==10.9.0.58
+nvidia-curand-cu11==10.2.10.91
+nvidia-cusolver-cu11==11.4.0.1
+nvidia-cusparse-cu11==11.7.4.91
+nvidia-nccl-cu11==2.14.3
+nvidia-nvtx-cu11==11.7.91
+orjson==3.8.11
+packaging==23.1
+pandas==2.0.1
+Pillow==9.5.0
+pycodestyle==2.10.0
+pydantic==1.10.7
+pydub==0.25.1
+pyparsing==3.0.9
+pyrsistent==0.19.3
+python-dateutil==2.8.2
+python-multipart==0.0.6
+pytz==2023.3
+PyYAML==6.0
+regex==2023.3.23
+requests==2.29.0
+semantic-version==2.10.0
+six==1.16.0
+sniffio==1.3.0
+starlette==0.26.1
+sympy==1.11.1
+tokenizers==0.13.3
+tomli==2.0.1
+toolz==0.12.0
+torch==2.0.0
+torchvision==0.15.1
+tqdm==4.65.0
+transformers==4.28.1
+triton==2.0.0
+typing_extensions==4.5.0
+tzdata==2023.3
+uc-micro-py==1.0.1
+urllib3==1.26.15
+uvicorn==0.22.0
+websockets==11.0.2
+yarl==1.9.2
+zipp==3.15.0