Spaces:
Sleeping
Sleeping
Upload 7 files
Browse files- .gitignore +4 -0
- README.md +15 -12
- app.py +45 -154
- final.ipynb +0 -0
- gradio_ui.py +15 -0
- predict_caption.py +30 -0
- requirements.txt +90 -6
.gitignore
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
/vit-gpt2-image-captioning
|
2 |
+
/venv
|
3 |
+
**/__pycache__
|
4 |
+
.qodo
|
README.md
CHANGED
@@ -1,12 +1,15 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
|
3 |
+
# Image Captioning
|
4 |
+
Generate a caption/description for your image, simple and straight forward using [Transformers](https://huggingface.co/docs/transformers/index) library.
|
5 |
+
|
6 |
+
### How to use
|
7 |
+
* Create a Python 3.9 virtual environment
|
8 |
+
* Install all the packages from the `requirements.txt` file
|
9 |
+
* Download the model and supporting files from [Huggingface](https://huggingface.co/nlpconnect/vit-gpt2-image-captioning/tree/main)
|
10 |
+
|
11 |
+
### HuggingFace Model
|
12 |
+
* https://huggingface.co/nlpconnect/vit-gpt2-image-captioning
|
13 |
+
|
14 |
+
##
|
15 |
+
execute the code "python gradio_ui.py"
|
app.py
CHANGED
@@ -1,154 +1,45 @@
|
|
1 |
-
import
|
2 |
-
import
|
3 |
-
import
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
image
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
width=width,
|
47 |
-
height=height,
|
48 |
-
generator=generator,
|
49 |
-
).images[0]
|
50 |
-
|
51 |
-
return image, seed
|
52 |
-
|
53 |
-
|
54 |
-
examples = [
|
55 |
-
"Astronaut in a jungle, cold color palette, muted colors, detailed, 8k",
|
56 |
-
"An astronaut riding a green horse",
|
57 |
-
"A delicious ceviche cheesecake slice",
|
58 |
-
]
|
59 |
-
|
60 |
-
css = """
|
61 |
-
#col-container {
|
62 |
-
margin: 0 auto;
|
63 |
-
max-width: 640px;
|
64 |
-
}
|
65 |
-
"""
|
66 |
-
|
67 |
-
with gr.Blocks(css=css) as demo:
|
68 |
-
with gr.Column(elem_id="col-container"):
|
69 |
-
gr.Markdown(" # Text-to-Image Gradio Template")
|
70 |
-
|
71 |
-
with gr.Row():
|
72 |
-
prompt = gr.Text(
|
73 |
-
label="Prompt",
|
74 |
-
show_label=False,
|
75 |
-
max_lines=1,
|
76 |
-
placeholder="Enter your prompt",
|
77 |
-
container=False,
|
78 |
-
)
|
79 |
-
|
80 |
-
run_button = gr.Button("Run", scale=0, variant="primary")
|
81 |
-
|
82 |
-
result = gr.Image(label="Result", show_label=False)
|
83 |
-
|
84 |
-
with gr.Accordion("Advanced Settings", open=False):
|
85 |
-
negative_prompt = gr.Text(
|
86 |
-
label="Negative prompt",
|
87 |
-
max_lines=1,
|
88 |
-
placeholder="Enter a negative prompt",
|
89 |
-
visible=False,
|
90 |
-
)
|
91 |
-
|
92 |
-
seed = gr.Slider(
|
93 |
-
label="Seed",
|
94 |
-
minimum=0,
|
95 |
-
maximum=MAX_SEED,
|
96 |
-
step=1,
|
97 |
-
value=0,
|
98 |
-
)
|
99 |
-
|
100 |
-
randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
|
101 |
-
|
102 |
-
with gr.Row():
|
103 |
-
width = gr.Slider(
|
104 |
-
label="Width",
|
105 |
-
minimum=256,
|
106 |
-
maximum=MAX_IMAGE_SIZE,
|
107 |
-
step=32,
|
108 |
-
value=1024, # Replace with defaults that work for your model
|
109 |
-
)
|
110 |
-
|
111 |
-
height = gr.Slider(
|
112 |
-
label="Height",
|
113 |
-
minimum=256,
|
114 |
-
maximum=MAX_IMAGE_SIZE,
|
115 |
-
step=32,
|
116 |
-
value=1024, # Replace with defaults that work for your model
|
117 |
-
)
|
118 |
-
|
119 |
-
with gr.Row():
|
120 |
-
guidance_scale = gr.Slider(
|
121 |
-
label="Guidance scale",
|
122 |
-
minimum=0.0,
|
123 |
-
maximum=10.0,
|
124 |
-
step=0.1,
|
125 |
-
value=0.0, # Replace with defaults that work for your model
|
126 |
-
)
|
127 |
-
|
128 |
-
num_inference_steps = gr.Slider(
|
129 |
-
label="Number of inference steps",
|
130 |
-
minimum=1,
|
131 |
-
maximum=50,
|
132 |
-
step=1,
|
133 |
-
value=2, # Replace with defaults that work for your model
|
134 |
-
)
|
135 |
-
|
136 |
-
gr.Examples(examples=examples, inputs=[prompt])
|
137 |
-
gr.on(
|
138 |
-
triggers=[run_button.click, prompt.submit],
|
139 |
-
fn=infer,
|
140 |
-
inputs=[
|
141 |
-
prompt,
|
142 |
-
negative_prompt,
|
143 |
-
seed,
|
144 |
-
randomize_seed,
|
145 |
-
width,
|
146 |
-
height,
|
147 |
-
guidance_scale,
|
148 |
-
num_inference_steps,
|
149 |
-
],
|
150 |
-
outputs=[result, seed],
|
151 |
-
)
|
152 |
-
|
153 |
-
if __name__ == "__main__":
|
154 |
-
demo.launch()
|
|
|
1 |
+
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
|
2 |
+
import torch
|
3 |
+
from PIL import Image
|
4 |
+
import gradio as gr
|
5 |
+
|
6 |
+
model_name = "aryan083/vit-gpt2-image-captioning"
|
7 |
+
model = VisionEncoderDecoderModel.from_pretrained(model_name)
|
8 |
+
feature_extractor = ViTFeatureExtractor.from_pretrained(model_name)
|
9 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
10 |
+
|
11 |
+
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
12 |
+
model.to(device)
|
13 |
+
|
14 |
+
def predict_caption(image):
|
15 |
+
if image is None:
|
16 |
+
return None
|
17 |
+
|
18 |
+
images = []
|
19 |
+
images.append(image)
|
20 |
+
|
21 |
+
pixel_values = feature_extractor(images=images, return_tensors="pt").pixel_values
|
22 |
+
pixel_values = pixel_values.to(device)
|
23 |
+
|
24 |
+
output_ids = model.generate(
|
25 |
+
pixel_values,
|
26 |
+
do_sample=True,
|
27 |
+
max_length=16,
|
28 |
+
num_beams=4,
|
29 |
+
temperature=0.7
|
30 |
+
)
|
31 |
+
|
32 |
+
preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
|
33 |
+
return preds[0].strip()
|
34 |
+
|
35 |
+
# Create Gradio interface
|
36 |
+
iface = gr.Interface(
|
37 |
+
fn=predict_caption,
|
38 |
+
inputs=gr.Image(type="pil"),
|
39 |
+
outputs=gr.Textbox(label="Generated Caption"),
|
40 |
+
title="Image Captioning",
|
41 |
+
description="Upload an image and get its description generated using ViT-GPT2",
|
42 |
+
# examples=[["assets/example1.jpg"]] # Add example images if you have any
|
43 |
+
)
|
44 |
+
|
45 |
+
iface.launch()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
final.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
gradio_ui.py
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
|
3 |
+
from predict_caption import predict_step
|
4 |
+
|
5 |
+
with gr.Blocks() as demo:
|
6 |
+
image = gr.Image(type='pil', label='Image')
|
7 |
+
label = gr.Text(label='Generated Caption')
|
8 |
+
image.upload(
|
9 |
+
predict_step,
|
10 |
+
[image],
|
11 |
+
[label]
|
12 |
+
)
|
13 |
+
|
14 |
+
if __name__ == '__main__':
|
15 |
+
demo.launch()
|
predict_caption.py
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor, AutoTokenizer
|
2 |
+
import torch
|
3 |
+
from PIL import Image
|
4 |
+
|
5 |
+
# Load model and tokenizer from the Hugging Face repository
|
6 |
+
model_name = "aryan083/vit-gpt2-image-captioning"
|
7 |
+
model = VisionEncoderDecoderModel.from_pretrained(model_name)
|
8 |
+
feature_extractor = ViTFeatureExtractor.from_pretrained(model_name)
|
9 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
10 |
+
|
11 |
+
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
12 |
+
model.to(device)
|
13 |
+
|
14 |
+
max_length = 16
|
15 |
+
num_beams = 4
|
16 |
+
gen_kwargs = {'max_length': max_length, 'num_beams': num_beams}
|
17 |
+
|
18 |
+
def predict_step(image_path):
|
19 |
+
image = Image.open(image_path)
|
20 |
+
pixel_values = feature_extractor(images=image, return_tensors='pt').pixel_values
|
21 |
+
pixel_values = pixel_values.to(device)
|
22 |
+
|
23 |
+
output_ids = model.generate(pixel_values, **gen_kwargs)
|
24 |
+
preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
|
25 |
+
preds = [pred.strip() for pred in preds]
|
26 |
+
return preds[0]
|
27 |
+
|
28 |
+
# Example usage with your image file
|
29 |
+
image_path = 'jon-parry-C8eSYwQkwHw-unsplash.jpg'
|
30 |
+
print(predict_step(image_path=image_path))
|
requirements.txt
CHANGED
@@ -1,6 +1,90 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
aiofiles==23.1.0
|
2 |
+
aiohttp==3.8.4
|
3 |
+
aiosignal==1.3.1
|
4 |
+
altair==4.2.2
|
5 |
+
anyio==3.6.2
|
6 |
+
async-timeout==4.0.2
|
7 |
+
attrs==23.1.0
|
8 |
+
autopep8==2.0.2
|
9 |
+
certifi==2022.12.7
|
10 |
+
charset-normalizer==3.1.0uuuu
|
11 |
+
click==8.1.3
|
12 |
+
cmake==3.26.3
|
13 |
+
contourpy==1.0.7
|
14 |
+
cycler==0.11.0
|
15 |
+
entrypoints==0.4
|
16 |
+
fastapi==0.95.1
|
17 |
+
ffmpy==0.3.0
|
18 |
+
filelock==3.12.0
|
19 |
+
fonttools==4.39.3
|
20 |
+
frozenlist==1.3.3
|
21 |
+
fsspec==2023.4.0
|
22 |
+
gradio==3.28.0
|
23 |
+
gradio_client==0.1.4
|
24 |
+
h11==0.14.0
|
25 |
+
httpcore==0.17.0
|
26 |
+
httpx==0.24.0
|
27 |
+
huggingface-hub==0.14.1
|
28 |
+
idna==3.4
|
29 |
+
importlib-resources==5.12.0
|
30 |
+
Jinja2==3.1.2
|
31 |
+
jsonschema==4.17.3
|
32 |
+
kiwisolver==1.4.4
|
33 |
+
linkify-it-py==2.0.0
|
34 |
+
lit==16.0.2
|
35 |
+
markdown-it-py==2.2.0
|
36 |
+
MarkupSafe==2.1.2
|
37 |
+
matplotlib==3.7.1
|
38 |
+
mdit-py-plugins==0.3.3
|
39 |
+
mdurl==0.1.2
|
40 |
+
mpmath==1.3.0
|
41 |
+
multidict==6.0.4
|
42 |
+
networkx==3.1
|
43 |
+
numpy==1.24.3
|
44 |
+
nvidia-cublas-cu11==11.10.3.66
|
45 |
+
nvidia-cuda-cupti-cu11==11.7.101
|
46 |
+
nvidia-cuda-nvrtc-cu11==11.7.99
|
47 |
+
nvidia-cuda-runtime-cu11==11.7.99
|
48 |
+
nvidia-cudnn-cu11==8.5.0.96
|
49 |
+
nvidia-cufft-cu11==10.9.0.58
|
50 |
+
nvidia-curand-cu11==10.2.10.91
|
51 |
+
nvidia-cusolver-cu11==11.4.0.1
|
52 |
+
nvidia-cusparse-cu11==11.7.4.91
|
53 |
+
nvidia-nccl-cu11==2.14.3
|
54 |
+
nvidia-nvtx-cu11==11.7.91
|
55 |
+
orjson==3.8.11
|
56 |
+
packaging==23.1
|
57 |
+
pandas==2.0.1
|
58 |
+
Pillow==9.5.0
|
59 |
+
pycodestyle==2.10.0
|
60 |
+
pydantic==1.10.7
|
61 |
+
pydub==0.25.1
|
62 |
+
pyparsing==3.0.9
|
63 |
+
pyrsistent==0.19.3
|
64 |
+
python-dateutil==2.8.2
|
65 |
+
python-multipart==0.0.6
|
66 |
+
pytz==2023.3
|
67 |
+
PyYAML==6.0
|
68 |
+
regex==2023.3.23
|
69 |
+
requests==2.29.0
|
70 |
+
semantic-version==2.10.0
|
71 |
+
six==1.16.0
|
72 |
+
sniffio==1.3.0
|
73 |
+
starlette==0.26.1
|
74 |
+
sympy==1.11.1
|
75 |
+
tokenizers==0.13.3
|
76 |
+
tomli==2.0.1
|
77 |
+
toolz==0.12.0
|
78 |
+
torch==2.0.0
|
79 |
+
torchvision==0.15.1
|
80 |
+
tqdm==4.65.0
|
81 |
+
transformers==4.28.1
|
82 |
+
triton==2.0.0
|
83 |
+
typing_extensions==4.5.0
|
84 |
+
tzdata==2023.3
|
85 |
+
uc-micro-py==1.0.1
|
86 |
+
urllib3==1.26.15
|
87 |
+
uvicorn==0.22.0
|
88 |
+
websockets==11.0.2
|
89 |
+
yarl==1.9.2
|
90 |
+
zipp==3.15.0
|