Spaces:
Sleeping
Sleeping
Upload 4 files
Browse files- README.md +63 -13
- app.py +162 -0
- model_utils.py +94 -0
- requirements.txt +8 -0
README.md
CHANGED
@@ -1,13 +1,63 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Multi-Modal AI Demo
|
2 |
+
|
3 |
+
This project demonstrates the use of multi-modal AI capabilities using Hugging Face pretrained models. The application provides the following features:
|
4 |
+
|
5 |
+
1. **Image Captioning**: Generate descriptive captions for images
|
6 |
+
2. **Visual Question Answering**: Answer questions about the content of images
|
7 |
+
3. **Sentiment Analysis**: Analyze the sentiment of text inputs
|
8 |
+
|
9 |
+
## Requirements
|
10 |
+
|
11 |
+
- Python 3.8+
|
12 |
+
- Dependencies listed in `requirements.txt`
|
13 |
+
|
14 |
+
## Installation
|
15 |
+
|
16 |
+
1. Clone this repository
|
17 |
+
2. Install dependencies and setup the application:
|
18 |
+
```
|
19 |
+
python run.py
|
20 |
+
```
|
21 |
+
Then select option 5 to perform full setup (install requirements, fix dependencies, and download sample images)
|
22 |
+
|
23 |
+
## Known Issues and Solutions
|
24 |
+
|
25 |
+
If you encounter errors related to package compatibility (Pydantic, FastAPI, or Gradio errors), use:
|
26 |
+
```
|
27 |
+
python fix_dependencies.py
|
28 |
+
```
|
29 |
+
This will install compatible versions of all dependencies to ensure the application runs correctly.
|
30 |
+
|
31 |
+
## Usage
|
32 |
+
|
33 |
+
Run the web interface:
|
34 |
+
```
|
35 |
+
python app.py
|
36 |
+
```
|
37 |
+
|
38 |
+
Then open your browser and navigate to the URL shown in the terminal (typically http://127.0.0.1:7860).
|
39 |
+
|
40 |
+
## Deploying to Hugging Face Spaces
|
41 |
+
|
42 |
+
To deploy this project to Hugging Face Spaces:
|
43 |
+
|
44 |
+
1. Create a new Space on [Hugging Face Spaces](https://huggingface.co/spaces)
|
45 |
+
2. Choose the "Gradio" SDK
|
46 |
+
3. Set up GitHub repository with these files:
|
47 |
+
- `app.py`
|
48 |
+
- `model_utils.py`
|
49 |
+
- `requirements.txt`
|
50 |
+
- `README.md`
|
51 |
+
4. Push to the repository connected to your Space
|
52 |
+
5. Hugging Face will automatically deploy your application
|
53 |
+
|
54 |
+
## Models Used
|
55 |
+
|
56 |
+
This demo uses the following pretrained models from Hugging Face:
|
57 |
+
- Image Captioning: `nlpconnect/vit-gpt2-image-captioning`
|
58 |
+
- Visual Question Answering: `nlpconnect/vit-gpt2-image-captioning` (simplified)
|
59 |
+
- Sentiment Analysis: `distilbert-base-uncased-finetuned-sst-2-english`
|
60 |
+
|
61 |
+
## License
|
62 |
+
|
63 |
+
MIT
|
app.py
ADDED
@@ -0,0 +1,162 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import gradio as gr
|
3 |
+
import matplotlib.pyplot as plt
|
4 |
+
from model_utils import (
|
5 |
+
analyze_sentiment,
|
6 |
+
answer_question,
|
7 |
+
generate_caption,
|
8 |
+
load_image_captioning_model,
|
9 |
+
load_sentiment_model,
|
10 |
+
load_vqa_model,
|
11 |
+
)
|
12 |
+
|
13 |
+
# Load models at startup
|
14 |
+
print("Loading models...")
|
15 |
+
image_caption_model, image_caption_processor, image_caption_tokenizer = (
|
16 |
+
load_image_captioning_model()
|
17 |
+
)
|
18 |
+
vqa_model, vqa_processor, vqa_tokenizer = load_vqa_model()
|
19 |
+
sentiment_model, sentiment_tokenizer = load_sentiment_model()
|
20 |
+
print("Models loaded successfully!")
|
21 |
+
|
22 |
+
|
23 |
+
def image_caption_fn(image):
|
24 |
+
# Save image temporarily
|
25 |
+
temp_path = "temp_image.jpg"
|
26 |
+
image.save(temp_path)
|
27 |
+
|
28 |
+
# Generate caption
|
29 |
+
caption = generate_caption(
|
30 |
+
temp_path, image_caption_model, image_caption_processor, image_caption_tokenizer
|
31 |
+
)
|
32 |
+
|
33 |
+
# Clean up
|
34 |
+
if os.path.exists(temp_path):
|
35 |
+
os.remove(temp_path)
|
36 |
+
|
37 |
+
return caption
|
38 |
+
|
39 |
+
|
40 |
+
def vqa_fn(image, question):
|
41 |
+
# Save image temporarily
|
42 |
+
temp_path = "temp_image.jpg"
|
43 |
+
image.save(temp_path)
|
44 |
+
|
45 |
+
# Answer question
|
46 |
+
answer = answer_question(
|
47 |
+
temp_path, question, vqa_model, vqa_processor, vqa_tokenizer
|
48 |
+
)
|
49 |
+
|
50 |
+
# Clean up
|
51 |
+
if os.path.exists(temp_path):
|
52 |
+
os.remove(temp_path)
|
53 |
+
|
54 |
+
return answer
|
55 |
+
|
56 |
+
|
57 |
+
def sentiment_fn(text):
|
58 |
+
sentiment, confidence = analyze_sentiment(
|
59 |
+
text, sentiment_model, sentiment_tokenizer
|
60 |
+
)
|
61 |
+
confidence_percentage = f"{confidence:.2%}"
|
62 |
+
|
63 |
+
# Create a simple bar chart for visualization
|
64 |
+
labels = ["Negative", "Positive"]
|
65 |
+
values = (
|
66 |
+
[1 - confidence, confidence]
|
67 |
+
if sentiment == "positive"
|
68 |
+
else [confidence, 1 - confidence]
|
69 |
+
)
|
70 |
+
|
71 |
+
fig, ax = plt.subplots(figsize=(6, 3))
|
72 |
+
bars = ax.bar(labels, values, color=["#FF6B6B", "#4ECDC4"])
|
73 |
+
ax.set_ylim(0, 1)
|
74 |
+
ax.set_title("Sentiment Analysis")
|
75 |
+
|
76 |
+
for bar in bars:
|
77 |
+
height = bar.get_height()
|
78 |
+
ax.text(
|
79 |
+
bar.get_x() + bar.get_width() / 2.0,
|
80 |
+
height + 0.02,
|
81 |
+
f"{height:.2f}",
|
82 |
+
ha="center",
|
83 |
+
va="bottom",
|
84 |
+
)
|
85 |
+
|
86 |
+
return f"Sentiment: {sentiment.upper()} (Confidence: {confidence_percentage})", fig
|
87 |
+
|
88 |
+
|
89 |
+
# Create the Gradio interface
|
90 |
+
with gr.Blocks(title="Multi-Modal AI Demo") as demo:
|
91 |
+
gr.Markdown("# Multi-Modal AI Demo")
|
92 |
+
gr.Markdown(
|
93 |
+
"This application demonstrates multi-modal AI capabilities using Hugging Face models."
|
94 |
+
)
|
95 |
+
|
96 |
+
with gr.Tab("Image Captioning"):
|
97 |
+
gr.Markdown("## Image Captioning")
|
98 |
+
gr.Markdown("Upload an image to generate a descriptive caption.")
|
99 |
+
|
100 |
+
with gr.Row():
|
101 |
+
with gr.Column():
|
102 |
+
image_input = gr.Image(type="pil", label="Input Image")
|
103 |
+
caption_button = gr.Button("Generate Caption")
|
104 |
+
|
105 |
+
with gr.Column():
|
106 |
+
caption_output = gr.Textbox(
|
107 |
+
label="Generated Caption", interactive=False
|
108 |
+
)
|
109 |
+
|
110 |
+
caption_button.click(
|
111 |
+
fn=image_caption_fn, inputs=[image_input], outputs=[caption_output]
|
112 |
+
)
|
113 |
+
|
114 |
+
with gr.Tab("Visual Question Answering"):
|
115 |
+
gr.Markdown("## Visual Question Answering")
|
116 |
+
gr.Markdown("Upload an image and ask a question about it.")
|
117 |
+
|
118 |
+
with gr.Row():
|
119 |
+
with gr.Column():
|
120 |
+
vqa_image_input = gr.Image(type="pil", label="Input Image")
|
121 |
+
vqa_question_input = gr.Textbox(label="Your Question")
|
122 |
+
vqa_button = gr.Button("Get Answer")
|
123 |
+
|
124 |
+
with gr.Column():
|
125 |
+
vqa_output = gr.Textbox(label="Answer", interactive=False)
|
126 |
+
|
127 |
+
vqa_button.click(
|
128 |
+
fn=vqa_fn,
|
129 |
+
inputs=[vqa_image_input, vqa_question_input],
|
130 |
+
outputs=[vqa_output],
|
131 |
+
)
|
132 |
+
|
133 |
+
with gr.Tab("Sentiment Analysis"):
|
134 |
+
gr.Markdown("## Sentiment Analysis")
|
135 |
+
gr.Markdown("Enter some text to analyze its sentiment.")
|
136 |
+
|
137 |
+
with gr.Row():
|
138 |
+
with gr.Column():
|
139 |
+
sentiment_input = gr.Textbox(label="Input Text")
|
140 |
+
sentiment_button = gr.Button("Analyze Sentiment")
|
141 |
+
|
142 |
+
with gr.Column():
|
143 |
+
sentiment_output = gr.Textbox(label="Result", interactive=False)
|
144 |
+
sentiment_plot = gr.Plot(label="Sentiment Distribution")
|
145 |
+
|
146 |
+
sentiment_button.click(
|
147 |
+
fn=sentiment_fn,
|
148 |
+
inputs=[sentiment_input],
|
149 |
+
outputs=[sentiment_output, sentiment_plot],
|
150 |
+
)
|
151 |
+
|
152 |
+
gr.Markdown("### About")
|
153 |
+
gr.Markdown("""
|
154 |
+
This demo uses the following pretrained models from Hugging Face:
|
155 |
+
- Image Captioning: `nlpconnect/vit-gpt2-image-captioning`
|
156 |
+
- Visual Question Answering: `nlpconnect/vit-gpt2-image-captioning` (simplified)
|
157 |
+
- Sentiment Analysis: `distilbert-base-uncased-finetuned-sst-2-english`
|
158 |
+
""")
|
159 |
+
|
160 |
+
# Launch the demo
|
161 |
+
if __name__ == "__main__":
|
162 |
+
demo.launch(share=True)
|
model_utils.py
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from PIL import Image
|
3 |
+
from transformers import (
|
4 |
+
AutoModelForSequenceClassification,
|
5 |
+
AutoTokenizer,
|
6 |
+
VisionEncoderDecoderModel,
|
7 |
+
ViTImageProcessor,
|
8 |
+
)
|
9 |
+
|
10 |
+
# Device configuration
|
11 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
12 |
+
|
13 |
+
|
14 |
+
# Load image captioning model
|
15 |
+
def load_image_captioning_model():
|
16 |
+
model = VisionEncoderDecoderModel.from_pretrained(
|
17 |
+
"nlpconnect/vit-gpt2-image-captioning"
|
18 |
+
).to(device)
|
19 |
+
feature_extractor = ViTImageProcessor.from_pretrained(
|
20 |
+
"nlpconnect/vit-gpt2-image-captioning"
|
21 |
+
)
|
22 |
+
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
|
23 |
+
|
24 |
+
return model, feature_extractor, tokenizer
|
25 |
+
|
26 |
+
|
27 |
+
# Generate caption for an image
|
28 |
+
def generate_caption(image_path, model, feature_extractor, tokenizer):
|
29 |
+
max_length = 16
|
30 |
+
num_beams = 4
|
31 |
+
|
32 |
+
image = Image.open(image_path).convert("RGB")
|
33 |
+
pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values.to(
|
34 |
+
device
|
35 |
+
)
|
36 |
+
|
37 |
+
with torch.no_grad():
|
38 |
+
output_ids = model.generate(
|
39 |
+
pixel_values, max_length=max_length, num_beams=num_beams
|
40 |
+
)
|
41 |
+
|
42 |
+
preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
|
43 |
+
return preds[0].strip()
|
44 |
+
|
45 |
+
|
46 |
+
# Load visual question answering model
|
47 |
+
def load_vqa_model():
|
48 |
+
# For simplicity, we'll use the same image captioning model
|
49 |
+
# In a real application, you would use a dedicated VQA model
|
50 |
+
model = VisionEncoderDecoderModel.from_pretrained(
|
51 |
+
"nlpconnect/vit-gpt2-image-captioning"
|
52 |
+
).to(device)
|
53 |
+
feature_extractor = ViTImageProcessor.from_pretrained(
|
54 |
+
"nlpconnect/vit-gpt2-image-captioning"
|
55 |
+
)
|
56 |
+
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
|
57 |
+
|
58 |
+
return model, feature_extractor, tokenizer
|
59 |
+
|
60 |
+
|
61 |
+
# Answer a question about an image
|
62 |
+
def answer_question(image_path, question, model, feature_extractor, tokenizer):
|
63 |
+
# This is a simplified version - in a real app, you'd use a proper VQA model
|
64 |
+
# Here we just generate a caption and append it to a template
|
65 |
+
caption = generate_caption(image_path, model, feature_extractor, tokenizer)
|
66 |
+
return f"Based on the image which shows {caption}, I would say: {caption}"
|
67 |
+
|
68 |
+
|
69 |
+
# Load sentiment analysis model
|
70 |
+
def load_sentiment_model():
|
71 |
+
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
|
72 |
+
model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)
|
73 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
74 |
+
|
75 |
+
return model, tokenizer
|
76 |
+
|
77 |
+
|
78 |
+
# Analyze sentiment of text
|
79 |
+
def analyze_sentiment(text, model, tokenizer):
|
80 |
+
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(
|
81 |
+
device
|
82 |
+
)
|
83 |
+
|
84 |
+
with torch.no_grad():
|
85 |
+
outputs = model(**inputs)
|
86 |
+
|
87 |
+
scores = torch.nn.functional.softmax(outputs.logits, dim=1)
|
88 |
+
scores = scores.cpu().numpy()[0]
|
89 |
+
|
90 |
+
# DistilBERT-SST2 has 2 labels: negative (0) and positive (1)
|
91 |
+
sentiment = "positive" if scores[1] > scores[0] else "negative"
|
92 |
+
confidence = float(max(scores))
|
93 |
+
|
94 |
+
return sentiment, confidence
|
requirements.txt
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
torch
|
2 |
+
torchvision
|
3 |
+
transformers>=4.25.0
|
4 |
+
pillow>=9.0.0
|
5 |
+
gradio==3.50.2
|
6 |
+
matplotlib
|
7 |
+
pydantic==1.10.8
|
8 |
+
fastapi==0.103.2
|