Amarthya7 commited on
Commit
ae1d6c7
·
verified ·
1 Parent(s): 003dc11

Upload 4 files

Browse files
Files changed (4) hide show
  1. README.md +63 -13
  2. app.py +162 -0
  3. model_utils.py +94 -0
  4. requirements.txt +8 -0
README.md CHANGED
@@ -1,13 +1,63 @@
1
- ---
2
- title: Multi Model Ai Demo
3
- emoji: 😻
4
- colorFrom: yellow
5
- colorTo: green
6
- sdk: gradio
7
- sdk_version: 5.20.1
8
- app_file: app.py
9
- pinned: false
10
- short_description: Multimodal AI demo
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Multi-Modal AI Demo
2
+
3
+ This project demonstrates the use of multi-modal AI capabilities using Hugging Face pretrained models. The application provides the following features:
4
+
5
+ 1. **Image Captioning**: Generate descriptive captions for images
6
+ 2. **Visual Question Answering**: Answer questions about the content of images
7
+ 3. **Sentiment Analysis**: Analyze the sentiment of text inputs
8
+
9
+ ## Requirements
10
+
11
+ - Python 3.8+
12
+ - Dependencies listed in `requirements.txt`
13
+
14
+ ## Installation
15
+
16
+ 1. Clone this repository
17
+ 2. Install dependencies and setup the application:
18
+ ```
19
+ python run.py
20
+ ```
21
+ Then select option 5 to perform full setup (install requirements, fix dependencies, and download sample images)
22
+
23
+ ## Known Issues and Solutions
24
+
25
+ If you encounter errors related to package compatibility (Pydantic, FastAPI, or Gradio errors), use:
26
+ ```
27
+ python fix_dependencies.py
28
+ ```
29
+ This will install compatible versions of all dependencies to ensure the application runs correctly.
30
+
31
+ ## Usage
32
+
33
+ Run the web interface:
34
+ ```
35
+ python app.py
36
+ ```
37
+
38
+ Then open your browser and navigate to the URL shown in the terminal (typically http://127.0.0.1:7860).
39
+
40
+ ## Deploying to Hugging Face Spaces
41
+
42
+ To deploy this project to Hugging Face Spaces:
43
+
44
+ 1. Create a new Space on [Hugging Face Spaces](https://huggingface.co/spaces)
45
+ 2. Choose the "Gradio" SDK
46
+ 3. Set up GitHub repository with these files:
47
+ - `app.py`
48
+ - `model_utils.py`
49
+ - `requirements.txt`
50
+ - `README.md`
51
+ 4. Push to the repository connected to your Space
52
+ 5. Hugging Face will automatically deploy your application
53
+
54
+ ## Models Used
55
+
56
+ This demo uses the following pretrained models from Hugging Face:
57
+ - Image Captioning: `nlpconnect/vit-gpt2-image-captioning`
58
+ - Visual Question Answering: `nlpconnect/vit-gpt2-image-captioning` (simplified)
59
+ - Sentiment Analysis: `distilbert-base-uncased-finetuned-sst-2-english`
60
+
61
+ ## License
62
+
63
+ MIT
app.py ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+ import matplotlib.pyplot as plt
4
+ from model_utils import (
5
+ analyze_sentiment,
6
+ answer_question,
7
+ generate_caption,
8
+ load_image_captioning_model,
9
+ load_sentiment_model,
10
+ load_vqa_model,
11
+ )
12
+
13
+ # Load models at startup
14
+ print("Loading models...")
15
+ image_caption_model, image_caption_processor, image_caption_tokenizer = (
16
+ load_image_captioning_model()
17
+ )
18
+ vqa_model, vqa_processor, vqa_tokenizer = load_vqa_model()
19
+ sentiment_model, sentiment_tokenizer = load_sentiment_model()
20
+ print("Models loaded successfully!")
21
+
22
+
23
+ def image_caption_fn(image):
24
+ # Save image temporarily
25
+ temp_path = "temp_image.jpg"
26
+ image.save(temp_path)
27
+
28
+ # Generate caption
29
+ caption = generate_caption(
30
+ temp_path, image_caption_model, image_caption_processor, image_caption_tokenizer
31
+ )
32
+
33
+ # Clean up
34
+ if os.path.exists(temp_path):
35
+ os.remove(temp_path)
36
+
37
+ return caption
38
+
39
+
40
+ def vqa_fn(image, question):
41
+ # Save image temporarily
42
+ temp_path = "temp_image.jpg"
43
+ image.save(temp_path)
44
+
45
+ # Answer question
46
+ answer = answer_question(
47
+ temp_path, question, vqa_model, vqa_processor, vqa_tokenizer
48
+ )
49
+
50
+ # Clean up
51
+ if os.path.exists(temp_path):
52
+ os.remove(temp_path)
53
+
54
+ return answer
55
+
56
+
57
+ def sentiment_fn(text):
58
+ sentiment, confidence = analyze_sentiment(
59
+ text, sentiment_model, sentiment_tokenizer
60
+ )
61
+ confidence_percentage = f"{confidence:.2%}"
62
+
63
+ # Create a simple bar chart for visualization
64
+ labels = ["Negative", "Positive"]
65
+ values = (
66
+ [1 - confidence, confidence]
67
+ if sentiment == "positive"
68
+ else [confidence, 1 - confidence]
69
+ )
70
+
71
+ fig, ax = plt.subplots(figsize=(6, 3))
72
+ bars = ax.bar(labels, values, color=["#FF6B6B", "#4ECDC4"])
73
+ ax.set_ylim(0, 1)
74
+ ax.set_title("Sentiment Analysis")
75
+
76
+ for bar in bars:
77
+ height = bar.get_height()
78
+ ax.text(
79
+ bar.get_x() + bar.get_width() / 2.0,
80
+ height + 0.02,
81
+ f"{height:.2f}",
82
+ ha="center",
83
+ va="bottom",
84
+ )
85
+
86
+ return f"Sentiment: {sentiment.upper()} (Confidence: {confidence_percentage})", fig
87
+
88
+
89
+ # Create the Gradio interface
90
+ with gr.Blocks(title="Multi-Modal AI Demo") as demo:
91
+ gr.Markdown("# Multi-Modal AI Demo")
92
+ gr.Markdown(
93
+ "This application demonstrates multi-modal AI capabilities using Hugging Face models."
94
+ )
95
+
96
+ with gr.Tab("Image Captioning"):
97
+ gr.Markdown("## Image Captioning")
98
+ gr.Markdown("Upload an image to generate a descriptive caption.")
99
+
100
+ with gr.Row():
101
+ with gr.Column():
102
+ image_input = gr.Image(type="pil", label="Input Image")
103
+ caption_button = gr.Button("Generate Caption")
104
+
105
+ with gr.Column():
106
+ caption_output = gr.Textbox(
107
+ label="Generated Caption", interactive=False
108
+ )
109
+
110
+ caption_button.click(
111
+ fn=image_caption_fn, inputs=[image_input], outputs=[caption_output]
112
+ )
113
+
114
+ with gr.Tab("Visual Question Answering"):
115
+ gr.Markdown("## Visual Question Answering")
116
+ gr.Markdown("Upload an image and ask a question about it.")
117
+
118
+ with gr.Row():
119
+ with gr.Column():
120
+ vqa_image_input = gr.Image(type="pil", label="Input Image")
121
+ vqa_question_input = gr.Textbox(label="Your Question")
122
+ vqa_button = gr.Button("Get Answer")
123
+
124
+ with gr.Column():
125
+ vqa_output = gr.Textbox(label="Answer", interactive=False)
126
+
127
+ vqa_button.click(
128
+ fn=vqa_fn,
129
+ inputs=[vqa_image_input, vqa_question_input],
130
+ outputs=[vqa_output],
131
+ )
132
+
133
+ with gr.Tab("Sentiment Analysis"):
134
+ gr.Markdown("## Sentiment Analysis")
135
+ gr.Markdown("Enter some text to analyze its sentiment.")
136
+
137
+ with gr.Row():
138
+ with gr.Column():
139
+ sentiment_input = gr.Textbox(label="Input Text")
140
+ sentiment_button = gr.Button("Analyze Sentiment")
141
+
142
+ with gr.Column():
143
+ sentiment_output = gr.Textbox(label="Result", interactive=False)
144
+ sentiment_plot = gr.Plot(label="Sentiment Distribution")
145
+
146
+ sentiment_button.click(
147
+ fn=sentiment_fn,
148
+ inputs=[sentiment_input],
149
+ outputs=[sentiment_output, sentiment_plot],
150
+ )
151
+
152
+ gr.Markdown("### About")
153
+ gr.Markdown("""
154
+ This demo uses the following pretrained models from Hugging Face:
155
+ - Image Captioning: `nlpconnect/vit-gpt2-image-captioning`
156
+ - Visual Question Answering: `nlpconnect/vit-gpt2-image-captioning` (simplified)
157
+ - Sentiment Analysis: `distilbert-base-uncased-finetuned-sst-2-english`
158
+ """)
159
+
160
+ # Launch the demo
161
+ if __name__ == "__main__":
162
+ demo.launch(share=True)
model_utils.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from PIL import Image
3
+ from transformers import (
4
+ AutoModelForSequenceClassification,
5
+ AutoTokenizer,
6
+ VisionEncoderDecoderModel,
7
+ ViTImageProcessor,
8
+ )
9
+
10
+ # Device configuration
11
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
12
+
13
+
14
+ # Load image captioning model
15
+ def load_image_captioning_model():
16
+ model = VisionEncoderDecoderModel.from_pretrained(
17
+ "nlpconnect/vit-gpt2-image-captioning"
18
+ ).to(device)
19
+ feature_extractor = ViTImageProcessor.from_pretrained(
20
+ "nlpconnect/vit-gpt2-image-captioning"
21
+ )
22
+ tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
23
+
24
+ return model, feature_extractor, tokenizer
25
+
26
+
27
+ # Generate caption for an image
28
+ def generate_caption(image_path, model, feature_extractor, tokenizer):
29
+ max_length = 16
30
+ num_beams = 4
31
+
32
+ image = Image.open(image_path).convert("RGB")
33
+ pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values.to(
34
+ device
35
+ )
36
+
37
+ with torch.no_grad():
38
+ output_ids = model.generate(
39
+ pixel_values, max_length=max_length, num_beams=num_beams
40
+ )
41
+
42
+ preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
43
+ return preds[0].strip()
44
+
45
+
46
+ # Load visual question answering model
47
+ def load_vqa_model():
48
+ # For simplicity, we'll use the same image captioning model
49
+ # In a real application, you would use a dedicated VQA model
50
+ model = VisionEncoderDecoderModel.from_pretrained(
51
+ "nlpconnect/vit-gpt2-image-captioning"
52
+ ).to(device)
53
+ feature_extractor = ViTImageProcessor.from_pretrained(
54
+ "nlpconnect/vit-gpt2-image-captioning"
55
+ )
56
+ tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
57
+
58
+ return model, feature_extractor, tokenizer
59
+
60
+
61
+ # Answer a question about an image
62
+ def answer_question(image_path, question, model, feature_extractor, tokenizer):
63
+ # This is a simplified version - in a real app, you'd use a proper VQA model
64
+ # Here we just generate a caption and append it to a template
65
+ caption = generate_caption(image_path, model, feature_extractor, tokenizer)
66
+ return f"Based on the image which shows {caption}, I would say: {caption}"
67
+
68
+
69
+ # Load sentiment analysis model
70
+ def load_sentiment_model():
71
+ model_name = "distilbert-base-uncased-finetuned-sst-2-english"
72
+ model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)
73
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
74
+
75
+ return model, tokenizer
76
+
77
+
78
+ # Analyze sentiment of text
79
+ def analyze_sentiment(text, model, tokenizer):
80
+ inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(
81
+ device
82
+ )
83
+
84
+ with torch.no_grad():
85
+ outputs = model(**inputs)
86
+
87
+ scores = torch.nn.functional.softmax(outputs.logits, dim=1)
88
+ scores = scores.cpu().numpy()[0]
89
+
90
+ # DistilBERT-SST2 has 2 labels: negative (0) and positive (1)
91
+ sentiment = "positive" if scores[1] > scores[0] else "negative"
92
+ confidence = float(max(scores))
93
+
94
+ return sentiment, confidence
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ torch
2
+ torchvision
3
+ transformers>=4.25.0
4
+ pillow>=9.0.0
5
+ gradio==3.50.2
6
+ matplotlib
7
+ pydantic==1.10.8
8
+ fastapi==0.103.2