Spaces:

Amarthya7
/

Multi-model-ai-demo

Sleeping

App Files Files Community

Amarthya7 commited on Mar 11

Commit

ae1d6c7

verified ·

1 Parent(s): 003dc11

Upload 4 files

Browse files

Files changed (4) hide show

README.md +63 -13
app.py +162 -0
model_utils.py +94 -0
requirements.txt +8 -0

README.md CHANGED Viewed

@@ -1,13 +1,63 @@
----
-title: Multi Model Ai Demo
-emoji: 😻
-colorFrom: yellow
-colorTo: green
-sdk: gradio
-sdk_version: 5.20.1
-app_file: app.py
-pinned: false
-short_description: Multimodal AI demo
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# Multi-Modal AI Demo
+This project demonstrates the use of multi-modal AI capabilities using Hugging Face pretrained models. The application provides the following features:
+1. **Image Captioning**: Generate descriptive captions for images
+2. **Visual Question Answering**: Answer questions about the content of images
+3. **Sentiment Analysis**: Analyze the sentiment of text inputs
+## Requirements
+- Python 3.8+
+- Dependencies listed in `requirements.txt`
+## Installation
+1. Clone this repository
+2. Install dependencies and setup the application:
+   ```
+   python run.py
+   ```
+   Then select option 5 to perform full setup (install requirements, fix dependencies, and download sample images)
+## Known Issues and Solutions
+If you encounter errors related to package compatibility (Pydantic, FastAPI, or Gradio errors), use:
+```
+python fix_dependencies.py
+```
+This will install compatible versions of all dependencies to ensure the application runs correctly.
+## Usage
+Run the web interface:
+```
+python app.py
+```
+Then open your browser and navigate to the URL shown in the terminal (typically http://127.0.0.1:7860).
+## Deploying to Hugging Face Spaces
+To deploy this project to Hugging Face Spaces:
+1. Create a new Space on [Hugging Face Spaces](https://huggingface.co/spaces)
+2. Choose the "Gradio" SDK
+3. Set up GitHub repository with these files:
+   - `app.py`
+   - `model_utils.py`
+   - `requirements.txt`
+   - `README.md`
+4. Push to the repository connected to your Space
+5. Hugging Face will automatically deploy your application
+## Models Used
+This demo uses the following pretrained models from Hugging Face:
+- Image Captioning: `nlpconnect/vit-gpt2-image-captioning`
+- Visual Question Answering: `nlpconnect/vit-gpt2-image-captioning` (simplified)
+- Sentiment Analysis: `distilbert-base-uncased-finetuned-sst-2-english`
+## License
+MIT

app.py ADDED Viewed

	@@ -0,0 +1,162 @@

+import os
+import gradio as gr
+import matplotlib.pyplot as plt
+from model_utils import (
+    analyze_sentiment,
+    answer_question,
+    generate_caption,
+    load_image_captioning_model,
+    load_sentiment_model,
+    load_vqa_model,
+)
+# Load models at startup
+print("Loading models...")
+image_caption_model, image_caption_processor, image_caption_tokenizer = (
+    load_image_captioning_model()
+)
+vqa_model, vqa_processor, vqa_tokenizer = load_vqa_model()
+sentiment_model, sentiment_tokenizer = load_sentiment_model()
+print("Models loaded successfully!")
+def image_caption_fn(image):
+    # Save image temporarily
+    temp_path = "temp_image.jpg"
+    image.save(temp_path)
+    # Generate caption
+    caption = generate_caption(
+        temp_path, image_caption_model, image_caption_processor, image_caption_tokenizer
+    )
+    # Clean up
+    if os.path.exists(temp_path):
+        os.remove(temp_path)
+    return caption
+def vqa_fn(image, question):
+    # Save image temporarily
+    temp_path = "temp_image.jpg"
+    image.save(temp_path)
+    # Answer question
+    answer = answer_question(
+        temp_path, question, vqa_model, vqa_processor, vqa_tokenizer
+    )
+    # Clean up
+    if os.path.exists(temp_path):
+        os.remove(temp_path)
+    return answer
+def sentiment_fn(text):
+    sentiment, confidence = analyze_sentiment(
+        text, sentiment_model, sentiment_tokenizer
+    )
+    confidence_percentage = f"{confidence:.2%}"
+    # Create a simple bar chart for visualization
+    labels = ["Negative", "Positive"]
+    values = (
+        [1 - confidence, confidence]
+        if sentiment == "positive"
+        else [confidence, 1 - confidence]
+    )
+    fig, ax = plt.subplots(figsize=(6, 3))
+    bars = ax.bar(labels, values, color=["#FF6B6B", "#4ECDC4"])
+    ax.set_ylim(0, 1)
+    ax.set_title("Sentiment Analysis")
+    for bar in bars:
+        height = bar.get_height()
+        ax.text(
+            bar.get_x() + bar.get_width() / 2.0,
+            height + 0.02,
+            f"{height:.2f}",
+            ha="center",
+            va="bottom",
+        )
+    return f"Sentiment: {sentiment.upper()} (Confidence: {confidence_percentage})", fig
+# Create the Gradio interface
+with gr.Blocks(title="Multi-Modal AI Demo") as demo:
+    gr.Markdown("# Multi-Modal AI Demo")
+    gr.Markdown(
+        "This application demonstrates multi-modal AI capabilities using Hugging Face models."
+    )
+    with gr.Tab("Image Captioning"):
+        gr.Markdown("## Image Captioning")
+        gr.Markdown("Upload an image to generate a descriptive caption.")
+        with gr.Row():
+            with gr.Column():
+                image_input = gr.Image(type="pil", label="Input Image")
+                caption_button = gr.Button("Generate Caption")
+            with gr.Column():
+                caption_output = gr.Textbox(
+                    label="Generated Caption", interactive=False
+                )
+        caption_button.click(
+            fn=image_caption_fn, inputs=[image_input], outputs=[caption_output]
+        )
+    with gr.Tab("Visual Question Answering"):
+        gr.Markdown("## Visual Question Answering")
+        gr.Markdown("Upload an image and ask a question about it.")
+        with gr.Row():
+            with gr.Column():
+                vqa_image_input = gr.Image(type="pil", label="Input Image")
+                vqa_question_input = gr.Textbox(label="Your Question")
+                vqa_button = gr.Button("Get Answer")
+            with gr.Column():
+                vqa_output = gr.Textbox(label="Answer", interactive=False)
+        vqa_button.click(
+            fn=vqa_fn,
+            inputs=[vqa_image_input, vqa_question_input],
+            outputs=[vqa_output],
+        )
+    with gr.Tab("Sentiment Analysis"):
+        gr.Markdown("## Sentiment Analysis")
+        gr.Markdown("Enter some text to analyze its sentiment.")
+        with gr.Row():
+            with gr.Column():
+                sentiment_input = gr.Textbox(label="Input Text")
+                sentiment_button = gr.Button("Analyze Sentiment")
+            with gr.Column():
+                sentiment_output = gr.Textbox(label="Result", interactive=False)
+                sentiment_plot = gr.Plot(label="Sentiment Distribution")
+        sentiment_button.click(
+            fn=sentiment_fn,
+            inputs=[sentiment_input],
+            outputs=[sentiment_output, sentiment_plot],
+        )
+    gr.Markdown("### About")
+    gr.Markdown("""
+    This demo uses the following pretrained models from Hugging Face:
+    - Image Captioning: `nlpconnect/vit-gpt2-image-captioning`
+    - Visual Question Answering: `nlpconnect/vit-gpt2-image-captioning` (simplified)
+    - Sentiment Analysis: `distilbert-base-uncased-finetuned-sst-2-english`
+    """)
+# Launch the demo
+if __name__ == "__main__":
+    demo.launch(share=True)

model_utils.py ADDED Viewed

	@@ -0,0 +1,94 @@

+import torch
+from PIL import Image
+from transformers import (
+    AutoModelForSequenceClassification,
+    AutoTokenizer,
+    VisionEncoderDecoderModel,
+    ViTImageProcessor,
+)
+# Device configuration
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# Load image captioning model
+def load_image_captioning_model():
+    model = VisionEncoderDecoderModel.from_pretrained(
+        "nlpconnect/vit-gpt2-image-captioning"
+    ).to(device)
+    feature_extractor = ViTImageProcessor.from_pretrained(
+        "nlpconnect/vit-gpt2-image-captioning"
+    )
+    tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
+    return model, feature_extractor, tokenizer
+# Generate caption for an image
+def generate_caption(image_path, model, feature_extractor, tokenizer):
+    max_length = 16
+    num_beams = 4
+    image = Image.open(image_path).convert("RGB")
+    pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values.to(
+        device
+    )
+    with torch.no_grad():
+        output_ids = model.generate(
+            pixel_values, max_length=max_length, num_beams=num_beams
+        )
+    preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
+    return preds[0].strip()
+# Load visual question answering model
+def load_vqa_model():
+    # For simplicity, we'll use the same image captioning model
+    # In a real application, you would use a dedicated VQA model
+    model = VisionEncoderDecoderModel.from_pretrained(
+        "nlpconnect/vit-gpt2-image-captioning"
+    ).to(device)
+    feature_extractor = ViTImageProcessor.from_pretrained(
+        "nlpconnect/vit-gpt2-image-captioning"
+    )
+    tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
+    return model, feature_extractor, tokenizer
+# Answer a question about an image
+def answer_question(image_path, question, model, feature_extractor, tokenizer):
+    # This is a simplified version - in a real app, you'd use a proper VQA model
+    # Here we just generate a caption and append it to a template
+    caption = generate_caption(image_path, model, feature_extractor, tokenizer)
+    return f"Based on the image which shows {caption}, I would say: {caption}"
+# Load sentiment analysis model
+def load_sentiment_model():
+    model_name = "distilbert-base-uncased-finetuned-sst-2-english"
+    model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    return model, tokenizer
+# Analyze sentiment of text
+def analyze_sentiment(text, model, tokenizer):
+    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(
+        device
+    )
+    with torch.no_grad():
+        outputs = model(**inputs)
+    scores = torch.nn.functional.softmax(outputs.logits, dim=1)
+    scores = scores.cpu().numpy()[0]
+    # DistilBERT-SST2 has 2 labels: negative (0) and positive (1)
+    sentiment = "positive" if scores[1] > scores[0] else "negative"
+    confidence = float(max(scores))
+    return sentiment, confidence

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+torch
+torchvision
+transformers>=4.25.0
+pillow>=9.0.0
+gradio==3.50.2
+matplotlib
+pydantic==1.10.8
+fastapi==0.103.2