Amarthya7 commited on
Commit
0757241
·
verified ·
1 Parent(s): 0ca949f

Upload 5 files

Browse files
Files changed (5) hide show
  1. README.md +94 -13
  2. __init__.py +9 -0
  3. app.py +189 -0
  4. requirements.txt +10 -0
  5. run.py +90 -0
README.md CHANGED
@@ -1,13 +1,94 @@
1
- ---
2
- title: Image Question Answering System
3
- emoji: 🏃
4
- colorFrom: indigo
5
- colorTo: yellow
6
- sdk: streamlit
7
- sdk_version: 1.43.1
8
- app_file: app.py
9
- pinned: false
10
- short_description: multi-modal AI application
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Visual Question Answering (VQA) System
3
+ emoji: 🏞️
4
+ colorFrom: blue
5
+ colorTo: purple
6
+ sdk: gradio
7
+ sdk_version: 5.20.1
8
+ app_file: run.py
9
+ pinned: false
10
+ ---
11
+ # Visual Question Answering (VQA) System
12
+
13
+ A multi-modal AI application that allows users to upload images and ask questions about them. This project uses pre-trained models from Hugging Face to analyze images and answer natural language questions.
14
+
15
+ ## Features
16
+
17
+ - Upload images in common formats (jpg, png, etc.)
18
+ - Ask questions about image content in natural language
19
+ - Get AI-generated answers based on image content
20
+ - User-friendly Streamlit interface
21
+ - Support for various types of questions (objects, attributes, counting, etc.)
22
+
23
+ ## Technical Stack
24
+
25
+ - **Python**: Main programming language
26
+ - **PyTorch & Transformers**: Deep learning frameworks for running the models
27
+ - **Streamlit**: Interactive web application framework
28
+ - **HuggingFace Models**: Pre-trained visual question answering models
29
+ - **PIL**: Image processing
30
+
31
+ ## Setup Instructions
32
+
33
+ 1. Clone this repository:
34
+ ```
35
+ git clone https://github.com/your-username/visual-question-answering.git
36
+ cd visual-question-answering
37
+ ```
38
+
39
+ 2. Create a virtual environment (recommended):
40
+ ```
41
+ python -m venv venv
42
+ # On Windows
43
+ venv\Scripts\activate
44
+ # On macOS/Linux
45
+ source venv/bin/activate
46
+ ```
47
+
48
+ 3. Install dependencies:
49
+ ```
50
+ pip install -r requirements.txt
51
+ ```
52
+
53
+ 4. Run the application:
54
+ ```
55
+ python run.py
56
+ ```
57
+
58
+ Or directly with Streamlit:
59
+ ```
60
+ streamlit run app.py
61
+ ```
62
+
63
+ 5. Open a web browser and go to `http://localhost:8501`
64
+
65
+ ## Usage
66
+
67
+ 1. Upload an image using the file upload area
68
+ 2. Type your question about the image in the text field
69
+ 3. Select a model from the sidebar (BLIP or ViLT)
70
+ 4. Click "Get Answer" to get an AI-generated response
71
+ 5. View the answer displayed on the right side of the screen
72
+
73
+ ## Models Used
74
+
75
+ This application uses the following pre-trained models from Hugging Face:
76
+ - **BLIP**: For general visual question answering with free-form answers
77
+ - **ViLT**: For detailed understanding of image content and yes/no questions
78
+
79
+ ## Project Structure
80
+
81
+ - `app.py`: Main Streamlit application
82
+ - `models/`: Contains model handling code
83
+ - `utils/`: Utility functions for image processing and more
84
+ - `static/`: Static files including uploaded images
85
+ - `run.py`: Script to run the application
86
+
87
+ ## License
88
+
89
+ This project is licensed under the MIT License - see the LICENSE file for details.
90
+
91
+ ## Acknowledgments
92
+
93
+ - Hugging Face for their excellent pre-trained models
94
+ - The open-source community for various libraries used in this project
__init__.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Visual Question Answering - Multi-Modal AI Application
3
+
4
+ A Python application for answering questions about images using
5
+ pre-trained Hugging Face models for multi-modal understanding.
6
+ """
7
+
8
+ __version__ = "0.1.0"
9
+ __author__ = "Multi-Modal AI Project"
app.py ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Visual Question Answering Streamlit Application
3
+ """
4
+
5
+ import logging
6
+ import os
7
+ import sys
8
+ import time
9
+ from datetime import datetime
10
+
11
+ import streamlit as st
12
+ from PIL import Image
13
+
14
+ # Configure path to include parent directory
15
+ sys.path.append(os.path.dirname(os.path.abspath(__file__)))
16
+
17
+ # Configure logging
18
+ log_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "logs")
19
+ os.makedirs(log_dir, exist_ok=True)
20
+ log_file = os.path.join(
21
+ log_dir, f"vqa_app_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"
22
+ )
23
+
24
+ logging.basicConfig(
25
+ level=logging.INFO,
26
+ format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
27
+ handlers=[logging.FileHandler(log_file), logging.StreamHandler()],
28
+ )
29
+ logger = logging.getLogger("vqa_app")
30
+
31
+ # Import modules
32
+ from models import VQAInference
33
+ from utils.image_utils import resize_image
34
+
35
+ # Global variables
36
+ MODEL_OPTIONS = {"BLIP": "blip", "ViLT": "vilt"}
37
+
38
+ # Setup directories
39
+ uploads_dir = os.path.join(
40
+ os.path.dirname(os.path.abspath(__file__)), "static", "uploads"
41
+ )
42
+ os.makedirs(uploads_dir, exist_ok=True)
43
+
44
+ # Configure page
45
+ st.set_page_config(
46
+ page_title="Visual Question Answering",
47
+ page_icon="🔍",
48
+ layout="wide",
49
+ initial_sidebar_state="expanded",
50
+ )
51
+
52
+
53
+ @st.cache_resource
54
+ def load_model(model_name):
55
+ """Load the VQA model with caching for better performance"""
56
+ try:
57
+ logger.info(f"Loading model: {model_name}")
58
+ return VQAInference(model_name=model_name)
59
+ except Exception as e:
60
+ logger.error(f"Error loading model: {str(e)}")
61
+ st.error(f"Failed to load model: {str(e)}")
62
+ return None
63
+
64
+
65
+ def process_image_and_question(image_file, question, model_name):
66
+ """Process the uploaded image and question to generate an answer"""
67
+ start_time = time.time()
68
+
69
+ try:
70
+ # Load image
71
+ image = Image.open(image_file).convert("RGB")
72
+ logger.info(f"Image loaded, size: {image.size}")
73
+
74
+ # Resize image
75
+ image = resize_image(image)
76
+ logger.info(f"Image resized to: {image.size}")
77
+
78
+ # Load model
79
+ model = load_model(model_name)
80
+ if model is None:
81
+ return None
82
+
83
+ # Generate answer
84
+ logger.info(f"Generating answer for question: '{question}'")
85
+ answer = model.predict(image, question)
86
+ logger.info(f"Answer generated: '{answer}'")
87
+
88
+ # Calculate processing time
89
+ processing_time = time.time() - start_time
90
+
91
+ return {"answer": answer, "processing_time": f"{processing_time:.2f} seconds"}
92
+ except Exception as e:
93
+ logger.error(f"Error processing request: {str(e)}", exc_info=True)
94
+ return None
95
+
96
+
97
+ def main():
98
+ """Main function for Streamlit app"""
99
+ # Header
100
+ st.title("Visual Question Answering")
101
+ st.markdown("Upload an image, ask a question, and get AI-powered answers")
102
+
103
+ # Sidebar for model selection
104
+ st.sidebar.title("Model Options")
105
+ selected_model_name = st.sidebar.radio(
106
+ "Choose a model:", options=list(MODEL_OPTIONS.keys()), index=0
107
+ )
108
+ model_name = MODEL_OPTIONS[selected_model_name]
109
+
110
+ st.sidebar.markdown("---")
111
+ st.sidebar.markdown("## About the Models")
112
+ st.sidebar.markdown("**BLIP**: General purpose VQA with free-form answers")
113
+ st.sidebar.markdown("**ViLT**: Better for yes/no questions and specific categories")
114
+
115
+ # Main content - two columns
116
+ col1, col2 = st.columns([1, 1])
117
+
118
+ with col1:
119
+ st.markdown("### Upload & Ask")
120
+ uploaded_file = st.file_uploader(
121
+ "Upload an image:", type=["jpg", "jpeg", "png", "bmp", "gif"]
122
+ )
123
+
124
+ question = st.text_input(
125
+ "Your question about the image:", placeholder="E.g., What is in this image?"
126
+ )
127
+
128
+ submit_button = st.button(
129
+ "Get Answer", type="primary", use_container_width=True
130
+ )
131
+
132
+ # Preview uploaded image
133
+ if uploaded_file is not None:
134
+ st.markdown("### Image Preview")
135
+ st.image(uploaded_file, caption="Uploaded Image", use_column_width=True)
136
+
137
+ with col2:
138
+ st.markdown("### AI Answer")
139
+
140
+ # Process when submit button is clicked
141
+ if submit_button and uploaded_file is not None and question:
142
+ with st.spinner("Generating answer..."):
143
+ result = process_image_and_question(uploaded_file, question, model_name)
144
+
145
+ if result:
146
+ st.success("Answer generated successfully!")
147
+
148
+ # Display results
149
+ st.markdown("#### Question:")
150
+ st.write(question)
151
+
152
+ st.markdown("#### Answer:")
153
+ st.markdown(
154
+ f"<div style='background-color: #f0f2f6; padding: 20px; border-radius: 5px;'>{result['answer']}</div>",
155
+ unsafe_allow_html=True,
156
+ )
157
+
158
+ st.markdown("#### Processing Time:")
159
+ st.text(result["processing_time"])
160
+ else:
161
+ st.error(
162
+ "Failed to generate an answer. Please check the image and question, and try again."
163
+ )
164
+
165
+ elif not uploaded_file and submit_button:
166
+ st.warning("Please upload an image first.")
167
+ elif not question and submit_button:
168
+ st.warning("Please enter a question about the image.")
169
+ else:
170
+ st.info("AI answers will appear here after you submit your question")
171
+
172
+ # Information about the application
173
+ st.markdown("---")
174
+ st.markdown("### About Visual Question Answering")
175
+ st.markdown("""
176
+ This application uses multi-modal AI, combining computer vision and natural language processing
177
+ to answer questions about images. Here are some examples of questions you can ask:
178
+
179
+ - **Objects**: "What objects are in this image?"
180
+ - **Counting**: "How many people are in this image?"
181
+ - **Colors**: "What color is the car?"
182
+ - **Actions**: "What is the person doing?"
183
+ - **Spatial relations**: "What is to the left of the chair?"
184
+ - **Attributes**: "Is the cat sleeping?"
185
+ """)
186
+
187
+
188
+ if __name__ == "__main__":
189
+ main()
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ torch>=2.0.0
2
+ torchvision>=0.15.0
3
+ transformers>=4.30.0
4
+ Pillow>=9.0.0
5
+ timm>=0.9.0
6
+ numpy>=1.24.0
7
+ tqdm>=4.65.0
8
+ streamlit>=1.34.0
9
+ watchdog>=3.0.0
10
+ python-dotenv>=1.0.0
run.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Visual Question Answering Application - Run Script for Streamlit
3
+ """
4
+
5
+ import os
6
+ import subprocess
7
+ import sys
8
+
9
+ # Configure minimal environment settings
10
+ os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" # Suppress TensorFlow logging
11
+
12
+
13
+ def check_requirements_installed():
14
+ """Check if requirements are installed"""
15
+ try:
16
+ import streamlit
17
+ import torch
18
+ import transformers
19
+ from PIL import Image
20
+
21
+ return True
22
+ except ImportError as e:
23
+ print(f"Error: Required package not installed - {e}")
24
+ print("Please install requirements using: pip install -r requirements.txt")
25
+ return False
26
+
27
+
28
+ def ensure_directories():
29
+ """Ensure all required directories exist"""
30
+ # Get the base directory
31
+ base_dir = os.path.dirname(os.path.abspath(__file__))
32
+
33
+ # Create uploads directory
34
+ uploads_dir = os.path.join(base_dir, "static", "uploads")
35
+ os.makedirs(uploads_dir, exist_ok=True)
36
+ print(f"Uploads directory: {uploads_dir}")
37
+
38
+ # Create logs directory
39
+ logs_dir = os.path.join(base_dir, "logs")
40
+ os.makedirs(logs_dir, exist_ok=True)
41
+
42
+
43
+ def main():
44
+ """Main function to run the application"""
45
+ print("Visual Question Answering - Multi-Modal AI Application with Streamlit")
46
+
47
+ # Check requirements
48
+ if not check_requirements_installed():
49
+ sys.exit(1)
50
+
51
+ # Ensure directories exist
52
+ ensure_directories()
53
+
54
+ # Set environment variables
55
+ os.environ["VQA_MODEL"] = os.environ.get(
56
+ "VQA_MODEL", "blip"
57
+ ) # Default to 'blip' model
58
+
59
+ # Get the app.py path
60
+ app_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "app.py")
61
+ if not os.path.exists(app_path):
62
+ print(f"Error: Streamlit app not found at {app_path}")
63
+ sys.exit(1)
64
+
65
+ # Print startup information
66
+ port = int(os.environ.get("PORT", 8501)) # Streamlit default port is 8501
67
+ print(f"Starting VQA application on http://localhost:{port}")
68
+ print(f"Using VQA model: {os.environ.get('VQA_MODEL', 'blip')}")
69
+ print("Press Ctrl+C to exit")
70
+
71
+ # Run the Streamlit app
72
+ cmd = [
73
+ "streamlit",
74
+ "run",
75
+ app_path,
76
+ "--server.port",
77
+ str(port),
78
+ "--server.address",
79
+ "0.0.0.0",
80
+ ]
81
+ try:
82
+ subprocess.run(cmd)
83
+ except KeyboardInterrupt:
84
+ print("\nShutting down the application...")
85
+ except Exception as e:
86
+ print(f"Error launching Streamlit: {e}")
87
+
88
+
89
+ if __name__ == "__main__":
90
+ main()