Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -12,7 +12,7 @@ def upgrade_packages():
|
|
12 |
except Exception as e:
|
13 |
print("Error upgrading packages:", e)
|
14 |
|
15 |
-
# Uncomment the next line
|
16 |
# upgrade_packages()
|
17 |
|
18 |
import torch
|
@@ -40,7 +40,7 @@ text_model = AutoModelForCausalLM.from_pretrained("gpt2")
|
|
40 |
# Define Projection Layers
|
41 |
# ------------------------------
|
42 |
print("Initializing image projection layer...")
|
43 |
-
#
|
44 |
image_projection = torch.nn.Linear(512, 768)
|
45 |
|
46 |
# ------------------------------
|
@@ -50,11 +50,11 @@ image_projection = torch.nn.Linear(512, 768)
|
|
50 |
def multimodal_inference(text_input, image_input, audio_input):
|
51 |
"""
|
52 |
Processes text, image, and audio inputs:
|
53 |
-
- Text
|
54 |
-
-
|
55 |
-
- Audio
|
56 |
|
57 |
-
The final prompt is
|
58 |
"""
|
59 |
prompt = ""
|
60 |
|
@@ -71,7 +71,7 @@ def multimodal_inference(text_input, image_input, audio_input):
|
|
71 |
# Normalize and project image features
|
72 |
image_features = image_features / image_features.norm(p=2, dim=-1, keepdim=True)
|
73 |
projected_image = image_projection(image_features)
|
74 |
-
#
|
75 |
prompt += " [IMAGE_EMBEDDING]"
|
76 |
except Exception as e:
|
77 |
print("Error processing image:", e)
|
@@ -113,7 +113,7 @@ iface = gr.Interface(
|
|
113 |
inputs=[
|
114 |
gr.Textbox(lines=5, placeholder="Enter your text here...", label="Text Input"),
|
115 |
gr.Image(type="pil", label="Image Input (Optional)"),
|
116 |
-
gr.Audio(
|
117 |
],
|
118 |
outputs="text",
|
119 |
title="Multi-Modal LLM Demo",
|
|
|
12 |
except Exception as e:
|
13 |
print("Error upgrading packages:", e)
|
14 |
|
15 |
+
# Uncomment the next line if you want to force an upgrade locally (not recommended on Spaces)
|
16 |
# upgrade_packages()
|
17 |
|
18 |
import torch
|
|
|
40 |
# Define Projection Layers
|
41 |
# ------------------------------
|
42 |
print("Initializing image projection layer...")
|
43 |
+
# This layer projects CLIP's 512-dimensional image embeddings to GPT-2's 768-dimensional space.
|
44 |
image_projection = torch.nn.Linear(512, 768)
|
45 |
|
46 |
# ------------------------------
|
|
|
50 |
def multimodal_inference(text_input, image_input, audio_input):
|
51 |
"""
|
52 |
Processes text, image, and audio inputs:
|
53 |
+
- Text: used directly.
|
54 |
+
- Image: processed via CLIP and projected (here, we append a placeholder tag).
|
55 |
+
- Audio: transcribed using Whisper.
|
56 |
|
57 |
+
The final prompt is fed to the text model (GPT-2) to generate a response.
|
58 |
"""
|
59 |
prompt = ""
|
60 |
|
|
|
71 |
# Normalize and project image features
|
72 |
image_features = image_features / image_features.norm(p=2, dim=-1, keepdim=True)
|
73 |
projected_image = image_projection(image_features)
|
74 |
+
# For demo purposes, we append a placeholder tag.
|
75 |
prompt += " [IMAGE_EMBEDDING]"
|
76 |
except Exception as e:
|
77 |
print("Error processing image:", e)
|
|
|
113 |
inputs=[
|
114 |
gr.Textbox(lines=5, placeholder="Enter your text here...", label="Text Input"),
|
115 |
gr.Image(type="pil", label="Image Input (Optional)"),
|
116 |
+
gr.Audio(type="filepath", label="Audio Input (Optional)")
|
117 |
],
|
118 |
outputs="text",
|
119 |
title="Multi-Modal LLM Demo",
|