Spaces:

atharvasc27112001
/

Capstone_Project

Sleeping

App Files Files Community

atharvasc27112001 commited on Apr 6

Commit

87ac0d2

verified ·

1 Parent(s): f52b61a

Update app.py

Browse files

Files changed (1) hide show

app.py +8 -8

app.py CHANGED Viewed

@@ -12,7 +12,7 @@ def upgrade_packages():
     except Exception as e:
         print("Error upgrading packages:", e)
-# Uncomment the next line for local debugging.
 # upgrade_packages()
 import torch
@@ -40,7 +40,7 @@ text_model = AutoModelForCausalLM.from_pretrained("gpt2")
 # Define Projection Layers
 # ------------------------------
 print("Initializing image projection layer...")
-# Project CLIP's 512-dim image embeddings to GPT-2's 768-dim embeddings.
 image_projection = torch.nn.Linear(512, 768)
 # ------------------------------
@@ -50,11 +50,11 @@ image_projection = torch.nn.Linear(512, 768)
 def multimodal_inference(text_input, image_input, audio_input):
     """
     Processes text, image, and audio inputs:
-      - Text is added directly.
-      - The image is processed via CLIP and projected (here, we append a placeholder).
-      - Audio is transcribed via Whisper.
-    The final prompt is sent to the text model for generation.
     """
     prompt = ""
@@ -71,7 +71,7 @@ def multimodal_inference(text_input, image_input, audio_input):
             # Normalize and project image features
             image_features = image_features / image_features.norm(p=2, dim=-1, keepdim=True)
             projected_image = image_projection(image_features)
-            # Append a placeholder tag (in a full system, you would fuse these embeddings)
             prompt += " [IMAGE_EMBEDDING]"
         except Exception as e:
             print("Error processing image:", e)
@@ -113,7 +113,7 @@ iface = gr.Interface(
     inputs=[
         gr.Textbox(lines=5, placeholder="Enter your text here...", label="Text Input"),
         gr.Image(type="pil", label="Image Input (Optional)"),
-        gr.Audio(source="upload", type="filepath", label="Audio Input (Optional)")
     ],
     outputs="text",
     title="Multi-Modal LLM Demo",

     except Exception as e:
         print("Error upgrading packages:", e)
+# Uncomment the next line if you want to force an upgrade locally (not recommended on Spaces)
 # upgrade_packages()
 import torch
 # Define Projection Layers
 # ------------------------------
 print("Initializing image projection layer...")
+# This layer projects CLIP's 512-dimensional image embeddings to GPT-2's 768-dimensional space.
 image_projection = torch.nn.Linear(512, 768)
 # ------------------------------
 def multimodal_inference(text_input, image_input, audio_input):
     """
     Processes text, image, and audio inputs:
+      - Text: used directly.
+      - Image: processed via CLIP and projected (here, we append a placeholder tag).
+      - Audio: transcribed using Whisper.
+    The final prompt is fed to the text model (GPT-2) to generate a response.
     """
     prompt = ""
             # Normalize and project image features
             image_features = image_features / image_features.norm(p=2, dim=-1, keepdim=True)
             projected_image = image_projection(image_features)
+            # For demo purposes, we append a placeholder tag.
             prompt += " [IMAGE_EMBEDDING]"
         except Exception as e:
             print("Error processing image:", e)
     inputs=[
         gr.Textbox(lines=5, placeholder="Enter your text here...", label="Text Input"),
         gr.Image(type="pil", label="Image Input (Optional)"),
+        gr.Audio(type="filepath", label="Audio Input (Optional)")
     ],
     outputs="text",
     title="Multi-Modal LLM Demo",