Spaces:

atharvasc27112001
/

Capstone_Project

Sleeping

App Files Files Community

atharvasc27112001 commited on Apr 6

Commit

d6de1a9

verified ·

1 Parent(s): f94e228

Update app.py

Browse files

Files changed (1) hide show

app.py +24 -13

app.py CHANGED Viewed

@@ -1,3 +1,21 @@
 import torch
 from transformers import CLIPProcessor, CLIPModel, WhisperProcessor, WhisperForConditionalGeneration, AutoTokenizer, AutoModelForCausalLM
 import gradio as gr
@@ -22,8 +40,6 @@ text_model = AutoModelForCausalLM.from_pretrained("gpt2")
 # ------------------------------
 # Define Projection Layers
 # ------------------------------
-# Here we create a simple linear layer to project CLIP's image embeddings (512 dims)
-# to GPT-2's embedding dimension (768 dims). In a full project, this layer would be fine-tuned.
 print("Initializing image projection layer...")
 image_projection = torch.nn.Linear(512, 768)
@@ -33,13 +49,11 @@ image_projection = torch.nn.Linear(512, 768)
 def multimodal_inference(text_input, image_input, audio_input):
     """
-    Processes three modalities:
-      - Text: used directly.
-      - Image: processed via CLIP and projected.
-      - Audio: transcribed using Whisper.
-    The function fuses the outputs by concatenating their textual representations,
-    and then feeds the final prompt to the text model for generation.
     """
     prompt = ""
@@ -57,8 +71,7 @@ def multimodal_inference(text_input, image_input, audio_input):
             image_features = image_features / image_features.norm(p=2, dim=-1, keepdim=True)
             # Project image embedding into GPT-2's embedding space
             projected_image = image_projection(image_features)
-            # For demo purposes, we simply append a placeholder tag.
-            # In a full system, you would integrate these embeddings into your model.
             prompt += " [IMAGE_EMBEDDING]"
         except Exception as e:
             print("Error processing image:", e)
@@ -67,7 +80,6 @@ def multimodal_inference(text_input, image_input, audio_input):
     # Process audio input if provided
     if audio_input is not None:
         try:
-            # Gradio provides a filepath for the audio file.
             audio, sr = sf.read(audio_input)
         except Exception as e:
             print("Error reading audio file:", e)
@@ -82,7 +94,6 @@ def multimodal_inference(text_input, image_input, audio_input):
             print("Error during audio transcription:", e)
             prompt += " [AUDIO_ERROR]"
-    # Debug: Print the final prompt for verification
     print("Final fused prompt:", prompt)
     # Generate text response using the text model

+import sys
+import subprocess
+def upgrade_packages():
+    try:
+        print("Upgrading transformers and accelerate...")
+        subprocess.check_call([
+            sys.executable, "-m", "pip", "install", "--upgrade",
+            "transformers>=4.31.0", "accelerate>=0.20.0"
+        ])
+        print("Upgrade complete.")
+    except Exception as e:
+        print("Error upgrading packages:", e)
+# Attempt to upgrade the packages
+upgrade_packages()
+# Now import the libraries
 import torch
 from transformers import CLIPProcessor, CLIPModel, WhisperProcessor, WhisperForConditionalGeneration, AutoTokenizer, AutoModelForCausalLM
 import gradio as gr
 # ------------------------------
 # Define Projection Layers
 # ------------------------------
 print("Initializing image projection layer...")
 image_projection = torch.nn.Linear(512, 768)
 def multimodal_inference(text_input, image_input, audio_input):
     """
+    Processes text, image, and audio inputs.
+      - Text is added directly.
+      - The image is processed via CLIP, its embedding is projected, and a placeholder tag is appended.
+      - Audio is transcribed using Whisper and appended.
+    The final prompt is sent to the text model for generation.
     """
     prompt = ""
             image_features = image_features / image_features.norm(p=2, dim=-1, keepdim=True)
             # Project image embedding into GPT-2's embedding space
             projected_image = image_projection(image_features)
+            # For demo purposes, we append a placeholder tag.
             prompt += " [IMAGE_EMBEDDING]"
         except Exception as e:
             print("Error processing image:", e)
     # Process audio input if provided
     if audio_input is not None:
         try:
             audio, sr = sf.read(audio_input)
         except Exception as e:
             print("Error reading audio file:", e)
             print("Error during audio transcription:", e)
             prompt += " [AUDIO_ERROR]"
     print("Final fused prompt:", prompt)
     # Generate text response using the text model