Spaces:

atharvasc27112001
/

Capstone_Project

Sleeping

App Files Files Community

atharvasc27112001 commited on Apr 6

Commit

f52b61a

verified ·

1 Parent(s): d6de1a9

Update app.py

Browse files

Files changed (1) hide show

app.py +12 -12

app.py CHANGED Viewed

@@ -12,10 +12,9 @@ def upgrade_packages():
     except Exception as e:
         print("Error upgrading packages:", e)
-# Attempt to upgrade the packages
-upgrade_packages()
-# Now import the libraries
 import torch
 from transformers import CLIPProcessor, CLIPModel, WhisperProcessor, WhisperForConditionalGeneration, AutoTokenizer, AutoModelForCausalLM
 import gradio as gr
@@ -41,6 +40,7 @@ text_model = AutoModelForCausalLM.from_pretrained("gpt2")
 # Define Projection Layers
 # ------------------------------
 print("Initializing image projection layer...")
 image_projection = torch.nn.Linear(512, 768)
 # ------------------------------
@@ -49,10 +49,11 @@ image_projection = torch.nn.Linear(512, 768)
 def multimodal_inference(text_input, image_input, audio_input):
     """
-    Processes text, image, and audio inputs.
       - Text is added directly.
-      - The image is processed via CLIP, its embedding is projected, and a placeholder tag is appended.
-      - Audio is transcribed using Whisper and appended.
     The final prompt is sent to the text model for generation.
     """
     prompt = ""
@@ -67,11 +68,10 @@ def multimodal_inference(text_input, image_input, audio_input):
             clip_inputs = clip_processor(images=image_input, return_tensors="pt")
             with torch.no_grad():
                 image_features = clip_model.get_image_features(**clip_inputs)
-            # Normalize image features
             image_features = image_features / image_features.norm(p=2, dim=-1, keepdim=True)
-            # Project image embedding into GPT-2's embedding space
             projected_image = image_projection(image_features)
-            # For demo purposes, we append a placeholder tag.
             prompt += " [IMAGE_EMBEDDING]"
         except Exception as e:
             print("Error processing image:", e)
@@ -111,9 +111,9 @@ def multimodal_inference(text_input, image_input, audio_input):
 iface = gr.Interface(
     fn=multimodal_inference,
     inputs=[
-        gr.inputs.Textbox(lines=5, placeholder="Enter your text here...", label="Text Input"),
-        gr.inputs.Image(type="pil", label="Image Input (Optional)"),
-        gr.inputs.Audio(source="upload", type="filepath", label="Audio Input (Optional)")
     ],
     outputs="text",
     title="Multi-Modal LLM Demo",

     except Exception as e:
         print("Error upgrading packages:", e)
+# Uncomment the next line for local debugging.
+# upgrade_packages()
 import torch
 from transformers import CLIPProcessor, CLIPModel, WhisperProcessor, WhisperForConditionalGeneration, AutoTokenizer, AutoModelForCausalLM
 import gradio as gr
 # Define Projection Layers
 # ------------------------------
 print("Initializing image projection layer...")
+# Project CLIP's 512-dim image embeddings to GPT-2's 768-dim embeddings.
 image_projection = torch.nn.Linear(512, 768)
 # ------------------------------
 def multimodal_inference(text_input, image_input, audio_input):
     """
+    Processes text, image, and audio inputs:
       - Text is added directly.
+      - The image is processed via CLIP and projected (here, we append a placeholder).
+      - Audio is transcribed via Whisper.
     The final prompt is sent to the text model for generation.
     """
     prompt = ""
             clip_inputs = clip_processor(images=image_input, return_tensors="pt")
             with torch.no_grad():
                 image_features = clip_model.get_image_features(**clip_inputs)
+            # Normalize and project image features
             image_features = image_features / image_features.norm(p=2, dim=-1, keepdim=True)
             projected_image = image_projection(image_features)
+            # Append a placeholder tag (in a full system, you would fuse these embeddings)
             prompt += " [IMAGE_EMBEDDING]"
         except Exception as e:
             print("Error processing image:", e)
 iface = gr.Interface(
     fn=multimodal_inference,
     inputs=[
+        gr.Textbox(lines=5, placeholder="Enter your text here...", label="Text Input"),
+        gr.Image(type="pil", label="Image Input (Optional)"),
+        gr.Audio(source="upload", type="filepath", label="Audio Input (Optional)")
     ],
     outputs="text",
     title="Multi-Modal LLM Demo",