atharvasc27112001 commited on
Commit
d6de1a9
·
verified ·
1 Parent(s): f94e228

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -13
app.py CHANGED
@@ -1,3 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import torch
2
  from transformers import CLIPProcessor, CLIPModel, WhisperProcessor, WhisperForConditionalGeneration, AutoTokenizer, AutoModelForCausalLM
3
  import gradio as gr
@@ -22,8 +40,6 @@ text_model = AutoModelForCausalLM.from_pretrained("gpt2")
22
  # ------------------------------
23
  # Define Projection Layers
24
  # ------------------------------
25
- # Here we create a simple linear layer to project CLIP's image embeddings (512 dims)
26
- # to GPT-2's embedding dimension (768 dims). In a full project, this layer would be fine-tuned.
27
  print("Initializing image projection layer...")
28
  image_projection = torch.nn.Linear(512, 768)
29
 
@@ -33,13 +49,11 @@ image_projection = torch.nn.Linear(512, 768)
33
 
34
  def multimodal_inference(text_input, image_input, audio_input):
35
  """
36
- Processes three modalities:
37
- - Text: used directly.
38
- - Image: processed via CLIP and projected.
39
- - Audio: transcribed using Whisper.
40
-
41
- The function fuses the outputs by concatenating their textual representations,
42
- and then feeds the final prompt to the text model for generation.
43
  """
44
  prompt = ""
45
 
@@ -57,8 +71,7 @@ def multimodal_inference(text_input, image_input, audio_input):
57
  image_features = image_features / image_features.norm(p=2, dim=-1, keepdim=True)
58
  # Project image embedding into GPT-2's embedding space
59
  projected_image = image_projection(image_features)
60
- # For demo purposes, we simply append a placeholder tag.
61
- # In a full system, you would integrate these embeddings into your model.
62
  prompt += " [IMAGE_EMBEDDING]"
63
  except Exception as e:
64
  print("Error processing image:", e)
@@ -67,7 +80,6 @@ def multimodal_inference(text_input, image_input, audio_input):
67
  # Process audio input if provided
68
  if audio_input is not None:
69
  try:
70
- # Gradio provides a filepath for the audio file.
71
  audio, sr = sf.read(audio_input)
72
  except Exception as e:
73
  print("Error reading audio file:", e)
@@ -82,7 +94,6 @@ def multimodal_inference(text_input, image_input, audio_input):
82
  print("Error during audio transcription:", e)
83
  prompt += " [AUDIO_ERROR]"
84
 
85
- # Debug: Print the final prompt for verification
86
  print("Final fused prompt:", prompt)
87
 
88
  # Generate text response using the text model
 
1
+ import sys
2
+ import subprocess
3
+
4
+ def upgrade_packages():
5
+ try:
6
+ print("Upgrading transformers and accelerate...")
7
+ subprocess.check_call([
8
+ sys.executable, "-m", "pip", "install", "--upgrade",
9
+ "transformers>=4.31.0", "accelerate>=0.20.0"
10
+ ])
11
+ print("Upgrade complete.")
12
+ except Exception as e:
13
+ print("Error upgrading packages:", e)
14
+
15
+ # Attempt to upgrade the packages
16
+ upgrade_packages()
17
+
18
+ # Now import the libraries
19
  import torch
20
  from transformers import CLIPProcessor, CLIPModel, WhisperProcessor, WhisperForConditionalGeneration, AutoTokenizer, AutoModelForCausalLM
21
  import gradio as gr
 
40
  # ------------------------------
41
  # Define Projection Layers
42
  # ------------------------------
 
 
43
  print("Initializing image projection layer...")
44
  image_projection = torch.nn.Linear(512, 768)
45
 
 
49
 
50
  def multimodal_inference(text_input, image_input, audio_input):
51
  """
52
+ Processes text, image, and audio inputs.
53
+ - Text is added directly.
54
+ - The image is processed via CLIP, its embedding is projected, and a placeholder tag is appended.
55
+ - Audio is transcribed using Whisper and appended.
56
+ The final prompt is sent to the text model for generation.
 
 
57
  """
58
  prompt = ""
59
 
 
71
  image_features = image_features / image_features.norm(p=2, dim=-1, keepdim=True)
72
  # Project image embedding into GPT-2's embedding space
73
  projected_image = image_projection(image_features)
74
+ # For demo purposes, we append a placeholder tag.
 
75
  prompt += " [IMAGE_EMBEDDING]"
76
  except Exception as e:
77
  print("Error processing image:", e)
 
80
  # Process audio input if provided
81
  if audio_input is not None:
82
  try:
 
83
  audio, sr = sf.read(audio_input)
84
  except Exception as e:
85
  print("Error reading audio file:", e)
 
94
  print("Error during audio transcription:", e)
95
  prompt += " [AUDIO_ERROR]"
96
 
 
97
  print("Final fused prompt:", prompt)
98
 
99
  # Generate text response using the text model