atharvasc27112001 commited on
Commit
87ac0d2
·
verified ·
1 Parent(s): f52b61a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -8
app.py CHANGED
@@ -12,7 +12,7 @@ def upgrade_packages():
12
  except Exception as e:
13
  print("Error upgrading packages:", e)
14
 
15
- # Uncomment the next line for local debugging.
16
  # upgrade_packages()
17
 
18
  import torch
@@ -40,7 +40,7 @@ text_model = AutoModelForCausalLM.from_pretrained("gpt2")
40
  # Define Projection Layers
41
  # ------------------------------
42
  print("Initializing image projection layer...")
43
- # Project CLIP's 512-dim image embeddings to GPT-2's 768-dim embeddings.
44
  image_projection = torch.nn.Linear(512, 768)
45
 
46
  # ------------------------------
@@ -50,11 +50,11 @@ image_projection = torch.nn.Linear(512, 768)
50
  def multimodal_inference(text_input, image_input, audio_input):
51
  """
52
  Processes text, image, and audio inputs:
53
- - Text is added directly.
54
- - The image is processed via CLIP and projected (here, we append a placeholder).
55
- - Audio is transcribed via Whisper.
56
 
57
- The final prompt is sent to the text model for generation.
58
  """
59
  prompt = ""
60
 
@@ -71,7 +71,7 @@ def multimodal_inference(text_input, image_input, audio_input):
71
  # Normalize and project image features
72
  image_features = image_features / image_features.norm(p=2, dim=-1, keepdim=True)
73
  projected_image = image_projection(image_features)
74
- # Append a placeholder tag (in a full system, you would fuse these embeddings)
75
  prompt += " [IMAGE_EMBEDDING]"
76
  except Exception as e:
77
  print("Error processing image:", e)
@@ -113,7 +113,7 @@ iface = gr.Interface(
113
  inputs=[
114
  gr.Textbox(lines=5, placeholder="Enter your text here...", label="Text Input"),
115
  gr.Image(type="pil", label="Image Input (Optional)"),
116
- gr.Audio(source="upload", type="filepath", label="Audio Input (Optional)")
117
  ],
118
  outputs="text",
119
  title="Multi-Modal LLM Demo",
 
12
  except Exception as e:
13
  print("Error upgrading packages:", e)
14
 
15
+ # Uncomment the next line if you want to force an upgrade locally (not recommended on Spaces)
16
  # upgrade_packages()
17
 
18
  import torch
 
40
  # Define Projection Layers
41
  # ------------------------------
42
  print("Initializing image projection layer...")
43
+ # This layer projects CLIP's 512-dimensional image embeddings to GPT-2's 768-dimensional space.
44
  image_projection = torch.nn.Linear(512, 768)
45
 
46
  # ------------------------------
 
50
  def multimodal_inference(text_input, image_input, audio_input):
51
  """
52
  Processes text, image, and audio inputs:
53
+ - Text: used directly.
54
+ - Image: processed via CLIP and projected (here, we append a placeholder tag).
55
+ - Audio: transcribed using Whisper.
56
 
57
+ The final prompt is fed to the text model (GPT-2) to generate a response.
58
  """
59
  prompt = ""
60
 
 
71
  # Normalize and project image features
72
  image_features = image_features / image_features.norm(p=2, dim=-1, keepdim=True)
73
  projected_image = image_projection(image_features)
74
+ # For demo purposes, we append a placeholder tag.
75
  prompt += " [IMAGE_EMBEDDING]"
76
  except Exception as e:
77
  print("Error processing image:", e)
 
113
  inputs=[
114
  gr.Textbox(lines=5, placeholder="Enter your text here...", label="Text Input"),
115
  gr.Image(type="pil", label="Image Input (Optional)"),
116
+ gr.Audio(type="filepath", label="Audio Input (Optional)")
117
  ],
118
  outputs="text",
119
  title="Multi-Modal LLM Demo",