atharvasc27112001 commited on
Commit
f52b61a
·
verified ·
1 Parent(s): d6de1a9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -12
app.py CHANGED
@@ -12,10 +12,9 @@ def upgrade_packages():
12
  except Exception as e:
13
  print("Error upgrading packages:", e)
14
 
15
- # Attempt to upgrade the packages
16
- upgrade_packages()
17
 
18
- # Now import the libraries
19
  import torch
20
  from transformers import CLIPProcessor, CLIPModel, WhisperProcessor, WhisperForConditionalGeneration, AutoTokenizer, AutoModelForCausalLM
21
  import gradio as gr
@@ -41,6 +40,7 @@ text_model = AutoModelForCausalLM.from_pretrained("gpt2")
41
  # Define Projection Layers
42
  # ------------------------------
43
  print("Initializing image projection layer...")
 
44
  image_projection = torch.nn.Linear(512, 768)
45
 
46
  # ------------------------------
@@ -49,10 +49,11 @@ image_projection = torch.nn.Linear(512, 768)
49
 
50
  def multimodal_inference(text_input, image_input, audio_input):
51
  """
52
- Processes text, image, and audio inputs.
53
  - Text is added directly.
54
- - The image is processed via CLIP, its embedding is projected, and a placeholder tag is appended.
55
- - Audio is transcribed using Whisper and appended.
 
56
  The final prompt is sent to the text model for generation.
57
  """
58
  prompt = ""
@@ -67,11 +68,10 @@ def multimodal_inference(text_input, image_input, audio_input):
67
  clip_inputs = clip_processor(images=image_input, return_tensors="pt")
68
  with torch.no_grad():
69
  image_features = clip_model.get_image_features(**clip_inputs)
70
- # Normalize image features
71
  image_features = image_features / image_features.norm(p=2, dim=-1, keepdim=True)
72
- # Project image embedding into GPT-2's embedding space
73
  projected_image = image_projection(image_features)
74
- # For demo purposes, we append a placeholder tag.
75
  prompt += " [IMAGE_EMBEDDING]"
76
  except Exception as e:
77
  print("Error processing image:", e)
@@ -111,9 +111,9 @@ def multimodal_inference(text_input, image_input, audio_input):
111
  iface = gr.Interface(
112
  fn=multimodal_inference,
113
  inputs=[
114
- gr.inputs.Textbox(lines=5, placeholder="Enter your text here...", label="Text Input"),
115
- gr.inputs.Image(type="pil", label="Image Input (Optional)"),
116
- gr.inputs.Audio(source="upload", type="filepath", label="Audio Input (Optional)")
117
  ],
118
  outputs="text",
119
  title="Multi-Modal LLM Demo",
 
12
  except Exception as e:
13
  print("Error upgrading packages:", e)
14
 
15
+ # Uncomment the next line for local debugging.
16
+ # upgrade_packages()
17
 
 
18
  import torch
19
  from transformers import CLIPProcessor, CLIPModel, WhisperProcessor, WhisperForConditionalGeneration, AutoTokenizer, AutoModelForCausalLM
20
  import gradio as gr
 
40
  # Define Projection Layers
41
  # ------------------------------
42
  print("Initializing image projection layer...")
43
+ # Project CLIP's 512-dim image embeddings to GPT-2's 768-dim embeddings.
44
  image_projection = torch.nn.Linear(512, 768)
45
 
46
  # ------------------------------
 
49
 
50
  def multimodal_inference(text_input, image_input, audio_input):
51
  """
52
+ Processes text, image, and audio inputs:
53
  - Text is added directly.
54
+ - The image is processed via CLIP and projected (here, we append a placeholder).
55
+ - Audio is transcribed via Whisper.
56
+
57
  The final prompt is sent to the text model for generation.
58
  """
59
  prompt = ""
 
68
  clip_inputs = clip_processor(images=image_input, return_tensors="pt")
69
  with torch.no_grad():
70
  image_features = clip_model.get_image_features(**clip_inputs)
71
+ # Normalize and project image features
72
  image_features = image_features / image_features.norm(p=2, dim=-1, keepdim=True)
 
73
  projected_image = image_projection(image_features)
74
+ # Append a placeholder tag (in a full system, you would fuse these embeddings)
75
  prompt += " [IMAGE_EMBEDDING]"
76
  except Exception as e:
77
  print("Error processing image:", e)
 
111
  iface = gr.Interface(
112
  fn=multimodal_inference,
113
  inputs=[
114
+ gr.Textbox(lines=5, placeholder="Enter your text here...", label="Text Input"),
115
+ gr.Image(type="pil", label="Image Input (Optional)"),
116
+ gr.Audio(source="upload", type="filepath", label="Audio Input (Optional)")
117
  ],
118
  outputs="text",
119
  title="Multi-Modal LLM Demo",