atharvasc27112001 commited on
Commit
3180216
·
verified ·
1 Parent(s): 87ac0d2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -21
app.py CHANGED
@@ -1,20 +1,3 @@
1
- import sys
2
- import subprocess
3
-
4
- def upgrade_packages():
5
- try:
6
- print("Upgrading transformers and accelerate...")
7
- subprocess.check_call([
8
- sys.executable, "-m", "pip", "install", "--upgrade",
9
- "transformers>=4.31.0", "accelerate>=0.20.0"
10
- ])
11
- print("Upgrade complete.")
12
- except Exception as e:
13
- print("Error upgrading packages:", e)
14
-
15
- # Uncomment the next line if you want to force an upgrade locally (not recommended on Spaces)
16
- # upgrade_packages()
17
-
18
  import torch
19
  from transformers import CLIPProcessor, CLIPModel, WhisperProcessor, WhisperForConditionalGeneration, AutoTokenizer, AutoModelForCausalLM
20
  import gradio as gr
@@ -40,7 +23,7 @@ text_model = AutoModelForCausalLM.from_pretrained("gpt2")
40
  # Define Projection Layers
41
  # ------------------------------
42
  print("Initializing image projection layer...")
43
- # This layer projects CLIP's 512-dimensional image embeddings to GPT-2's 768-dimensional space.
44
  image_projection = torch.nn.Linear(512, 768)
45
 
46
  # ------------------------------
@@ -96,11 +79,18 @@ def multimodal_inference(text_input, image_input, audio_input):
96
 
97
  print("Final fused prompt:", prompt)
98
 
99
- # Generate text response using the text model
100
  inputs = tokenizer(prompt, return_tensors="pt")
101
  with torch.no_grad():
102
- outputs = text_model.generate(**inputs, max_length=200)
103
- generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
 
 
 
 
 
 
 
104
 
105
  return generated_text
106
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import torch
2
  from transformers import CLIPProcessor, CLIPModel, WhisperProcessor, WhisperForConditionalGeneration, AutoTokenizer, AutoModelForCausalLM
3
  import gradio as gr
 
23
  # Define Projection Layers
24
  # ------------------------------
25
  print("Initializing image projection layer...")
26
+ # Project CLIP's 512-dimensional image embeddings to GPT-2's 768-dimensional space.
27
  image_projection = torch.nn.Linear(512, 768)
28
 
29
  # ------------------------------
 
79
 
80
  print("Final fused prompt:", prompt)
81
 
82
+ # Generate text response using the text model with advanced decoding parameters
83
  inputs = tokenizer(prompt, return_tensors="pt")
84
  with torch.no_grad():
85
+ generated_ids = text_model.generate(
86
+ **inputs,
87
+ max_length=200,
88
+ temperature=0.7, # Controls randomness (0=deterministic, 1=more random)
89
+ top_p=0.9, # Limits sampling to the top 90% probability mass
90
+ repetition_penalty=1.2,# Penalizes repeated phrases
91
+ do_sample=True # Enables sampling (instead of greedy decoding)
92
+ )
93
+ generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
94
 
95
  return generated_text
96