Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,20 +1,3 @@
|
|
1 |
-
import sys
|
2 |
-
import subprocess
|
3 |
-
|
4 |
-
def upgrade_packages():
|
5 |
-
try:
|
6 |
-
print("Upgrading transformers and accelerate...")
|
7 |
-
subprocess.check_call([
|
8 |
-
sys.executable, "-m", "pip", "install", "--upgrade",
|
9 |
-
"transformers>=4.31.0", "accelerate>=0.20.0"
|
10 |
-
])
|
11 |
-
print("Upgrade complete.")
|
12 |
-
except Exception as e:
|
13 |
-
print("Error upgrading packages:", e)
|
14 |
-
|
15 |
-
# Uncomment the next line if you want to force an upgrade locally (not recommended on Spaces)
|
16 |
-
# upgrade_packages()
|
17 |
-
|
18 |
import torch
|
19 |
from transformers import CLIPProcessor, CLIPModel, WhisperProcessor, WhisperForConditionalGeneration, AutoTokenizer, AutoModelForCausalLM
|
20 |
import gradio as gr
|
@@ -40,7 +23,7 @@ text_model = AutoModelForCausalLM.from_pretrained("gpt2")
|
|
40 |
# Define Projection Layers
|
41 |
# ------------------------------
|
42 |
print("Initializing image projection layer...")
|
43 |
-
#
|
44 |
image_projection = torch.nn.Linear(512, 768)
|
45 |
|
46 |
# ------------------------------
|
@@ -96,11 +79,18 @@ def multimodal_inference(text_input, image_input, audio_input):
|
|
96 |
|
97 |
print("Final fused prompt:", prompt)
|
98 |
|
99 |
-
# Generate text response using the text model
|
100 |
inputs = tokenizer(prompt, return_tensors="pt")
|
101 |
with torch.no_grad():
|
102 |
-
|
103 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
104 |
|
105 |
return generated_text
|
106 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import torch
|
2 |
from transformers import CLIPProcessor, CLIPModel, WhisperProcessor, WhisperForConditionalGeneration, AutoTokenizer, AutoModelForCausalLM
|
3 |
import gradio as gr
|
|
|
23 |
# Define Projection Layers
|
24 |
# ------------------------------
|
25 |
print("Initializing image projection layer...")
|
26 |
+
# Project CLIP's 512-dimensional image embeddings to GPT-2's 768-dimensional space.
|
27 |
image_projection = torch.nn.Linear(512, 768)
|
28 |
|
29 |
# ------------------------------
|
|
|
79 |
|
80 |
print("Final fused prompt:", prompt)
|
81 |
|
82 |
+
# Generate text response using the text model with advanced decoding parameters
|
83 |
inputs = tokenizer(prompt, return_tensors="pt")
|
84 |
with torch.no_grad():
|
85 |
+
generated_ids = text_model.generate(
|
86 |
+
**inputs,
|
87 |
+
max_length=200,
|
88 |
+
temperature=0.7, # Controls randomness (0=deterministic, 1=more random)
|
89 |
+
top_p=0.9, # Limits sampling to the top 90% probability mass
|
90 |
+
repetition_penalty=1.2,# Penalizes repeated phrases
|
91 |
+
do_sample=True # Enables sampling (instead of greedy decoding)
|
92 |
+
)
|
93 |
+
generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
|
94 |
|
95 |
return generated_text
|
96 |
|