SoundingStreet / app.py
FQiao's picture
Update app.py
31f8954 verified
import os
import spaces
import gc
from pathlib import Path
import gradio as gr
import torch
import torchaudio
from config import LOGS_DIR, OUTPUT_DIR
from SoundMapper import SoundMapper
from GenerateAudio import GenerateAudio
from GenerateCaptions import generate_caption
from audio_mixer import compose_audio
# Ensure required directories exist
os.makedirs(LOGS_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)
# Prepare external model dir and download checkpoint if missing
from pathlib import Path
depthfm_ckpt = Path('external_models/depth-fm/checkpoints/depthfm-v1.ckpt')
if not depthfm_ckpt.exists():
depthfm_ckpt.parent.mkdir(parents=True, exist_ok=True)
os.system('wget https://ommer-lab.com/files/depthfm/depthfm-v1.ckpt -P external_models/depth-fm/checkpoints/')
# Clear CUDA cache between runs
def clear_cuda():
torch.cuda.empty_cache()
gc.collect()
def process_images(
image_dir: str,
output_dir: str,
panoramic: bool,
view: str,
model: str,
location: str,
audio_duration: int,
cpu_only: bool
) -> None:
# Existing processing logic, generates files in OUTPUT_DIR
lat, lon = location.split(",")
os.makedirs(output_dir, exist_ok=True)
sound_mapper = SoundMapper()
audio_generator = GenerateAudio()
if panoramic:
# Panoramic: generate per-view audio then composition
view_results = generate_caption(lat, lon, view=view, model=model,
cpu_only=cpu_only, panoramic=True)
processed_maps = sound_mapper.process_depth_maps()
image_paths = sorted(Path(image_dir).glob("*.jpg"))
audios = {}
for vr in view_results:
cv = vr["view"]
img_file = Path(image_dir) / f"{cv}.jpg"
if not img_file.exists():
continue
idx = [i for i, p in enumerate(image_paths) if p.name == img_file.name]
if not idx:
continue
depth_map = processed_maps[idx[0]]["normalization"]
obj_depths = sound_mapper.analyze_object_depths(
str(img_file), depth_map, lat, lon,
caption_data=vr, all_objects=False
)
if not obj_depths:
continue
out_wav = Path(output_dir) / f"sound_{cv}.wav"
audio, sr = audio_generator.process_and_generate_audio(
obj_depths, duration=audio_duration
)
if audio.dim() == 3:
audio = audio.squeeze(0)
elif audio.dim() == 1:
audio = audio.unsqueeze(0)
torchaudio.save(str(out_wav), audio, sr)
audios[cv] = str(out_wav)
# final panoramic composition
comp = Path(output_dir) / "panoramic_composition.wav"
compose_audio(list(audios.values()), [1.0]*len(audios), str(comp))
audios['panorama'] = str(comp)
clear_cuda()
return
# Single-view: generate one audio
vr = generate_caption(lat, lon, view=view, model=model,
cpu_only=cpu_only, panoramic=False)
img_file = Path(image_dir) / f"{view}.jpg"
processed_maps = sound_mapper.process_depth_maps()
image_paths = sorted(Path(image_dir).glob("*.jpg"))
idx = [i for i, p in enumerate(image_paths) if p.name == img_file.name]
depth_map = processed_maps[idx[0]]["normalization"]
obj_depths = sound_mapper.analyze_object_depths(
str(img_file), depth_map, lat, lon,
caption_data=vr, all_objects=True
)
out_wav = Path(output_dir) / f"sound_{view}.wav"
audio, sr = audio_generator.process_and_generate_audio(obj_depths, duration=audio_duration)
if audio.dim() == 3:
audio = audio.squeeze(0)
elif audio.dim() == 1:
audio = audio.unsqueeze(0)
torchaudio.save(str(out_wav), audio, sr)
clear_cuda()
# Gradio UI
demo = gr.Blocks(title="Panoramic Audio Generator")
with demo:
gr.Markdown("""
# Panoramic Audio Generator
Displays each view with its audio side by side.
"""
)
with gr.Row():
panoramic = gr.Checkbox(label="Panoramic (multi-view)", value=False)
view = gr.Dropdown(["front", "back", "left", "right"], value="front", label="View")
location = gr.Textbox(value="52.3436723,4.8529625", label="Location (lat,lon)")
model = gr.Textbox(value="intern_2_5-4B", label="Vision-Language Model")
# model = "intern_2_5-4B"
audio_duration = gr.Slider(1, 60, value=10, step=1, label="Audio Duration (sec)")
cpu_only = gr.Checkbox(label="CPU Only", value=False)
btn = gr.Button("Generate")
# Output layout: two rows of two
with gr.Row():
with gr.Column():
img_front = gr.Image(label="Front View", type="filepath")
aud_front = gr.Audio(label="Front Audio", type="filepath")
with gr.Column():
img_back = gr.Image(label="Back View", type="filepath")
aud_back = gr.Audio(label="Back Audio", type="filepath")
with gr.Row():
with gr.Column():
img_left = gr.Image(label="Left View", type="filepath")
aud_left = gr.Audio(label="Left Audio", type="filepath")
with gr.Column():
img_right = gr.Image(label="Right View", type="filepath")
aud_right = gr.Audio(label="Right Audio", type="filepath")
# Panorama at bottom
img_pan = gr.Image(label="Panorama View", type="filepath")
aud_pan = gr.Audio(label="Panoramic Audio", type="filepath")
# Preview update
@spaces.GPU
def run_all(pan, vw, loc, mdl, dur, cpu):
# generate files
process_images(LOGS_DIR, OUTPUT_DIR, pan, vw, mdl, loc, dur, cpu)
# collect files
views = ["front", "back", "left", "right", "panorama"]
paths = {}
for v in views:
img = Path(LOGS_DIR) / f"{v}.jpg"
audio = Path(OUTPUT_DIR) / ("panoramic_composition.wav" if v == "panorama" else f"sound_{v}.wav")
paths[v] = {
'img': str(img) if img.exists() else None,
'aud': str(audio) if audio.exists() else None
}
return (
paths['front']['img'], paths['front']['aud'],
paths['back']['img'], paths['back']['aud'],
paths['left']['img'], paths['left']['aud'],
paths['right']['img'], paths['right']['aud'],
paths['panorama']['img'], paths['panorama']['aud']
)
btn.click(
fn=run_all,
inputs=[panoramic, view, location, model, audio_duration, cpu_only],
outputs=[
img_front, aud_front,
img_back, aud_back,
img_left, aud_left,
img_right, aud_right,
img_pan, aud_pan
]
)
if __name__ == "__main__":
demo.launch(show_api=False)