Spaces:
Running
on
Zero
Running
on
Zero
import os | |
import argparse | |
from PIL import Image | |
import numpy as np | |
import torch | |
import torchaudio | |
import gc | |
from config import LOGS_DIR, OUTPUT_DIR | |
from DepthEstimator import DepthEstimator | |
from SoundMapper import SoundMapper | |
from GenerateAudio import GenerateAudio | |
from GenerateCaptions import generate_caption | |
from audio_mixer import compose_audio | |
def main(): | |
parser = argparse.ArgumentParser(description="Generate sound from panoramic images") | |
parser.add_argument("--image_dir", type=str, default=LOGS_DIR, help="Directory containing input images") | |
parser.add_argument("--output_dir", type=str, default=OUTPUT_DIR, help="Directory for output files") | |
parser.add_argument("--audio_duration", type=int, default=10, help="Duration of generated audio in seconds") | |
parser.add_argument("--location", type=str, default="52.3436723,4.8529625", help='Location in format "latitude,longitude" (e.g., "40.7128,-74.0060")') | |
parser.add_argument("--view", type=str, default="front", choices=["front", "back", "left", "right"], help="Perspective view to analyze") | |
parser.add_argument("--model", type=str, default="intern_2_5-4B", help="Vision-language model to use for analysis") | |
parser.add_argument("--cpu_only", action="store_true", help="Force CPU usage even if CUDA is available") | |
parser.add_argument("--panoramic", action="store_true", default=False, | |
help="Process panoramic images instead of a single image") | |
args = parser.parse_args() | |
lat, lon = args.location.split(",") | |
os.makedirs(args.output_dir, exist_ok=True) | |
if args.panoramic: | |
print("-----------Processing panoramic images-----------") | |
# Generate captions for all views at once with panoramic=True | |
view_results = generate_caption(lat, lon, view=args.view, model=args.model, | |
cpu_only=args.cpu_only, panoramic=True) | |
if not view_results: | |
print("Failed to generate captions for panoramic views") | |
return | |
sound_mapper = SoundMapper() | |
processed_maps = sound_mapper.process_depth_maps() | |
image_paths = [os.path.join(args.image_dir, f) for f in os.listdir(args.image_dir) if f.endswith(".jpg")] | |
# Create audio generator | |
audio_generator = GenerateAudio() | |
sound_tracks_dict = {} # keep track of sound tracks and their weight | |
# Process each view | |
for i, view_result in enumerate(view_results): | |
current_view = view_result["view"] | |
print(f"Processing {current_view} view ({i+1}/{len(view_results)})") | |
# Find corresponding image path for this view | |
image_path = os.path.join(args.image_dir, f"{current_view}.jpg") | |
if not os.path.exists(image_path): | |
print(f"Warning: Image file {image_path} not found") | |
continue | |
image_index = [idx for idx, path in enumerate(image_paths) | |
if os.path.basename(path) == f"{current_view}.jpg"] | |
if not image_index: | |
print(f"Could not find processed map for {current_view} view") | |
continue | |
depth_map = processed_maps[image_index[0]]["normalization"] | |
object_depths = sound_mapper.analyze_object_depths( | |
image_path, depth_map, lat, lon, | |
caption_data=view_result, | |
all_objects=False | |
) | |
if not object_depths: | |
print(f"No objects detected in the {current_view} view") | |
continue | |
# Generate audio for this view | |
output_path = os.path.join(args.output_dir, f"sound_{current_view}.wav") | |
print(f"Generating audio for {current_view} view...") | |
audio, sample_rate = audio_generator.process_and_generate_audio( | |
object_depths, | |
duration=args.audio_duration | |
) | |
if audio.dim() == 3: | |
audio = audio.squeeze(0) | |
elif audio.dim() == 1: | |
audio = audio.unsqueeze(0) | |
if audio.dim() != 2: | |
raise ValueError(f"Could not convert audio tensor of shape {audio.shape} to 2D") | |
torchaudio.save( | |
output_path, | |
audio, | |
sample_rate | |
) | |
if object_depths: | |
sound_tracks_dict[output_path] = object_depths[0]['weight'] | |
print(f"Generated audio saved to: {output_path}") | |
print("-" * 50) | |
if sound_tracks_dict: | |
print("Composing final audio from all views...") | |
compose_audio( | |
list(sound_tracks_dict.keys()), | |
list(sound_tracks_dict.values()), | |
os.path.join(args.output_dir, "panoramic_composition.wav") | |
) | |
print(f"Final audio composition saved to: {os.path.join(args.output_dir, 'panoramic_composition.wav')}") | |
torch.cuda.empty_cache() | |
gc.collect() | |
del sound_mapper, audio_generator | |
gc.collect() | |
torch.cuda.empty_cache() | |
else: | |
print("Processing single image...") | |
view_result = generate_caption(lat, lon, view=args.view, model=args.model, | |
cpu_only=args.cpu_only, panoramic=False) | |
if not view_result: | |
print("Failed to generate caption for the view") | |
return | |
image_path = os.path.join(args.image_dir, f"{args.view}.jpg") | |
if not os.path.exists(image_path): | |
print(f"Error: Image file {image_path} not found") | |
return | |
print(f"Processing image: {image_path}") | |
sound_mapper = SoundMapper() | |
processed_maps = sound_mapper.process_depth_maps() | |
image_paths = [os.path.join(args.image_dir, f) for f in os.listdir(args.image_dir) if f.endswith(".jpg")] | |
image_basename = os.path.basename(image_path) | |
image_index = [i for i, path in enumerate(image_paths) if os.path.basename(path) == image_basename] | |
if not image_index: | |
print(f"Could not find processed map for {image_basename}") | |
return | |
depth_map = processed_maps[image_index[0]]["normalization"] | |
print("Detecting objects and their depths...") | |
object_depths = sound_mapper.analyze_object_depths( | |
image_path, depth_map, lat, lon, | |
caption_data=view_result, | |
all_objects=True | |
) | |
if not object_depths: | |
print("No objects detected in the image.") | |
return | |
print(f"Detected {len(object_depths)} objects:") | |
for obj in object_depths: | |
print(f" - {obj['original_label']} (Zone: {obj['zone_description']}, Depth: {obj['mean_depth']:.4f})") | |
print("Generating audio...") | |
audio_generator = GenerateAudio() | |
audio, sample_rate = audio_generator.process_and_generate_audio( | |
object_depths, | |
duration=args.audio_duration | |
) | |
if audio.dim() == 3: | |
audio = audio.squeeze(0) | |
elif audio.dim() == 1: | |
audio = audio.unsqueeze(0) | |
if audio.dim() != 2: | |
raise ValueError(f"Could not convert audio tensor of shape {audio.shape} to 2D") | |
output_path = os.path.join(args.output_dir, f"sound_{args.view}.wav") | |
torchaudio.save( | |
output_path, | |
audio, | |
sample_rate | |
) | |
print(f"Generated audio saved to: {output_path}") | |
if __name__ == "__main__": | |
main() | |
# Usage: | |
#(For single image): python main.py --view front | |
#(For panoramic images): python main.py --panoramic |