Spaces:

FQiao
/

SoundingStreet

Running on Zero

File size: 7,271 Bytes

3324de2

import torchaudio
import sys
import torch
import random
from config import TANGO_FLUX_DIR
sys.path.append(TANGO_FLUX_DIR)
from tangoflux import TangoFluxInference
from transformers import AutoTokenizer, T5EncoderModel
from collections import Counter

class GenerateAudio():
    def __init__(self):
        self.device = "cuda"
        self.model = None
        self.text_encoder = None
        
        # Basic categories for object classification
        self.categories = {
            'vehicle': ['car', 'bus', 'truck', 'motorcycle', 'bicycle', 'train', 'vehicle'],
            'nature': ['tree', 'bird', 'water', 'river', 'lake', 'ocean', 'rain', 'wind', 'forest'],
            'urban': ['traffic', 'building', 'street', 'signal', 'construction'],
            'animal': ['dog', 'cat', 'bird', 'insect', 'frog', 'squirrel'],
            'human': ['person', 'people', 'crowd', 'child', 'footstep', 'voice'],
            'indoor': ['door', 'window', 'chair', 'table', 'fan', 'appliance', 'tv', 'radio']
        }
        
        # Suffixes and prefixes for pattern matching
        self.suffixes = {
            'tree': 'nature',
            'bird': 'animal',
            'car': 'vehicle',
            'truck': 'vehicle',
            'signal': 'urban'
        }
    
    def _load_model(self):
        if self.model is None:
            self.model = TangoFluxInference(name='declare-lab/TangoFlux')
        if self.text_encoder is None:
            self.text_encoder = T5EncoderModel.from_pretrained("google/flan-t5-large").to(self.device).eval()
        else:
            self.text_encoder = self.text_encoder.to(self.device)

    def generate_sound(self, prompt, steps=25, duration=10, guidance_scale=4.5, disable_progress=True):
        self._load_model()
        with torch.no_grad():
            latents = self.model.model.inference_flow(
                prompt,
                duration=duration,
                num_inference_steps=steps,
                guidance_scale=guidance_scale,
                disable_progress=disable_progress
            )
            wave = self.model.vae.decode(latents.transpose(2, 1)).sample.cpu()[0]
        waveform_end = int(duration * self.model.vae.config.sampling_rate)
        wave = wave[:, :waveform_end]
   
        return wave
    
    def _categorize_object(self, object_name):
        """Categorize an object based on keywords or patterns"""
        object_lower = object_name.lower()
        
        # Check if the object contains any category keywords
        for category, keywords in self.categories.items():
            for keyword in keywords:
                if keyword in object_lower:
                    return category
        
        # Check suffix/prefix patterns
        words = object_lower.split()
        for word in words:
            for suffix, category in self.suffixes.items():
                if word.endswith(suffix):
                    return category
        
        return "unknown"
    
    def _describe_object_sound(self, object_name, zone):
        """Generate an appropriate sound description based on object type and distance"""
        category = self._categorize_object(object_name)
        
        # Volume descriptor based on zone
        volume_descriptors = {
            "near": ["prominent", "clear", "loud", "distinct"],
            "medium": ["moderate", "audible", "present"],
            "far": ["subtle", "distant", "faint", "soft"]
        }
        
        volume = random.choice(volume_descriptors[zone])
        
        # Sound descriptors based on category
        sound_templates = {
            "vehicle": [
                "{volume} engine sounds from the {object}",
                "{volume} mechanical noise of the {object}",
                "the {object} creating {volume} road noise",
                "{volume} sounds of the {object} in motion"
            ],
            "nature": [
                "{volume} rustling of the {object}",
                "the {object} making {volume} natural sounds",
                "{volume} environmental sounds from the {object}",
                "the {object} with {volume} movement in the wind"
            ],
            "urban": [
                "{volume} urban sounds around the {object}",
                "the {object} with {volume} city ambience",
                "{volume} noise from the {object}",
                "the {object} contributing to {volume} street sounds"
            ],
            "animal": [
                "{volume} calls from the {object}",
                "the {object} making {volume} animal sounds",
                "{volume} sounds of the {object}",
                "the {object} with its {volume} presence"
            ],
            "human": [
                "{volume} voices from the {object}",
                "the {object} creating {volume} human sounds",
                "{volume} movement sounds from the {object}",
                "the {object} with {volume} activity"
            ],
            "indoor": [
                "{volume} ambient sounds around the {object}",
                "the {object} making {volume} indoor noises",
                "{volume} mechanical sounds from the {object}",
                "the {object} with its {volume} presence"
            ],
            "unknown": [
                "{volume} sounds from the {object}",
                "the {object} creating {volume} audio",
                "{volume} noises associated with the {object}",
                "the {object} with its {volume} acoustic presence"
            ]
        }
        
        # Select a template for this category
        templates = sound_templates.get(category, sound_templates["unknown"])
        template = random.choice(templates)
        
        # Fill in the template
        description = template.format(volume=volume, object=object_name)
        return description
    
    def create_audio_prompt(self, object_depths):
        if not object_depths:
            return "Environmental ambient sounds."
        
        for obj in object_depths:
            if obj.get("sound_description") and len(obj["sound_description"]) > 5:
                return obj["sound_description"]
        return f"Sounds of {object_depths[0]['original_label']}."

    def process_and_generate_audio(self, object_depths, output_path=None, duration=10, steps=25, guidance_scale=4.5):
        self._load_model()
        
        if not object_depths:
            prompt = "Environmental ambient sounds."
        else:
            # Sort objects by depth to prioritize closer objects
            sorted_objects = sorted(object_depths, key=lambda x: x["mean_depth"])
            prompt = self.create_audio_prompt(sorted_objects)
        
        print(f"Generated audio prompt: {prompt}")
        
        wave = self.generate_sound(
            prompt, 
            steps=steps,
            duration=duration,
            guidance_scale=guidance_scale
        )
        
        sample_rate = self.model.vae.config.sampling_rate
        
        if output_path:
            torchaudio.save(
                output_path,
                wave.unsqueeze(0), 
                sample_rate
            )
            print(f"Audio saved to: {output_path}")
        
        return wave, sample_rate