File size: 7,271 Bytes
3324de2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
import torchaudio
import sys
import torch
import random
from config import TANGO_FLUX_DIR
sys.path.append(TANGO_FLUX_DIR)
from tangoflux import TangoFluxInference
from transformers import AutoTokenizer, T5EncoderModel
from collections import Counter

class GenerateAudio():
    def __init__(self):
        self.device = "cuda"
        self.model = None
        self.text_encoder = None
        
        # Basic categories for object classification
        self.categories = {
            'vehicle': ['car', 'bus', 'truck', 'motorcycle', 'bicycle', 'train', 'vehicle'],
            'nature': ['tree', 'bird', 'water', 'river', 'lake', 'ocean', 'rain', 'wind', 'forest'],
            'urban': ['traffic', 'building', 'street', 'signal', 'construction'],
            'animal': ['dog', 'cat', 'bird', 'insect', 'frog', 'squirrel'],
            'human': ['person', 'people', 'crowd', 'child', 'footstep', 'voice'],
            'indoor': ['door', 'window', 'chair', 'table', 'fan', 'appliance', 'tv', 'radio']
        }
        
        # Suffixes and prefixes for pattern matching
        self.suffixes = {
            'tree': 'nature',
            'bird': 'animal',
            'car': 'vehicle',
            'truck': 'vehicle',
            'signal': 'urban'
        }
    
    def _load_model(self):
        if self.model is None:
            self.model = TangoFluxInference(name='declare-lab/TangoFlux')
        if self.text_encoder is None:
            self.text_encoder = T5EncoderModel.from_pretrained("google/flan-t5-large").to(self.device).eval()
        else:
            self.text_encoder = self.text_encoder.to(self.device)

    def generate_sound(self, prompt, steps=25, duration=10, guidance_scale=4.5, disable_progress=True):
        self._load_model()
        with torch.no_grad():
            latents = self.model.model.inference_flow(
                prompt,
                duration=duration,
                num_inference_steps=steps,
                guidance_scale=guidance_scale,
                disable_progress=disable_progress
            )
            wave = self.model.vae.decode(latents.transpose(2, 1)).sample.cpu()[0]
        waveform_end = int(duration * self.model.vae.config.sampling_rate)
        wave = wave[:, :waveform_end]
   
        return wave
    
    def _categorize_object(self, object_name):
        """Categorize an object based on keywords or patterns"""
        object_lower = object_name.lower()
        
        # Check if the object contains any category keywords
        for category, keywords in self.categories.items():
            for keyword in keywords:
                if keyword in object_lower:
                    return category
        
        # Check suffix/prefix patterns
        words = object_lower.split()
        for word in words:
            for suffix, category in self.suffixes.items():
                if word.endswith(suffix):
                    return category
        
        return "unknown"
    
    def _describe_object_sound(self, object_name, zone):
        """Generate an appropriate sound description based on object type and distance"""
        category = self._categorize_object(object_name)
        
        # Volume descriptor based on zone
        volume_descriptors = {
            "near": ["prominent", "clear", "loud", "distinct"],
            "medium": ["moderate", "audible", "present"],
            "far": ["subtle", "distant", "faint", "soft"]
        }
        
        volume = random.choice(volume_descriptors[zone])
        
        # Sound descriptors based on category
        sound_templates = {
            "vehicle": [
                "{volume} engine sounds from the {object}",
                "{volume} mechanical noise of the {object}",
                "the {object} creating {volume} road noise",
                "{volume} sounds of the {object} in motion"
            ],
            "nature": [
                "{volume} rustling of the {object}",
                "the {object} making {volume} natural sounds",
                "{volume} environmental sounds from the {object}",
                "the {object} with {volume} movement in the wind"
            ],
            "urban": [
                "{volume} urban sounds around the {object}",
                "the {object} with {volume} city ambience",
                "{volume} noise from the {object}",
                "the {object} contributing to {volume} street sounds"
            ],
            "animal": [
                "{volume} calls from the {object}",
                "the {object} making {volume} animal sounds",
                "{volume} sounds of the {object}",
                "the {object} with its {volume} presence"
            ],
            "human": [
                "{volume} voices from the {object}",
                "the {object} creating {volume} human sounds",
                "{volume} movement sounds from the {object}",
                "the {object} with {volume} activity"
            ],
            "indoor": [
                "{volume} ambient sounds around the {object}",
                "the {object} making {volume} indoor noises",
                "{volume} mechanical sounds from the {object}",
                "the {object} with its {volume} presence"
            ],
            "unknown": [
                "{volume} sounds from the {object}",
                "the {object} creating {volume} audio",
                "{volume} noises associated with the {object}",
                "the {object} with its {volume} acoustic presence"
            ]
        }
        
        # Select a template for this category
        templates = sound_templates.get(category, sound_templates["unknown"])
        template = random.choice(templates)
        
        # Fill in the template
        description = template.format(volume=volume, object=object_name)
        return description
    
    def create_audio_prompt(self, object_depths):
        if not object_depths:
            return "Environmental ambient sounds."
        
        for obj in object_depths:
            if obj.get("sound_description") and len(obj["sound_description"]) > 5:
                return obj["sound_description"]
        return f"Sounds of {object_depths[0]['original_label']}."

    def process_and_generate_audio(self, object_depths, output_path=None, duration=10, steps=25, guidance_scale=4.5):
        self._load_model()
        
        if not object_depths:
            prompt = "Environmental ambient sounds."
        else:
            # Sort objects by depth to prioritize closer objects
            sorted_objects = sorted(object_depths, key=lambda x: x["mean_depth"])
            prompt = self.create_audio_prompt(sorted_objects)
        
        print(f"Generated audio prompt: {prompt}")
        
        wave = self.generate_sound(
            prompt, 
            steps=steps,
            duration=duration,
            guidance_scale=guidance_scale
        )
        
        sample_rate = self.model.vae.config.sampling_rate
        
        if output_path:
            torchaudio.save(
                output_path,
                wave.unsqueeze(0), 
                sample_rate
            )
            print(f"Audio saved to: {output_path}")
        
        return wave, sample_rate