CallyticsDemo / src /audio /processing.py
bunyaminergen's picture
Initial
1b97239
# Standard library imports
import os
import re
import json
from io import TextIOWrapper
from typing import Annotated, Optional, Tuple, List, Dict
# Related third party imports
import torch
import faster_whisper
from pydub import AudioSegment
from deepmultilingualpunctuation import PunctuationModel
# Local imports
from src.audio.utils import TokenizerUtils
class AudioProcessor:
"""
A class to handle various audio processing tasks, such as conversion,
trimming, merging, and audio transformations.
Parameters
----------
audio_path : str
Path to the audio file to process.
temp_dir : str, optional
Directory for storing temporary files. Defaults to ".temp".
Attributes
----------
audio_path : str
Path to the input audio file.
temp_dir : str
Path to the temporary directory for processed files.
mono_audio_path : Optional[str]
Path to the mono audio file after conversion.
Methods
-------
convert_to_mono()
Converts the audio file to mono.
get_duration()
Gets the duration of the audio file in seconds.
change_format(new_format)
Converts the audio file to a new format.
trim_audio(start_time, end_time)
Trims the audio file to the specified time range.
adjust_volume(change_in_db)
Adjusts the volume of the audio file.
get_channels()
Gets the number of audio channels.
fade_in_out(fade_in_duration, fade_out_duration)
Applies fade-in and fade-out effects to the audio.
merge_audio(other_audio_path)
Merges the current audio with another audio file.
split_audio(chunk_duration)
Splits the audio file into chunks of a specified duration.
create_manifest(manifest_path)
Creates a manifest file containing metadata about the audio.
"""
def __init__(
self,
audio_path: Annotated[str, "Path to the audio file"],
temp_dir: Annotated[str, "Directory for temporary processed files"] = ".temp"
) -> None:
if not isinstance(audio_path, str):
raise TypeError("Expected 'audio_path' to be a string.")
if not isinstance(temp_dir, str):
raise TypeError("Expected 'temp_dir' to be a string.")
self.audio_path = audio_path
self.temp_dir = temp_dir
self.mono_audio_path = None
os.makedirs(temp_dir, exist_ok=True)
def convert_to_mono(self) -> Annotated[str, "Path to the mono audio file"]:
"""
Convert the audio file to mono.
Returns
-------
str
Path to the mono audio file.
Examples
--------
>>> processor = AudioProcessor("example.wav")
>>> mono_path = processor.convert_to_mono()
>>> isinstance(mono_path, str)
True
"""
sound = AudioSegment.from_file(self.audio_path)
mono_sound = sound.set_channels(1)
self.mono_audio_path = os.path.join(self.temp_dir, "mono_file.wav")
mono_sound.export(self.mono_audio_path, format="wav")
return self.mono_audio_path
def get_duration(self) -> Annotated[float, "Audio duration in seconds"]:
"""
Get the duration of the audio file.
Returns
-------
float
Duration of the audio in seconds.
Examples
--------
>>> processor = AudioProcessor("example.wav")
>>> duration = processor.get_duration()
>>> isinstance(duration, float)
True
"""
sound = AudioSegment.from_file(self.audio_path)
return len(sound) / 1000.0
def change_format(
self, new_format: Annotated[str, "New audio format"]
) -> Annotated[str, "Path to converted audio file"]:
"""
Convert the audio file to a new format.
Parameters
----------
new_format : str
Desired format for the output audio file.
Returns
-------
str
Path to the converted audio file.
Examples
--------
>>> processor = AudioProcessor("example.wav")
>>> converted_path = processor.change_format("mp3")
>>> isinstance(converted_path, str)
True
"""
if not isinstance(new_format, str):
raise TypeError("Expected 'new_format' to be a string.")
sound = AudioSegment.from_file(self.audio_path)
output_path = os.path.join(self.temp_dir, f"converted_file.{new_format}")
sound.export(output_path, format=new_format)
return output_path
def trim_audio(
self, start_time: Annotated[float, "Start time in seconds"],
end_time: Annotated[float, "End time in seconds"]
) -> Annotated[str, "Path to trimmed audio file"]:
"""
Trim the audio file to the specified duration.
Parameters
----------
start_time : float
Start time in seconds.
end_time : float
End time in seconds.
Returns
-------
str
Path to the trimmed audio file.
Examples
--------
>>> processor = AudioProcessor("example.wav")
>>> trimmed_path = processor.trim_audio(0.0, 10.0)
>>> isinstance(trimmed_path, str)
True
"""
if not isinstance(start_time, (int, float)):
raise TypeError("Expected 'start_time' to be a float or int.")
if not isinstance(end_time, (int, float)):
raise TypeError("Expected 'end_time' to be a float or int.")
sound = AudioSegment.from_file(self.audio_path)
trimmed_audio = sound[start_time * 1000:end_time * 1000]
trimmed_audio_path = os.path.join(self.temp_dir, "trimmed_file.wav")
trimmed_audio.export(trimmed_audio_path, format="wav")
return trimmed_audio_path
def adjust_volume(
self, change_in_db: Annotated[float, "Volume change in dB"]
) -> Annotated[str, "Path to volume-adjusted audio file"]:
"""
Adjust the volume of the audio file.
Parameters
----------
change_in_db : float
Volume change in decibels.
Returns
-------
str
Path to the volume-adjusted audio file.
Examples
--------
>>> processor = AudioProcessor("example.wav")
>>> adjusted_path = processor.adjust_volume(5.0)
>>> isinstance(adjusted_path, str)
True
"""
if not isinstance(change_in_db, (int, float)):
raise TypeError("Expected 'change_in_db' to be a float or int.")
sound = AudioSegment.from_file(self.audio_path)
adjusted_audio = sound + change_in_db
adjusted_audio_path = os.path.join(self.temp_dir, "adjusted_volume.wav")
adjusted_audio.export(adjusted_audio_path, format="wav")
return adjusted_audio_path
def get_channels(self) -> Annotated[int, "Number of channels"]:
"""
Get the number of audio channels.
Returns
-------
int
Number of audio channels.
Examples
--------
>>> processor = AudioProcessor("example.wav")
>>> channels = processor.get_channels()
>>> isinstance(channels, int)
True
"""
sound = AudioSegment.from_file(self.audio_path)
return sound.channels
def fade_in_out(
self, fade_in_duration: Annotated[float, "Fade-in duration in seconds"],
fade_out_duration: Annotated[float, "Fade-out duration in seconds"]
) -> Annotated[str, "Path to faded audio file"]:
"""
Apply fade-in and fade-out effects to the audio file.
Parameters
----------
fade_in_duration : float
Duration of the fade-in effect in seconds.
fade_out_duration : float
Duration of the fade-out effect in seconds.
Returns
-------
str
Path to the faded audio file.
Examples
--------
>>> processor = AudioProcessor("example.wav")
>>> faded_path = processor.fade_in_out(1.0, 2.0)
>>> isinstance(faded_path, str)
True
"""
if not isinstance(fade_in_duration, (int, float)):
raise TypeError("Expected 'fade_in_duration' to be a float or int.")
if not isinstance(fade_out_duration, (int, float)):
raise TypeError("Expected 'fade_out_duration' to be a float or int.")
sound = AudioSegment.from_file(self.audio_path)
faded_audio = sound.fade_in(fade_in_duration * 1000).fade_out(fade_out_duration * 1000)
faded_audio_path = os.path.join(self.temp_dir, "faded_audio.wav")
faded_audio.export(faded_audio_path, format="wav")
return faded_audio_path
def merge_audio(
self, other_audio_path: Annotated[str, "Path to other audio file"]
) -> Annotated[str, "Path to merged audio file"]:
"""
Merge the current audio file with another audio file.
Parameters
----------
other_audio_path : str
Path to the other audio file.
Returns
-------
str
Path to the merged audio file.
Examples
--------
>>> processor = AudioProcessor("example.wav")
>>> merged_path = processor.merge_audio("other_example.wav")
>>> isinstance(merged_path, str)
True
"""
if not isinstance(other_audio_path, str):
raise TypeError("Expected 'other_audio_path' to be a string.")
sound1 = AudioSegment.from_file(self.audio_path)
sound2 = AudioSegment.from_file(other_audio_path)
merged_audio = sound1 + sound2
merged_audio_path = os.path.join(self.temp_dir, "merged_audio.wav")
merged_audio.export(merged_audio_path, format="wav")
return merged_audio_path
def split_audio(
self, chunk_duration: Annotated[float, "Chunk duration in seconds"]
) -> Annotated[List[str], "Paths to audio chunks"]:
"""
Split the audio file into chunks of the specified duration.
Parameters
----------
chunk_duration : float
Duration of each chunk in seconds.
Returns
-------
List[str]
Paths to the generated audio chunks.
Examples
--------
>>> processor = AudioProcessor("example.wav")
>>> chunks = processor.split_audio(10.0)
>>> isinstance(chunks, list)
True
"""
if not isinstance(chunk_duration, (int, float)):
raise TypeError("Expected 'chunk_duration' to be a float or int.")
sound = AudioSegment.from_file(self.audio_path)
chunk_paths = []
for i in range(0, len(sound), int(chunk_duration * 1000)):
chunk = sound[i:i + int(chunk_duration * 1000)]
chunk_path = os.path.join(self.temp_dir, f"chunk_{i // 1000}.wav")
chunk.export(chunk_path, format="wav")
chunk_paths.append(chunk_path)
return chunk_paths
def create_manifest(
self,
manifest_path: Annotated[str, "Manifest file path"]
) -> None:
"""
Create a manifest file containing metadata about the audio file.
Parameters
----------
manifest_path : str
Path to the manifest file.
Examples
--------
>>> processor = AudioProcessor("example.wav")
>>> processor.create_manifest("manifest.json")
"""
duration = self.get_duration()
manifest_entry = {
"audio_filepath": self.audio_path,
"offset": 0,
"duration": duration,
"label": "infer",
"text": "-",
"rttm_filepath": None,
"uem_filepath": None
}
with open(manifest_path, 'w', encoding='utf-8') as f: # type: TextIOWrapper
json.dump(manifest_entry, f)
class Transcriber:
"""
A class for transcribing audio files using a pre-trained Whisper model.
Parameters
----------
model_name : str, optional
Name of the model to load. Defaults to 'large-v3'.
device : str, optional
Device to use for model inference ('cpu' or 'cuda'). Defaults to 'cpu'.
compute_type : str, optional
Data type for model computation ('int8', 'float16', etc.). Defaults to 'int8'.
Attributes
----------
model : faster_whisper.WhisperModel
Loaded Whisper model for transcription.
device : str
Device used for inference.
Methods
-------
transcribe(audio_path, language=None, suppress_numerals=False)
Transcribes the audio file into text.
"""
def __init__(
self,
model_name: Annotated[str, "Name of the model to load"] = 'large-v3',
device: Annotated[str, "Device to use for model inference"] = 'cpu',
compute_type: Annotated[str, "Data type for model computation, e.g., 'int8' or 'float16'"] = 'int8'
) -> None:
if not isinstance(model_name, str):
raise TypeError("Expected 'model_name' to be of type str")
if not isinstance(device, str):
raise TypeError("Expected 'device' to be of type str")
if not isinstance(compute_type, str):
raise TypeError("Expected 'compute_type' to be of type str")
self.device = device
self.model = faster_whisper.WhisperModel(
model_name, device=device, compute_type=compute_type
)
def transcribe(
self,
audio_path: Annotated[str, "Path to the audio file to transcribe"],
language: Annotated[Optional[str], "Language code for transcription, e.g., 'en' for English"] = None,
suppress_numerals: Annotated[bool, "Whether to suppress numerals in the transcription"] = False
) -> Annotated[Tuple[str, dict], "Transcription text and additional information"]:
"""
Transcribe an audio file into text.
Parameters
----------
audio_path : str
Path to the audio file.
language : str, optional
Language code for transcription (e.g., 'en' for English).
suppress_numerals : bool, optional
Whether to suppress numerals in the transcription. Defaults to False.
Returns
-------
Tuple[str, dict]
The transcribed text and additional transcription metadata.
Examples
--------
>>> transcriber = Transcriber()
>>> text, information = transcriber.transcribe("example.wav")
>>> isinstance(text, str)
True
>>> isinstance(info, dict)
True
"""
if not isinstance(audio_path, str):
raise TypeError("Expected 'audio_path' to be of type str")
if language is not None and not isinstance(language, str):
raise TypeError("Expected 'language' to be of type str if provided")
if not isinstance(suppress_numerals, bool):
raise TypeError("Expected 'suppress_numerals' to be of type bool")
audio_waveform = faster_whisper.decode_audio(audio_path)
suppress_tokens = [-1]
if suppress_numerals:
suppress_tokens = TokenizerUtils.find_numeral_symbol_tokens(
self.model.hf_tokenizer
)
transcript_segments, info = self.model.transcribe(
audio_waveform,
language=language,
suppress_tokens=suppress_tokens,
without_timestamps=True,
vad_filter=True,
log_progress=True,
)
transcript = ''.join(segment.text for segment in transcript_segments)
info = vars(info)
if self.device == 'cuda':
del self.model
torch.cuda.empty_cache()
print(transcript, info)
return transcript, info
class PunctuationRestorer:
"""
A class for restoring punctuation in transcribed text.
Parameters
----------
language : str, optional
Language for punctuation restoration. Defaults to 'en'.
Attributes
----------
language : str
Language used for punctuation restoration.
punct_model : PunctuationModel
Model for predicting punctuation.
supported_languages : List[str]
List of languages supported by the model.
Methods
-------
restore_punctuation(word_speaker_mapping)
Restores punctuation in the provided text based on word mappings.
"""
def __init__(self, language: Annotated[str, "Language for punctuation restoration"] = 'en') -> None:
self.language = language
self.punct_model = PunctuationModel(model="kredor/punctuate-all")
self.supported_languages = [
"en", "fr", "de", "es", "it", "nl", "pt", "bg", "pl", "cs", "sk", "sl",
]
def restore_punctuation(
self, word_speaker_mapping: Annotated[List[Dict], "List of word-speaker mappings"]
) -> Annotated[List[Dict], "Word mappings with restored punctuation"]:
"""
Restore punctuation for transcribed text.
Parameters
----------
word_speaker_mapping : List[Dict]
List of dictionaries containing word and speaker mappings.
Returns
-------
List[Dict]
Updated list with punctuation restored.
Examples
--------
>>> restorer = PunctuationRestorer()
>>> mapping = [{"text": "hello"}, {"text": "world"}]
>>> result = restorer.restore_punctuation(mapping)
>>> isinstance(result, list)
True
>>> "text" in result[0]
True
"""
if self.language not in self.supported_languages:
print(f"Punctuation restoration is not available for {self.language} language.")
return word_speaker_mapping
words_list = [word_dict["text"] for word_dict in word_speaker_mapping]
labeled_words = self.punct_model.predict(words_list)
ending_puncts = ".?!"
model_puncts = ".,;:!?"
is_acronym = lambda x: re.fullmatch(r"\b(?:[a-zA-Z]\.){2,}", x)
for word_dict, labeled_tuple in zip(word_speaker_mapping, labeled_words):
word = word_dict["text"]
if (
word
and labeled_tuple[1] in ending_puncts
and (word[-1] not in model_puncts or is_acronym(word))
):
word += labeled_tuple[1]
word = word.rstrip(".") if word.endswith("..") else word
word_dict["text"] = word
return word_speaker_mapping
if __name__ == "__main__":
sample_audio_path = "sample_audio.wav"
audio_processor_instance = AudioProcessor(sample_audio_path)
mono_audio_path = audio_processor_instance.convert_to_mono()
print(f"Mono audio file saved at: {mono_audio_path}")
audio_duration = audio_processor_instance.get_duration()
print(f"Audio duration: {audio_duration} seconds")
converted_audio_path = audio_processor_instance.change_format("mp3")
print(f"Converted audio file saved at: {converted_audio_path}")
audio_path_trimmed = audio_processor_instance.trim_audio(0.0, 10.0)
print(f"Trimmed audio file saved at: {audio_path_trimmed}")
volume_adjusted_audio_path = audio_processor_instance.adjust_volume(5.0)
print(f"Volume adjusted audio file saved at: {volume_adjusted_audio_path}")
additional_audio_path = "additional_audio.wav"
merged_audio_output_path = audio_processor_instance.merge_audio(additional_audio_path)
print(f"Merged audio file saved at: {merged_audio_output_path}")
audio_chunk_paths = audio_processor_instance.split_audio(10.0)
print(f"Audio chunks saved at: {audio_chunk_paths}")
output_manifest_path = "output_manifest.json"
audio_processor_instance.create_manifest(output_manifest_path)
print(f"Manifest file saved at: {output_manifest_path}")
transcriber_instance = Transcriber()
transcribed_text_output, transcription_metadata = transcriber_instance.transcribe(sample_audio_path)
print(f"Transcribed Text: {transcribed_text_output}")
print(f"Transcription Info: {transcription_metadata}")
word_mapping_example = [
{"text": "hello"},
{"text": "world"},
{"text": "this"},
{"text": "is"},
{"text": "a"},
{"text": "test"}
]
punctuation_restorer_instance = PunctuationRestorer()
punctuation_restored_mapping = punctuation_restorer_instance.restore_punctuation(word_mapping_example)
print(f"Restored Mapping: {punctuation_restored_mapping}")