Mondjemin_gradio / ai_functions.py
Zelyanoth's picture
update
dbcc3a3
raw
history blame
6.13 kB
from transformers import pipeline
from pydub import AudioSegment
import os
import soundfile as sf
import numpy as np
from typing import Union
import torch
import librosa
import requests
from transformers import AutoModelForCausalLM
from transformers import AutoTokenizer
from huggingface_hub import login
from huggingface_hub import hf_hub_download
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torchaudio
from functools import lru_cache
key = os.environ['access_token']
login(token = key)
@lru_cache(maxsize=None)
def load_model() :
tokenizer = AutoTokenizer.from_pretrained("Zelyanoth/my_fon_translation_model", token=True, src_lang="fon_Latn")
model = AutoModelForSeq2SeqLM.from_pretrained("Zelyanoth/my_fon_translation_model")
tokenizer_tr = AutoTokenizer.from_pretrained("google/flan-t5-xl")
model_tr = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-xl", torch_dtype=torch.float16)
transcription_pipeline = pipeline("automatic-speech-recognition", model="Zelyanoth/wav2vec2-bert-fon-colab")
return tokenizer_tr, model_tr, transcription_pipeline,tokenizer,model
tokenizer_tr,model_tr,transcription_pipeline,tokenizer,model = load_model()
class MONDJEMIN_AI(object) :
"THE AI PART USED IN THE Project"
def __init__(self,audio) :
self.audio = audio
def formater(self, nom_fichier_wav):
"permet de formatter le fichier pour qu il soit gérable par le transcripteur c est son bro sure"
formats = os.path.splitext(self.audio)[1][1:]
audio = AudioSegment.from_file(self.audio, format=formats)
audio.export(nom_fichier_wav, format="wav")
y, sr = librosa.load(nom_fichier_wav, sr=16000)
frame_length = 400
hop_length = 160
amplitude = np.abs(y)
energy = np.array([
sum(abs(y[i:i+frame_length]**2))
for i in range(0, len(y), hop_length)
])
fenetre = np.percentile(energy, 10)
silence_indices = np.where(energy < fenetre)[0]
silence_times = librosa.frames_to_time(silence_indices, sr=sr, hop_length=hop_length)
min_silence_duration = 0.5
merged_silences = []
current_silence_start = silence_times[0]
for i in range(1, len(silence_times)):
if silence_times[i] - silence_times[i - 1] > min_silence_duration:
current_silence_end = silence_times[i - 1]
merged_silences.append((current_silence_start, current_silence_end))
current_silence_start = silence_times[i]
merged_silences.append((current_silence_start, silence_times[-1]))
segments_boundaries = [0] + [end for start, end in merged_silences] + [len(y) / sr]
segments = []
min_segment_duration = 1.0
segments_files_names = []
for i in range(len(segments_boundaries) - 1):
start_sample = int(segments_boundaries[i] * sr)
end_sample = int(segments_boundaries[i + 1] * sr)
segment = y[start_sample:end_sample]
segment_duration = (end_sample - start_sample) / sr
if segment_duration >= min_segment_duration:
segments.append(segment)
output_filename = f'segment_{i+1}.wav'
sf.write(output_filename, segment, sr)
if len(segments) == 0:
output_filename = 'segment_1.wav'
sf.write(output_filename, y, sr)
segments.append(y)
return segments
# # Permettre l'écoute de chaque segment
# for i, segment in enumerate(segments):
# print(f'Écouter le segment {i+1}')
# display(Audio(segment, rate=sr))
# chemin_dossier = "./audio/"
# if duree_audio_ms > duree_segment_ms :
# segments_audio = []
# debut_segment = 0
# fin_segment = duree_segment_ms
# while debut_segment < duree_audio_ms:
# segment = audio[debut_segment:fin_segment]
# nombre_dossiers = 0
# if not os.path.exists(chemin_dossier):
# os.makedirs(chemin_dossier)
# for element in os.listdir(chemin_dossier):
# # Vérifier si l'élément est un dossier
# if os.path.isdir(os.path.join(chemin_dossier, element)):
# # Incrémenter le compteur
# nombre_dossiers += 1
# folder = "./audio/split"+str(nombre_dossiers)
# os.makedirs(folder)
# outputfile = folder+"/"+nom_fichier_wav+ str(fin_segment)
# segment.export(outputfile , format="wav")
# segments_audio.append(outputfile )
# # Mettre à jour les indices pour le prochain segment
# debut_segment = fin_segment
# fin_segment += duree_segment_ms
# return segments_audio
def transcriptor(self) :
"Fais une transcription de l audio en fon nyehehe"
global transcription_pipeline
audio = self.formater("test.wav")
if isinstance(audio, list) :
trans = []
for a in audio :
transcription = transcription_pipeline(a)
trans.append(transcription['text'])
transcr = ", ".join(trans)
return transcr
else :
transcription = transcription_pipeline(audio)
return transcription['text']
def translate(self,inputt) :
lop = []
for a in inputt :
inputs = tokenizer(a, return_tensors="pt")
translated_tokens = model.generate(
**inputs, forced_bos_token_id=tokenizer.lang_code_to_id["fra_Latn"],
)
lop.append(tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0])
return ' \n'.join(lop)