Spaces:
Sleeping
Sleeping
File size: 6,129 Bytes
53deec3 3e416ff 53deec3 3e416ff 53deec3 1ff7c68 53deec3 1ff7c68 53deec3 70decdb 53deec3 3e416ff 2b50d42 3e416ff 2b50d42 3e416ff 2b50d42 3e416ff 2b50d42 3e416ff 2b50d42 3e416ff 53deec3 3e416ff 53deec3 3e416ff 53deec3 3e416ff 53deec3 dbcc3a3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
from transformers import pipeline
from pydub import AudioSegment
import os
import soundfile as sf
import numpy as np
from typing import Union
import torch
import librosa
import requests
from transformers import AutoModelForCausalLM
from transformers import AutoTokenizer
from huggingface_hub import login
from huggingface_hub import hf_hub_download
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torchaudio
from functools import lru_cache
key = os.environ['access_token']
login(token = key)
@lru_cache(maxsize=None)
def load_model() :
tokenizer = AutoTokenizer.from_pretrained("Zelyanoth/my_fon_translation_model", token=True, src_lang="fon_Latn")
model = AutoModelForSeq2SeqLM.from_pretrained("Zelyanoth/my_fon_translation_model")
tokenizer_tr = AutoTokenizer.from_pretrained("google/flan-t5-xl")
model_tr = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-xl", torch_dtype=torch.float16)
transcription_pipeline = pipeline("automatic-speech-recognition", model="Zelyanoth/wav2vec2-bert-fon-colab")
return tokenizer_tr, model_tr, transcription_pipeline,tokenizer,model
tokenizer_tr,model_tr,transcription_pipeline,tokenizer,model = load_model()
class MONDJEMIN_AI(object) :
"THE AI PART USED IN THE Project"
def __init__(self,audio) :
self.audio = audio
def formater(self, nom_fichier_wav):
"permet de formatter le fichier pour qu il soit gérable par le transcripteur c est son bro sure"
formats = os.path.splitext(self.audio)[1][1:]
audio = AudioSegment.from_file(self.audio, format=formats)
audio.export(nom_fichier_wav, format="wav")
y, sr = librosa.load(nom_fichier_wav, sr=16000)
frame_length = 400
hop_length = 160
amplitude = np.abs(y)
energy = np.array([
sum(abs(y[i:i+frame_length]**2))
for i in range(0, len(y), hop_length)
])
fenetre = np.percentile(energy, 10)
silence_indices = np.where(energy < fenetre)[0]
silence_times = librosa.frames_to_time(silence_indices, sr=sr, hop_length=hop_length)
min_silence_duration = 0.5
merged_silences = []
current_silence_start = silence_times[0]
for i in range(1, len(silence_times)):
if silence_times[i] - silence_times[i - 1] > min_silence_duration:
current_silence_end = silence_times[i - 1]
merged_silences.append((current_silence_start, current_silence_end))
current_silence_start = silence_times[i]
merged_silences.append((current_silence_start, silence_times[-1]))
segments_boundaries = [0] + [end for start, end in merged_silences] + [len(y) / sr]
segments = []
min_segment_duration = 1.0
segments_files_names = []
for i in range(len(segments_boundaries) - 1):
start_sample = int(segments_boundaries[i] * sr)
end_sample = int(segments_boundaries[i + 1] * sr)
segment = y[start_sample:end_sample]
segment_duration = (end_sample - start_sample) / sr
if segment_duration >= min_segment_duration:
segments.append(segment)
output_filename = f'segment_{i+1}.wav'
sf.write(output_filename, segment, sr)
if len(segments) == 0:
output_filename = 'segment_1.wav'
sf.write(output_filename, y, sr)
segments.append(y)
return segments
# # Permettre l'écoute de chaque segment
# for i, segment in enumerate(segments):
# print(f'Écouter le segment {i+1}')
# display(Audio(segment, rate=sr))
# chemin_dossier = "./audio/"
# if duree_audio_ms > duree_segment_ms :
# segments_audio = []
# debut_segment = 0
# fin_segment = duree_segment_ms
# while debut_segment < duree_audio_ms:
# segment = audio[debut_segment:fin_segment]
# nombre_dossiers = 0
# if not os.path.exists(chemin_dossier):
# os.makedirs(chemin_dossier)
# for element in os.listdir(chemin_dossier):
# # Vérifier si l'élément est un dossier
# if os.path.isdir(os.path.join(chemin_dossier, element)):
# # Incrémenter le compteur
# nombre_dossiers += 1
# folder = "./audio/split"+str(nombre_dossiers)
# os.makedirs(folder)
# outputfile = folder+"/"+nom_fichier_wav+ str(fin_segment)
# segment.export(outputfile , format="wav")
# segments_audio.append(outputfile )
# # Mettre à jour les indices pour le prochain segment
# debut_segment = fin_segment
# fin_segment += duree_segment_ms
# return segments_audio
def transcriptor(self) :
"Fais une transcription de l audio en fon nyehehe"
global transcription_pipeline
audio = self.formater("test.wav")
if isinstance(audio, list) :
trans = []
for a in audio :
transcription = transcription_pipeline(a)
trans.append(transcription['text'])
transcr = ", ".join(trans)
return transcr
else :
transcription = transcription_pipeline(audio)
return transcription['text']
def translate(self,inputt) :
lop = []
for a in inputt :
inputs = tokenizer(a, return_tensors="pt")
translated_tokens = model.generate(
**inputs, forced_bos_token_id=tokenizer.lang_code_to_id["fra_Latn"],
)
lop.append(tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0])
return ' \n'.join(lop) |