from transformers import pipeline from pydub import AudioSegment import os import soundfile as sf import numpy as np from typing import Union import torch import librosa import requests from transformers import AutoModelForCausalLM from transformers import AutoTokenizer from huggingface_hub import login from huggingface_hub import hf_hub_download from transformers import AutoModelForSeq2SeqLM, AutoTokenizer import torchaudio from functools import lru_cache key = os.environ['access_token'] login(token = key) @lru_cache(maxsize=None) def load_model() : tokenizer = AutoTokenizer.from_pretrained("Zelyanoth/my_fon_translation_model", token=True, src_lang="fon_Latn") model = AutoModelForSeq2SeqLM.from_pretrained("Zelyanoth/my_fon_translation_model") tokenizer_tr = AutoTokenizer.from_pretrained("google/flan-t5-xl") model_tr = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-xl", torch_dtype=torch.float16) transcription_pipeline = pipeline("automatic-speech-recognition", model="Zelyanoth/wav2vec2-bert-fon-colab") return tokenizer_tr, model_tr, transcription_pipeline,tokenizer,model tokenizer_tr,model_tr,transcription_pipeline,tokenizer,model = load_model() class MONDJEMIN_AI(object) : "THE AI PART USED IN THE Project" def __init__(self,audio) : self.audio = audio def formater(self, nom_fichier_wav): "permet de formatter le fichier pour qu il soit gérable par le transcripteur c est son bro sure" formats = os.path.splitext(self.audio)[1][1:] audio = AudioSegment.from_file(self.audio, format=formats) audio.export(nom_fichier_wav, format="wav") y, sr = librosa.load(nom_fichier_wav, sr=16000) frame_length = 400 hop_length = 160 amplitude = np.abs(y) energy = np.array([ sum(abs(y[i:i+frame_length]**2)) for i in range(0, len(y), hop_length) ]) fenetre = np.percentile(energy, 10) silence_indices = np.where(energy < fenetre)[0] silence_times = librosa.frames_to_time(silence_indices, sr=sr, hop_length=hop_length) min_silence_duration = 0.5 merged_silences = [] current_silence_start = silence_times[0] for i in range(1, len(silence_times)): if silence_times[i] - silence_times[i - 1] > min_silence_duration: current_silence_end = silence_times[i - 1] merged_silences.append((current_silence_start, current_silence_end)) current_silence_start = silence_times[i] merged_silences.append((current_silence_start, silence_times[-1])) segments_boundaries = [0] + [end for start, end in merged_silences] + [len(y) / sr] segments = [] min_segment_duration = 1.0 segments_files_names = [] for i in range(len(segments_boundaries) - 1): start_sample = int(segments_boundaries[i] * sr) end_sample = int(segments_boundaries[i + 1] * sr) segment = y[start_sample:end_sample] segment_duration = (end_sample - start_sample) / sr if segment_duration >= min_segment_duration: segments.append(segment) output_filename = f'segment_{i+1}.wav' sf.write(output_filename, segment, sr) if len(segments) == 0: output_filename = 'segment_1.wav' sf.write(output_filename, y, sr) segments.append(y) return segments # # Permettre l'écoute de chaque segment # for i, segment in enumerate(segments): # print(f'Écouter le segment {i+1}') # display(Audio(segment, rate=sr)) # chemin_dossier = "./audio/" # if duree_audio_ms > duree_segment_ms : # segments_audio = [] # debut_segment = 0 # fin_segment = duree_segment_ms # while debut_segment < duree_audio_ms: # segment = audio[debut_segment:fin_segment] # nombre_dossiers = 0 # if not os.path.exists(chemin_dossier): # os.makedirs(chemin_dossier) # for element in os.listdir(chemin_dossier): # # Vérifier si l'élément est un dossier # if os.path.isdir(os.path.join(chemin_dossier, element)): # # Incrémenter le compteur # nombre_dossiers += 1 # folder = "./audio/split"+str(nombre_dossiers) # os.makedirs(folder) # outputfile = folder+"/"+nom_fichier_wav+ str(fin_segment) # segment.export(outputfile , format="wav") # segments_audio.append(outputfile ) # # Mettre à jour les indices pour le prochain segment # debut_segment = fin_segment # fin_segment += duree_segment_ms # return segments_audio def transcriptor(self) : "Fais une transcription de l audio en fon nyehehe" global transcription_pipeline audio = self.formater("test.wav") if isinstance(audio, list) : trans = [] for a in audio : transcription = transcription_pipeline(a) trans.append(transcription['text']) transcr = ", ".join(trans) return transcr else : transcription = transcription_pipeline(audio) return transcription['text'] def translate(self,inputt) : lop = [] for a in inputt : inputs = tokenizer(a, return_tensors="pt") translated_tokens = model.generate( **inputs, forced_bos_token_id=tokenizer.lang_code_to_id["fra_Latn"], ) lop.append(tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]) return ' \n'.join(lop)