File size: 6,129 Bytes
53deec3
 
 
3e416ff
 
53deec3
 
3e416ff
53deec3
 
 
 
 
 
 
1ff7c68
53deec3
 
 
 
 
1ff7c68
53deec3
70decdb
 
53deec3
 
 
 
 
 
 
 
 
 
 
 
 
 
3e416ff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2b50d42
3e416ff
 
2b50d42
3e416ff
 
 
 
 
 
 
 
 
2b50d42
3e416ff
 
 
 
2b50d42
3e416ff
 
 
 
 
 
 
 
 
 
2b50d42
3e416ff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53deec3
3e416ff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53deec3
3e416ff
 
53deec3
 
 
 
 
 
 
 
 
3e416ff
53deec3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dbcc3a3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
from transformers import pipeline
from pydub import AudioSegment
import os
import soundfile as sf
import numpy as np
from typing import Union
import torch
import librosa
import requests
from transformers import AutoModelForCausalLM
from transformers import AutoTokenizer
from huggingface_hub import login
from huggingface_hub import hf_hub_download
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torchaudio
from functools import lru_cache


key = os.environ['access_token']
login(token = key)

@lru_cache(maxsize=None)
def load_model() :
    tokenizer = AutoTokenizer.from_pretrained("Zelyanoth/my_fon_translation_model", token=True, src_lang="fon_Latn")
    model = AutoModelForSeq2SeqLM.from_pretrained("Zelyanoth/my_fon_translation_model")
    tokenizer_tr = AutoTokenizer.from_pretrained("google/flan-t5-xl")
    model_tr = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-xl", torch_dtype=torch.float16)
    transcription_pipeline = pipeline("automatic-speech-recognition", model="Zelyanoth/wav2vec2-bert-fon-colab")
    return tokenizer_tr, model_tr, transcription_pipeline,tokenizer,model

tokenizer_tr,model_tr,transcription_pipeline,tokenizer,model = load_model()

class MONDJEMIN_AI(object) :
    "THE AI PART USED IN THE Project"

    def __init__(self,audio) :
        self.audio = audio

    def formater(self, nom_fichier_wav):
            
        "permet de formatter le fichier pour qu il soit gérable par le transcripteur c est son bro sure"

        formats = os.path.splitext(self.audio)[1][1:]
        audio = AudioSegment.from_file(self.audio, format=formats)
        audio.export(nom_fichier_wav, format="wav")
        y, sr = librosa.load(nom_fichier_wav, sr=16000)

        frame_length = 400  
        hop_length = 160  

        amplitude = np.abs(y)

        energy = np.array([
            sum(abs(y[i:i+frame_length]**2))
            for i in range(0, len(y), hop_length)
        ])

        fenetre = np.percentile(energy, 10)
        silence_indices = np.where(energy < fenetre)[0]

      
        silence_times = librosa.frames_to_time(silence_indices, sr=sr, hop_length=hop_length)

        min_silence_duration = 0.5  
        merged_silences = []
        current_silence_start = silence_times[0]

        for i in range(1, len(silence_times)):
            if silence_times[i] - silence_times[i - 1] > min_silence_duration:
                current_silence_end = silence_times[i - 1]
                merged_silences.append((current_silence_start, current_silence_end))
                current_silence_start = silence_times[i]


        merged_silences.append((current_silence_start, silence_times[-1]))

        segments_boundaries = [0] + [end for start, end in merged_silences] + [len(y) / sr]


        segments = []
        min_segment_duration = 1.0 
        segments_files_names = []
        for i in range(len(segments_boundaries) - 1):
            start_sample = int(segments_boundaries[i] * sr)
            end_sample = int(segments_boundaries[i + 1] * sr)
            segment = y[start_sample:end_sample]
            segment_duration = (end_sample - start_sample) / sr
            if segment_duration >= min_segment_duration:
                segments.append(segment)
      
                output_filename = f'segment_{i+1}.wav'
                sf.write(output_filename, segment, sr)

        if len(segments) == 0:
            output_filename = 'segment_1.wav'
            sf.write(output_filename, y, sr)
            segments.append(y)

        return segments
            

            # # Permettre l'écoute de chaque segment
            # for i, segment in enumerate(segments):
            #     print(f'Écouter le segment {i+1}')
            #     display(Audio(segment, rate=sr))

            # chemin_dossier = "./audio/"
            # if duree_audio_ms > duree_segment_ms :
            #     segments_audio = []
            #     debut_segment = 0
            #     fin_segment = duree_segment_ms
            #     while debut_segment < duree_audio_ms:
            #         segment = audio[debut_segment:fin_segment]
            #         nombre_dossiers = 0
                    
            #         if not os.path.exists(chemin_dossier):
            #             os.makedirs(chemin_dossier)
            #         for element in os.listdir(chemin_dossier):
            #             # Vérifier si l'élément est un dossier
            #             if os.path.isdir(os.path.join(chemin_dossier, element)):
            #                 # Incrémenter le compteur
            #                 nombre_dossiers += 1
            #         folder = "./audio/split"+str(nombre_dossiers)
            #         os.makedirs(folder)
            #         outputfile = folder+"/"+nom_fichier_wav+ str(fin_segment)
            #         segment.export(outputfile , format="wav")
            #         segments_audio.append(outputfile )

            #         # Mettre à jour les indices pour le prochain segment
            #         debut_segment = fin_segment
            #         fin_segment += duree_segment_ms
                    
            #     return segments_audio
    
    def transcriptor(self) :
        "Fais une transcription de l audio en fon nyehehe"
        global transcription_pipeline 
        audio  = self.formater("test.wav")
        if isinstance(audio, list) :
            trans = []
            for a in audio : 
                transcription = transcription_pipeline(a)
                trans.append(transcription['text'])
            transcr = ", ".join(trans)
            return transcr
        else :
            transcription = transcription_pipeline(audio)
            return transcription['text']
        
    def translate(self,inputt) :

        lop = []
        for a in inputt :
            inputs = tokenizer(a, return_tensors="pt")

            translated_tokens = model.generate(
                    **inputs, forced_bos_token_id=tokenizer.lang_code_to_id["fra_Latn"],
            )
            lop.append(tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0])
        return ' \n'.join(lop)