Spaces:

vankienemk
/

Voice-regconizer

Running

App Files Files Community

vankienemk commited on Apr 16

Commit

0c2f2f9

verified ·

1 Parent(s): 15eda7f

Update app.py

Browse files

Files changed (1) hide show

app.py +50 -31

app.py CHANGED Viewed

@@ -1,39 +1,58 @@
 import gradio as gr
 import torch
 import numpy as np
-from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
-from scipy.signal import resample
-# Load model
-processor = Wav2Vec2Processor.from_pretrained("Menlo/Ichigo-whisper-v0.1")
-model = Wav2Vec2ForCTC.from_pretrained("Menlo/Ichigo-whisper-v0.1")
-model = torch.compile(model)
-def transcribe(audio):
-    if audio is None:
-        return "Không có âm thanh."
-    sample_rate, audio_data = audio
-    target_rate = 16000
-    # Nếu sample rate khác 16kHz thì chuyển về
-    if sample_rate != target_rate:
-        duration = len(audio_data) / sample_rate
-        new_length = int(duration * target_rate)
-        audio_data = resample(audio_data, new_length)
-    # Dự đoán
-    inputs = processor(audio_data, sampling_rate=target_rate, return_tensors="pt", padding=True)
-    with torch.no_grad():
-        logits = model(**inputs).logits
-    predicted_ids = torch.argmax(logits, dim=-1)
-    transcription = processor.decode(predicted_ids[0])
-    return transcription
-# Gradio UI
-gr.Interface(
-    fn=transcribe,
-    inputs=gr.Audio(sources=["microphone"], type="numpy", label="Ghi âm từ micro (16kHz mono)"),
-    outputs="text",
-    title="STT Tiếng Việt với Wav2Vec2",
-    description="Ghi âm và nhận dạng giọng nói tiếng Việt bằng mô hình FPTAI/wav2vec2-base"
-).launch()

 import gradio as gr
 import torch
+import torchaudio
+from transformers import pipeline
 import numpy as np
+# Tải mô hình Ichigo-whisper
+model_id = "Menlo/Ichigo-whisper-v0.1"
+transcriber = pipeline("automatic-speech-recognition", model=model_id)
+def transcribe_stream(stream, new_chunk):
+    # Trích xuất sample rate và dữ liệu âm thanh
+    sr, y = new_chunk
+    # Chuyển về mono nếu là stereo
+    if y.ndim > 1:
+        y = y.mean(axis=1)
+    # Chuẩn hóa âm thanh
+    y = y.astype(np.float32)
+    y /= np.max(np.abs(y)) if np.max(np.abs(y)) > 0 else 1.0
+    # Nối với audio trước đó
+    if stream is not None:
+        stream = np.concatenate([stream, y])
+    else:
+        stream = y
+    # Dự đoán kết quả
+    result = transcriber({"sampling_rate": sr, "raw": stream})
+    return stream, result["text"]
+# Tạo giao diện Gradio
+title = "Ichigo Whisper Streaming Demo"
+description = """
+# 🍓 Ichigo Whisper Streaming Recognition
+Nhận dạng giọng nói theo thời gian thực với mô hình Menlo/Ichigo-whisper-v0.1.
+"""
+# Tạo giao diện streaming
+streaming_demo = gr.Interface(
+    fn=transcribe_stream,
+    inputs=[
+        "state",
+        gr.Audio(sources=["microphone"], streaming=True)
+    ],
+    outputs=[
+        "state",
+        gr.Textbox(label="Phiên âm theo thời gian thực")
+    ],
+    live=True,
+    title=title,
+    description=description
+)
+# Khởi chạy ứng dụng
+if __name__ == "__main__":
+    streaming_demo.launch()