MakiAi commited on
Commit
edf2658
·
1 Parent(s): 4cd874e

[feat] 音声認識アプリの実装

Browse files

- Gradioを使用したWeb UIの実装
- Whisperモデルを使用した音声認識機能の実装
- Dockerfileとdocker-compose.ymlの作成
- GPUを使用する場合のDockerfile.gpuとdocker-compose.gpu.ymlの作成
- 必要なライブラリをrequirements.txtに記載

Dockerfile ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11
2
+
3
+ WORKDIR /app
4
+
5
+ COPY requirements.txt .
6
+ RUN pip install --no-cache-dir -r requirements.txt
7
+
8
+ RUN apt-get update && \
9
+ apt-get install -y ffmpeg && \
10
+ rm -rf /var/lib/apt/lists/*
app.py CHANGED
@@ -1,15 +1,51 @@
1
- import streamlit as st
 
 
 
2
 
 
 
 
 
 
 
3
 
4
- def load_markdown(file_path):
5
- with open(file_path, encoding="utf8") as f:
6
- return f.read()
 
 
 
 
 
7
 
8
- def display_front_page():
9
- html_front = load_markdown('docs/page_front.md')
10
- st.markdown(f"{html_front}", unsafe_allow_html=True)
 
11
 
12
- if __name__ == "__main__":
13
- display_front_page()
14
- x = st.slider('Select a value')
15
- st.write(x, 'squared is', x * x)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from transformers import pipeline
4
+ import librosa
5
 
6
+ # モデルの設定
7
+ model_id = "kotoba-tech/kotoba-whisper-v1.0"
8
+ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
9
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
10
+ model_kwargs = {"attn_implementation": "sdpa"} if torch.cuda.is_available() else {}
11
+ generate_kwargs = {"language": "japanese", "task": "transcribe"}
12
 
13
+ # モデルのロード
14
+ pipe = pipeline(
15
+ "automatic-speech-recognition",
16
+ model=model_id,
17
+ torch_dtype=torch_dtype,
18
+ device=device,
19
+ model_kwargs=model_kwargs
20
+ )
21
 
22
+ # 文字起こし関数
23
+ def transcribe(audio_file):
24
+ # 音声の読み込み
25
+ audio, sr = librosa.load(audio_file, sr=None)
26
 
27
+ # 音声をリサンプリング
28
+ target_sr = 16000
29
+ audio_resampled = librosa.resample(audio, orig_sr=sr, target_sr=target_sr)
30
+
31
+ # 推論の実行
32
+ result = pipe(audio_resampled, generate_kwargs=generate_kwargs)
33
+
34
+ return result["text"]
35
+
36
+ description = """
37
+ The bot was trained to answer questions based on Rick and Morty dialogues. Ask Rick anything!
38
+ <img src="https://huggingface.co/spaces/course-demos/Rick_and_Morty_QA/resolve/main/rick.png" width=200px>
39
+ """
40
+
41
+ # Gradioインターフェースの定義
42
+ iface = gr.Interface(
43
+ fn=transcribe,
44
+ inputs=gr.Audio(type="filepath", label="Upload Audio (MP3 or MP4)"),
45
+ outputs="text",
46
+ title="Speech-to-Text App",
47
+ description=description,
48
+ theme=gr.themes.Soft(),
49
+ )
50
+ # アプリの起動
51
+ iface.launch(server_name="0.0.0.0", server_port=7860, share=True)
docker-compose.yml ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: '3'
2
+
3
+ services:
4
+ app:
5
+ build: .
6
+ ports:
7
+ - "7860:7860"
8
+ volumes:
9
+ - ./:/app
10
+ - .cache:/root/.cache
11
+ command: python app.py
12
+ tty: true
docker/Dockerfile.gpu ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM nvidia/cuda:12.0.1-cudnn8-runtime-ubuntu22.04
2
+ ENV DEBIAN_FRONTEND=noninteractive
3
+
4
+ RUN apt-get update \
5
+ && apt-get upgrade -y \
6
+ && apt-get install -y --no-install-recommends \
7
+ gcc \
8
+ curl \
9
+ wget \
10
+ sudo \
11
+ pciutils \
12
+ python3-all-dev \
13
+ python-is-python3 \
14
+ python3-pip \
15
+ ffmpeg \
16
+ libsdl2-dev \
17
+ pulseaudio \
18
+ alsa-utils \
19
+ portaudio19-dev \
20
+ && pip install pip -U
21
+
22
+ WORKDIR /app
23
+
24
+ COPY requirements.txt .
25
+ RUN pip install --no-cache-dir -r requirements.txt
docker/docker-compose.gpu.yml ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ version: '3'
2
+
3
+ services:
4
+ app:
5
+ build: .
6
+ ports:
7
+ - "7860:7860"
8
+ volumes:
9
+ - ./:/app
10
+ command: python app.py
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio
2
+ torch
3
+ transformers
4
+ datasets[audio]
5
+ librosa