ftshijt commited on
Commit
37d87af
·
1 Parent(s): 0223e0e

use docker for setup

Browse files
Files changed (5) hide show
  1. Dockerfile +42 -0
  2. app.py +30 -234
  3. postBuild +0 -41
  4. requirements.txt +1 -2
  5. universal_metrics.yaml +158 -0
Dockerfile ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9-slim
2
+
3
+ WORKDIR /app
4
+
5
+ # Install system dependencies
6
+ RUN apt-get update && apt-get install -y \
7
+ git \
8
+ build-essential \
9
+ libsndfile1 \
10
+ ffmpeg \
11
+ && apt-get clean \
12
+ && rm -rf /var/lib/apt/lists/*
13
+
14
+ # Copy requirements file
15
+ COPY requirements.txt .
16
+
17
+ # Install Python dependencies
18
+ RUN pip install --no-cache-dir -U pip && \
19
+ pip install --no-cache-dir -r requirements.txt
20
+
21
+ # Clone VERSA repository
22
+ RUN git clone https://github.com/shinjiwlab/versa.git && \
23
+ cd versa && \
24
+ pip install -e .
25
+
26
+ # Set up data directories
27
+ RUN mkdir -p /app/data/configs /app/data/uploads /app/data/results
28
+
29
+ # Copy universal metrics YAML file
30
+ COPY universal_metrics.yaml /app/data/configs/
31
+
32
+ # Copy application code
33
+ COPY app.py .
34
+
35
+ # Create installation complete indicator
36
+ RUN touch /app/versa/.installation_complete
37
+
38
+ # Set environment variables
39
+ ENV PYTHONUNBUFFERED=1
40
+
41
+ # Run the application
42
+ CMD ["python", "app.py"]
app.py CHANGED
@@ -11,253 +11,38 @@ import matplotlib.pyplot as plt
11
  import time
12
  from pathlib import Path
13
 
14
- # VERSA paths - these should be set up during the build phase
15
- VERSA_ROOT = os.path.join(os.path.dirname(os.path.abspath(__file__)), "versa")
16
  VERSA_BIN = os.path.join(VERSA_ROOT, "versa", "bin", "scorer.py")
17
  VERSA_CONFIG_DIR = os.path.join(VERSA_ROOT, "egs")
18
 
 
 
 
 
 
 
19
  # Check if VERSA is installed
20
  def check_versa_installation():
21
  """Check if VERSA is properly installed"""
22
  if not os.path.exists(VERSA_ROOT):
23
- return False, "VERSA directory not found. The build process may have failed."
24
 
25
  if not os.path.exists(VERSA_BIN):
26
- return False, "VERSA binary not found. The installation may be incomplete."
27
 
28
  if not os.path.exists(VERSA_CONFIG_DIR):
29
- return False, "VERSA configuration directory not found. The installation may be incomplete."
30
 
31
- # Check if the .installation_complete file exists (created by build.sh)
32
  if not os.path.exists(os.path.join(VERSA_ROOT, ".installation_complete")):
33
- return False, "VERSA installation indicator file not found. The build process may have failed."
34
 
35
  return True, "VERSA is properly installed."
36
 
37
  # Check VERSA installation at startup
38
  versa_installed, versa_status = check_versa_installation()
39
- if not versa_installed:
40
- print(f"WARNING: {versa_status}")
41
- print("The application may not function correctly without VERSA.")
42
- else:
43
- print("VERSA installation verified successfully.")
44
-
45
- # Create data directory if it doesn't exist
46
- DATA_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
47
- UPLOAD_DIR = os.path.join(DATA_DIR, "uploads")
48
- RESULTS_DIR = os.path.join(DATA_DIR, "results")
49
- CONFIG_DIR = os.path.join(DATA_DIR, "configs")
50
-
51
- for directory in [DATA_DIR, UPLOAD_DIR, RESULTS_DIR, CONFIG_DIR]:
52
- os.makedirs(directory, exist_ok=True)
53
-
54
- # Save the default universal metrics YAML file
55
- UNIVERSAL_METRICS_YAML = os.path.join(CONFIG_DIR, "universal_metrics.yaml")
56
- if not os.path.exists(UNIVERSAL_METRICS_YAML):
57
- with open(UNIVERSAL_METRICS_YAML, 'w') as f:
58
- f.write("""# Universal Metrics Configuration for Versa
59
- # This file contains the configuration for various universal metrics used in speech quality assessment.
60
-
61
- # visqol metric
62
- # -- visqol: visual quality of speech
63
- - name: visqol
64
- model: default
65
-
66
- # Word error rate with ESPnet-OWSM model
67
- # More model_tag can be from the ESPnet huggingface https://huggingface.co/espnet .
68
- # The default model is `espnet/owsm_v3.1_ebf`.
69
- # --lid: the nbest language tag
70
- - name: lid
71
- model_tag: default
72
- nbest: 1
73
-
74
- # nomad (reference-based) metric
75
- # -- nomad: nomad reference-based model
76
- - name: nomad
77
- model_cache: versa_cache/nomad_pt-models
78
-
79
- # srmr related metrics
80
- # -- srmr: speech-to-reverberation modulation energy ratio
81
- - name: srmr
82
- n_cochlear_filters: 23
83
- low_freq: 125
84
- min_cf: 4
85
- max_cf: 128
86
- fast: True
87
- norm: False
88
-
89
- # Emotion similarity calculated based on emo2vec
90
- # --emo2vec_similarity: the emotion similarity with emo2vec
91
- - name: emo2vec_similarity
92
-
93
- # noresqa related metrics
94
- # -- noresqa: non-matching reference based speech quality assessment
95
- - name: noresqa
96
- metric_type: 1 #0: NORESQA-score, 1: NORESQA-MOS
97
-
98
- # pysepm related metrics
99
- # -- pysepm_fwsegsnr: frequency-weighted segmental SNR
100
- # -- pysepm_llr: Log likelihood ratio
101
- # -- pysepm_wss: weighted spectral slope
102
- # -- pysepm_cd: cepstral distance objective speech quality measure
103
- # -- pysepm_Csig, pysepm_Cbak, pysepm_Covl: composite objective speech quality
104
- # -- pysepm_csii_high, pysepm_csii_mid, pysepm_csii_low: coherence and speech intelligibility index
105
- # -- pysepm_ncm: normalized-covariance measure
106
- - name: pysepm
107
-
108
- # nisqa score for speech quality assessment
109
- # -- nisqa_mos_pred: NISQA MOS prediction
110
- # -- nisqa_noi_pred: NISQA noise prediction
111
- # -- nisqa_dis_pred: NISQA distortion prediction
112
- # -- nisqa_col_pred: NISQA color prediction
113
- # --nisqa_loud_pred: NISQA loudness prediction
114
- # NOTE(jiatong): pretrain model can be downloaded with `./tools/setup_nisqa.sh`
115
- - name: nisqa
116
- nisqa_model_path: ./tools/NISQA/weights/nisqa.tar
117
-
118
- # discrete speech metrics
119
- # -- speech_bert: speech bert score
120
- # -- speech_bleu: speech bleu score
121
- # -- speech_token_distance: speech token distance score
122
- - name: discrete_speech
123
-
124
- # mcd f0 related metrics
125
- # -- mcd: mel cepstral distortion
126
- # -- f0_corr: f0 correlation
127
- # -- f0_rmse: f0 root mean square error
128
- - name: mcd_f0
129
- f0min: 40
130
- f0max: 800
131
- mcep_shift: 5
132
- mcep_fftl: 1024
133
- mcep_dim: 39
134
- mcep_alpha: 0.466
135
- seq_mismatch_tolerance: 0.1
136
- power_threshold: -20
137
- dtw: false
138
-
139
- # An overall model on MOS-bench from Sheet toolkit
140
- # --sheet_ssqa: the mos prediction from sheet_ssqa
141
- - name: sheet_ssqa
142
-
143
- # pesq related metrics
144
- # -- pesq: perceptual evaluation of speech quality
145
- - name: pesq
146
-
147
- # stoi related metrics
148
- # -- stoi: short-time objective intelligibility
149
- - name: stoi
150
-
151
- # pseudo subjective metrics
152
- # -- utmos: UT-MOS score
153
- # -- dnsmos: DNS-MOS score
154
- # -- plcmos: PLC-MOS score
155
- # -- aecmos: AEC-MOS score
156
- - name: pseudo_mos
157
- predictor_types: ["utmos", "dnsmos", "plcmos", "singmos", "utmosv2"]
158
- predictor_args:
159
- utmos:
160
- fs: 16000
161
- dnsmos:
162
- fs: 16000
163
- plcmos:
164
- fs: 16000
165
- singmos:
166
- fs: 16000
167
- utmosv2:
168
- fs: 16000
169
-
170
- # Word error rate with OpenAI-Whisper model
171
- # -- whisper_wer: word error rate of openai-whisper
172
- - name: whisper_wer
173
- model_tag: default
174
- beam_size: 1
175
- text_cleaner: whisper_basic
176
-
177
- # scoreq (reference-based) metric
178
- # -- scoreq_ref: scoreq reference-based model
179
- - name: scoreq_ref
180
- data_domain: natrual
181
- model_cache: versa_cache/scoreq_pt-models
182
-
183
- # scoreq (non-reference-based) metric
184
- # -- scoreq_nr: scoreq non-reference-based model
185
- - name: scoreq_nr
186
- data_domain: natural
187
- model_cache: versa_cache/scoreq_pt-models
188
-
189
- # Speech Enhancement-based Metrics
190
- # model tag can be any ESPnet-SE huggingface repo
191
- # -- se_si_snr: the SI-SNR from a rerference speech enhancement model
192
- - name: se_snr
193
- model_tag: default
194
-
195
- # PAM: Prompting Audio-Language Models for Audio Quality Assessment
196
- # https://github.com/soham97/PAM/tree/main
197
-
198
- - name: pam
199
- repro: true
200
- cache_dir: versa_cache/pam
201
- io: soundfile
202
- # TEXT ENCODER CONFIG
203
- text_model: 'gpt2'
204
- text_len: 77
205
- transformer_embed_dim: 768
206
- freeze_text_encoder_weights: True
207
- # AUDIO ENCODER CONFIG
208
- audioenc_name: 'HTSAT'
209
- out_emb: 768
210
- sampling_rate: 44100
211
- duration: 7
212
- fmin: 50
213
- fmax: 8000 #14000
214
- n_fft: 1024 # 1028
215
- hop_size: 320
216
- mel_bins: 64
217
- window_size: 1024
218
- # PROJECTION SPACE CONFIG
219
- d_proj: 1024
220
- temperature: 0.003
221
- # TRAINING AND EVALUATION CONFIG
222
- num_classes: 527
223
- batch_size: 1024
224
- demo: False
225
-
226
- # Speaking rate calculating
227
- # --speaking_rate: correct matching words/character counts
228
- - name: speaking_rate
229
- model_tag: default
230
- beam_size: 1
231
- text_cleaner: whisper_basic
232
-
233
- # Audiobox Aesthetics (Unified automatic quality assessment for speech, music, and sound.)
234
- - name: audiobox_aesthetics
235
- batch_size: 1
236
- cache_dir: versa_cache/audiobox
237
-
238
- # ASR-match calculating
239
- # --asr_match_error_rate: correct matching words/character counts
240
- - name: asr_match
241
- model_tag: default
242
- beam_size: 1
243
- text_cleaner: whisper_basic
244
-
245
- # speaker related metrics
246
- # -- spk_similarity: speaker cosine similarity
247
- - name: speaker
248
- model_tag: default
249
-
250
- # asvspoof related metrics
251
- # -- asvspoof_score: evaluate how the generated speech is likely to be classifiied by a deepfake classifier
252
- - name: asvspoof_score
253
-
254
- # signal related metrics
255
- # -- sir: signal to interference ratio
256
- # -- sar: signal to artifact ratio
257
- # -- sdr: signal to distortion ratio
258
- # -- ci-sdr: scale-invariant signal to distortion ratio
259
- # -- si-snri: scale-invariant signal to noise ratio improvement
260
- - name: signal_metric""")
261
 
262
  # Find available metric configs
263
  def get_available_metrics():
@@ -297,9 +82,10 @@ def get_available_metric_names():
297
  return []
298
 
299
  # First check the universal metrics file
300
- if os.path.exists(UNIVERSAL_METRICS_YAML):
 
301
  try:
302
- with open(UNIVERSAL_METRICS_YAML, 'r') as f:
303
  config = yaml.safe_load(f)
304
  if isinstance(config, list):
305
  for item in config:
@@ -371,8 +157,9 @@ def create_custom_metric_config(selected_metrics, metric_parameters):
371
 
372
  # Load universal metrics as reference
373
  universal_metrics = []
 
374
  try:
375
- with open(UNIVERSAL_METRICS_YAML, 'r') as f:
376
  universal_metrics = yaml.safe_load(f)
377
  except Exception as e:
378
  return None, f"Error loading universal metrics: {str(e)}"
@@ -549,10 +336,11 @@ def create_gradio_demo():
549
  gr.Markdown(f"""
550
  ## ⚠️ VERSA Not Installed
551
 
552
- VERSA does not appear to be properly installed. The build process may have failed.
553
- Please check the build logs in the Factory tab of your Hugging Face Space.
554
 
555
  Error: {versa_status}
 
 
556
  """)
557
  else:
558
  gr.Markdown("Upload audio files and evaluate them using VERSA metrics.")
@@ -736,3 +524,11 @@ mcd_f0:
736
  inputs=[uploaded_yaml],
737
  outputs=[upload_status, custom_config_path, custom_config_content, metric_dropdown]
738
  )
 
 
 
 
 
 
 
 
 
11
  import time
12
  from pathlib import Path
13
 
14
+ # VERSA paths - these are set by the Dockerfile
15
+ VERSA_ROOT = "/app/versa"
16
  VERSA_BIN = os.path.join(VERSA_ROOT, "versa", "bin", "scorer.py")
17
  VERSA_CONFIG_DIR = os.path.join(VERSA_ROOT, "egs")
18
 
19
+ # Data directories - also set up by the Dockerfile
20
+ DATA_DIR = "/app/data"
21
+ UPLOAD_DIR = os.path.join(DATA_DIR, "uploads")
22
+ RESULTS_DIR = os.path.join(DATA_DIR, "results")
23
+ CONFIG_DIR = os.path.join(DATA_DIR, "configs")
24
+
25
  # Check if VERSA is installed
26
  def check_versa_installation():
27
  """Check if VERSA is properly installed"""
28
  if not os.path.exists(VERSA_ROOT):
29
+ return False, "VERSA directory not found."
30
 
31
  if not os.path.exists(VERSA_BIN):
32
+ return False, "VERSA binary not found."
33
 
34
  if not os.path.exists(VERSA_CONFIG_DIR):
35
+ return False, "VERSA configuration directory not found."
36
 
37
+ # Check if the .installation_complete file exists (created by Dockerfile)
38
  if not os.path.exists(os.path.join(VERSA_ROOT, ".installation_complete")):
39
+ return False, "VERSA installation indicator file not found."
40
 
41
  return True, "VERSA is properly installed."
42
 
43
  # Check VERSA installation at startup
44
  versa_installed, versa_status = check_versa_installation()
45
+ print(f"VERSA installation status: {versa_status}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
  # Find available metric configs
48
  def get_available_metrics():
 
82
  return []
83
 
84
  # First check the universal metrics file
85
+ universal_metrics_yaml = os.path.join(CONFIG_DIR, "universal_metrics.yaml")
86
+ if os.path.exists(universal_metrics_yaml):
87
  try:
88
+ with open(universal_metrics_yaml, 'r') as f:
89
  config = yaml.safe_load(f)
90
  if isinstance(config, list):
91
  for item in config:
 
157
 
158
  # Load universal metrics as reference
159
  universal_metrics = []
160
+ universal_metrics_yaml = os.path.join(CONFIG_DIR, "universal_metrics.yaml")
161
  try:
162
+ with open(universal_metrics_yaml, 'r') as f:
163
  universal_metrics = yaml.safe_load(f)
164
  except Exception as e:
165
  return None, f"Error loading universal metrics: {str(e)}"
 
336
  gr.Markdown(f"""
337
  ## ⚠️ VERSA Not Installed
338
 
339
+ VERSA does not appear to be properly installed. The Docker container may not have been set up correctly.
 
340
 
341
  Error: {versa_status}
342
+
343
+ Please check the Docker build logs or contact the administrator.
344
  """)
345
  else:
346
  gr.Markdown("Upload audio files and evaluate them using VERSA metrics.")
 
524
  inputs=[uploaded_yaml],
525
  outputs=[upload_status, custom_config_path, custom_config_content, metric_dropdown]
526
  )
527
+
528
+ return demo
529
+
530
+ # Launch the app
531
+ if __name__ == "__main__":
532
+ demo = create_gradio_demo()
533
+ # Use 0.0.0.0 to listen on all interfaces, which is required for Docker
534
+ demo.launch(server_name="0.0.0.0", server_port=7860)
postBuild DELETED
@@ -1,41 +0,0 @@
1
- #!/bin/bash
2
- # This script will run after the environment has been built but before the Space is started
3
-
4
- set -e # Exit immediately if a command fails
5
-
6
- echo "Starting VERSA installation for Hugging Face Space..."
7
-
8
- # Set up directory structure
9
- echo "Setting up directory structure..."
10
- VERSA_ROOT="$(pwd)/versa"
11
- DATA_DIR="$(pwd)/data"
12
- CONFIG_DIR="${DATA_DIR}/configs"
13
- UPLOAD_DIR="${DATA_DIR}/uploads"
14
- RESULTS_DIR="${DATA_DIR}/results"
15
-
16
- mkdir -p "${DATA_DIR}" "${CONFIG_DIR}" "${UPLOAD_DIR}" "${RESULTS_DIR}"
17
-
18
- # Clone VERSA repository
19
- echo "Cloning VERSA repository..."
20
- if [ -d "${VERSA_ROOT}" ]; then
21
- echo "VERSA directory already exists, updating..."
22
- cd "${VERSA_ROOT}"
23
- git pull
24
- cd ..
25
- else
26
- echo "Cloning fresh VERSA repository..."
27
- git clone https://github.com/shinjiwlab/versa.git "${VERSA_ROOT}"
28
- fi
29
-
30
- # Install VERSA
31
- echo "Installing VERSA and dependencies..."
32
- cd "${VERSA_ROOT}"
33
- pip install -e .
34
-
35
- # Create a file to indicate successful installation
36
- touch "${VERSA_ROOT}/.installation_complete"
37
-
38
- # Return to the original directory
39
- cd ..
40
-
41
- echo "VERSA installation completed successfully!"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -1,11 +1,10 @@
1
  gradio>=4.0.0
2
  pyyaml>=6.0
3
  pandas>=1.5.0
4
- numpy>=1.20.0
5
  matplotlib>=3.5.0
6
  soundfile>=0.12.1
7
  scipy>=1.7.0
8
  torch>=1.10.0
9
  torchaudio>=0.10.0
10
  librosa>=0.9.2
11
- GitPython>=3.1.30
 
1
  gradio>=4.0.0
2
  pyyaml>=6.0
3
  pandas>=1.5.0
4
+ numpy<=1.23.5
5
  matplotlib>=3.5.0
6
  soundfile>=0.12.1
7
  scipy>=1.7.0
8
  torch>=1.10.0
9
  torchaudio>=0.10.0
10
  librosa>=0.9.2
 
universal_metrics.yaml ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Universal Metrics Configuration for Versa
2
+ # This file contains the configuration for various universal metrics used in speech quality assessment.
3
+
4
+ # visqol metric
5
+ # -- visqol: visual quality of speech
6
+ - name: visqol
7
+ model: default
8
+
9
+ # Word error rate with ESPnet-OWSM model
10
+ # More model_tag can be from the ESPnet huggingface https://huggingface.co/espnet .
11
+ # The default model is `espnet/owsm_v3.1_ebf`.
12
+ # --lid: the nbest language tag
13
+ - name: lid
14
+ model_tag: default
15
+ nbest: 1
16
+
17
+ # nomad (reference-based) metric
18
+ # -- nomad: nomad reference-based model
19
+ - name: nomad
20
+ model_cache: versa_cache/nomad_pt-models
21
+
22
+ # srmr related metrics
23
+ # -- srmr: speech-to-reverberation modulation energy ratio
24
+ - name: srmr
25
+ n_cochlear_filters: 23
26
+ low_freq: 125
27
+ min_cf: 4
28
+ max_cf: 128
29
+ fast: True
30
+ norm: False
31
+
32
+ # Emotion similarity calculated based on emo2vec
33
+ # --emo2vec_similarity: the emotion similarity with emo2vec
34
+ - name: emo2vec_similarity
35
+
36
+ # noresqa related metrics
37
+ # -- noresqa: non-matching reference based speech quality assessment
38
+ - name: noresqa
39
+ metric_type: 1 #0: NORESQA-score, 1: NORESQA-MOS
40
+
41
+ # pysepm related metrics
42
+ # -- pysepm_fwsegsnr: frequency-weighted segmental SNR
43
+ # -- pysepm_llr: Log likelihood ratio
44
+ # -- pysepm_wss: weighted spectral slope
45
+ # -- pysepm_cd: cepstral distance objective speech quality measure
46
+ # -- pysepm_Csig, pysepm_Cbak, pysepm_Covl: composite objective speech quality
47
+ # -- pysepm_csii_high, pysepm_csii_mid, pysepm_csii_low: coherence and speech intelligibility index
48
+ # -- pysepm_ncm: normalized-covariance measure
49
+ - name: pysepm
50
+
51
+ # nisqa score for speech quality assessment
52
+ # -- nisqa_mos_pred: NISQA MOS prediction
53
+ # -- nisqa_noi_pred: NISQA noise prediction
54
+ # -- nisqa_dis_pred: NISQA distortion prediction
55
+ # -- nisqa_col_pred: NISQA color prediction
56
+ # --nisqa_loud_pred: NISQA loudness prediction
57
+ # NOTE(jiatong): pretrain model can be downloaded with `./tools/setup_nisqa.sh`
58
+ - name: nisqa
59
+ nisqa_model_path: ./tools/NISQA/weights/nisqa.tar
60
+
61
+ # discrete speech metrics
62
+ # -- speech_bert: speech bert score
63
+ # -- speech_bleu: speech bleu score
64
+ # -- speech_token_distance: speech token distance score
65
+ - name: discrete_speech
66
+
67
+ # mcd f0 related metrics
68
+ # -- mcd: mel cepstral distortion
69
+ # -- f0_corr: f0 correlation
70
+ # -- f0_rmse: f0 root mean square error
71
+ - name: mcd_f0
72
+ f0min: 40
73
+ f0max: 800
74
+ mcep_shift: 5
75
+ mcep_fftl: 1024
76
+ mcep_dim: 39
77
+ mcep_alpha: 0.466
78
+ seq_mismatch_tolerance: 0.1
79
+ power_threshold: -20
80
+ dtw: false
81
+
82
+ # An overall model on MOS-bench from Sheet toolkit
83
+ # --sheet_ssqa: the mos prediction from sheet_ssqa
84
+ - name: sheet_ssqa
85
+
86
+ # pesq related metrics
87
+ # -- pesq: perceptual evaluation of speech quality
88
+ - name: pesq
89
+
90
+ # stoi related metrics
91
+ # -- stoi: short-time objective intelligibility
92
+ - name: stoi
93
+
94
+ # pseudo subjective metrics
95
+ # -- utmos: UT-MOS score
96
+ # -- dnsmos: DNS-MOS score
97
+ # -- plcmos: PLC-MOS score
98
+ # -- aecmos: AEC-MOS score
99
+ - name: pseudo_mos
100
+ predictor_types: ["utmos", "dnsmos", "plcmos", "singmos", "utmosv2"]
101
+ predictor_args:
102
+ utmos:
103
+ fs: 16000
104
+ dnsmos:
105
+ fs: 16000
106
+ plcmos:
107
+ fs: 16000
108
+ singmos:
109
+ fs: 16000
110
+ utmosv2:
111
+ fs: 16000
112
+
113
+ # Word error rate with OpenAI-Whisper model
114
+ # -- whisper_wer: word error rate of openai-whisper
115
+ - name: whisper_wer
116
+ model_tag: default
117
+ beam_size: 1
118
+ text_cleaner: whisper_basic
119
+
120
+ # scoreq (reference-based) metric
121
+ # -- scoreq_ref: scoreq reference-based model
122
+ - name: scoreq_ref
123
+ data_domain: natrual
124
+ model_cache: versa_cache/scoreq_pt-models
125
+
126
+ # scoreq (non-reference-based) metric
127
+ # -- scoreq_nr: scoreq non-reference-based model
128
+ - name: scoreq_nr
129
+ data_domain: natural
130
+ model_cache: versa_cache/scoreq_pt-models
131
+
132
+ # Speech Enhancement-based Metrics
133
+ # model tag can be any ESPnet-SE huggingface repo
134
+ # -- se_si_snr: the SI-SNR from a rerference speech enhancement model
135
+ - name: se_snr
136
+ model_tag: default
137
+
138
+ # PAM: Prompting Audio-Language Models for Audio Quality Assessment
139
+ # https://github.com/soham97/PAM/tree/main
140
+
141
+ - name: pam
142
+ repro: true
143
+ cache_dir: versa_cache/pam
144
+ io: soundfile
145
+ # TEXT ENCODER CONFIG
146
+ text_model: 'gpt2'
147
+ text_len: 77
148
+ transformer_embed_dim: 768
149
+ freeze_text_encoder_weights: True
150
+ # AUDIO ENCODER CONFIG
151
+ audioenc_name: 'HTSAT'
152
+ out_emb: 768
153
+ sampling_rate: 44100
154
+ duration: 7
155
+ fmin: 50
156
+ fmax: 8000 #14000
157
+ n_fft: 1024 # 1028
158
+ hop_size