Spaces:

ftshijt
/

versa

Sleeping

App Files Files Community

ftshijt commited on 18 days ago

Commit

37d87af

1 Parent(s): 0223e0e

use docker for setup

Browse files

Files changed (5) hide show

Dockerfile +42 -0
app.py +30 -234
postBuild +0 -41
requirements.txt +1 -2
universal_metrics.yaml +158 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,42 @@

+FROM python:3.9-slim
+WORKDIR /app
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    git \
+    build-essential \
+    libsndfile1 \
+    ffmpeg \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+# Copy requirements file
+COPY requirements.txt .
+# Install Python dependencies
+RUN pip install --no-cache-dir -U pip && \
+    pip install --no-cache-dir -r requirements.txt
+# Clone VERSA repository
+RUN git clone https://github.com/shinjiwlab/versa.git && \
+    cd versa && \
+    pip install -e .
+# Set up data directories
+RUN mkdir -p /app/data/configs /app/data/uploads /app/data/results
+# Copy universal metrics YAML file
+COPY universal_metrics.yaml /app/data/configs/
+# Copy application code
+COPY app.py .
+# Create installation complete indicator
+RUN touch /app/versa/.installation_complete
+# Set environment variables
+ENV PYTHONUNBUFFERED=1
+# Run the application
+CMD ["python", "app.py"]

app.py CHANGED Viewed

@@ -11,253 +11,38 @@ import matplotlib.pyplot as plt
 import time
 from pathlib import Path
-# VERSA paths - these should be set up during the build phase
-VERSA_ROOT = os.path.join(os.path.dirname(os.path.abspath(__file__)), "versa")
 VERSA_BIN = os.path.join(VERSA_ROOT, "versa", "bin", "scorer.py")
 VERSA_CONFIG_DIR = os.path.join(VERSA_ROOT, "egs")
 # Check if VERSA is installed
 def check_versa_installation():
     """Check if VERSA is properly installed"""
     if not os.path.exists(VERSA_ROOT):
-        return False, "VERSA directory not found. The build process may have failed."
     if not os.path.exists(VERSA_BIN):
-        return False, "VERSA binary not found. The installation may be incomplete."
     if not os.path.exists(VERSA_CONFIG_DIR):
-        return False, "VERSA configuration directory not found. The installation may be incomplete."
-    # Check if the .installation_complete file exists (created by build.sh)
     if not os.path.exists(os.path.join(VERSA_ROOT, ".installation_complete")):
-        return False, "VERSA installation indicator file not found. The build process may have failed."
     return True, "VERSA is properly installed."
 # Check VERSA installation at startup
 versa_installed, versa_status = check_versa_installation()
-if not versa_installed:
-    print(f"WARNING: {versa_status}")
-    print("The application may not function correctly without VERSA.")
-else:
-    print("VERSA installation verified successfully.")
-# Create data directory if it doesn't exist
-DATA_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
-UPLOAD_DIR = os.path.join(DATA_DIR, "uploads")
-RESULTS_DIR = os.path.join(DATA_DIR, "results")
-CONFIG_DIR = os.path.join(DATA_DIR, "configs")
-for directory in [DATA_DIR, UPLOAD_DIR, RESULTS_DIR, CONFIG_DIR]:
-    os.makedirs(directory, exist_ok=True)
-# Save the default universal metrics YAML file
-UNIVERSAL_METRICS_YAML = os.path.join(CONFIG_DIR, "universal_metrics.yaml")
-if not os.path.exists(UNIVERSAL_METRICS_YAML):
-    with open(UNIVERSAL_METRICS_YAML, 'w') as f:
-        f.write("""# Universal Metrics Configuration for Versa
-# This file contains the configuration for various universal metrics used in speech quality assessment.
-# visqol metric
-# -- visqol: visual quality of speech
-- name: visqol
-  model: default
-# Word error rate with ESPnet-OWSM model
-# More model_tag can be from the ESPnet huggingface https://huggingface.co/espnet .
-# The default model is `espnet/owsm_v3.1_ebf`.
-# --lid: the nbest language tag
-- name: lid
-  model_tag: default
-  nbest: 1
-# nomad (reference-based) metric
-# -- nomad: nomad reference-based model
-- name: nomad
-  model_cache: versa_cache/nomad_pt-models
-# srmr related metrics
-# -- srmr: speech-to-reverberation modulation energy ratio
-- name: srmr
-  n_cochlear_filters: 23
-  low_freq: 125
-  min_cf: 4
-  max_cf: 128
-  fast: True
-  norm: False
-# Emotion similarity calculated based on emo2vec
-# --emo2vec_similarity: the emotion similarity with emo2vec
-- name: emo2vec_similarity
-# noresqa related metrics
-# -- noresqa: non-matching reference based speech quality assessment
-- name: noresqa
-  metric_type: 1 #0: NORESQA-score, 1: NORESQA-MOS
-# pysepm related metrics
-# -- pysepm_fwsegsnr: frequency-weighted segmental SNR
-# -- pysepm_llr: Log likelihood ratio
-# -- pysepm_wss: weighted spectral slope
-# -- pysepm_cd: cepstral distance objective speech quality measure
-# -- pysepm_Csig, pysepm_Cbak, pysepm_Covl: composite objective speech quality
-# -- pysepm_csii_high, pysepm_csii_mid, pysepm_csii_low: coherence and speech intelligibility index
-# -- pysepm_ncm: normalized-covariance measure
-- name: pysepm
-# nisqa score for speech quality assessment
-#  -- nisqa_mos_pred: NISQA MOS prediction
-#  -- nisqa_noi_pred: NISQA noise prediction
-#  -- nisqa_dis_pred: NISQA distortion prediction
-#  -- nisqa_col_pred: NISQA color prediction
-#  --nisqa_loud_pred: NISQA loudness prediction
-# NOTE(jiatong): pretrain model can be downloaded with `./tools/setup_nisqa.sh`
-- name: nisqa
-  nisqa_model_path: ./tools/NISQA/weights/nisqa.tar
-# discrete speech metrics
-# -- speech_bert: speech bert score
-# -- speech_bleu: speech bleu score
-# -- speech_token_distance: speech token distance score
-- name: discrete_speech
-# mcd f0 related metrics
-#  -- mcd: mel cepstral distortion
-#  -- f0_corr: f0 correlation
-#  -- f0_rmse: f0 root mean square error
-- name: mcd_f0
-  f0min: 40
-  f0max: 800
-  mcep_shift: 5
-  mcep_fftl: 1024
-  mcep_dim: 39
-  mcep_alpha: 0.466
-  seq_mismatch_tolerance: 0.1
-  power_threshold: -20
-  dtw: false
-# An overall model on MOS-bench from Sheet toolkit
-# --sheet_ssqa: the mos prediction from sheet_ssqa
-- name: sheet_ssqa
-# pesq related metrics
-# -- pesq: perceptual evaluation of speech quality
-- name: pesq
-# stoi related metrics
-# -- stoi: short-time objective intelligibility
-- name: stoi
-# pseudo subjective metrics
-# -- utmos: UT-MOS score
-# -- dnsmos: DNS-MOS score
-# -- plcmos: PLC-MOS score
-# -- aecmos: AEC-MOS score
-- name: pseudo_mos
-  predictor_types: ["utmos", "dnsmos", "plcmos", "singmos", "utmosv2"]
-  predictor_args:
-    utmos:
-      fs: 16000
-    dnsmos:
-      fs: 16000
-    plcmos:
-      fs: 16000
-    singmos:
-      fs: 16000
-    utmosv2:
-      fs: 16000
-# Word error rate with OpenAI-Whisper model
-# -- whisper_wer: word error rate of openai-whisper
-- name: whisper_wer
-  model_tag: default
-  beam_size: 1
-  text_cleaner: whisper_basic
-# scoreq (reference-based) metric
-# -- scoreq_ref: scoreq reference-based model
-- name: scoreq_ref
-  data_domain: natrual
-  model_cache: versa_cache/scoreq_pt-models
-# scoreq (non-reference-based) metric
-# -- scoreq_nr: scoreq non-reference-based model
-- name: scoreq_nr
-  data_domain: natural
-  model_cache: versa_cache/scoreq_pt-models
-# Speech Enhancement-based Metrics
-# model tag can be any ESPnet-SE huggingface repo
-# -- se_si_snr: the SI-SNR from a rerference speech enhancement model
-- name: se_snr
-  model_tag: default
-# PAM: Prompting Audio-Language Models for Audio Quality Assessment
-# https://github.com/soham97/PAM/tree/main
-- name: pam
-  repro: true
-  cache_dir: versa_cache/pam
-  io: soundfile
-  # TEXT ENCODER CONFIG
-  text_model: 'gpt2'
-  text_len: 77
-  transformer_embed_dim: 768
-  freeze_text_encoder_weights: True
-  # AUDIO ENCODER CONFIG
-  audioenc_name: 'HTSAT'
-  out_emb: 768
-  sampling_rate: 44100
-  duration: 7
-  fmin: 50
-  fmax: 8000 #14000
-  n_fft: 1024 # 1028
-  hop_size: 320
-  mel_bins: 64
-  window_size: 1024
-  # PROJECTION SPACE CONFIG
-  d_proj: 1024
-  temperature: 0.003
-  # TRAINING AND EVALUATION CONFIG
-  num_classes: 527
-  batch_size: 1024
-  demo: False
-# Speaking rate calculating
-# --speaking_rate: correct matching words/character counts
-- name: speaking_rate
-  model_tag: default
-  beam_size: 1
-  text_cleaner: whisper_basic
-# Audiobox Aesthetics (Unified automatic quality assessment for speech, music, and sound.)
-- name: audiobox_aesthetics
-  batch_size: 1
-  cache_dir: versa_cache/audiobox
-# ASR-match calculating
-# --asr_match_error_rate: correct matching words/character counts
-- name: asr_match
-  model_tag: default
-  beam_size: 1
-  text_cleaner: whisper_basic
-# speaker related metrics
-# -- spk_similarity: speaker cosine similarity
-- name: speaker
-  model_tag: default
-# asvspoof related metrics
-# -- asvspoof_score: evaluate how the generated speech is likely to be classifiied by a deepfake classifier
-- name: asvspoof_score
-# signal related metrics
-# -- sir: signal to interference ratio
-# -- sar: signal to artifact ratio
-# -- sdr: signal to distortion ratio
-# -- ci-sdr: scale-invariant signal to distortion ratio
-# -- si-snri: scale-invariant signal to noise ratio improvement
-- name: signal_metric""")
 # Find available metric configs
 def get_available_metrics():
@@ -297,9 +82,10 @@ def get_available_metric_names():
         return []
     # First check the universal metrics file
-    if os.path.exists(UNIVERSAL_METRICS_YAML):
         try:
-            with open(UNIVERSAL_METRICS_YAML, 'r') as f:
                 config = yaml.safe_load(f)
                 if isinstance(config, list):
                     for item in config:
@@ -371,8 +157,9 @@ def create_custom_metric_config(selected_metrics, metric_parameters):
     # Load universal metrics as reference
     universal_metrics = []
     try:
-        with open(UNIVERSAL_METRICS_YAML, 'r') as f:
             universal_metrics = yaml.safe_load(f)
     except Exception as e:
         return None, f"Error loading universal metrics: {str(e)}"
@@ -549,10 +336,11 @@ def create_gradio_demo():
             gr.Markdown(f"""
             ## ⚠️ VERSA Not Installed
-            VERSA does not appear to be properly installed. The build process may have failed.
-            Please check the build logs in the Factory tab of your Hugging Face Space.
             Error: {versa_status}
             """)
         else:
             gr.Markdown("Upload audio files and evaluate them using VERSA metrics.")
@@ -736,3 +524,11 @@ mcd_f0:
                 inputs=[uploaded_yaml],
                 outputs=[upload_status, custom_config_path, custom_config_content, metric_dropdown]
             )

 import time
 from pathlib import Path
+# VERSA paths - these are set by the Dockerfile
+VERSA_ROOT = "/app/versa"
 VERSA_BIN = os.path.join(VERSA_ROOT, "versa", "bin", "scorer.py")
 VERSA_CONFIG_DIR = os.path.join(VERSA_ROOT, "egs")
+# Data directories - also set up by the Dockerfile
+DATA_DIR = "/app/data"
+UPLOAD_DIR = os.path.join(DATA_DIR, "uploads")
+RESULTS_DIR = os.path.join(DATA_DIR, "results")
+CONFIG_DIR = os.path.join(DATA_DIR, "configs")
 # Check if VERSA is installed
 def check_versa_installation():
     """Check if VERSA is properly installed"""
     if not os.path.exists(VERSA_ROOT):
+        return False, "VERSA directory not found."
     if not os.path.exists(VERSA_BIN):
+        return False, "VERSA binary not found."
     if not os.path.exists(VERSA_CONFIG_DIR):
+        return False, "VERSA configuration directory not found."
+    # Check if the .installation_complete file exists (created by Dockerfile)
     if not os.path.exists(os.path.join(VERSA_ROOT, ".installation_complete")):
+        return False, "VERSA installation indicator file not found."
     return True, "VERSA is properly installed."
 # Check VERSA installation at startup
 versa_installed, versa_status = check_versa_installation()
+print(f"VERSA installation status: {versa_status}")
 # Find available metric configs
 def get_available_metrics():
         return []
     # First check the universal metrics file
+    universal_metrics_yaml = os.path.join(CONFIG_DIR, "universal_metrics.yaml")
+    if os.path.exists(universal_metrics_yaml):
         try:
+            with open(universal_metrics_yaml, 'r') as f:
                 config = yaml.safe_load(f)
                 if isinstance(config, list):
                     for item in config:
     # Load universal metrics as reference
     universal_metrics = []
+    universal_metrics_yaml = os.path.join(CONFIG_DIR, "universal_metrics.yaml")
     try:
+        with open(universal_metrics_yaml, 'r') as f:
             universal_metrics = yaml.safe_load(f)
     except Exception as e:
         return None, f"Error loading universal metrics: {str(e)}"
             gr.Markdown(f"""
             ## ⚠️ VERSA Not Installed
+            VERSA does not appear to be properly installed. The Docker container may not have been set up correctly.
             Error: {versa_status}
+            Please check the Docker build logs or contact the administrator.
             """)
         else:
             gr.Markdown("Upload audio files and evaluate them using VERSA metrics.")
                 inputs=[uploaded_yaml],
                 outputs=[upload_status, custom_config_path, custom_config_content, metric_dropdown]
             )
+    return demo
+# Launch the app
+if __name__ == "__main__":
+    demo = create_gradio_demo()
+    # Use 0.0.0.0 to listen on all interfaces, which is required for Docker
+    demo.launch(server_name="0.0.0.0", server_port=7860)

postBuild DELETED Viewed

@@ -1,41 +0,0 @@
-#!/bin/bash
-# This script will run after the environment has been built but before the Space is started
-set -e  # Exit immediately if a command fails
-echo "Starting VERSA installation for Hugging Face Space..."
-# Set up directory structure
-echo "Setting up directory structure..."
-VERSA_ROOT="$(pwd)/versa"
-DATA_DIR="$(pwd)/data"
-CONFIG_DIR="${DATA_DIR}/configs"
-UPLOAD_DIR="${DATA_DIR}/uploads"
-RESULTS_DIR="${DATA_DIR}/results"
-mkdir -p "${DATA_DIR}" "${CONFIG_DIR}" "${UPLOAD_DIR}" "${RESULTS_DIR}"
-# Clone VERSA repository
-echo "Cloning VERSA repository..."
-if [ -d "${VERSA_ROOT}" ]; then
-    echo "VERSA directory already exists, updating..."
-    cd "${VERSA_ROOT}"
-    git pull
-    cd ..
-else
-    echo "Cloning fresh VERSA repository..."
-    git clone https://github.com/shinjiwlab/versa.git "${VERSA_ROOT}"
-fi
-# Install VERSA
-echo "Installing VERSA and dependencies..."
-cd "${VERSA_ROOT}"
-pip install -e .
-# Create a file to indicate successful installation
-touch "${VERSA_ROOT}/.installation_complete"
-# Return to the original directory
-cd ..
-echo "VERSA installation completed successfully!"

requirements.txt CHANGED Viewed

@@ -1,11 +1,10 @@
 gradio>=4.0.0
 pyyaml>=6.0
 pandas>=1.5.0
-numpy>=1.20.0
 matplotlib>=3.5.0
 soundfile>=0.12.1
 scipy>=1.7.0
 torch>=1.10.0
 torchaudio>=0.10.0
 librosa>=0.9.2
-GitPython>=3.1.30

 gradio>=4.0.0
 pyyaml>=6.0
 pandas>=1.5.0
+numpy<=1.23.5
 matplotlib>=3.5.0
 soundfile>=0.12.1
 scipy>=1.7.0
 torch>=1.10.0
 torchaudio>=0.10.0
 librosa>=0.9.2

universal_metrics.yaml ADDED Viewed

	@@ -0,0 +1,158 @@

+# Universal Metrics Configuration for Versa
+# This file contains the configuration for various universal metrics used in speech quality assessment.
+# visqol metric
+# -- visqol: visual quality of speech
+- name: visqol
+  model: default
+# Word error rate with ESPnet-OWSM model
+# More model_tag can be from the ESPnet huggingface https://huggingface.co/espnet .
+# The default model is `espnet/owsm_v3.1_ebf`.
+# --lid: the nbest language tag
+- name: lid
+  model_tag: default
+  nbest: 1
+# nomad (reference-based) metric
+# -- nomad: nomad reference-based model
+- name: nomad
+  model_cache: versa_cache/nomad_pt-models
+# srmr related metrics
+# -- srmr: speech-to-reverberation modulation energy ratio
+- name: srmr
+  n_cochlear_filters: 23
+  low_freq: 125
+  min_cf: 4
+  max_cf: 128
+  fast: True
+  norm: False
+# Emotion similarity calculated based on emo2vec
+# --emo2vec_similarity: the emotion similarity with emo2vec
+- name: emo2vec_similarity
+# noresqa related metrics
+# -- noresqa: non-matching reference based speech quality assessment
+- name: noresqa
+  metric_type: 1 #0: NORESQA-score, 1: NORESQA-MOS
+# pysepm related metrics
+# -- pysepm_fwsegsnr: frequency-weighted segmental SNR
+# -- pysepm_llr: Log likelihood ratio
+# -- pysepm_wss: weighted spectral slope
+# -- pysepm_cd: cepstral distance objective speech quality measure
+# -- pysepm_Csig, pysepm_Cbak, pysepm_Covl: composite objective speech quality
+# -- pysepm_csii_high, pysepm_csii_mid, pysepm_csii_low: coherence and speech intelligibility index
+# -- pysepm_ncm: normalized-covariance measure
+- name: pysepm
+# nisqa score for speech quality assessment
+#  -- nisqa_mos_pred: NISQA MOS prediction
+#  -- nisqa_noi_pred: NISQA noise prediction
+#  -- nisqa_dis_pred: NISQA distortion prediction
+#  -- nisqa_col_pred: NISQA color prediction
+#  --nisqa_loud_pred: NISQA loudness prediction
+# NOTE(jiatong): pretrain model can be downloaded with `./tools/setup_nisqa.sh`
+- name: nisqa
+  nisqa_model_path: ./tools/NISQA/weights/nisqa.tar
+# discrete speech metrics
+# -- speech_bert: speech bert score
+# -- speech_bleu: speech bleu score
+# -- speech_token_distance: speech token distance score
+- name: discrete_speech
+# mcd f0 related metrics
+#  -- mcd: mel cepstral distortion
+#  -- f0_corr: f0 correlation
+#  -- f0_rmse: f0 root mean square error
+- name: mcd_f0
+  f0min: 40
+  f0max: 800
+  mcep_shift: 5
+  mcep_fftl: 1024
+  mcep_dim: 39
+  mcep_alpha: 0.466
+  seq_mismatch_tolerance: 0.1
+  power_threshold: -20
+  dtw: false
+# An overall model on MOS-bench from Sheet toolkit
+# --sheet_ssqa: the mos prediction from sheet_ssqa
+- name: sheet_ssqa
+# pesq related metrics
+# -- pesq: perceptual evaluation of speech quality
+- name: pesq
+# stoi related metrics
+# -- stoi: short-time objective intelligibility
+- name: stoi
+# pseudo subjective metrics
+# -- utmos: UT-MOS score
+# -- dnsmos: DNS-MOS score
+# -- plcmos: PLC-MOS score
+# -- aecmos: AEC-MOS score
+- name: pseudo_mos
+  predictor_types: ["utmos", "dnsmos", "plcmos", "singmos", "utmosv2"]
+  predictor_args:
+    utmos:
+      fs: 16000
+    dnsmos:
+      fs: 16000
+    plcmos:
+      fs: 16000
+    singmos:
+      fs: 16000
+    utmosv2:
+      fs: 16000
+# Word error rate with OpenAI-Whisper model
+# -- whisper_wer: word error rate of openai-whisper
+- name: whisper_wer
+  model_tag: default
+  beam_size: 1
+  text_cleaner: whisper_basic
+# scoreq (reference-based) metric
+# -- scoreq_ref: scoreq reference-based model
+- name: scoreq_ref
+  data_domain: natrual
+  model_cache: versa_cache/scoreq_pt-models
+# scoreq (non-reference-based) metric
+# -- scoreq_nr: scoreq non-reference-based model
+- name: scoreq_nr
+  data_domain: natural
+  model_cache: versa_cache/scoreq_pt-models
+# Speech Enhancement-based Metrics
+# model tag can be any ESPnet-SE huggingface repo
+# -- se_si_snr: the SI-SNR from a rerference speech enhancement model
+- name: se_snr
+  model_tag: default
+# PAM: Prompting Audio-Language Models for Audio Quality Assessment
+# https://github.com/soham97/PAM/tree/main
+- name: pam
+  repro: true
+  cache_dir: versa_cache/pam
+  io: soundfile
+  # TEXT ENCODER CONFIG
+  text_model: 'gpt2'
+  text_len: 77
+  transformer_embed_dim: 768
+  freeze_text_encoder_weights: True
+  # AUDIO ENCODER CONFIG
+  audioenc_name: 'HTSAT'
+  out_emb: 768
+  sampling_rate: 44100
+  duration: 7
+  fmin: 50
+  fmax: 8000 #14000
+  n_fft: 1024 # 1028
+  hop_size