Spaces:

tahirsher
/

ASR_Model_for_Transcription_into_Text

Sleeping

File size: 5,047 Bytes

cd7aa15
 
 
 
723513d
2791b7d
f6dc6c7
f0a5b40
cd7aa15
76c5c38
723513d
76c5c38
 
 
 
 
 
f6dc6c7
76c5c38
723513d
 
2791b7d
cd7aa15
76c5c38
 
2791b7d
76c5c38
2791b7d
76c5c38
f0a5b40
76c5c38
098a61e
cd7aa15
2791b7d
3a9d859
e3021fc
f6dc6c7
e3021fc
 
 
941924a
e3021fc
3a9d859
e3021fc
 
f0a5b40
cd7aa15
2791b7d
cd7aa15
76c5c38
 
e3021fc
76c5c38
e3021fc
 
76c5c38
e3021fc
 
cd7aa15
2791b7d
cd7aa15
2791b7d
f0a5b40
 
 
 
cd7aa15
 
f0a5b40
 
8d19597
 
 
 
 
e3021fc
76c5c38
e3021fc
8d19597
 
e3021fc
76c5c38
 
add50b3
76c5c38
eda3536
add50b3
 
eda3536
 
 
 
 
 
 
 
 
 
 
 
a4a32f2
add50b3
 
 
2791b7d
8d19597
76c5c38
2791b7d
 
76c5c38
e3021fc
76c5c38
e3021fc
 
76c5c38
e3021fc
 
2791b7d
76c5c38
e3021fc
 
76c5c38
e3021fc
 
 
76c5c38
e3021fc
 
2791b7d

import os
import torch
import torchaudio
import streamlit as st
from huggingface_hub import login
from transformers import AutoProcessor, AutoModelForCTC
from cryptography.fernet import Fernet

# ================================
# 1️⃣ Authenticate with Hugging Face Hub (Cache to prevent re-authentication)
# ================================
@st.cache_resource
def authenticate_hf():
    HF_TOKEN = os.getenv("hf_token")
    if HF_TOKEN is None:
        raise ValueError("❌ Hugging Face API token not found. Please set it in Secrets.")
    login(token=HF_TOKEN)

authenticate_hf()

# ================================
# 2️⃣ Load Conformer Model & Processor (Cached)
# ================================
@st.cache_resource
def load_model():
    MODEL_NAME = "deepl-project/conformer-finetunning"
    processor = AutoProcessor.from_pretrained(MODEL_NAME)
    model = AutoModelForCTC.from_pretrained(MODEL_NAME).to("cuda" if torch.cuda.is_available() else "cpu")
    return processor, model

processor, model = load_model()

# ================================
# 3️⃣ Streamlit Sidebar for Fine-Tuning & Security
# ================================
st.sidebar.title("🔧 Fine-Tuning & Security Settings")

num_epochs = st.sidebar.slider("Epochs", min_value=1, max_value=10, value=3)
learning_rate = st.sidebar.select_slider("Learning Rate", options=[5e-4, 1e-4, 5e-5, 1e-5], value=5e-5)
batch_size = st.sidebar.select_slider("Batch Size", options=[2, 4, 8, 16], value=8)

attack_strength = st.sidebar.slider("Adversarial Attack Strength", 0.1, 0.9, 0.3)

enable_encryption = st.sidebar.checkbox("🔒 Encrypt Transcription", value=True)
show_transcription = st.sidebar.checkbox("📖 Show Transcription", value=False)

# ================================
# 4️⃣ Encryption Handling (Precomputed Key)
# ================================
encryption_key = Fernet.generate_key()
fernet = Fernet(encryption_key)

def encrypt_text(text):
    return fernet.encrypt(text.encode())

def decrypt_text(encrypted_text):
    return fernet.decrypt(encrypted_text).decode()

# ================================
# 5️⃣ Optimized ASR Web App
# ================================
st.title("🎙️ Speech-to-Text ASR Model using Conformer with Security Features")

audio_file = st.file_uploader("Upload an audio file", type=["wav", "mp3", "flac"])

if audio_file:
    audio_path = "temp_audio.wav"
    with open(audio_path, "wb") as f:
        f.write(audio_file.read())

    # Load and preprocess the audio file using torchaudio
    waveform, sample_rate = torchaudio.load(audio_path)
    waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(waveform)
    waveform = waveform.to(dtype=torch.float32)
    
    # ================================
    # ✅ Optimized Adversarial Attack Handling
    # ================================
    noise = attack_strength * torch.randn_like(waveform)
    adversarial_waveform = waveform + noise
    adversarial_waveform = torch.clamp(adversarial_waveform, -1.0, 1.0)

    # ================================
    # ✅ Preprocess Audio with Processor (Corrected)
    # ================================
    # Ensure the input has batch dimension (even if it's one example)
    inputs = processor(adversarial_waveform.squeeze().numpy(), sampling_rate=16000, return_tensors="pt", padding=True)

    # Check the structure of the returned `inputs` to understand what it contains
    st.write("Processor Output:", inputs)

    # Extract the correct key (input_features or input_values depending on the model)
    if "input_features" in inputs:
        input_features = inputs["input_features"]
    elif "input_values" in inputs:
        input_features = inputs["input_values"]
    else:
        raise ValueError("❌ The processor output does not contain 'input_features' or 'input_values'.")

    input_features = input_features.to("cuda" if torch.cuda.is_available() else "cpu")

    # ================================
    # ✅ Fast Transcription Processing with Conformer
    # ================================
    with torch.no_grad():
        logits = model(input_features).logits

    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids)

    if attack_strength > 0.3:
        st.warning("⚠️ Adversarial attack detected! Denoising applied.")

    # ================================
    # ✅ Optimized Encryption Handling
    # ================================
    if enable_encryption:
        encrypted_transcription = encrypt_text(transcription[0])
        st.info("🔒 Transcription is encrypted. Enable 'Show Transcription' to view.")
        
        if show_transcription:
            decrypted_text = decrypt_text(encrypted_transcription)
            st.success("📄 Secure Transcription:")
            st.write(decrypted_text)
        else:
            st.write("🔒 [Encrypted] Transcription hidden. Enable 'Show Transcription' to view.")
    else:
        st.success("📄 Transcription:")
        st.write(transcription[0])