Spaces:

hriteshMaikap
/

indic-languages-classifier

Build error

App Files Files Community

hriteshMaikap commited on 29 days ago

Commit

8579e22

verified ·

1 Parent(s): f3090af

Update app.py

Browse files

Files changed (1) hide show

app.py +53 -263

app.py CHANGED Viewed

@@ -1,273 +1,63 @@
 import gradio as gr
 import torch
-import torch.nn as nn
 import torchaudio
 import json
-import numpy as np
-from huggingface_hub import hf_hub_download
-from transformers import PretrainedConfig, PreTrainedModel
-# Define model architecture
-class AudioLanguageClassifierConfig(PretrainedConfig):
-    model_type = "audio-language-classifier"
-    def __init__(
-        self,
-        num_labels=10,  # Changed from 12 to 10 to match the saved model
-        sampling_rate=16000,
-        num_mel_bins=128,
-        feature_size=512,
-        num_transformer_layers=4,
-        num_attention_heads=4,
-        intermediate_size=1024,
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        **kwargs
-    ):
-        super().__init__(**kwargs)
-        self.num_labels = num_labels
-        self.sampling_rate = sampling_rate
-        self.num_mel_bins = num_mel_bins
-        self.feature_size = feature_size
-        self.num_transformer_layers = num_transformer_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-class AudioFeatureExtractor:
-    def __init__(self, config):
-        self.config = config
-        self.mel_spectrogram = torchaudio.transforms.MelSpectrogram(
-            sample_rate=config.sampling_rate,
-            n_fft=1024,
-            hop_length=512,
-            n_mels=config.num_mel_bins
-        )
-        self.amplitude_to_db = torchaudio.transforms.AmplitudeToDB()
-    def __call__(self, audio_data, padding=True, max_length=None, truncation=True, **kwargs):
-        if isinstance(audio_data, np.ndarray):
-            audio_data = torch.from_numpy(audio_data)
-        # Ensure it's in the expected shape
-        if audio_data.ndim == 1:
-            audio_data = audio_data.unsqueeze(0)  # Add channel dimension
-        # Convert to mel spectrogram
-        mel_spec = self.mel_spectrogram(audio_data)
-        log_mel_spec = self.amplitude_to_db(mel_spec)
-        # Normalization
-        mean = log_mel_spec.mean()
-        std = log_mel_spec.std()
-        log_mel_spec = (log_mel_spec - mean) / (std + 1e-10)
-        # Handle max length/truncation
-        if max_length is not None and truncation and log_mel_spec.shape[-1] > max_length:
-            log_mel_spec = log_mel_spec[..., :max_length]
-        return {"input_values": log_mel_spec}
-class AudioLanguageClassifier(PreTrainedModel):
-    config_class = AudioLanguageClassifierConfig
-    def __init__(self, config):
-        super().__init__(config)
-        self.config = config
-        # CNN feature extractor
-        self.feature_extractor = nn.Sequential(
-            nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1),
-            nn.ReLU(),
-            nn.MaxPool2d(2, 2),
-            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
-            nn.ReLU(),
-            nn.MaxPool2d(2, 2)
-        )
-        # Global average pooling to eliminate size dependency
-        self.global_pool = nn.AdaptiveAvgPool2d((4, 4))
-        # Fixed size after global pooling
-        self.flattened_size = 64 * 4 * 4
-        # Projection layer with fixed input size
-        self.projection = nn.Linear(self.flattened_size, config.feature_size)
-        # Transformer for sequence modeling
-        encoder_layer = nn.TransformerEncoderLayer(
-            d_model=config.feature_size,
-            nhead=config.num_attention_heads,
-            dim_feedforward=config.intermediate_size,
-            dropout=config.hidden_dropout_prob,
-            batch_first=True
-        )
-        self.transformer_encoder = nn.TransformerEncoder(
-            encoder_layer,
-            num_layers=config.num_transformer_layers
-        )
-        # Classification head
-        self.classifier = nn.Linear(config.feature_size, config.num_labels)
-    def forward(
-        self,
-        input_values=None,
-        labels=None,
-        **kwargs
-    ):
-        batch_size = input_values.size(0)
-        # Extract features using CNN
-        x = self.feature_extractor(input_values)
-        # Apply global pooling to get fixed size
-        x = self.global_pool(x)
-        # Flatten
-        x = x.view(batch_size, -1)
-        # Project to transformer dimension
-        x = self.projection(x)
-        # Add sequence dimension for transformer
-        x = x.unsqueeze(1)  # [batch_size, 1, feature_size]
-        # Transformer encoding
-        x = self.transformer_encoder(x)
-        # Classification
-        x = x[:, 0, :]  # Take first token representation
-        logits = self.classifier(x)
-        loss = None
-        if labels is not None:
-            loss_fct = nn.CrossEntropyLoss()
-            loss = loss_fct(logits, labels)
-        return {"loss": loss, "logits": logits} if loss is not None else {"logits": logits}
-# Function to load the model and its configuration
-def load_model():
-    # Download the model files
-    repo_id = "hriteshMaikap/languageClassifier"
-    try:
-        model_path = hf_hub_download(repo_id=repo_id, filename="model.pt")
-        config_path = hf_hub_download(repo_id=repo_id, filename="config.json")
-        mappings_path = hf_hub_download(repo_id=repo_id, filename="language_mappings.json")
-        # Load the config
-        with open(config_path, "r") as f:
-            config_dict = json.load(f)
-        # IMPORTANT: Override num_labels to 10 since the model was trained with 10 classes
-        config_dict["num_labels"] = 10
-        config = AudioLanguageClassifierConfig(**config_dict)
-        # Load the model
-        model = AudioLanguageClassifier(config)
-        model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))
-        model.eval()
-        # Load language mappings
-        with open(mappings_path, "r") as f:
-            mappings = json.load(f)
-        id_to_language = {int(k): v for k, v in mappings["id_to_language"].items()}
-        return model, config, id_to_language
-    except Exception as e:
-        gr.Warning(f"Error loading model: {e}")
-        # Return placeholders with error message
-        raise gr.Error(f"Failed to load the language classification model: {e}")
-# Function to process audio and make predictions
-def classify_language(audio):
-    try:
-        # Load model on first inference
-        global model, config, id_to_language, feature_extractor
-        if 'model' not in globals() or model is None:
-            model, config, id_to_language = load_model()
-            feature_extractor = AudioFeatureExtractor(config)
-        # Get audio data
-        sr, waveform = audio
-        # Convert to torch tensor
-        waveform = torch.tensor(waveform).float()
-        # Ensure mono
-        if waveform.ndim > 1 and waveform.shape[0] > 1:
-            waveform = torch.mean(waveform, dim=0, keepdim=True)
-        elif waveform.ndim == 1:
-            waveform = waveform.unsqueeze(0)
-        # Resample to 16kHz if needed
-        if sr != 16000:
-            resampler = torchaudio.transforms.Resample(sr, 16000)
-            waveform = resampler(waveform)
-        # Extract features
-        features = feature_extractor(waveform, max_length=256)
-        input_values = features["input_values"]
-        # Pad or truncate to fixed length
-        _, height, width = input_values.shape
-        max_length = 256
-        if width < max_length:
-            padding = torch.zeros(1, height, max_length - width)
-            input_values = torch.cat([input_values, padding], dim=2)
-        elif width > max_length:
-            input_values = input_values[:, :, :max_length]
-        # Get prediction
-        with torch.no_grad():
-            outputs = model(input_values=input_values)
-            logits = outputs["logits"]
-            probs = torch.nn.functional.softmax(logits, dim=1)[0]
-        # Get top predictions
-        num_classes = min(3, len(id_to_language))
-        top_probs, top_ids = torch.topk(probs, num_classes)
-        # Format results
-        results = {}
-        for i, (prob, pred_id) in enumerate(zip(top_probs, top_ids)):
-            lang = id_to_language.get(pred_id.item(), f"Unknown-{pred_id.item()}")
-            results[lang] = float(prob)
-        return results
-    except Exception as e:
-        return {"Error": 1.0, "Details": str(e)}
-# Create the Gradio interface
 demo = gr.Interface(
-    fn=classify_language,
-    # Changed type from "tuple" to "numpy" to fix the error
-    inputs=gr.Audio(sources=["microphone", "upload"], type="numpy"),
-    outputs=gr.Label(num_top_classes=3),
-    title="Indian Language Identification",
-    description="Record or upload audio to identify the Indian language being spoken.",
-    examples=[],
-    article="""
-    <div style="text-align: center;">
-        <p>This model identifies various Indian languages from audio input. For best results:</p>
-        <ul style="display: inline-block; text-align: left;">
-            <li>Speak clearly with minimal background noise</li>
-            <li>Recording length of 3-5 seconds is ideal</li>
-            <li>Make sure to speak a full sentence or phrase</li>
-        </ul>
-    </div>
-    """
 )
-# Launch the app
 if __name__ == "__main__":
-    # Initialize model as None to lazy-load on first inference
-    model, config, id_to_language, feature_extractor = None, None, None, None
     demo.launch()

 import gradio as gr
 import torch
 import torchaudio
 import json
+import os
+# Import your model architecture
+from model import AudioLanguageClassifier, AudioLanguageClassifierConfig, AudioFeatureExtractor
+MODEL_DIR = "."
+# Load config and mappings
+with open(os.path.join(MODEL_DIR, "config.json")) as f:
+    config_dict = json.load(f)
+with open(os.path.join(MODEL_DIR, "language_mappings.json")) as f:
+    mappings = json.load(f)
+id_to_language = {int(k): v for k, v in mappings["id_to_language"].items()}
+config = AudioLanguageClassifierConfig(**config_dict)
+model = AudioLanguageClassifier(config)
+model.load_state_dict(torch.load(os.path.join(MODEL_DIR, "model.pt"), map_location="cpu"))
+model.eval()
+feature_extractor = AudioFeatureExtractor(config)
+max_length = 256  # Or whatever you used in training
+def predict_language(audio):
+    waveform, sample_rate = torchaudio.load(audio)
+    # Resample and mono
+    if sample_rate != 16000:
+        waveform = torchaudio.transforms.Resample(sample_rate, 16000)(waveform)
+    if waveform.shape[0] > 1:
+        waveform = torch.mean(waveform, dim=0, keepdim=True)
+    features = feature_extractor(waveform)
+    input_values = features["input_values"]
+    _, height, width = input_values.shape
+    # Pad/truncate
+    if width < max_length:
+        padding = torch.zeros(1, height, max_length - width)
+        input_values = torch.cat([input_values, padding], dim=2)
+    elif width > max_length:
+        input_values = input_values[:, :, :max_length]
+    with torch.no_grad():
+        outputs = model(input_values=input_values)
+        logits = outputs["logits"]
+        probs = torch.nn.functional.softmax(logits, dim=1)[0]
+        top_probs, top_ids = torch.topk(probs, 3)
+        results = []
+        for prob, pred_id in zip(top_probs, top_ids):
+            lang = id_to_language[pred_id.item()]
+            results.append(f"{lang}: {prob.item():.2f}")
+    return "\n".join(results)
 demo = gr.Interface(
+    fn=predict_language,
+    inputs=gr.Audio(source="microphone", type="filepath"),
+    outputs="text",
+    title="Indian Language Identifier",
+    description="Record audio and classify the spoken Indian language."
 )
 if __name__ == "__main__":
     demo.launch()