Spaces:

hriteshMaikap
/

indic-languages-classifier

Build error

App Files Files Community

hriteshMaikap commited on Apr 18

Commit

018dafc

verified ·

1 Parent(s): 4c524a0

Create app.py

Browse files

Files changed (1) hide show

app.py +287 -0

app.py ADDED Viewed

	@@ -0,0 +1,287 @@

+import gradio as gr
+import torch
+import torch.nn as nn
+import torchaudio
+import json
+import numpy as np
+from huggingface_hub import hf_hub_download
+from transformers import PretrainedConfig, PreTrainedModel
+# Define model architecture (same as your training code)
+class AudioLanguageClassifierConfig(PretrainedConfig):
+    model_type = "audio-language-classifier"
+    def __init__(
+        self,
+        num_labels=12,
+        sampling_rate=16000,
+        num_mel_bins=128,
+        feature_size=512,
+        num_transformer_layers=4,
+        num_attention_heads=4,
+        intermediate_size=1024,
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.num_labels = num_labels
+        self.sampling_rate = sampling_rate
+        self.num_mel_bins = num_mel_bins
+        self.feature_size = feature_size
+        self.num_transformer_layers = num_transformer_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+class AudioFeatureExtractor:
+    def __init__(self, config):
+        self.config = config
+        self.mel_spectrogram = torchaudio.transforms.MelSpectrogram(
+            sample_rate=config.sampling_rate,
+            n_fft=1024,
+            hop_length=512,
+            n_mels=config.num_mel_bins
+        )
+        self.amplitude_to_db = torchaudio.transforms.AmplitudeToDB()
+    def __call__(self, audio_data, padding=True, max_length=None, truncation=True, **kwargs):
+        if isinstance(audio_data, np.ndarray):
+            audio_data = torch.from_numpy(audio_data)
+        # Ensure it's in the expected shape
+        if audio_data.ndim == 1:
+            audio_data = audio_data.unsqueeze(0)  # Add channel dimension
+        # Convert to mel spectrogram
+        mel_spec = self.mel_spectrogram(audio_data)
+        log_mel_spec = self.amplitude_to_db(mel_spec)
+        # Normalization
+        mean = log_mel_spec.mean()
+        std = log_mel_spec.std()
+        log_mel_spec = (log_mel_spec - mean) / (std + 1e-10)
+        # Handle max length/truncation
+        if max_length is not None and truncation and log_mel_spec.shape[-1] > max_length:
+            log_mel_spec = log_mel_spec[..., :max_length]
+        return {"input_values": log_mel_spec}
+class AudioLanguageClassifier(PreTrainedModel):
+    config_class = AudioLanguageClassifierConfig
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        # CNN feature extractor
+        self.feature_extractor = nn.Sequential(
+            nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1),
+            nn.ReLU(),
+            nn.MaxPool2d(2, 2),
+            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
+            nn.ReLU(),
+            nn.MaxPool2d(2, 2)
+        )
+        # Global average pooling to eliminate size dependency
+        self.global_pool = nn.AdaptiveAvgPool2d((4, 4))
+        # Fixed size after global pooling
+        self.flattened_size = 64 * 4 * 4
+        # Projection layer with fixed input size
+        self.projection = nn.Linear(self.flattened_size, config.feature_size)
+        # Transformer for sequence modeling
+        encoder_layer = nn.TransformerEncoderLayer(
+            d_model=config.feature_size,
+            nhead=config.num_attention_heads,
+            dim_feedforward=config.intermediate_size,
+            dropout=config.hidden_dropout_prob,
+            batch_first=True
+        )
+        self.transformer_encoder = nn.TransformerEncoder(
+            encoder_layer,
+            num_layers=config.num_transformer_layers
+        )
+        # Classification head - use num_labels from config or default to 12 if not available
+        num_labels = getattr(config, "num_labels", len(getattr(config, "id2label", {12: ""})))
+        self.classifier = nn.Linear(config.feature_size, num_labels)
+    def forward(
+        self,
+        input_values=None,
+        labels=None,
+        **kwargs
+    ):
+        batch_size = input_values.size(0)
+        # Extract features using CNN
+        x = self.feature_extractor(input_values)
+        # Apply global pooling to get fixed size
+        x = self.global_pool(x)
+        # Flatten
+        x = x.view(batch_size, -1)
+        # Project to transformer dimension
+        x = self.projection(x)
+        # Add sequence dimension for transformer
+        x = x.unsqueeze(1)  # [batch_size, 1, feature_size]
+        # Transformer encoding
+        x = self.transformer_encoder(x)
+        # Classification
+        x = x[:, 0, :]  # Take first token representation
+        logits = self.classifier(x)
+        loss = None
+        if labels is not None:
+            loss_fct = nn.CrossEntropyLoss()
+            loss = loss_fct(logits, labels)
+        return {"loss": loss, "logits": logits} if loss is not None else {"logits": logits}
+# Function to load the model and its configuration
+def load_model():
+    # Download the model files
+    repo_id = "hriteshMaikap/languageClassifier"
+    try:
+        model_path = hf_hub_download(repo_id=repo_id, filename="model.pt")
+        config_path = hf_hub_download(repo_id=repo_id, filename="config.json")
+        mappings_path = hf_hub_download(repo_id=repo_id, filename="language_mappings.json")
+        # Load the config
+        with open(config_path, "r") as f:
+            config_dict = json.load(f)
+        config = AudioLanguageClassifierConfig(**config_dict)
+        # Load the model
+        model = AudioLanguageClassifier(config)
+        model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))
+        model.eval()
+        # Load language mappings
+        with open(mappings_path, "r") as f:
+            mappings = json.load(f)
+        id_to_language = {int(k): v for k, v in mappings["id_to_language"].items()}
+        # If id2label is updated in the config, use that instead
+        if hasattr(config, "id2label") and config.id2label:
+            if not all(v == f"LABEL_{k}" for k, v in config.id2label.items()):
+                id_to_language = {int(k): v for k, v in config.id2label.items()}
+        return model, config, id_to_language
+    except Exception as e:
+        gr.Warning(f"Error loading model: {e}")
+        # Return placeholders with error message
+        raise gr.Error(f"Failed to load the language classification model: {e}")
+# Prepare the feature extractor and model
+try:
+    model, config, id_to_language = load_model()
+    feature_extractor = AudioFeatureExtractor(config)
+    languages = list(id_to_language.values())
+except Exception as e:
+    model, config, id_to_language = None, None, {}
+    languages = []
+    print(f"Error initializing model: {e}")
+# Function to process audio and make predictions
+def classify_language(audio):
+    if model is None or config is None:
+        return {"Error": 1.0}
+    if audio is None:
+        return {"No audio detected": 1.0}
+    try:
+        # Process audio
+        sr, waveform = audio
+        # Convert to torch tensor
+        waveform = torch.tensor(waveform).float()
+        # Ensure mono
+        if waveform.ndim > 1 and waveform.shape[0] > 1:
+            waveform = torch.mean(waveform, dim=0, keepdim=True)
+        elif waveform.ndim == 1:
+            waveform = waveform.unsqueeze(0)
+        # Resample to 16kHz if needed
+        if sr != 16000:
+            resampler = torchaudio.transforms.Resample(sr, 16000)
+            waveform = resampler(waveform)
+        # Extract features
+        features = feature_extractor(waveform, max_length=256)
+        input_values = features["input_values"]
+        # Pad or truncate to fixed length
+        _, height, width = input_values.shape
+        max_length = 256
+        if width < max_length:
+            padding = torch.zeros(1, height, max_length - width)
+            input_values = torch.cat([input_values, padding], dim=2)
+        elif width > max_length:
+            input_values = input_values[:, :, :max_length]
+        # Get prediction
+        with torch.no_grad():
+            outputs = model(input_values=input_values)
+            logits = outputs["logits"]
+            probs = torch.nn.functional.softmax(logits, dim=1)[0]
+            predicted_id = torch.argmax(probs).item()
+        # Get top 3 predictions (or all if fewer than 3)
+        num_classes = min(3, len(id_to_language))
+        top_probs, top_ids = torch.topk(probs, num_classes)
+        # Format results
+        results = {}
+        for i, (prob, pred_id) in enumerate(zip(top_probs, top_ids)):
+            lang = id_to_language.get(pred_id.item(), f"Unknown-{pred_id.item()}")
+            results[lang] = float(prob)
+        return results
+    except Exception as e:
+        gr.Warning(f"Error processing audio: {e}")
+        return {"Error processing audio": 1.0}
+# Create the Gradio interface
+demo = gr.Interface(
+    fn=classify_language,
+    inputs=gr.Audio(sources=["microphone", "upload"], type="tuple"),
+    outputs=gr.Label(num_top_classes=3),
+    title="Indian Language Identification",
+    description="Record or upload audio to identify the Indian language being spoken. Supported languages: " +
+                ", ".join(languages) if languages else "Error loading language list",
+    examples=[],
+    article="""
+    <div style="text-align: center;">
+        <p>This model identifies various Indian languages from audio input. For best results:</p>
+        <ul style="display: inline-block; text-align: left;">
+            <li>Speak clearly with minimal background noise</li>
+            <li>Recording length of 3-5 seconds is ideal</li>
+            <li>Make sure to speak a full sentence or phrase</li>
+        </ul>
+        <p>Model by <a href="https://huggingface.co/hriteshMaikap/languageClassifier" target="_blank">hriteshMaikap</a></p>
+    </div>
+    """
+)
+# Launch the app
+if __name__ == "__main__":
+    demo.launch()