Spaces:

hriteshMaikap
/

indic-languages-classifier

Build error

App Files Files Community

hriteshMaikap commited on 18 days ago

Commit

6fd65bb

verified ·

1 Parent(s): ba7a495

Update app.py

Browse files

Files changed (1) hide show

app.py +264 -11

app.py CHANGED Viewed

@@ -1,20 +1,273 @@
 import gradio as gr
-from transformers import pipeline
-classifier = pipeline("audio-classification", model="hriteshMaikap/languageClassifier")
-def predict_language(audio):
-    out = classifier(audio)
-    # out is a list of dicts: [{'label': 'Hindi', 'score': 0.98}, ...]
-    return "\n".join([f"{res['label']}: {res['score']:.2f}" for res in out])
 demo = gr.Interface(
-    fn=predict_language,
-    inputs=gr.Audio(source="microphone", type="filepath"),
-    outputs="text",
-    title="Indian Language Identifier",
-    description="Record audio and classify the spoken Indian language."
 )
 if __name__ == "__main__":
     demo.launch()

 import gradio as gr
+import torch
+import torch.nn as nn
+import torchaudio
+import json
+import numpy as np
+from huggingface_hub import hf_hub_download
+from transformers import PretrainedConfig, PreTrainedModel
+# Define model architecture
+class AudioLanguageClassifierConfig(PretrainedConfig):
+    model_type = "audio-language-classifier"
+    def __init__(
+        self,
+        num_labels=10,  # Changed from 12 to 10 to match the saved model
+        sampling_rate=16000,
+        num_mel_bins=128,
+        feature_size=512,
+        num_transformer_layers=4,
+        num_attention_heads=4,
+        intermediate_size=1024,
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.num_labels = num_labels
+        self.sampling_rate = sampling_rate
+        self.num_mel_bins = num_mel_bins
+        self.feature_size = feature_size
+        self.num_transformer_layers = num_transformer_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+class AudioFeatureExtractor:
+    def __init__(self, config):
+        self.config = config
+        self.mel_spectrogram = torchaudio.transforms.MelSpectrogram(
+            sample_rate=config.sampling_rate,
+            n_fft=1024,
+            hop_length=512,
+            n_mels=config.num_mel_bins
+        )
+        self.amplitude_to_db = torchaudio.transforms.AmplitudeToDB()
+    def __call__(self, audio_data, padding=True, max_length=None, truncation=True, **kwargs):
+        if isinstance(audio_data, np.ndarray):
+            audio_data = torch.from_numpy(audio_data)
+        # Ensure it's in the expected shape
+        if audio_data.ndim == 1:
+            audio_data = audio_data.unsqueeze(0)  # Add channel dimension
+        # Convert to mel spectrogram
+        mel_spec = self.mel_spectrogram(audio_data)
+        log_mel_spec = self.amplitude_to_db(mel_spec)
+        # Normalization
+        mean = log_mel_spec.mean()
+        std = log_mel_spec.std()
+        log_mel_spec = (log_mel_spec - mean) / (std + 1e-10)
+        # Handle max length/truncation
+        if max_length is not None and truncation and log_mel_spec.shape[-1] > max_length:
+            log_mel_spec = log_mel_spec[..., :max_length]
+        return {"input_values": log_mel_spec}
+class AudioLanguageClassifier(PreTrainedModel):
+    config_class = AudioLanguageClassifierConfig
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        # CNN feature extractor
+        self.feature_extractor = nn.Sequential(
+            nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1),
+            nn.ReLU(),
+            nn.MaxPool2d(2, 2),
+            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
+            nn.ReLU(),
+            nn.MaxPool2d(2, 2)
+        )
+        # Global average pooling to eliminate size dependency
+        self.global_pool = nn.AdaptiveAvgPool2d((4, 4))
+        # Fixed size after global pooling
+        self.flattened_size = 64 * 4 * 4
+        # Projection layer with fixed input size
+        self.projection = nn.Linear(self.flattened_size, config.feature_size)
+        # Transformer for sequence modeling
+        encoder_layer = nn.TransformerEncoderLayer(
+            d_model=config.feature_size,
+            nhead=config.num_attention_heads,
+            dim_feedforward=config.intermediate_size,
+            dropout=config.hidden_dropout_prob,
+            batch_first=True
+        )
+        self.transformer_encoder = nn.TransformerEncoder(
+            encoder_layer,
+            num_layers=config.num_transformer_layers
+        )
+        # Classification head
+        self.classifier = nn.Linear(config.feature_size, config.num_labels)
+    def forward(
+        self,
+        input_values=None,
+        labels=None,
+        **kwargs
+    ):
+        batch_size = input_values.size(0)
+        # Extract features using CNN
+        x = self.feature_extractor(input_values)
+        # Apply global pooling to get fixed size
+        x = self.global_pool(x)
+        # Flatten
+        x = x.view(batch_size, -1)
+        # Project to transformer dimension
+        x = self.projection(x)
+        # Add sequence dimension for transformer
+        x = x.unsqueeze(1)  # [batch_size, 1, feature_size]
+        # Transformer encoding
+        x = self.transformer_encoder(x)
+        # Classification
+        x = x[:, 0, :]  # Take first token representation
+        logits = self.classifier(x)
+        loss = None
+        if labels is not None:
+            loss_fct = nn.CrossEntropyLoss()
+            loss = loss_fct(logits, labels)
+        return {"loss": loss, "logits": logits} if loss is not None else {"logits": logits}
+# Function to load the model and its configuration
+def load_model():
+    # Download the model files
+    repo_id = "hriteshMaikap/languageClassifier"
+    try:
+        model_path = hf_hub_download(repo_id=repo_id, filename="model.pt")
+        config_path = hf_hub_download(repo_id=repo_id, filename="config.json")
+        mappings_path = hf_hub_download(repo_id=repo_id, filename="language_mappings.json")
+        # Load the config
+        with open(config_path, "r") as f:
+            config_dict = json.load(f)
+        # IMPORTANT: Override num_labels to 10 since the model was trained with 10 classes
+        config_dict["num_labels"] = 10
+        config = AudioLanguageClassifierConfig(**config_dict)
+        # Load the model
+        model = AudioLanguageClassifier(config)
+        model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))
+        model.eval()
+        # Load language mappings
+        with open(mappings_path, "r") as f:
+            mappings = json.load(f)
+        id_to_language = {int(k): v for k, v in mappings["id_to_language"].items()}
+        return model, config, id_to_language
+    except Exception as e:
+        gr.Warning(f"Error loading model: {e}")
+        # Return placeholders with error message
+        raise gr.Error(f"Failed to load the language classification model: {e}")
+# Function to process audio and make predictions
+def classify_language(audio):
+    try:
+        # Load model on first inference
+        global model, config, id_to_language, feature_extractor
+        if 'model' not in globals() or model is None:
+            model, config, id_to_language = load_model()
+            feature_extractor = AudioFeatureExtractor(config)
+        # Get audio data
+        sr, waveform = audio
+        # Convert to torch tensor
+        waveform = torch.tensor(waveform).float()
+        # Ensure mono
+        if waveform.ndim > 1 and waveform.shape[0] > 1:
+            waveform = torch.mean(waveform, dim=0, keepdim=True)
+        elif waveform.ndim == 1:
+            waveform = waveform.unsqueeze(0)
+        # Resample to 16kHz if needed
+        if sr != 16000:
+            resampler = torchaudio.transforms.Resample(sr, 16000)
+            waveform = resampler(waveform)
+        # Extract features
+        features = feature_extractor(waveform, max_length=256)
+        input_values = features["input_values"]
+        # Pad or truncate to fixed length
+        _, height, width = input_values.shape
+        max_length = 256
+        if width < max_length:
+            padding = torch.zeros(1, height, max_length - width)
+            input_values = torch.cat([input_values, padding], dim=2)
+        elif width > max_length:
+            input_values = input_values[:, :, :max_length]
+        # Get prediction
+        with torch.no_grad():
+            outputs = model(input_values=input_values)
+            logits = outputs["logits"]
+            probs = torch.nn.functional.softmax(logits, dim=1)[0]
+        # Get top predictions
+        num_classes = min(3, len(id_to_language))
+        top_probs, top_ids = torch.topk(probs, num_classes)
+        # Format results
+        results = {}
+        for i, (prob, pred_id) in enumerate(zip(top_probs, top_ids)):
+            lang = id_to_language.get(pred_id.item(), f"Unknown-{pred_id.item()}")
+            results[lang] = float(prob)
+        return results
+    except Exception as e:
+        return {"Error": 1.0, "Details": str(e)}
+# Create the Gradio interface
 demo = gr.Interface(
+    fn=classify_language,
+    # Changed type from "tuple" to "numpy" to fix the error
+    inputs=gr.Audio(sources=["microphone", "upload"], type="numpy"),
+    outputs=gr.Label(num_top_classes=3),
+    title="Indian Language Identification",
+    description="Record or upload audio to identify the Indian language being spoken.",
+    examples=[],
+    article="""
+    <div style="text-align: center;">
+        <p>This model identifies various Indian languages from audio input. For best results:</p>
+        <ul style="display: inline-block; text-align: left;">
+            <li>Speak clearly with minimal background noise</li>
+            <li>Recording length of 3-5 seconds is ideal</li>
+            <li>Make sure to speak a full sentence or phrase</li>
+        </ul>
+    </div>
+    """
 )
+# Launch the app
 if __name__ == "__main__":
+    # Initialize model as None to lazy-load on first inference
+    model, config, id_to_language, feature_extractor = None, None, None, None
     demo.launch()