Spaces:

hriteshMaikap
/

indic-languages-classifier

Build error

App Files Files Community

hriteshMaikap commited on 28 days ago

Commit

32c9322

verified ·

1 Parent(s): 6fd65bb

Update app.py

Browse files

Files changed (1) hide show

app.py +114 -37

app.py CHANGED Viewed

@@ -4,6 +4,9 @@ import torch.nn as nn
 import torchaudio
 import json
 import numpy as np
 from huggingface_hub import hf_hub_download
 from transformers import PretrainedConfig, PreTrainedModel
@@ -13,7 +16,7 @@ class AudioLanguageClassifierConfig(PretrainedConfig):
     def __init__(
         self,
-        num_labels=10,  # Changed from 12 to 10 to match the saved model
         sampling_rate=16000,
         num_mel_bins=128,
         feature_size=512,
@@ -158,42 +161,94 @@ def load_model():
         config_path = hf_hub_download(repo_id=repo_id, filename="config.json")
         mappings_path = hf_hub_download(repo_id=repo_id, filename="language_mappings.json")
-        # Load the config
         with open(config_path, "r") as f:
             config_dict = json.load(f)
-        # IMPORTANT: Override num_labels to 10 since the model was trained with 10 classes
-        config_dict["num_labels"] = 10
         config = AudioLanguageClassifierConfig(**config_dict)
-        # Load the model
         model = AudioLanguageClassifier(config)
-        model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))
-        model.eval()
-        # Load language mappings
-        with open(mappings_path, "r") as f:
-            mappings = json.load(f)
-        id_to_language = {int(k): v for k, v in mappings["id_to_language"].items()}
         return model, config, id_to_language
     except Exception as e:
-        gr.Warning(f"Error loading model: {e}")
-        # Return placeholders with error message
-        raise gr.Error(f"Failed to load the language classification model: {e}")
 # Function to process audio and make predictions
 def classify_language(audio):
     try:
-        # Load model on first inference
-        global model, config, id_to_language, feature_extractor
-        if 'model' not in globals() or model is None:
-            model, config, id_to_language = load_model()
-            feature_extractor = AudioFeatureExtractor(config)
         # Get audio data
         sr, waveform = audio
@@ -230,29 +285,52 @@ def classify_language(audio):
             logits = outputs["logits"]
             probs = torch.nn.functional.softmax(logits, dim=1)[0]
-        # Get top predictions
-        num_classes = min(3, len(id_to_language))
-        top_probs, top_ids = torch.topk(probs, num_classes)
-        # Format results
-        results = {}
-        for i, (prob, pred_id) in enumerate(zip(top_probs, top_ids)):
-            lang = id_to_language.get(pred_id.item(), f"Unknown-{pred_id.item()}")
-            results[lang] = float(prob)
-        return results
     except Exception as e:
-        return {"Error": 1.0, "Details": str(e)}
 # Create the Gradio interface
 demo = gr.Interface(
     fn=classify_language,
-    # Changed type from "tuple" to "numpy" to fix the error
     inputs=gr.Audio(sources=["microphone", "upload"], type="numpy"),
-    outputs=gr.Label(num_top_classes=3),
     title="Indian Language Identification",
-    description="Record or upload audio to identify the Indian language being spoken.",
     examples=[],
     article="""
     <div style="text-align: center;">
@@ -262,12 +340,11 @@ demo = gr.Interface(
             <li>Recording length of 3-5 seconds is ideal</li>
             <li>Make sure to speak a full sentence or phrase</li>
         </ul>
     </div>
     """
 )
 # Launch the app
 if __name__ == "__main__":
-    # Initialize model as None to lazy-load on first inference
-    model, config, id_to_language, feature_extractor = None, None, None, None
     demo.launch()

 import torchaudio
 import json
 import numpy as np
+import matplotlib.pyplot as plt
+from io import BytesIO
+import base64
 from huggingface_hub import hf_hub_download
 from transformers import PretrainedConfig, PreTrainedModel
     def __init__(
         self,
+        num_labels=10,
         sampling_rate=16000,
         num_mel_bins=128,
         feature_size=512,
         config_path = hf_hub_download(repo_id=repo_id, filename="config.json")
         mappings_path = hf_hub_download(repo_id=repo_id, filename="language_mappings.json")
+        # Load language mappings first to get correct number of labels
+        with open(mappings_path, "r") as f:
+            mappings = json.load(f)
+        id_to_language = {int(k): v for k, v in mappings["id_to_language"].items()}
+        num_languages = len(id_to_language)
+        # Load the config with correct number of labels
         with open(config_path, "r") as f:
             config_dict = json.load(f)
+        config_dict["num_labels"] = num_languages
         config = AudioLanguageClassifierConfig(**config_dict)
+        # Create the model
         model = AudioLanguageClassifier(config)
+        # Load and adapt state dict as needed
+        state_dict = torch.load(model_path, map_location=torch.device('cpu'))
+        # Fix classifier weights and biases if needed
+        if 'classifier.weight' in state_dict and state_dict['classifier.weight'].size(0) != config.num_labels:
+            print(f"Adjusting classifier size from {state_dict['classifier.weight'].size(0)} to {config.num_labels}")
+            old_size = state_dict['classifier.weight'].size(0)
+            # Create new classifier layer with correct size
+            new_classifier = nn.Linear(config.feature_size, config.num_labels)
+            # Copy weights and biases for available classes
+            with torch.no_grad():
+                # Copy weights for the classes we have
+                new_classifier.weight.data[:old_size, :] = state_dict['classifier.weight']
+                new_classifier.bias.data[:old_size] = state_dict['classifier.bias']
+            # Update state dict with new weights
+            state_dict['classifier.weight'] = new_classifier.weight.data
+            state_dict['classifier.bias'] = new_classifier.bias.data
+        # Load the updated state dict
+        model.load_state_dict(state_dict)
+        model.eval()
         return model, config, id_to_language
     except Exception as e:
+        print(f"Error loading model: {e}")
+        import traceback
+        traceback.print_exc()
+        raise gr.Error(f"Failed to load the language classification model: {str(e)}")
+# Function to create a bar chart visualization
+def create_confidence_chart(probs, id_to_language):
+    plt.figure(figsize=(10, 5))
+    languages = [id_to_language[i] for i in range(len(id_to_language))]
+    # Sort by confidence score
+    indices = np.argsort(probs)[::-1]
+    sorted_languages = [languages[i] for i in indices]
+    sorted_confidences = [probs[i] for i in indices]
+    # Use a colormap - highest confidence gets different color
+    colors = ['#1f77b4'] * len(sorted_languages)
+    colors[0] = '#ff7f0e'  # Highlight the top prediction
+    plt.bar(sorted_languages, sorted_confidences, color=colors)
+    plt.xticks(rotation=45, ha='right')
+    plt.title('Language Detection Confidence')
+    plt.xlabel('Language')
+    plt.ylabel('Confidence')
+    plt.tight_layout()
+    # Save plot to a bytes buffer
+    buf = BytesIO()
+    plt.savefig(buf, format='png')
+    plt.close()
+    buf.seek(0)
+    # Convert to base64 string for HTML embedding
+    img_str = base64.b64encode(buf.read()).decode('utf-8')
+    return f"<img src='data:image/png;base64,{img_str}' alt='Confidence Chart'>"
 # Function to process audio and make predictions
 def classify_language(audio):
+    if audio is None:
+        return {"No audio detected": 1.0}, "Please record or upload audio to analyze."
     try:
         # Get audio data
         sr, waveform = audio
             logits = outputs["logits"]
             probs = torch.nn.functional.softmax(logits, dim=1)[0]
+            # Only consider valid language indices
+            valid_indices = list(range(len(id_to_language)))
+            valid_probs = probs[valid_indices].cpu().numpy()
+            # Generate the confidence visualization
+            chart_html = create_confidence_chart(valid_probs, id_to_language)
+            # Get top 3 predictions (or all if fewer than 3)
+            num_classes = min(3, len(id_to_language))
+            top_indices = np.argsort(valid_probs)[::-1][:num_classes]
+            # Format results
+            results = {}
+            for idx in top_indices:
+                lang = id_to_language.get(idx, f"Unknown-{idx}")
+                results[lang] = float(valid_probs[idx])
+        return results, chart_html
     except Exception as e:
+        import traceback
+        traceback.print_exc()
+        return {"Error": 1.0}, f"<p>Error processing audio: {str(e)}</p>"
+# Initialize model and feature extractor
+try:
+    model, config, id_to_language = load_model()
+    feature_extractor = AudioFeatureExtractor(config)
+    languages = list(id_to_language.values())
+    print(f"Model loaded successfully. Found {len(languages)} languages: {languages}")
+except Exception as e:
+    print(f"Error initializing model: {e}")
+    model, config, id_to_language, feature_extractor = None, None, None, None
+    languages = []
 # Create the Gradio interface
 demo = gr.Interface(
     fn=classify_language,
     inputs=gr.Audio(sources=["microphone", "upload"], type="numpy"),
+    outputs=[
+        gr.Label(num_top_classes=3),
+        gr.HTML(label="Confidence Chart")
+    ],
     title="Indian Language Identification",
+    description="Record or upload audio to identify the Indian language being spoken. " +
+                f"Supported languages: {', '.join(languages) if languages else 'Error loading language list'}",
     examples=[],
     article="""
     <div style="text-align: center;">
             <li>Recording length of 3-5 seconds is ideal</li>
             <li>Make sure to speak a full sentence or phrase</li>
         </ul>
+        <p>Model by <a href="https://huggingface.co/hriteshMaikap/languageClassifier" target="_blank">hriteshMaikap</a></p>
     </div>
     """
 )
 # Launch the app
 if __name__ == "__main__":
     demo.launch()