Spaces:

aryachakraborty
/

Voice_Assistant_smolLM2

Running

App Files Files Community

aryachakraborty commited on 16 days ago

Commit

e8b0736

verified ·

1 Parent(s): 266070f

Upload 6 files

Browse files

Files changed (6) hide show

Dockerfile +16 -0
README.md +35 -12
main.py +65 -0
requirements.txt +5 -0
templates/index.html +580 -0
web_search_tool.py +5 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,16 @@

+# Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
+# you will also find guides on how best to write your Dockerfile
+FROM python:3.9
+RUN useradd -m -u 1000 user
+USER user
+ENV PATH="/home/user/.local/bin:$PATH"
+WORKDIR /app
+COPY --chown=user ./requirements.txt requirements.txt
+RUN pip install --no-cache-dir --upgrade -r requirements.txt
+COPY --chown=user . /app
+CMD ["gunicorn","-b","0.0.0.0:7860", "main:app"]

README.md CHANGED Viewed

@@ -1,12 +1,35 @@
----
-title: Voice Assistant SmolLM2
-emoji: 🏢
-colorFrom: pink
-colorTo: blue
-sdk: docker
-pinned: false
-license: apache-2.0
-short_description: This is a voice assistant powered by smolLM2-130M-IT
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# ARIA – AI-Responsive Interactive Assistant
+ARIA is an AI-powered voice assistant that provides intelligent, web-enhanced answers to user queries. Built using a lightweight HuggingFace model, it integrates real-time web search and responds in a professional tone.
+---
+## 🚀 Features
+- Uses `SmolLM2-135M-Instruct` for fast, efficient responses.
+- Integrates web context for better accuracy using a custom search tool.
+- Hosted using Flask + Gunicorn in a Hugging Face Space (Docker-based).
+- Clean web UI with voice interaction (frontend via `index.html`).
+---
+## 🧱 Project Structure
+├── Dockerfile # For Hugging Face Space deployment ├── main.py # Flask application ├── requirements.txt # Python dependencies ├── web_search_tool.py # Web search context integration └── templates/ └── index.html # Web UI
+---
+## 🐳 Running Locally with Docker
+```bash
+# Build the Docker image
+docker build -t aria-assistant .
+# Run the container
+docker run -p 7860:7860 aria-assistant
+🤖 Model Used
+SmolLM2-135M-Instruct
+📄 License
+This project is under the MIT License.

main.py ADDED Viewed

	@@ -0,0 +1,65 @@

+### ARIA – AI-Responsive Interactive Assistant
+from flask import Flask, request, render_template
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import torch
+from web_search_tool import web_search
+import warnings
+warnings.filterwarnings("ignore", message="Failed to load image Python extension")
+app = Flask(__name__)
+# Load model and tokenizer
+checkpoint = "HuggingFaceTB/SmolLM2-135M-Instruct"
+device = "cuda" if torch.cuda.is_available() else "cpu"
+tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+model = AutoModelForCausalLM.from_pretrained(checkpoint).to(device)
+@app.route("/", methods=["GET", "POST"])
+def index():
+    response = ""
+    if request.method == "POST":
+        user_query = request.form["query"]
+        # Get context from the web
+        try:
+            context = web_search(user_query)
+        except Exception as e:
+            context = "No additional context could be retrieved."
+            print("Web search failed:", e)
+        # System prompt setup with context included
+        messages = [
+            {"role": "system", "content": (
+                "You are a voice assistant that answers in a polite and professional tone. "
+                "Use the following context to help answer the question:\n"
+                f"{context}\n"
+                "If the context is insufficient, still try to give the best possible answer."
+            )},
+            {"role": "user", "content": user_query}
+        ]
+        input_text = tokenizer.apply_chat_template(messages, tokenize=False)
+        inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)
+        outputs = model.generate(
+            inputs,
+            max_new_tokens=128,
+            temperature=0.1,
+            top_p=0.9,
+            do_sample=False
+        )
+        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        # Extract only the assistant's reply
+        if "assistant" in response:
+            response = response.split("assistant")[-1].strip(": ").strip()
+        else:
+            response = "Sorry, couldn't understand your query. Can you ask again?"
+    return render_template("index.html", response=response)
+if __name__ == "__main__":
+    app.run(debug=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+duckduckgo-search>7.1.1
+gunicorn
+transformers
+flask
+torch

templates/index.html ADDED Viewed

	@@ -0,0 +1,580 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>AI Voice Assistant</title>
+    <script src="https://cdn.tailwindcss.com"></script>
+    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.0/css/all.min.css">
+    <style>
+        :root {
+            --primary: #00f7ff;
+            --secondary: #7b2cbf;
+            --accent: #ff00e4;
+            --bg-dark: #0f172a;
+            --bg-darker: #0b1120;
+            --text-light: #e2e8f0;
+        }
+        body {
+            background-color: var(--bg-dark);
+            color: var(--text-light);
+            transition: all 0.3s ease;
+            font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
+        }
+        .glass-effect {
+            background: rgba(15, 23, 42, 0.7);
+            backdrop-filter: blur(10px);
+            -webkit-backdrop-filter: blur(10px);
+            border: 1px solid rgba(255, 255, 255, 0.1);
+        }
+        .neon-glow {
+            text-shadow: 0 0 8px var(--primary),
+                         0 0 16px var(--primary);
+        }
+        .pulse {
+            animation: pulse 2s infinite;
+        }
+        @keyframes pulse {
+            0% {
+                box-shadow: 0 0 0 0 rgba(0, 247, 255, 0.7);
+            }
+            70% {
+                box-shadow: 0 0 0 15px rgba(0, 247, 255, 0);
+            }
+            100% {
+                box-shadow: 0 0 0 0 rgba(0, 247, 255, 0);
+            }
+        }
+        .wave {
+            position: relative;
+            height: 80px;
+            width: 80px;
+            display: flex;
+            justify-content: center;
+            align-items: center;
+        }
+        .wave .dot {
+            display: inline-block;
+            width: 8px;
+            height: 8px;
+            border-radius: 50%;
+            margin-right: 3px;
+            background: var(--primary);
+            animation: wave 1.3s linear infinite;
+        }
+        .wave .dot:nth-child(2) {
+            animation-delay: -1.1s;
+        }
+        .wave .dot:nth-child(3) {
+            animation-delay: -0.9s;
+        }
+        @keyframes wave {
+            0%, 60%, 100% {
+                transform: initial;
+            }
+            30% {
+                transform: translateY(-15px);
+            }
+        }
+        .voice-btn {
+            transition: all 0.3s ease;
+            box-shadow: 0 0 20px rgba(0, 247, 255, 0.3);
+        }
+        .voice-btn.active {
+            background: var(--primary);
+            color: var(--bg-darker);
+            box-shadow: 0 0 30px var(--primary);
+        }
+        .response-text {
+            border-left: 3px solid var(--primary);
+            animation: textAppear 0.5s ease-out;
+        }
+        @keyframes textAppear {
+            from {
+                opacity: 0;
+                transform: translateY(10px);
+            }
+            to {
+                opacity: 1;
+                transform: translateY(0);
+            }
+        }
+        .floating-orb {
+            position: absolute;
+            width: 200px;
+            height: 200px;
+            border-radius: 50%;
+            background: radial-gradient(circle at 30% 30%, var(--primary), transparent 60%);
+            filter: blur(30px);
+            opacity: 0.3;
+            z-index: -1;
+            animation: float 15s infinite ease-in-out;
+        }
+        @keyframes float {
+            0%, 100% {
+                transform: translate(0, 0);
+            }
+            25% {
+                transform: translate(50px, 50px);
+            }
+            50% {
+                transform: translate(0, 100px);
+            }
+            75% {
+                transform: translate(-50px, 50px);
+            }
+        }
+        .typing-cursor {
+            display: inline-block;
+            width: 8px;
+            height: 20px;
+            background: var(--primary);
+            animation: blink 1s infinite;
+        }
+        @keyframes blink {
+            0%, 100% {
+                opacity: 1;
+            }
+            50% {
+                opacity: 0;
+            }
+        }
+    </style>
+</head>
+<body class="overflow-hidden">
+    <!-- Floating orbs for background effect -->
+    <div class="floating-orb" style="top: 20%; left: 10%;"></div>
+    <div class="floating-orb" style="top: 70%; left: 80%; animation-delay: 5s;"></div>
+    <div class="floating-orb" style="top: 30%; left: 60%; animation-delay: 10s; width: 150px; height: 150px;"></div>
+    <div class="min-h-screen flex flex-col items-center justify-center p-4">
+        <div class="glass-effect rounded-2xl p-8 w-full max-w-3xl shadow-2xl">
+            <!-- Header -->
+            <div class="text-center mb-8">
+                <h1 class="text-4xl font-bold mb-2 neon-glow" style="color: var(--primary)">A.R.I.A</h1>
+                <p class="text-gray-400">Your futuristic voice-controlled companion, powered by smolLM2</p>
+            </div>
+            <!-- Status indicator -->
+            <div class="flex justify-center mb-8">
+                <div class="bg-gray-800 rounded-full px-4 py-2 flex items-center">
+                    <div id="statusIndicator" class="w-3 h-3 rounded-full bg-gray-500 mr-2"></div>
+                    <span id="statusText" class="text-sm">Ready</span>
+                </div>
+            </div>
+            <!-- Voice visualization -->
+            <div class="flex justify-center mb-8">
+                <div id="voiceVisualization" class="wave hidden">
+                    <div class="dot"></div>
+                    <div class="dot"></div>
+                    <div class="dot"></div>
+                </div>
+            </div>
+            <!-- Response area -->
+            <div id="responseArea" class="glass-effect rounded-xl p-6 mb-8 min-h-32 {{ 'hidden' if not response }}">
+                <div class="flex items-start">
+                    <div class="flex-shrink-0 h-10 w-10 rounded-full bg-cyan-900 flex items-center justify-center mr-3">
+                        <i class="fas fa-robot text-cyan-300"></i>
+                    </div>
+                    <div class="flex-1">
+                        <p class="font-semibold mb-2" style="color: var(--primary)">AI Response</p>
+                        <div id="responseText" class="response-text pl-4">
+                            <!-- Response will appear here -->
+                            {% if response %}
+                                {{ response }}
+                            {% endif %}
+                        </div>
+                    </div>
+                </div>
+            </div>
+            <!-- Voice button -->
+            <div class="flex justify-center">
+                <button id="voiceButton" class="voice-btn w-24 h-24 rounded-full bg-gray-800 border-2 border-cyan-500 flex items-center justify-center text-3xl text-cyan-400 pulse">
+                    <i class="fas fa-microphone"></i>
+                </button>
+            </div>
+            <!-- Instructions -->
+            <div class="mt-8 text-center text-gray-400 text-sm">
+                <p>Press and hold the microphone button to speak</p>
+                <p class="mt-1">Release to send your voice command</p>
+            </div>
+            <!-- Hidden form for Flask communication -->
+            <form id="queryForm" action="/" method="POST" class="hidden">
+                <input type="text" id="queryInput" name="query">
+            </form>
+            <!-- Fallback text input -->
+            <div id="textInputFallback" class="mt-6">
+                <div class="glass-effect rounded-xl p-4">
+                    <input type="text" id="textCommand" placeholder="Type your command instead..."
+                        class="w-full bg-gray-800 border border-cyan-700 rounded px-4 py-2 text-white">
+                    <button id="sendText" class="mt-2 bg-cyan-700 hover:bg-cyan-600 text-white py-2 px-4 rounded">
+                        Send
+                    </button>
+                </div>
+            </div>
+        </div>
+        <!-- Footer -->
+        <div class="mt-8 text-center text-gray-500 text-sm">
+            <p>© 2023 AI Voice Assistant | Futuristic Interface</p>
+        </div>
+    </div>
+    <script>
+        // DOM Elements
+        const voiceButton = document.getElementById('voiceButton');
+        const responseArea = document.getElementById('responseArea');
+        const responseText = document.getElementById('responseText');
+        const statusIndicator = document.getElementById('statusIndicator');
+        const statusText = document.getElementById('statusText');
+        const voiceVisualization = document.getElementById('voiceVisualization');
+        const queryForm = document.getElementById('queryForm');
+        const queryInput = document.getElementById('queryInput');
+        const textInputFallback = document.getElementById('textInputFallback');
+        const textCommand = document.getElementById('textCommand');
+        const sendText = document.getElementById('sendText');
+        // Speech recognition setup
+        let recognition;
+        let isListening = false;
+        let finalTranscript = '';
+        let speechSynthesis = window.speechSynthesis;
+        let recognitionTimeout;
+        let recognitionRetries = 0;
+        const MAX_RETRIES = 3;
+        const checkSpeechSupport = () => {
+            return 'SpeechRecognition' in window || 'webkitSpeechRecognition' in window;
+        };
+        // Function to initialize speech recognition
+        function initSpeechRecognition() {
+            if (checkSpeechSupport()) {
+                const SpeechRecognition = window.SpeechRecognition || window.webkitSpeechRecognition;
+                recognition = new SpeechRecognition();
+                recognition.continuous = false;
+                recognition.interimResults = true;
+                recognition.lang = 'en-US'; // Set language explicitly
+                recognition.onstart = () => {
+                    clearTimeout(recognitionTimeout); // Clear timeout when properly started
+                    isListening = true;
+                    voiceButton.classList.add('active');
+                    statusIndicator.classList.remove('bg-gray-500', 'bg-red-500');
+                    statusIndicator.classList.add('bg-green-500');
+                    statusText.textContent = 'Listening...';
+                    voiceVisualization.classList.remove('hidden');
+                    finalTranscript = '';
+                    recognitionRetries = 0; // Reset retries counter
+                };
+                recognition.onresult = (event) => {
+                    let interimTranscript = '';
+                    for (let i = event.resultIndex; i < event.results.length; i++) {
+                        const transcript = event.results[i][0].transcript;
+                        if (event.results[i].isFinal) {
+                            finalTranscript += transcript;
+                        } else {
+                            interimTranscript += transcript;
+                        }
+                    }
+                    // Show interim results
+                    if (interimTranscript || finalTranscript) {
+                        responseArea.classList.remove('hidden');
+                        responseText.innerHTML = `<span class="text-gray-400">${interimTranscript || finalTranscript}</span>${interimTranscript ? '<span class="typing-cursor"></span>' : ''}`;
+                    }
+                };
+                recognition.onerror = (event) => {
+                    console.error('Speech recognition error', event.error);
+                    isListening = false;
+                    voiceButton.classList.remove('active');
+                    voiceVisualization.classList.add('hidden');
+                    statusIndicator.classList.remove('bg-green-500');
+                    statusIndicator.classList.add('bg-red-500');
+                    // Special handling for permission errors
+                    if (event.error === 'not-allowed') {
+                        statusText.textContent = 'Microphone permission denied';
+                    } else if (event.error === 'network') {
+                        statusText.textContent = 'Network error. Check your connection.';
+                    } else {
+                        statusText.textContent = 'Error: ' + event.error;
+                    }
+                    setTimeout(resetStatus, 3000);
+                };
+                recognition.onend = () => {
+                    clearTimeout(recognitionTimeout);
+                    isListening = false;
+                    voiceButton.classList.remove('active');
+                    voiceVisualization.classList.add('hidden');
+                    if (finalTranscript) {
+                        processVoiceCommand(finalTranscript);
+                    } else if (recognitionRetries < MAX_RETRIES) {
+                        // Recognition ended without results, try again
+                        recognitionRetries++;
+                        statusText.textContent = `No speech detected, retrying (${recognitionRetries}/${MAX_RETRIES})...`;
+                        setTimeout(() => {
+                            try {
+                                recognition.start();
+                            } catch (err) {
+                                console.error('Failed to restart recognition:', err);
+                                resetStatus();
+                            }
+                        }, 1000);
+                    } else {
+                        statusText.textContent = 'No speech detected. Please try again.';
+                        setTimeout(resetStatus, 2000);
+                    }
+                };
+                // Button event handlers
+                voiceButton.addEventListener('mousedown', startListening);
+                voiceButton.addEventListener('touchstart', startListening);
+                voiceButton.addEventListener('mouseup', stopListening);
+                voiceButton.addEventListener('touchend', stopListening);
+                voiceButton.addEventListener('mouseleave', stopListening);
+                return true;
+            } else {
+                console.error('Speech recognition not supported in this browser');
+                voiceButton.disabled = true;
+                voiceButton.innerHTML = '<i class="fas fa-microphone-slash"></i>';
+                statusIndicator.classList.remove('bg-gray-500');
+                statusIndicator.classList.add('bg-red-500');
+                statusText.textContent = 'Voice not supported';
+                return false;
+            }
+        }
+        // Initialize speech recognition when page loads
+        let speechInitialized = false;
+        window.addEventListener('DOMContentLoaded', () => {
+            speechInitialized = initSpeechRecognition();
+            // Check for existing response from Flask and speak it
+            const existingResponse = responseText.innerText.trim();
+            if (existingResponse && !existingResponse.startsWith('You said:')) {
+                speakResponse(existingResponse);
+            }
+            // Try to test speech recognition without actually listening
+            if (speechInitialized) {
+                try {
+                    // Just ping the recognition system to trigger permission requests
+                    const testRecognition = new (window.SpeechRecognition || window.webkitSpeechRecognition)();
+                    testRecognition.continuous = false;
+                    testRecognition.interimResults = false;
+                    testRecognition.maxAlternatives = 1;
+                    let testTimeout = setTimeout(() => {
+                        try { testRecognition.stop(); } catch(e) {}
+                    }, 1000);
+                    testRecognition.onstart = () => {
+                        clearTimeout(testTimeout);
+                        setTimeout(() => {
+                            try { testRecognition.stop(); } catch(e) {}
+                        }, 100);
+                    };
+                    testRecognition.start();
+                } catch(e) {
+                    console.warn('Speech recognition test failed:', e);
+                }
+            }
+        });
+        function startListening(e) {
+            e.preventDefault();
+            if (!isListening && recognition) {
+                try {
+                    recognition.start();
+                    // Set timeout in case recognition doesn't trigger onstart
+                    recognitionTimeout = setTimeout(() => {
+                        if (!isListening) {
+                            console.warn("Recognition didn't start properly, retrying...");
+                            try {
+                                recognition.stop();
+                                setTimeout(() => {
+                                    try {
+                                        recognition.start();
+                                    } catch(err) {
+                                        console.error('Failed to restart recognition:', err);
+                                        resetStatus();
+                                    }
+                                }, 300);
+                            } catch (err) {
+                                console.error('Failed to stop non-started recognition:', err);
+                                resetStatus();
+                            }
+                        }
+                    }, 2000);
+                } catch (err) {
+                    console.error('Recognition error:', err);
+                    statusIndicator.classList.remove('bg-gray-500');
+                    statusIndicator.classList.add('bg-red-500');
+                    statusText.textContent = 'Error starting recognition';
+                    setTimeout(resetStatus, 3000);
+                }
+            }
+        }
+        function stopListening(e) {
+            e.preventDefault();
+            if (recognition) {
+                try {
+                    recognition.stop();
+                } catch (err) {
+                    console.error('Error stopping recognition:', err);
+                }
+            }
+        }
+        function resetStatus() {
+            statusIndicator.classList.remove('bg-green-500', 'bg-red-500', 'bg-yellow-500', 'bg-blue-500');
+            statusIndicator.classList.add('bg-gray-500');
+            statusText.textContent = 'Ready';
+        }
+        function processVoiceCommand(command) {
+            // Show recognized text
+            responseArea.classList.remove('hidden');
+            responseText.innerHTML = `<span class="text-gray-400">You said: "${command}"</span>`;
+            // Set query in form and submit to Flask
+            queryInput.value = command;
+            // Simulate AI thinking
+            statusIndicator.classList.remove('bg-green-500');
+            statusIndicator.classList.add('bg-yellow-500');
+            statusText.textContent = 'Processing...';
+            // Submit form to Flask backend
+            queryForm.submit();
+        }
+        // Text input fallback handlers
+        sendText.addEventListener('click', () => {
+            const command = textCommand.value;
+            if (command.trim()) {
+                processVoiceCommand(command);
+                textCommand.value = '';
+            }
+        });
+        textCommand.addEventListener('keypress', (e) => {
+            if (e.key === 'Enter') {
+                const command = textCommand.value;
+                if (command.trim()) {
+                    processVoiceCommand(command);
+                    textCommand.value = '';
+                }
+            }
+        });
+        function speakResponse(text) {
+            if (speechSynthesis) {
+                // Cancel any ongoing speech
+                speechSynthesis.cancel();
+                const utterance = new SpeechSynthesisUtterance(text);
+                // Get available voices
+                let voices = speechSynthesis.getVoices();
+                // If voices array is empty, wait for the onvoiceschanged event
+                if (voices.length === 0) {
+                    speechSynthesis.onvoiceschanged = () => {
+                        voices = speechSynthesis.getVoices();
+                        setVoiceAndSpeak();
+                    };
+                } else {
+                    setVoiceAndSpeak();
+                }
+                function setVoiceAndSpeak() {
+                    // Find a good English voice
+                    const preferredVoice = voices.find(voice =>
+                        voice.name.includes('Google US English') ||
+                        voice.name.includes('Samantha') ||
+                        voice.name.includes('Google UK English Female') ||
+                        voice.name.includes('en-US')
+                    ) || voices.find(voice => voice.lang.includes('en')) || voices[0];
+                    if (preferredVoice) {
+                        utterance.voice = preferredVoice;
+                    }
+                    utterance.rate = 1.1;
+                    utterance.pitch = 1.1;
+                    // Start speaking
+                    speechSynthesis.speak(utterance);
+                    // Visual feedback
+                    statusIndicator.classList.remove('bg-yellow-500');
+                    statusIndicator.classList.add('bg-blue-500');
+                    statusText.textContent = 'Speaking';
+                    utterance.onend = () => {
+                        resetStatus();
+                    };
+                }
+            }
+        }
+    </script>
+    <script>
+        window.addEventListener("DOMContentLoaded", () => {
+            const responseText = {{ response|tojson }};
+            if ('speechSynthesis' in window && responseText.trim().length > 0) {
+                const utterance = new SpeechSynthesisUtterance(responseText);
+                utterance.lang = 'en-US';
+                // Cancel any ongoing speech
+                window.speechSynthesis.cancel();
+                // Optional: add debug listeners
+                utterance.onend = () => console.log("✅ Speech finished");
+                utterance.onerror = (e) => console.error("❌ Speech error:", e);
+                window.speechSynthesis.speak(utterance);
+            } else {
+                console.log("⚠️ Speech synthesis not triggered: either not supported or empty response.");
+            }
+        });
+        </script>
+</body>
+</html>

web_search_tool.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from duckduckgo_search import DDGS
+def web_search(query):
+    results = DDGS().text(query, max_results=1)
+    return results[0]['body']