Spaces:

Mr-Geo
/

ai-narrator

Sleeping

App Files Files Community

Mr-Geo commited on Dec 13, 2024

Commit

b15167b

verified ·

1 Parent(s): 98f7557

Upload 17 files

Browse files

Files changed (17) hide show

Dockerfile +13 -0
convert_text_to_speech.py +31 -0
generate_description.py +46 -0
main.py +20 -0
narrate_description.py +67 -0
requirements.txt +7 -0
static/assets/android-chrome-192x192.png +0 -0
static/assets/android-chrome-512x512.png +0 -0
static/assets/apple-touch-icon.png +0 -0
static/assets/favicon-16x16.png +0 -0
static/assets/favicon-32x32.png +0 -0
static/assets/favicon.ico +0 -0
static/assets/logo.png +0 -0
static/assets/site.webmanifest +1 -0
static/css/style.css +199 -0
static/js/main.js +248 -0
templates/main.html +39 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,13 @@

+FROM python:3.11-slim
+WORKDIR /code
+COPY ./requirements.txt /code/requirements.txt
+RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
+COPY . /code
+EXPOSE 7860
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]

convert_text_to_speech.py ADDED Viewed

	@@ -0,0 +1,31 @@

+from elevenlabs.client import ElevenLabs
+import os
+import httpx
+import time
+ELEVENLABS_API_KEY = os.getenv('ELEVENLABS_API_KEY')
+elevenlabs_client = ElevenLabs(api_key=ELEVENLABS_API_KEY)
+async def convert_text_to_speech(text, selected_voice_id):
+    try:
+        async with httpx.AsyncClient() as http_client:
+            response = await http_client.post(
+                f"https://api.elevenlabs.io/v1/text-to-speech/{selected_voice_id}/stream",
+                json={
+                    "model_id": "eleven_monolingual_v1",
+                    "text": text,
+                    "output_format": "mp3_44100_128"
+                },
+                headers={
+                    "Content-Type": "application/json",
+                    "xi-api-key": ELEVENLABS_API_KEY
+                },
+                timeout=None
+            )
+            # Stream the response content
+            async for chunk in response.aiter_bytes():
+                print(f"Received chunk: {len(chunk)} bytes at {time.time()}")
+                yield chunk
+    except Exception as e:
+        print(f"Error during text-to-speech conversion: {e}")

generate_description.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import os
+import asyncio
+from anthropic import AsyncAnthropic
+ANTHROPIC_API_KEY = os.getenv('ANTHROPIC_API_KEY')
+async def generate_description(image_data, selected_voice_name, description_history):
+    client = AsyncAnthropic(api_key=ANTHROPIC_API_KEY)
+    try:
+        system_prompt = f"You are {selected_voice_name} and you must describe the image you are given using your unique phrases in a humorous way in 15 words or less. Please use only raw text without any special formatting characters like asterisks."
+        print("System prompt:", system_prompt)
+        async with client.messages.stream(
+            model="claude-3-haiku-20240307",
+            max_tokens=100,
+            temperature=1,
+            system=system_prompt,
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "image",
+                            "source": {
+                                "type": "base64",
+                                "media_type": "image/jpeg",
+                                "data": image_data
+                            }
+                        },
+                        {
+                            "type": "text",
+                            "text": f"As {selected_voice_name} describe this image in a humorous way in 15 words or less"
+                        }
+                    ]
+                }
+            ]
+        ) as stream:
+            description = ""
+            async for event in stream.text_stream:
+                print(event)
+                description += event
+                yield event
+    except Exception as e:
+        print(f"Error generating description: {e}")
+        yield "Error generating description."

main.py ADDED Viewed

	@@ -0,0 +1,20 @@

+from dotenv import load_dotenv
+load_dotenv()
+from fastapi import FastAPI, Request
+from fastapi.responses import FileResponse, HTMLResponse
+from fastapi.staticfiles import StaticFiles
+from narrate_description import router as narrate_description_router
+app = FastAPI()
+app.include_router(narrate_description_router)
+app.mount("/static", StaticFiles(directory="static"), name="static")
+app.mount("/templates", StaticFiles(directory="templates"), name="templates")
+@app.get("/", response_class=HTMLResponse)
+async def get_root(request: Request):
+    return FileResponse('templates/main.html')

narrate_description.py ADDED Viewed

	@@ -0,0 +1,67 @@

+from fastapi import APIRouter, WebSocket
+import json
+from generate_description import generate_description
+from convert_text_to_speech import convert_text_to_speech
+import re
+import asyncio
+router = APIRouter()
+description_history = []
+@router.websocket_route("/narrate")
+async def websocket_narrate(websocket: WebSocket):
+    await websocket.accept()
+    print("WebSocket connection accepted.")
+    try:
+        while True:
+            data = await websocket.receive_text()
+            if data == "close":
+                print("Closing WebSocket connection.")
+                break
+            data_json = json.loads(data)
+            image_data = data_json.get('image')
+            selected_voice_id = data_json.get('voiceId')
+            selected_voice_name = data_json.get('voiceName')
+            if image_data:
+                print(f"Image data received, sending to {selected_voice_name} model for analysis.")
+                description_accumulator = ""
+                punctuation_pattern = re.compile(r"[*]")
+                async for description_chunk in generate_description(image_data, selected_voice_name, description_history):
+                    if description_chunk:
+                        # Accumulate the chunk, ensuring not to break on single punctuation marks
+                        if not punctuation_pattern.fullmatch(description_chunk.strip()):
+                            description_accumulator += description_chunk
+                        else:
+                            description_accumulator += " " + description_chunk
+                        # Send each text chunk to the frontend
+                        await websocket.send_text(json.dumps({"type": "text_chunk", "data": description_chunk, "pictureCount": data_json.get('pictureCount'), "voiceName": selected_voice_name}))
+                        # If the chunk ends with punctuation, convert and stream it
+                        if punctuation_pattern.search(description_chunk):
+                            audio_chunks = convert_text_to_speech(description_accumulator.strip(), selected_voice_id)
+                            await asyncio.gather(*[websocket.send_bytes(chunk) async for chunk in audio_chunks])
+                            # Append the fully accumulated description to the history
+                            description_history.append(description_accumulator.strip())
+                            description_accumulator = ""
+                # If there is any remaining text after the loop, send it for conversion too
+                if description_accumulator:
+                    audio_chunks = convert_text_to_speech(description_accumulator.strip(), selected_voice_id)
+                    await asyncio.gather(*[websocket.send_bytes(chunk) async for chunk in audio_chunks])
+                    # Append the remaining accumulated description to the history
+                    description_history.append(description_accumulator.strip())
+                print("Finished processing image data.")
+            else:
+                print("No image data received, sending error message to client.")
+                await websocket.send_text("No image data received.")
+        print("WebSocket connection closed.")
+    except Exception as e:
+        print(f"Error during WebSocket communication: {e}")
+    finally:
+        await websocket.close()

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+fastapi==0.110.0
+uvicorn==0.27.1
+httpx==0.27.0
+python-dotenv==1.0.1
+aiohttp==3.9.3
+elevenlabs==1.0.0b1
+anthropic==0.20.0

static/assets/android-chrome-192x192.png ADDED Viewed

static/assets/android-chrome-512x512.png ADDED Viewed

static/assets/apple-touch-icon.png ADDED Viewed

static/assets/favicon-16x16.png ADDED Viewed

static/assets/favicon-32x32.png ADDED Viewed

static/assets/favicon.ico ADDED Viewed

static/assets/logo.png ADDED Viewed

static/assets/site.webmanifest ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"name":"","short_name":"","icons":[{"src":"/android-chrome-192x192.png","sizes":"192x192","type":"image/png"},{"src":"/android-chrome-512x512.png","sizes":"512x512","type":"image/png"}],"theme_color":"#ffffff","background_color":"#ffffff","display":"standalone"}

static/css/style.css ADDED Viewed

	@@ -0,0 +1,199 @@

+body {
+    font-family: Arial;
+    display: flex;
+    flex-direction: column;
+    align-items: center;
+    justify-content: center;
+    height: 100vh;
+    margin: 0;
+    background-color: #f0f0f0;
+}
+#camera-feed {
+    border: 3px solid #333;
+    width: 640px;
+    height: 480px;
+    background-color: #000;
+}
+#voice-selection {
+    margin: 20px 0;
+    display: flex;
+    flex-wrap: wrap;
+    justify-content: center;
+}
+#voice-selection button {
+    padding: 10px 20px;
+    margin: 5px;
+    background-color: #007bff;
+    color: white;
+    border: none;
+    border-radius: 5px;
+    cursor: pointer;
+    transition: background-color 0.3s ease;
+}
+#voice-selection button:hover {
+    background-color: #0056b3;
+}
+#voice-selection button.selected {
+    background-color: #0056b3; /* Darker blue to indicate selection */
+    color: #fff;
+}
+#voice-selection .voice-btn {
+  font-weight: bold;
+}
+#start-btn {
+    padding: 10px 20px;
+    font-size: 16px;
+    font-weight: bold;
+    background-color: #28a745;
+    color: white;
+    border: none;
+    border-radius: 5px;
+    cursor: pointer;
+    transition: background-color 0.3s ease;
+}
+#start-btn:hover {
+    background-color: #218838;
+}
+#feedback {
+    max-width: 1500px;
+    margin-top: 20px;
+    padding-left: 20px;
+    padding-right: 20px;
+    text-align: left;
+    max-height: 100px;
+    overflow-y: auto;
+}
+#feedback img {
+    max-width: 640px;
+    max-height: 480px;
+    border: 1px solid #ddd;
+    border-radius: 4px;
+    padding: 5px;
+}
+#feedback p {
+    margin-top: 10px;
+    color: #333;
+}
+.switch {
+    position: relative;
+    display: inline-block;
+    width: 60px;
+    height: 34px;
+  }
+  .switch input {
+    opacity: 0;
+    width: 0;
+    height: 0;
+  }
+  .slider {
+    position: absolute;
+    cursor: pointer;
+    top: 0;
+    left: 0;
+    right: 0;
+    bottom: 0;
+    background-color: #ccc;
+    -webkit-transition: .4s;
+    transition: .4s;
+    border-radius: 34px; /* Makes the slider rounded */
+  }
+  input:checked + .slider {
+    background-color: #4CAF50; /* Green color when enabled */
+  }
+  input:not(:checked) + .slider {
+    background-color: #f44336; /* Red color when disabled */
+  }
+  .slider:before {
+    position: absolute;
+    content: "";
+    height: 26px;
+    width: 26px;
+    left: 4px; /* Initial position */
+    bottom: 4px;
+    background-color: white;
+    -webkit-transition: .4s;
+    transition: .4s;
+    border-radius: 50%; /* Keeps the circle inside the slider rounded */
+  }
+  input:checked + .slider:before {
+    -webkit-transform: translateX(26px);
+    -ms-transform: translateX(26px);
+    transform: translateX(26px); /* Slide to the right */
+  }
+#picture-counter {
+  font-size: 18px;
+  color: #333;
+}
+#captured-images {
+  display: flex;
+  overflow-x: auto;
+  margin-top: 20px;
+  max-width: 600px; /* Set the maximum width */
+  white-space: nowrap; /* Keep images in a single line */
+  scroll-snap-type: x mandatory; /* Enable scroll snap along the x-axis and make it mandatory */
+}
+#captured-images .image-wrapper {
+  flex: 0 0 100px; /* Do not grow, do not shrink, base width of 100px */
+  margin-right: 5px;
+  position: relative;
+  display: inline-flex; /* Use inline-flex to keep the wrapper inline */
+  scroll-snap-align: start; /* Optional: Enhances the scrolling experience */
+}
+#captured-images .image-wrapper img {
+  max-width: 100%; /* Ensure images do not exceed the width of their wrappers */
+  height: auto; /* Maintain aspect ratio */
+}
+#captured-images .image-wrapper::after {
+  content: attr(data-picture-number);
+  position: absolute;
+  bottom: 0;
+  right: 0;
+  background-color: rgba(0, 0, 0, 0.75); /* Make it darker for better visibility */
+  color: white;
+  padding: 2px 5px;
+  font-size: 12px; /* Adjust font size as needed */
+  z-index: 10; /* Increase z-index to ensure it's above the image */
+}
+.error {
+  color: red;
+}
+#toggle-camera-btn {
+  padding: 10px 20px;
+  font-size: 16px;
+  font-weight: bold;
+  background-color: #ff9800; /* Orange color for visibility */
+  color: white;
+  border: none;
+  border-radius: 5px;
+  cursor: pointer;
+  transition: background-color 0.3s ease;
+}
+#toggle-camera-btn:hover {
+  background-color: #e68900; /* Darker shade of orange on hover */
+}

static/js/main.js ADDED Viewed

	@@ -0,0 +1,248 @@

+const cameraFeedElement = document.getElementById('camera-feed');
+let ws;
+let currentStream = null;
+let currentDeviceIndex = 0;
+let allCameras = [];
+function stopCurrentVideoStream() {
+    if (currentStream) {
+        currentStream.getTracks().forEach(track => track.stop());
+    }
+}
+function getCameras() {
+    navigator.mediaDevices.enumerateDevices()
+        .then(devices => {
+            allCameras = devices.filter(device => device.kind === 'videoinput');
+            if (allCameras.length > 0) {
+                switchCamera(); // Initialize the first camera
+            }
+        })
+        .catch(error => console.error("Could not get cameras:", error));
+}
+function switchCamera() {
+    stopCurrentVideoStream();
+    currentDeviceIndex = (currentDeviceIndex + 1) % allCameras.length;
+    const deviceId = allCameras[currentDeviceIndex].deviceId;
+    const constraints = {
+        video: {
+            deviceId: deviceId,
+            width: { ideal: 640 },
+            height: { ideal: 480 }
+        }
+    };
+    navigator.mediaDevices.getUserMedia(constraints)
+        .then(stream => {
+            currentStream = stream;
+            cameraFeedElement.srcObject = stream;
+        })
+        .catch(error => {
+            console.error("Could not switch camera:", error);
+            console.error("Error name: ", error.name);
+            console.error("Error message: ", error.message);
+            handleCameraError(error);
+        });
+}
+function handleCameraError(error) {
+    if (error.name === 'NotAllowedError') {
+        alert('Camera access was denied. Please allow camera access for this site.');
+    } else if (error.name === 'NotFoundError') {
+        alert('No camera found. Please ensure a camera is properly connected or integrated.');
+    } else if (error.name === 'NotReadableError') {
+        alert('Camera is currently being used by another application. Please close that application and try again.');
+    } else if (error.name === 'OverconstrainedError') {
+        alert('No camera matches the requested constraints. Trying default settings...');
+        fallbackToDefaultCamera();
+    } else {
+        alert('An unknown error occurred when trying to access the camera.');
+    }
+}
+function fallbackToDefaultCamera() {
+    const constraints = {
+        video: true // Use default settings
+    };
+    navigator.mediaDevices.getUserMedia(constraints)
+        .then(stream => {
+            currentStream = stream;
+            cameraFeedElement.srcObject = stream;
+        })
+        .catch(error => {
+            console.error("Could not access default camera:", error);
+        });
+}
+getCameras();
+document.getElementById('toggle-camera-btn').addEventListener('click', switchCamera);
+let audioQueue = [];
+let isPlaying = false;
+function playAudio(arrayBuffer) {
+    console.log("Attempting to play audio", arrayBuffer);
+    const blob = new Blob([arrayBuffer], { type: 'audio/mp3' });
+    audioQueue.push(blob);
+    if (!isPlaying) {
+        playNextAudio();
+    }
+}
+function playNextAudio() {
+    if (audioQueue.length > 0) {
+        isPlaying = true;
+        const url = URL.createObjectURL(audioQueue.shift());
+        const audio = new Audio(url);
+        audio.play().then(() => {
+            audio.addEventListener('ended', playNextAudio);
+        }).catch(e => {
+            console.error("Error playing audio:", e);
+            isPlaying = false;
+            playNextAudio();
+        });
+    } else {
+        isPlaying = false;
+    }
+}
+let selectedVoiceName = "Daniel Attenborough";
+function selectVoice() {
+    selectedVoiceId = this.getAttribute('data-voice-id');
+    selectedVoiceName = this.getAttribute('data-voice-name');
+    document.querySelectorAll('.voice-btn').forEach(btn => btn.classList.remove('selected'));
+    this.classList.add('selected');
+    // Check if the current feedback is the voice selection warning before clearing
+    const feedbackElement = document.getElementById('feedback');
+    if (feedbackElement.textContent === 'Please select a voice before narrating.') {
+        feedbackElement.textContent = ''; // Clear the warning message
+    }
+    feedbackElement.classList.remove('error'); // Remove the error class if present
+}
+document.querySelectorAll('.voice-btn').forEach(btn => {
+    btn.addEventListener('click', selectVoice);
+});
+let selectedVoiceId;
+function captureAndAnalyseImage() {
+    if (!selectedVoiceId) {
+        const feedbackElement = document.getElementById('feedback');
+        feedbackElement.textContent = 'Please select a voice before narrating.';
+        feedbackElement.classList.add('error');
+        return;
+    }
+    const canvas = document.createElement('canvas');
+    canvas.width = cameraFeedElement.videoWidth;
+    canvas.height = cameraFeedElement.videoHeight;
+    const ctx = canvas.getContext('2d');
+    ctx.drawImage(cameraFeedElement, 0, 0, canvas.width, canvas.height);
+    const imageDataUrl = canvas.toDataURL('image/jpeg');
+    pictureCount++;
+    document.getElementById('picture-counter').textContent = `Pictures taken: ${pictureCount}`;
+    const capturedImagesContainer = document.getElementById('captured-images');
+    const imgWrapper = document.createElement('div'); // Create a wrapper div for the image
+    imgWrapper.classList.add('image-wrapper'); // Add class for styling
+    imgWrapper.setAttribute('data-picture-number', `Picture ${pictureCount}`); // Set the picture number
+    const imgElement = document.createElement('img');
+    imgElement.src = imageDataUrl;
+    imgWrapper.appendChild(imgElement); // Append the image to the wrapper
+    capturedImagesContainer.appendChild(imgWrapper); // Append the wrapper to the container
+    // Scroll to the latest image
+    capturedImagesContainer.scrollLeft = capturedImagesContainer.scrollWidth;
+    if (ws && ws.readyState === WebSocket.OPEN) {
+        ws.send(JSON.stringify({ image: imageDataUrl.split(',')[1], voiceId: selectedVoiceId, voiceName: selectedVoiceName, pictureCount: pictureCount }));
+    } else {
+        console.error("WebSocket is not open.");
+    }
+}
+// Initialise WebSocket connection and event handlers
+function initWebSocket() {
+    console.log(`wss://${window.location.host}/narrate`);
+    ws = new WebSocket(`wss://${window.location.host}/narrate`);
+    ws.binaryType = 'arraybuffer'; // Important for audio data
+    ws.onopen = () => {
+        console.log("WebSocket connection opened.");
+        // Now safe to send messages
+    };
+    ws.onmessage = (event) => {
+        if (typeof event.data === "string") {
+            const message = JSON.parse(event.data);
+            if (message.type === "text_chunk") {
+                let feedbackElement = document.getElementById('feedback');
+                let p = document.querySelector(`p[data-picture-count="${message.pictureCount}"]`);
+                if (!p) {
+                    p = document.createElement('p');
+                    const timestamp = new Date().toLocaleTimeString();
+                    p.setAttribute('data-picture-count', message.pictureCount);
+                    p.innerHTML = `<strong>[${timestamp}] [Picture ${message.pictureCount}] [${message.voiceName}]</strong> `;
+                    feedbackElement.appendChild(p);
+                }
+                p.innerHTML += `${message.data}`;
+                feedbackElement.scrollTop = feedbackElement.scrollHeight;
+            }
+        } else {
+            playAudio(event.data);
+        }
+    };
+    ws.onerror = (error) => {
+        console.error("WebSocket error:", error);
+    };
+    ws.onclose = () => {
+        console.log("WebSocket connection closed.");
+    };
+}
+// Add event listener to the start button for capturing and analysing the image
+document.getElementById('start-btn').addEventListener('click', captureAndAnalyseImage);
+// Initialise WebSocket connection
+initWebSocket();
+let continuousNarrationInterval; // Holds the interval ID for continuous narration
+document.getElementById('continuous-narrate-toggle').addEventListener('change', function() {
+    if (this.checked) {
+        if (!selectedVoiceId) {
+            document.getElementById('feedback').textContent = 'Please select a voice before narrating.';
+            document.getElementById('feedback').classList.add('error');
+            this.checked = false;
+            return;
+        }
+        captureAndAnalyseImage(); // Send the first image immediately
+        if (!continuousNarrationInterval) {
+            continuousNarrationInterval = setInterval(captureAndAnalyseImage, 5000); // 5-second delay for subsequent images
+        }
+    } else {
+        if (continuousNarrationInterval) {
+            clearInterval(continuousNarrationInterval);
+            continuousNarrationInterval = null;
+        }
+    }
+});
+// Existing code for adding event listener to the start button
+document.getElementById('start-btn').addEventListener('click', captureAndAnalyseImage);
+// Initialise WebSocket connection
+initWebSocket();
+let pictureCount = 0;

templates/main.html ADDED Viewed

	@@ -0,0 +1,39 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>AI Image Narrator</title>
+    <link rel="stylesheet" href="/static/css/style.css">
+    <link rel="apple-touch-icon" sizes="180x180" href="/static/assets/apple-touch-icon.png">
+    <link rel="icon" type="image/png" sizes="32x32" href="/static/assets/favicon-32x32.png">
+    <link rel="icon" type="image/png" sizes="16x16" href="/static/assets/favicon-16x16.png">
+    <link rel="manifest" href="/static/assets/site.webmanifest">
+</head>
+<body>
+    <video id="camera-feed" autoplay></video>
+    <div id="captured-images" style="display: flex; overflow-x: auto; margin-top: 20px;"></div>
+    <div id="voice-selection">
+        <button class="voice-btn" data-voice-id="0SQfBfjRCI4jQdnyrF5B" data-voice-name="Michael Caine">Michael Kaine 🎩</button>
+        <button class="voice-btn" data-voice-id="4c42HvUOZ0L0feAu3r5C" data-voice-name="David Attenborough">Daniel Attenborough 🌍</button>
+        <button class="voice-btn" data-voice-id="DFtRVeaAE1d7V4uhxFcF" data-voice-name="Stephen Fry">Stephon Fry 📚</button>
+        <button class="voice-btn" data-voice-id="K8sG6kT7jA4WnERxh8vd" data-voice-name="Morgan Freeman">Morgan Free 🎤</button>
+        <button class="voice-btn" data-voice-id="WiXK0UI5GPQ98IYxy8he" data-voice-name="Joanna Lumley">Johanna Lumly 💄</button>
+        <button class="voice-btn" data-voice-id="bnvSNcvmOz9I0VhuOh58" data-voice-name="John Cleese">Jon Cheese 🧀</button>
+        <button class="voice-btn" data-voice-id="g5Qp5bT7Dm1TIJecJuds" data-voice-name="Judi Dench">Judy Drench 🎭</button>
+        <button class="voice-btn" data-voice-id="w642gnqphLNLyM1zH2eI" data-voice-name="Richard Hammond">Richard Hamed 🚗</button>
+    </div>
+    <div id="picture-counter">Pictures taken: 0</div>
+    <div style="display: flex; justify-content: center; align-items: center; gap: 10px; margin-top: 10px;">
+        <button id="start-btn">Single Narrate</button>
+        <label class="switch">
+            <input type="checkbox" id="continuous-narrate-toggle">
+            <span class="slider round"></span>
+        </label>
+        <span>Continuously Narrate</span>
+        <button id="toggle-camera-btn">Toggle Camera</button>
+    </div>
+    <div id="feedback"></div>
+    <script src="/static/js/main.js"></script>
+</body>
+</html>