Mr-Geo commited on
Commit
b15167b
Β·
verified Β·
1 Parent(s): 98f7557

Upload 17 files

Browse files
Dockerfile ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ WORKDIR /code
4
+
5
+ COPY ./requirements.txt /code/requirements.txt
6
+
7
+ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
8
+
9
+ COPY . /code
10
+
11
+ EXPOSE 7860
12
+
13
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
convert_text_to_speech.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from elevenlabs.client import ElevenLabs
2
+ import os
3
+ import httpx
4
+ import time
5
+
6
+ ELEVENLABS_API_KEY = os.getenv('ELEVENLABS_API_KEY')
7
+
8
+ elevenlabs_client = ElevenLabs(api_key=ELEVENLABS_API_KEY)
9
+
10
+ async def convert_text_to_speech(text, selected_voice_id):
11
+ try:
12
+ async with httpx.AsyncClient() as http_client:
13
+ response = await http_client.post(
14
+ f"https://api.elevenlabs.io/v1/text-to-speech/{selected_voice_id}/stream",
15
+ json={
16
+ "model_id": "eleven_monolingual_v1",
17
+ "text": text,
18
+ "output_format": "mp3_44100_128"
19
+ },
20
+ headers={
21
+ "Content-Type": "application/json",
22
+ "xi-api-key": ELEVENLABS_API_KEY
23
+ },
24
+ timeout=None
25
+ )
26
+ # Stream the response content
27
+ async for chunk in response.aiter_bytes():
28
+ print(f"Received chunk: {len(chunk)} bytes at {time.time()}")
29
+ yield chunk
30
+ except Exception as e:
31
+ print(f"Error during text-to-speech conversion: {e}")
generate_description.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import asyncio
3
+ from anthropic import AsyncAnthropic
4
+
5
+ ANTHROPIC_API_KEY = os.getenv('ANTHROPIC_API_KEY')
6
+
7
+ async def generate_description(image_data, selected_voice_name, description_history):
8
+ client = AsyncAnthropic(api_key=ANTHROPIC_API_KEY)
9
+ try:
10
+ system_prompt = f"You are {selected_voice_name} and you must describe the image you are given using your unique phrases in a humorous way in 15 words or less. Please use only raw text without any special formatting characters like asterisks."
11
+
12
+ print("System prompt:", system_prompt)
13
+
14
+ async with client.messages.stream(
15
+ model="claude-3-haiku-20240307",
16
+ max_tokens=100,
17
+ temperature=1,
18
+ system=system_prompt,
19
+ messages=[
20
+ {
21
+ "role": "user",
22
+ "content": [
23
+ {
24
+ "type": "image",
25
+ "source": {
26
+ "type": "base64",
27
+ "media_type": "image/jpeg",
28
+ "data": image_data
29
+ }
30
+ },
31
+ {
32
+ "type": "text",
33
+ "text": f"As {selected_voice_name} describe this image in a humorous way in 15 words or less"
34
+ }
35
+ ]
36
+ }
37
+ ]
38
+ ) as stream:
39
+ description = ""
40
+ async for event in stream.text_stream:
41
+ print(event)
42
+ description += event
43
+ yield event
44
+ except Exception as e:
45
+ print(f"Error generating description: {e}")
46
+ yield "Error generating description."
main.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dotenv import load_dotenv
2
+ load_dotenv()
3
+ from fastapi import FastAPI, Request
4
+ from fastapi.responses import FileResponse, HTMLResponse
5
+ from fastapi.staticfiles import StaticFiles
6
+ from narrate_description import router as narrate_description_router
7
+
8
+
9
+ app = FastAPI()
10
+
11
+ app.include_router(narrate_description_router)
12
+
13
+ app.mount("/static", StaticFiles(directory="static"), name="static")
14
+
15
+ app.mount("/templates", StaticFiles(directory="templates"), name="templates")
16
+
17
+
18
+ @app.get("/", response_class=HTMLResponse)
19
+ async def get_root(request: Request):
20
+ return FileResponse('templates/main.html')
narrate_description.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter, WebSocket
2
+ import json
3
+ from generate_description import generate_description
4
+ from convert_text_to_speech import convert_text_to_speech
5
+ import re
6
+ import asyncio
7
+
8
+ router = APIRouter()
9
+
10
+ description_history = []
11
+
12
+ @router.websocket_route("/narrate")
13
+ async def websocket_narrate(websocket: WebSocket):
14
+ await websocket.accept()
15
+ print("WebSocket connection accepted.")
16
+ try:
17
+ while True:
18
+ data = await websocket.receive_text()
19
+ if data == "close":
20
+ print("Closing WebSocket connection.")
21
+ break
22
+
23
+ data_json = json.loads(data)
24
+ image_data = data_json.get('image')
25
+ selected_voice_id = data_json.get('voiceId')
26
+ selected_voice_name = data_json.get('voiceName')
27
+ if image_data:
28
+ print(f"Image data received, sending to {selected_voice_name} model for analysis.")
29
+ description_accumulator = ""
30
+ punctuation_pattern = re.compile(r"[*]")
31
+
32
+ async for description_chunk in generate_description(image_data, selected_voice_name, description_history):
33
+ if description_chunk:
34
+ # Accumulate the chunk, ensuring not to break on single punctuation marks
35
+ if not punctuation_pattern.fullmatch(description_chunk.strip()):
36
+ description_accumulator += description_chunk
37
+ else:
38
+ description_accumulator += " " + description_chunk
39
+
40
+ # Send each text chunk to the frontend
41
+ await websocket.send_text(json.dumps({"type": "text_chunk", "data": description_chunk, "pictureCount": data_json.get('pictureCount'), "voiceName": selected_voice_name}))
42
+
43
+ # If the chunk ends with punctuation, convert and stream it
44
+ if punctuation_pattern.search(description_chunk):
45
+ audio_chunks = convert_text_to_speech(description_accumulator.strip(), selected_voice_id)
46
+ await asyncio.gather(*[websocket.send_bytes(chunk) async for chunk in audio_chunks])
47
+ # Append the fully accumulated description to the history
48
+ description_history.append(description_accumulator.strip())
49
+ description_accumulator = ""
50
+
51
+ # If there is any remaining text after the loop, send it for conversion too
52
+ if description_accumulator:
53
+ audio_chunks = convert_text_to_speech(description_accumulator.strip(), selected_voice_id)
54
+ await asyncio.gather(*[websocket.send_bytes(chunk) async for chunk in audio_chunks])
55
+ # Append the remaining accumulated description to the history
56
+ description_history.append(description_accumulator.strip())
57
+
58
+ print("Finished processing image data.")
59
+ else:
60
+ print("No image data received, sending error message to client.")
61
+ await websocket.send_text("No image data received.")
62
+
63
+ print("WebSocket connection closed.")
64
+ except Exception as e:
65
+ print(f"Error during WebSocket communication: {e}")
66
+ finally:
67
+ await websocket.close()
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ fastapi==0.110.0
2
+ uvicorn==0.27.1
3
+ httpx==0.27.0
4
+ python-dotenv==1.0.1
5
+ aiohttp==3.9.3
6
+ elevenlabs==1.0.0b1
7
+ anthropic==0.20.0
static/assets/android-chrome-192x192.png ADDED
static/assets/android-chrome-512x512.png ADDED
static/assets/apple-touch-icon.png ADDED
static/assets/favicon-16x16.png ADDED
static/assets/favicon-32x32.png ADDED
static/assets/favicon.ico ADDED
static/assets/logo.png ADDED
static/assets/site.webmanifest ADDED
@@ -0,0 +1 @@
 
 
1
+ {"name":"","short_name":"","icons":[{"src":"/android-chrome-192x192.png","sizes":"192x192","type":"image/png"},{"src":"/android-chrome-512x512.png","sizes":"512x512","type":"image/png"}],"theme_color":"#ffffff","background_color":"#ffffff","display":"standalone"}
static/css/style.css ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ body {
2
+ font-family: Arial;
3
+ display: flex;
4
+ flex-direction: column;
5
+ align-items: center;
6
+ justify-content: center;
7
+ height: 100vh;
8
+ margin: 0;
9
+ background-color: #f0f0f0;
10
+ }
11
+
12
+ #camera-feed {
13
+ border: 3px solid #333;
14
+ width: 640px;
15
+ height: 480px;
16
+ background-color: #000;
17
+ }
18
+
19
+ #voice-selection {
20
+ margin: 20px 0;
21
+ display: flex;
22
+ flex-wrap: wrap;
23
+ justify-content: center;
24
+ }
25
+
26
+ #voice-selection button {
27
+ padding: 10px 20px;
28
+ margin: 5px;
29
+ background-color: #007bff;
30
+ color: white;
31
+ border: none;
32
+ border-radius: 5px;
33
+ cursor: pointer;
34
+ transition: background-color 0.3s ease;
35
+ }
36
+
37
+ #voice-selection button:hover {
38
+ background-color: #0056b3;
39
+ }
40
+
41
+ #voice-selection button.selected {
42
+ background-color: #0056b3; /* Darker blue to indicate selection */
43
+ color: #fff;
44
+ }
45
+
46
+ #voice-selection .voice-btn {
47
+ font-weight: bold;
48
+ }
49
+
50
+ #start-btn {
51
+ padding: 10px 20px;
52
+ font-size: 16px;
53
+ font-weight: bold;
54
+ background-color: #28a745;
55
+ color: white;
56
+ border: none;
57
+ border-radius: 5px;
58
+ cursor: pointer;
59
+ transition: background-color 0.3s ease;
60
+ }
61
+
62
+ #start-btn:hover {
63
+ background-color: #218838;
64
+ }
65
+
66
+ #feedback {
67
+ max-width: 1500px;
68
+ margin-top: 20px;
69
+ padding-left: 20px;
70
+ padding-right: 20px;
71
+ text-align: left;
72
+ max-height: 100px;
73
+ overflow-y: auto;
74
+ }
75
+
76
+ #feedback img {
77
+ max-width: 640px;
78
+ max-height: 480px;
79
+ border: 1px solid #ddd;
80
+ border-radius: 4px;
81
+ padding: 5px;
82
+ }
83
+
84
+ #feedback p {
85
+ margin-top: 10px;
86
+ color: #333;
87
+ }
88
+
89
+ .switch {
90
+ position: relative;
91
+ display: inline-block;
92
+ width: 60px;
93
+ height: 34px;
94
+ }
95
+
96
+ .switch input {
97
+ opacity: 0;
98
+ width: 0;
99
+ height: 0;
100
+ }
101
+
102
+ .slider {
103
+ position: absolute;
104
+ cursor: pointer;
105
+ top: 0;
106
+ left: 0;
107
+ right: 0;
108
+ bottom: 0;
109
+ background-color: #ccc;
110
+ -webkit-transition: .4s;
111
+ transition: .4s;
112
+ border-radius: 34px; /* Makes the slider rounded */
113
+ }
114
+
115
+ input:checked + .slider {
116
+ background-color: #4CAF50; /* Green color when enabled */
117
+ }
118
+
119
+ input:not(:checked) + .slider {
120
+ background-color: #f44336; /* Red color when disabled */
121
+ }
122
+
123
+ .slider:before {
124
+ position: absolute;
125
+ content: "";
126
+ height: 26px;
127
+ width: 26px;
128
+ left: 4px; /* Initial position */
129
+ bottom: 4px;
130
+ background-color: white;
131
+ -webkit-transition: .4s;
132
+ transition: .4s;
133
+ border-radius: 50%; /* Keeps the circle inside the slider rounded */
134
+ }
135
+
136
+ input:checked + .slider:before {
137
+ -webkit-transform: translateX(26px);
138
+ -ms-transform: translateX(26px);
139
+ transform: translateX(26px); /* Slide to the right */
140
+ }
141
+
142
+ #picture-counter {
143
+ font-size: 18px;
144
+ color: #333;
145
+ }
146
+
147
+ #captured-images {
148
+ display: flex;
149
+ overflow-x: auto;
150
+ margin-top: 20px;
151
+ max-width: 600px; /* Set the maximum width */
152
+ white-space: nowrap; /* Keep images in a single line */
153
+ scroll-snap-type: x mandatory; /* Enable scroll snap along the x-axis and make it mandatory */
154
+ }
155
+
156
+ #captured-images .image-wrapper {
157
+ flex: 0 0 100px; /* Do not grow, do not shrink, base width of 100px */
158
+ margin-right: 5px;
159
+ position: relative;
160
+ display: inline-flex; /* Use inline-flex to keep the wrapper inline */
161
+ scroll-snap-align: start; /* Optional: Enhances the scrolling experience */
162
+ }
163
+
164
+ #captured-images .image-wrapper img {
165
+ max-width: 100%; /* Ensure images do not exceed the width of their wrappers */
166
+ height: auto; /* Maintain aspect ratio */
167
+ }
168
+
169
+ #captured-images .image-wrapper::after {
170
+ content: attr(data-picture-number);
171
+ position: absolute;
172
+ bottom: 0;
173
+ right: 0;
174
+ background-color: rgba(0, 0, 0, 0.75); /* Make it darker for better visibility */
175
+ color: white;
176
+ padding: 2px 5px;
177
+ font-size: 12px; /* Adjust font size as needed */
178
+ z-index: 10; /* Increase z-index to ensure it's above the image */
179
+ }
180
+
181
+ .error {
182
+ color: red;
183
+ }
184
+
185
+ #toggle-camera-btn {
186
+ padding: 10px 20px;
187
+ font-size: 16px;
188
+ font-weight: bold;
189
+ background-color: #ff9800; /* Orange color for visibility */
190
+ color: white;
191
+ border: none;
192
+ border-radius: 5px;
193
+ cursor: pointer;
194
+ transition: background-color 0.3s ease;
195
+ }
196
+
197
+ #toggle-camera-btn:hover {
198
+ background-color: #e68900; /* Darker shade of orange on hover */
199
+ }
static/js/main.js ADDED
@@ -0,0 +1,248 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ const cameraFeedElement = document.getElementById('camera-feed');
2
+ let ws;
3
+ let currentStream = null;
4
+ let currentDeviceIndex = 0;
5
+ let allCameras = [];
6
+
7
+ function stopCurrentVideoStream() {
8
+ if (currentStream) {
9
+ currentStream.getTracks().forEach(track => track.stop());
10
+ }
11
+ }
12
+
13
+ function getCameras() {
14
+ navigator.mediaDevices.enumerateDevices()
15
+ .then(devices => {
16
+ allCameras = devices.filter(device => device.kind === 'videoinput');
17
+ if (allCameras.length > 0) {
18
+ switchCamera(); // Initialize the first camera
19
+ }
20
+ })
21
+ .catch(error => console.error("Could not get cameras:", error));
22
+ }
23
+
24
+ function switchCamera() {
25
+ stopCurrentVideoStream();
26
+ currentDeviceIndex = (currentDeviceIndex + 1) % allCameras.length;
27
+ const deviceId = allCameras[currentDeviceIndex].deviceId;
28
+ const constraints = {
29
+ video: {
30
+ deviceId: deviceId,
31
+ width: { ideal: 640 },
32
+ height: { ideal: 480 }
33
+ }
34
+ };
35
+
36
+ navigator.mediaDevices.getUserMedia(constraints)
37
+ .then(stream => {
38
+ currentStream = stream;
39
+ cameraFeedElement.srcObject = stream;
40
+ })
41
+ .catch(error => {
42
+ console.error("Could not switch camera:", error);
43
+ console.error("Error name: ", error.name);
44
+ console.error("Error message: ", error.message);
45
+ handleCameraError(error);
46
+ });
47
+ }
48
+
49
+ function handleCameraError(error) {
50
+ if (error.name === 'NotAllowedError') {
51
+ alert('Camera access was denied. Please allow camera access for this site.');
52
+ } else if (error.name === 'NotFoundError') {
53
+ alert('No camera found. Please ensure a camera is properly connected or integrated.');
54
+ } else if (error.name === 'NotReadableError') {
55
+ alert('Camera is currently being used by another application. Please close that application and try again.');
56
+ } else if (error.name === 'OverconstrainedError') {
57
+ alert('No camera matches the requested constraints. Trying default settings...');
58
+ fallbackToDefaultCamera();
59
+ } else {
60
+ alert('An unknown error occurred when trying to access the camera.');
61
+ }
62
+ }
63
+
64
+ function fallbackToDefaultCamera() {
65
+ const constraints = {
66
+ video: true // Use default settings
67
+ };
68
+ navigator.mediaDevices.getUserMedia(constraints)
69
+ .then(stream => {
70
+ currentStream = stream;
71
+ cameraFeedElement.srcObject = stream;
72
+ })
73
+ .catch(error => {
74
+ console.error("Could not access default camera:", error);
75
+ });
76
+ }
77
+
78
+ getCameras();
79
+
80
+ document.getElementById('toggle-camera-btn').addEventListener('click', switchCamera);
81
+
82
+ let audioQueue = [];
83
+ let isPlaying = false;
84
+
85
+ function playAudio(arrayBuffer) {
86
+ console.log("Attempting to play audio", arrayBuffer);
87
+ const blob = new Blob([arrayBuffer], { type: 'audio/mp3' });
88
+ audioQueue.push(blob);
89
+ if (!isPlaying) {
90
+ playNextAudio();
91
+ }
92
+ }
93
+
94
+ function playNextAudio() {
95
+ if (audioQueue.length > 0) {
96
+ isPlaying = true;
97
+ const url = URL.createObjectURL(audioQueue.shift());
98
+ const audio = new Audio(url);
99
+ audio.play().then(() => {
100
+ audio.addEventListener('ended', playNextAudio);
101
+ }).catch(e => {
102
+ console.error("Error playing audio:", e);
103
+ isPlaying = false;
104
+ playNextAudio();
105
+ });
106
+ } else {
107
+ isPlaying = false;
108
+ }
109
+ }
110
+
111
+ let selectedVoiceName = "Daniel Attenborough";
112
+
113
+ function selectVoice() {
114
+ selectedVoiceId = this.getAttribute('data-voice-id');
115
+ selectedVoiceName = this.getAttribute('data-voice-name');
116
+ document.querySelectorAll('.voice-btn').forEach(btn => btn.classList.remove('selected'));
117
+ this.classList.add('selected');
118
+
119
+ // Check if the current feedback is the voice selection warning before clearing
120
+ const feedbackElement = document.getElementById('feedback');
121
+ if (feedbackElement.textContent === 'Please select a voice before narrating.') {
122
+ feedbackElement.textContent = ''; // Clear the warning message
123
+ }
124
+ feedbackElement.classList.remove('error'); // Remove the error class if present
125
+ }
126
+
127
+ document.querySelectorAll('.voice-btn').forEach(btn => {
128
+ btn.addEventListener('click', selectVoice);
129
+ });
130
+
131
+ let selectedVoiceId;
132
+
133
+
134
+
135
+ function captureAndAnalyseImage() {
136
+ if (!selectedVoiceId) {
137
+ const feedbackElement = document.getElementById('feedback');
138
+ feedbackElement.textContent = 'Please select a voice before narrating.';
139
+ feedbackElement.classList.add('error');
140
+ return;
141
+ }
142
+
143
+ const canvas = document.createElement('canvas');
144
+ canvas.width = cameraFeedElement.videoWidth;
145
+ canvas.height = cameraFeedElement.videoHeight;
146
+ const ctx = canvas.getContext('2d');
147
+ ctx.drawImage(cameraFeedElement, 0, 0, canvas.width, canvas.height);
148
+ const imageDataUrl = canvas.toDataURL('image/jpeg');
149
+
150
+ pictureCount++;
151
+ document.getElementById('picture-counter').textContent = `Pictures taken: ${pictureCount}`;
152
+
153
+ const capturedImagesContainer = document.getElementById('captured-images');
154
+ const imgWrapper = document.createElement('div'); // Create a wrapper div for the image
155
+ imgWrapper.classList.add('image-wrapper'); // Add class for styling
156
+ imgWrapper.setAttribute('data-picture-number', `Picture ${pictureCount}`); // Set the picture number
157
+
158
+ const imgElement = document.createElement('img');
159
+ imgElement.src = imageDataUrl;
160
+ imgWrapper.appendChild(imgElement); // Append the image to the wrapper
161
+ capturedImagesContainer.appendChild(imgWrapper); // Append the wrapper to the container
162
+
163
+ // Scroll to the latest image
164
+ capturedImagesContainer.scrollLeft = capturedImagesContainer.scrollWidth;
165
+
166
+ if (ws && ws.readyState === WebSocket.OPEN) {
167
+ ws.send(JSON.stringify({ image: imageDataUrl.split(',')[1], voiceId: selectedVoiceId, voiceName: selectedVoiceName, pictureCount: pictureCount }));
168
+ } else {
169
+ console.error("WebSocket is not open.");
170
+ }
171
+ }
172
+
173
+ // Initialise WebSocket connection and event handlers
174
+ function initWebSocket() {
175
+ console.log(`wss://${window.location.host}/narrate`);
176
+ ws = new WebSocket(`wss://${window.location.host}/narrate`);
177
+ ws.binaryType = 'arraybuffer'; // Important for audio data
178
+
179
+ ws.onopen = () => {
180
+ console.log("WebSocket connection opened.");
181
+ // Now safe to send messages
182
+ };
183
+
184
+ ws.onmessage = (event) => {
185
+ if (typeof event.data === "string") {
186
+ const message = JSON.parse(event.data);
187
+ if (message.type === "text_chunk") {
188
+ let feedbackElement = document.getElementById('feedback');
189
+ let p = document.querySelector(`p[data-picture-count="${message.pictureCount}"]`);
190
+ if (!p) {
191
+ p = document.createElement('p');
192
+ const timestamp = new Date().toLocaleTimeString();
193
+ p.setAttribute('data-picture-count', message.pictureCount);
194
+ p.innerHTML = `<strong>[${timestamp}] [Picture ${message.pictureCount}] [${message.voiceName}]</strong> `;
195
+ feedbackElement.appendChild(p);
196
+ }
197
+ p.innerHTML += `${message.data}`;
198
+ feedbackElement.scrollTop = feedbackElement.scrollHeight;
199
+ }
200
+ } else {
201
+ playAudio(event.data);
202
+ }
203
+ };
204
+
205
+ ws.onerror = (error) => {
206
+ console.error("WebSocket error:", error);
207
+ };
208
+
209
+ ws.onclose = () => {
210
+ console.log("WebSocket connection closed.");
211
+ };
212
+ }
213
+
214
+ // Add event listener to the start button for capturing and analysing the image
215
+ document.getElementById('start-btn').addEventListener('click', captureAndAnalyseImage);
216
+
217
+ // Initialise WebSocket connection
218
+ initWebSocket();
219
+
220
+ let continuousNarrationInterval; // Holds the interval ID for continuous narration
221
+
222
+ document.getElementById('continuous-narrate-toggle').addEventListener('change', function() {
223
+ if (this.checked) {
224
+ if (!selectedVoiceId) {
225
+ document.getElementById('feedback').textContent = 'Please select a voice before narrating.';
226
+ document.getElementById('feedback').classList.add('error');
227
+ this.checked = false;
228
+ return;
229
+ }
230
+ captureAndAnalyseImage(); // Send the first image immediately
231
+ if (!continuousNarrationInterval) {
232
+ continuousNarrationInterval = setInterval(captureAndAnalyseImage, 5000); // 5-second delay for subsequent images
233
+ }
234
+ } else {
235
+ if (continuousNarrationInterval) {
236
+ clearInterval(continuousNarrationInterval);
237
+ continuousNarrationInterval = null;
238
+ }
239
+ }
240
+ });
241
+
242
+ // Existing code for adding event listener to the start button
243
+ document.getElementById('start-btn').addEventListener('click', captureAndAnalyseImage);
244
+
245
+ // Initialise WebSocket connection
246
+ initWebSocket();
247
+
248
+ let pictureCount = 0;
templates/main.html ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>AI Image Narrator</title>
7
+ <link rel="stylesheet" href="/static/css/style.css">
8
+ <link rel="apple-touch-icon" sizes="180x180" href="/static/assets/apple-touch-icon.png">
9
+ <link rel="icon" type="image/png" sizes="32x32" href="/static/assets/favicon-32x32.png">
10
+ <link rel="icon" type="image/png" sizes="16x16" href="/static/assets/favicon-16x16.png">
11
+ <link rel="manifest" href="/static/assets/site.webmanifest">
12
+ </head>
13
+ <body>
14
+ <video id="camera-feed" autoplay></video>
15
+ <div id="captured-images" style="display: flex; overflow-x: auto; margin-top: 20px;"></div>
16
+ <div id="voice-selection">
17
+ <button class="voice-btn" data-voice-id="0SQfBfjRCI4jQdnyrF5B" data-voice-name="Michael Caine">Michael Kaine 🎩</button>
18
+ <button class="voice-btn" data-voice-id="4c42HvUOZ0L0feAu3r5C" data-voice-name="David Attenborough">Daniel Attenborough 🌍</button>
19
+ <button class="voice-btn" data-voice-id="DFtRVeaAE1d7V4uhxFcF" data-voice-name="Stephen Fry">Stephon Fry πŸ“š</button>
20
+ <button class="voice-btn" data-voice-id="K8sG6kT7jA4WnERxh8vd" data-voice-name="Morgan Freeman">Morgan Free 🎀</button>
21
+ <button class="voice-btn" data-voice-id="WiXK0UI5GPQ98IYxy8he" data-voice-name="Joanna Lumley">Johanna Lumly πŸ’„</button>
22
+ <button class="voice-btn" data-voice-id="bnvSNcvmOz9I0VhuOh58" data-voice-name="John Cleese">Jon Cheese πŸ§€</button>
23
+ <button class="voice-btn" data-voice-id="g5Qp5bT7Dm1TIJecJuds" data-voice-name="Judi Dench">Judy Drench 🎭</button>
24
+ <button class="voice-btn" data-voice-id="w642gnqphLNLyM1zH2eI" data-voice-name="Richard Hammond">Richard Hamed πŸš—</button>
25
+ </div>
26
+ <div id="picture-counter">Pictures taken: 0</div>
27
+ <div style="display: flex; justify-content: center; align-items: center; gap: 10px; margin-top: 10px;">
28
+ <button id="start-btn">Single Narrate</button>
29
+ <label class="switch">
30
+ <input type="checkbox" id="continuous-narrate-toggle">
31
+ <span class="slider round"></span>
32
+ </label>
33
+ <span>Continuously Narrate</span>
34
+ <button id="toggle-camera-btn">Toggle Camera</button>
35
+ </div>
36
+ <div id="feedback"></div>
37
+ <script src="/static/js/main.js"></script>
38
+ </body>
39
+ </html>