Spaces:
Sleeping
Sleeping
Upload 17 files
Browse files- Dockerfile +13 -0
- convert_text_to_speech.py +31 -0
- generate_description.py +46 -0
- main.py +20 -0
- narrate_description.py +67 -0
- requirements.txt +7 -0
- static/assets/android-chrome-192x192.png +0 -0
- static/assets/android-chrome-512x512.png +0 -0
- static/assets/apple-touch-icon.png +0 -0
- static/assets/favicon-16x16.png +0 -0
- static/assets/favicon-32x32.png +0 -0
- static/assets/favicon.ico +0 -0
- static/assets/logo.png +0 -0
- static/assets/site.webmanifest +1 -0
- static/css/style.css +199 -0
- static/js/main.js +248 -0
- templates/main.html +39 -0
Dockerfile
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.11-slim
|
2 |
+
|
3 |
+
WORKDIR /code
|
4 |
+
|
5 |
+
COPY ./requirements.txt /code/requirements.txt
|
6 |
+
|
7 |
+
RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
|
8 |
+
|
9 |
+
COPY . /code
|
10 |
+
|
11 |
+
EXPOSE 7860
|
12 |
+
|
13 |
+
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
|
convert_text_to_speech.py
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from elevenlabs.client import ElevenLabs
|
2 |
+
import os
|
3 |
+
import httpx
|
4 |
+
import time
|
5 |
+
|
6 |
+
ELEVENLABS_API_KEY = os.getenv('ELEVENLABS_API_KEY')
|
7 |
+
|
8 |
+
elevenlabs_client = ElevenLabs(api_key=ELEVENLABS_API_KEY)
|
9 |
+
|
10 |
+
async def convert_text_to_speech(text, selected_voice_id):
|
11 |
+
try:
|
12 |
+
async with httpx.AsyncClient() as http_client:
|
13 |
+
response = await http_client.post(
|
14 |
+
f"https://api.elevenlabs.io/v1/text-to-speech/{selected_voice_id}/stream",
|
15 |
+
json={
|
16 |
+
"model_id": "eleven_monolingual_v1",
|
17 |
+
"text": text,
|
18 |
+
"output_format": "mp3_44100_128"
|
19 |
+
},
|
20 |
+
headers={
|
21 |
+
"Content-Type": "application/json",
|
22 |
+
"xi-api-key": ELEVENLABS_API_KEY
|
23 |
+
},
|
24 |
+
timeout=None
|
25 |
+
)
|
26 |
+
# Stream the response content
|
27 |
+
async for chunk in response.aiter_bytes():
|
28 |
+
print(f"Received chunk: {len(chunk)} bytes at {time.time()}")
|
29 |
+
yield chunk
|
30 |
+
except Exception as e:
|
31 |
+
print(f"Error during text-to-speech conversion: {e}")
|
generate_description.py
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import asyncio
|
3 |
+
from anthropic import AsyncAnthropic
|
4 |
+
|
5 |
+
ANTHROPIC_API_KEY = os.getenv('ANTHROPIC_API_KEY')
|
6 |
+
|
7 |
+
async def generate_description(image_data, selected_voice_name, description_history):
|
8 |
+
client = AsyncAnthropic(api_key=ANTHROPIC_API_KEY)
|
9 |
+
try:
|
10 |
+
system_prompt = f"You are {selected_voice_name} and you must describe the image you are given using your unique phrases in a humorous way in 15 words or less. Please use only raw text without any special formatting characters like asterisks."
|
11 |
+
|
12 |
+
print("System prompt:", system_prompt)
|
13 |
+
|
14 |
+
async with client.messages.stream(
|
15 |
+
model="claude-3-haiku-20240307",
|
16 |
+
max_tokens=100,
|
17 |
+
temperature=1,
|
18 |
+
system=system_prompt,
|
19 |
+
messages=[
|
20 |
+
{
|
21 |
+
"role": "user",
|
22 |
+
"content": [
|
23 |
+
{
|
24 |
+
"type": "image",
|
25 |
+
"source": {
|
26 |
+
"type": "base64",
|
27 |
+
"media_type": "image/jpeg",
|
28 |
+
"data": image_data
|
29 |
+
}
|
30 |
+
},
|
31 |
+
{
|
32 |
+
"type": "text",
|
33 |
+
"text": f"As {selected_voice_name} describe this image in a humorous way in 15 words or less"
|
34 |
+
}
|
35 |
+
]
|
36 |
+
}
|
37 |
+
]
|
38 |
+
) as stream:
|
39 |
+
description = ""
|
40 |
+
async for event in stream.text_stream:
|
41 |
+
print(event)
|
42 |
+
description += event
|
43 |
+
yield event
|
44 |
+
except Exception as e:
|
45 |
+
print(f"Error generating description: {e}")
|
46 |
+
yield "Error generating description."
|
main.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dotenv import load_dotenv
|
2 |
+
load_dotenv()
|
3 |
+
from fastapi import FastAPI, Request
|
4 |
+
from fastapi.responses import FileResponse, HTMLResponse
|
5 |
+
from fastapi.staticfiles import StaticFiles
|
6 |
+
from narrate_description import router as narrate_description_router
|
7 |
+
|
8 |
+
|
9 |
+
app = FastAPI()
|
10 |
+
|
11 |
+
app.include_router(narrate_description_router)
|
12 |
+
|
13 |
+
app.mount("/static", StaticFiles(directory="static"), name="static")
|
14 |
+
|
15 |
+
app.mount("/templates", StaticFiles(directory="templates"), name="templates")
|
16 |
+
|
17 |
+
|
18 |
+
@app.get("/", response_class=HTMLResponse)
|
19 |
+
async def get_root(request: Request):
|
20 |
+
return FileResponse('templates/main.html')
|
narrate_description.py
ADDED
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import APIRouter, WebSocket
|
2 |
+
import json
|
3 |
+
from generate_description import generate_description
|
4 |
+
from convert_text_to_speech import convert_text_to_speech
|
5 |
+
import re
|
6 |
+
import asyncio
|
7 |
+
|
8 |
+
router = APIRouter()
|
9 |
+
|
10 |
+
description_history = []
|
11 |
+
|
12 |
+
@router.websocket_route("/narrate")
|
13 |
+
async def websocket_narrate(websocket: WebSocket):
|
14 |
+
await websocket.accept()
|
15 |
+
print("WebSocket connection accepted.")
|
16 |
+
try:
|
17 |
+
while True:
|
18 |
+
data = await websocket.receive_text()
|
19 |
+
if data == "close":
|
20 |
+
print("Closing WebSocket connection.")
|
21 |
+
break
|
22 |
+
|
23 |
+
data_json = json.loads(data)
|
24 |
+
image_data = data_json.get('image')
|
25 |
+
selected_voice_id = data_json.get('voiceId')
|
26 |
+
selected_voice_name = data_json.get('voiceName')
|
27 |
+
if image_data:
|
28 |
+
print(f"Image data received, sending to {selected_voice_name} model for analysis.")
|
29 |
+
description_accumulator = ""
|
30 |
+
punctuation_pattern = re.compile(r"[*]")
|
31 |
+
|
32 |
+
async for description_chunk in generate_description(image_data, selected_voice_name, description_history):
|
33 |
+
if description_chunk:
|
34 |
+
# Accumulate the chunk, ensuring not to break on single punctuation marks
|
35 |
+
if not punctuation_pattern.fullmatch(description_chunk.strip()):
|
36 |
+
description_accumulator += description_chunk
|
37 |
+
else:
|
38 |
+
description_accumulator += " " + description_chunk
|
39 |
+
|
40 |
+
# Send each text chunk to the frontend
|
41 |
+
await websocket.send_text(json.dumps({"type": "text_chunk", "data": description_chunk, "pictureCount": data_json.get('pictureCount'), "voiceName": selected_voice_name}))
|
42 |
+
|
43 |
+
# If the chunk ends with punctuation, convert and stream it
|
44 |
+
if punctuation_pattern.search(description_chunk):
|
45 |
+
audio_chunks = convert_text_to_speech(description_accumulator.strip(), selected_voice_id)
|
46 |
+
await asyncio.gather(*[websocket.send_bytes(chunk) async for chunk in audio_chunks])
|
47 |
+
# Append the fully accumulated description to the history
|
48 |
+
description_history.append(description_accumulator.strip())
|
49 |
+
description_accumulator = ""
|
50 |
+
|
51 |
+
# If there is any remaining text after the loop, send it for conversion too
|
52 |
+
if description_accumulator:
|
53 |
+
audio_chunks = convert_text_to_speech(description_accumulator.strip(), selected_voice_id)
|
54 |
+
await asyncio.gather(*[websocket.send_bytes(chunk) async for chunk in audio_chunks])
|
55 |
+
# Append the remaining accumulated description to the history
|
56 |
+
description_history.append(description_accumulator.strip())
|
57 |
+
|
58 |
+
print("Finished processing image data.")
|
59 |
+
else:
|
60 |
+
print("No image data received, sending error message to client.")
|
61 |
+
await websocket.send_text("No image data received.")
|
62 |
+
|
63 |
+
print("WebSocket connection closed.")
|
64 |
+
except Exception as e:
|
65 |
+
print(f"Error during WebSocket communication: {e}")
|
66 |
+
finally:
|
67 |
+
await websocket.close()
|
requirements.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
fastapi==0.110.0
|
2 |
+
uvicorn==0.27.1
|
3 |
+
httpx==0.27.0
|
4 |
+
python-dotenv==1.0.1
|
5 |
+
aiohttp==3.9.3
|
6 |
+
elevenlabs==1.0.0b1
|
7 |
+
anthropic==0.20.0
|
static/assets/android-chrome-192x192.png
ADDED
![]() |
static/assets/android-chrome-512x512.png
ADDED
![]() |
static/assets/apple-touch-icon.png
ADDED
![]() |
static/assets/favicon-16x16.png
ADDED
![]() |
static/assets/favicon-32x32.png
ADDED
![]() |
static/assets/favicon.ico
ADDED
|
static/assets/logo.png
ADDED
![]() |
static/assets/site.webmanifest
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"name":"","short_name":"","icons":[{"src":"/android-chrome-192x192.png","sizes":"192x192","type":"image/png"},{"src":"/android-chrome-512x512.png","sizes":"512x512","type":"image/png"}],"theme_color":"#ffffff","background_color":"#ffffff","display":"standalone"}
|
static/css/style.css
ADDED
@@ -0,0 +1,199 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
body {
|
2 |
+
font-family: Arial;
|
3 |
+
display: flex;
|
4 |
+
flex-direction: column;
|
5 |
+
align-items: center;
|
6 |
+
justify-content: center;
|
7 |
+
height: 100vh;
|
8 |
+
margin: 0;
|
9 |
+
background-color: #f0f0f0;
|
10 |
+
}
|
11 |
+
|
12 |
+
#camera-feed {
|
13 |
+
border: 3px solid #333;
|
14 |
+
width: 640px;
|
15 |
+
height: 480px;
|
16 |
+
background-color: #000;
|
17 |
+
}
|
18 |
+
|
19 |
+
#voice-selection {
|
20 |
+
margin: 20px 0;
|
21 |
+
display: flex;
|
22 |
+
flex-wrap: wrap;
|
23 |
+
justify-content: center;
|
24 |
+
}
|
25 |
+
|
26 |
+
#voice-selection button {
|
27 |
+
padding: 10px 20px;
|
28 |
+
margin: 5px;
|
29 |
+
background-color: #007bff;
|
30 |
+
color: white;
|
31 |
+
border: none;
|
32 |
+
border-radius: 5px;
|
33 |
+
cursor: pointer;
|
34 |
+
transition: background-color 0.3s ease;
|
35 |
+
}
|
36 |
+
|
37 |
+
#voice-selection button:hover {
|
38 |
+
background-color: #0056b3;
|
39 |
+
}
|
40 |
+
|
41 |
+
#voice-selection button.selected {
|
42 |
+
background-color: #0056b3; /* Darker blue to indicate selection */
|
43 |
+
color: #fff;
|
44 |
+
}
|
45 |
+
|
46 |
+
#voice-selection .voice-btn {
|
47 |
+
font-weight: bold;
|
48 |
+
}
|
49 |
+
|
50 |
+
#start-btn {
|
51 |
+
padding: 10px 20px;
|
52 |
+
font-size: 16px;
|
53 |
+
font-weight: bold;
|
54 |
+
background-color: #28a745;
|
55 |
+
color: white;
|
56 |
+
border: none;
|
57 |
+
border-radius: 5px;
|
58 |
+
cursor: pointer;
|
59 |
+
transition: background-color 0.3s ease;
|
60 |
+
}
|
61 |
+
|
62 |
+
#start-btn:hover {
|
63 |
+
background-color: #218838;
|
64 |
+
}
|
65 |
+
|
66 |
+
#feedback {
|
67 |
+
max-width: 1500px;
|
68 |
+
margin-top: 20px;
|
69 |
+
padding-left: 20px;
|
70 |
+
padding-right: 20px;
|
71 |
+
text-align: left;
|
72 |
+
max-height: 100px;
|
73 |
+
overflow-y: auto;
|
74 |
+
}
|
75 |
+
|
76 |
+
#feedback img {
|
77 |
+
max-width: 640px;
|
78 |
+
max-height: 480px;
|
79 |
+
border: 1px solid #ddd;
|
80 |
+
border-radius: 4px;
|
81 |
+
padding: 5px;
|
82 |
+
}
|
83 |
+
|
84 |
+
#feedback p {
|
85 |
+
margin-top: 10px;
|
86 |
+
color: #333;
|
87 |
+
}
|
88 |
+
|
89 |
+
.switch {
|
90 |
+
position: relative;
|
91 |
+
display: inline-block;
|
92 |
+
width: 60px;
|
93 |
+
height: 34px;
|
94 |
+
}
|
95 |
+
|
96 |
+
.switch input {
|
97 |
+
opacity: 0;
|
98 |
+
width: 0;
|
99 |
+
height: 0;
|
100 |
+
}
|
101 |
+
|
102 |
+
.slider {
|
103 |
+
position: absolute;
|
104 |
+
cursor: pointer;
|
105 |
+
top: 0;
|
106 |
+
left: 0;
|
107 |
+
right: 0;
|
108 |
+
bottom: 0;
|
109 |
+
background-color: #ccc;
|
110 |
+
-webkit-transition: .4s;
|
111 |
+
transition: .4s;
|
112 |
+
border-radius: 34px; /* Makes the slider rounded */
|
113 |
+
}
|
114 |
+
|
115 |
+
input:checked + .slider {
|
116 |
+
background-color: #4CAF50; /* Green color when enabled */
|
117 |
+
}
|
118 |
+
|
119 |
+
input:not(:checked) + .slider {
|
120 |
+
background-color: #f44336; /* Red color when disabled */
|
121 |
+
}
|
122 |
+
|
123 |
+
.slider:before {
|
124 |
+
position: absolute;
|
125 |
+
content: "";
|
126 |
+
height: 26px;
|
127 |
+
width: 26px;
|
128 |
+
left: 4px; /* Initial position */
|
129 |
+
bottom: 4px;
|
130 |
+
background-color: white;
|
131 |
+
-webkit-transition: .4s;
|
132 |
+
transition: .4s;
|
133 |
+
border-radius: 50%; /* Keeps the circle inside the slider rounded */
|
134 |
+
}
|
135 |
+
|
136 |
+
input:checked + .slider:before {
|
137 |
+
-webkit-transform: translateX(26px);
|
138 |
+
-ms-transform: translateX(26px);
|
139 |
+
transform: translateX(26px); /* Slide to the right */
|
140 |
+
}
|
141 |
+
|
142 |
+
#picture-counter {
|
143 |
+
font-size: 18px;
|
144 |
+
color: #333;
|
145 |
+
}
|
146 |
+
|
147 |
+
#captured-images {
|
148 |
+
display: flex;
|
149 |
+
overflow-x: auto;
|
150 |
+
margin-top: 20px;
|
151 |
+
max-width: 600px; /* Set the maximum width */
|
152 |
+
white-space: nowrap; /* Keep images in a single line */
|
153 |
+
scroll-snap-type: x mandatory; /* Enable scroll snap along the x-axis and make it mandatory */
|
154 |
+
}
|
155 |
+
|
156 |
+
#captured-images .image-wrapper {
|
157 |
+
flex: 0 0 100px; /* Do not grow, do not shrink, base width of 100px */
|
158 |
+
margin-right: 5px;
|
159 |
+
position: relative;
|
160 |
+
display: inline-flex; /* Use inline-flex to keep the wrapper inline */
|
161 |
+
scroll-snap-align: start; /* Optional: Enhances the scrolling experience */
|
162 |
+
}
|
163 |
+
|
164 |
+
#captured-images .image-wrapper img {
|
165 |
+
max-width: 100%; /* Ensure images do not exceed the width of their wrappers */
|
166 |
+
height: auto; /* Maintain aspect ratio */
|
167 |
+
}
|
168 |
+
|
169 |
+
#captured-images .image-wrapper::after {
|
170 |
+
content: attr(data-picture-number);
|
171 |
+
position: absolute;
|
172 |
+
bottom: 0;
|
173 |
+
right: 0;
|
174 |
+
background-color: rgba(0, 0, 0, 0.75); /* Make it darker for better visibility */
|
175 |
+
color: white;
|
176 |
+
padding: 2px 5px;
|
177 |
+
font-size: 12px; /* Adjust font size as needed */
|
178 |
+
z-index: 10; /* Increase z-index to ensure it's above the image */
|
179 |
+
}
|
180 |
+
|
181 |
+
.error {
|
182 |
+
color: red;
|
183 |
+
}
|
184 |
+
|
185 |
+
#toggle-camera-btn {
|
186 |
+
padding: 10px 20px;
|
187 |
+
font-size: 16px;
|
188 |
+
font-weight: bold;
|
189 |
+
background-color: #ff9800; /* Orange color for visibility */
|
190 |
+
color: white;
|
191 |
+
border: none;
|
192 |
+
border-radius: 5px;
|
193 |
+
cursor: pointer;
|
194 |
+
transition: background-color 0.3s ease;
|
195 |
+
}
|
196 |
+
|
197 |
+
#toggle-camera-btn:hover {
|
198 |
+
background-color: #e68900; /* Darker shade of orange on hover */
|
199 |
+
}
|
static/js/main.js
ADDED
@@ -0,0 +1,248 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
const cameraFeedElement = document.getElementById('camera-feed');
|
2 |
+
let ws;
|
3 |
+
let currentStream = null;
|
4 |
+
let currentDeviceIndex = 0;
|
5 |
+
let allCameras = [];
|
6 |
+
|
7 |
+
function stopCurrentVideoStream() {
|
8 |
+
if (currentStream) {
|
9 |
+
currentStream.getTracks().forEach(track => track.stop());
|
10 |
+
}
|
11 |
+
}
|
12 |
+
|
13 |
+
function getCameras() {
|
14 |
+
navigator.mediaDevices.enumerateDevices()
|
15 |
+
.then(devices => {
|
16 |
+
allCameras = devices.filter(device => device.kind === 'videoinput');
|
17 |
+
if (allCameras.length > 0) {
|
18 |
+
switchCamera(); // Initialize the first camera
|
19 |
+
}
|
20 |
+
})
|
21 |
+
.catch(error => console.error("Could not get cameras:", error));
|
22 |
+
}
|
23 |
+
|
24 |
+
function switchCamera() {
|
25 |
+
stopCurrentVideoStream();
|
26 |
+
currentDeviceIndex = (currentDeviceIndex + 1) % allCameras.length;
|
27 |
+
const deviceId = allCameras[currentDeviceIndex].deviceId;
|
28 |
+
const constraints = {
|
29 |
+
video: {
|
30 |
+
deviceId: deviceId,
|
31 |
+
width: { ideal: 640 },
|
32 |
+
height: { ideal: 480 }
|
33 |
+
}
|
34 |
+
};
|
35 |
+
|
36 |
+
navigator.mediaDevices.getUserMedia(constraints)
|
37 |
+
.then(stream => {
|
38 |
+
currentStream = stream;
|
39 |
+
cameraFeedElement.srcObject = stream;
|
40 |
+
})
|
41 |
+
.catch(error => {
|
42 |
+
console.error("Could not switch camera:", error);
|
43 |
+
console.error("Error name: ", error.name);
|
44 |
+
console.error("Error message: ", error.message);
|
45 |
+
handleCameraError(error);
|
46 |
+
});
|
47 |
+
}
|
48 |
+
|
49 |
+
function handleCameraError(error) {
|
50 |
+
if (error.name === 'NotAllowedError') {
|
51 |
+
alert('Camera access was denied. Please allow camera access for this site.');
|
52 |
+
} else if (error.name === 'NotFoundError') {
|
53 |
+
alert('No camera found. Please ensure a camera is properly connected or integrated.');
|
54 |
+
} else if (error.name === 'NotReadableError') {
|
55 |
+
alert('Camera is currently being used by another application. Please close that application and try again.');
|
56 |
+
} else if (error.name === 'OverconstrainedError') {
|
57 |
+
alert('No camera matches the requested constraints. Trying default settings...');
|
58 |
+
fallbackToDefaultCamera();
|
59 |
+
} else {
|
60 |
+
alert('An unknown error occurred when trying to access the camera.');
|
61 |
+
}
|
62 |
+
}
|
63 |
+
|
64 |
+
function fallbackToDefaultCamera() {
|
65 |
+
const constraints = {
|
66 |
+
video: true // Use default settings
|
67 |
+
};
|
68 |
+
navigator.mediaDevices.getUserMedia(constraints)
|
69 |
+
.then(stream => {
|
70 |
+
currentStream = stream;
|
71 |
+
cameraFeedElement.srcObject = stream;
|
72 |
+
})
|
73 |
+
.catch(error => {
|
74 |
+
console.error("Could not access default camera:", error);
|
75 |
+
});
|
76 |
+
}
|
77 |
+
|
78 |
+
getCameras();
|
79 |
+
|
80 |
+
document.getElementById('toggle-camera-btn').addEventListener('click', switchCamera);
|
81 |
+
|
82 |
+
let audioQueue = [];
|
83 |
+
let isPlaying = false;
|
84 |
+
|
85 |
+
function playAudio(arrayBuffer) {
|
86 |
+
console.log("Attempting to play audio", arrayBuffer);
|
87 |
+
const blob = new Blob([arrayBuffer], { type: 'audio/mp3' });
|
88 |
+
audioQueue.push(blob);
|
89 |
+
if (!isPlaying) {
|
90 |
+
playNextAudio();
|
91 |
+
}
|
92 |
+
}
|
93 |
+
|
94 |
+
function playNextAudio() {
|
95 |
+
if (audioQueue.length > 0) {
|
96 |
+
isPlaying = true;
|
97 |
+
const url = URL.createObjectURL(audioQueue.shift());
|
98 |
+
const audio = new Audio(url);
|
99 |
+
audio.play().then(() => {
|
100 |
+
audio.addEventListener('ended', playNextAudio);
|
101 |
+
}).catch(e => {
|
102 |
+
console.error("Error playing audio:", e);
|
103 |
+
isPlaying = false;
|
104 |
+
playNextAudio();
|
105 |
+
});
|
106 |
+
} else {
|
107 |
+
isPlaying = false;
|
108 |
+
}
|
109 |
+
}
|
110 |
+
|
111 |
+
let selectedVoiceName = "Daniel Attenborough";
|
112 |
+
|
113 |
+
function selectVoice() {
|
114 |
+
selectedVoiceId = this.getAttribute('data-voice-id');
|
115 |
+
selectedVoiceName = this.getAttribute('data-voice-name');
|
116 |
+
document.querySelectorAll('.voice-btn').forEach(btn => btn.classList.remove('selected'));
|
117 |
+
this.classList.add('selected');
|
118 |
+
|
119 |
+
// Check if the current feedback is the voice selection warning before clearing
|
120 |
+
const feedbackElement = document.getElementById('feedback');
|
121 |
+
if (feedbackElement.textContent === 'Please select a voice before narrating.') {
|
122 |
+
feedbackElement.textContent = ''; // Clear the warning message
|
123 |
+
}
|
124 |
+
feedbackElement.classList.remove('error'); // Remove the error class if present
|
125 |
+
}
|
126 |
+
|
127 |
+
document.querySelectorAll('.voice-btn').forEach(btn => {
|
128 |
+
btn.addEventListener('click', selectVoice);
|
129 |
+
});
|
130 |
+
|
131 |
+
let selectedVoiceId;
|
132 |
+
|
133 |
+
|
134 |
+
|
135 |
+
function captureAndAnalyseImage() {
|
136 |
+
if (!selectedVoiceId) {
|
137 |
+
const feedbackElement = document.getElementById('feedback');
|
138 |
+
feedbackElement.textContent = 'Please select a voice before narrating.';
|
139 |
+
feedbackElement.classList.add('error');
|
140 |
+
return;
|
141 |
+
}
|
142 |
+
|
143 |
+
const canvas = document.createElement('canvas');
|
144 |
+
canvas.width = cameraFeedElement.videoWidth;
|
145 |
+
canvas.height = cameraFeedElement.videoHeight;
|
146 |
+
const ctx = canvas.getContext('2d');
|
147 |
+
ctx.drawImage(cameraFeedElement, 0, 0, canvas.width, canvas.height);
|
148 |
+
const imageDataUrl = canvas.toDataURL('image/jpeg');
|
149 |
+
|
150 |
+
pictureCount++;
|
151 |
+
document.getElementById('picture-counter').textContent = `Pictures taken: ${pictureCount}`;
|
152 |
+
|
153 |
+
const capturedImagesContainer = document.getElementById('captured-images');
|
154 |
+
const imgWrapper = document.createElement('div'); // Create a wrapper div for the image
|
155 |
+
imgWrapper.classList.add('image-wrapper'); // Add class for styling
|
156 |
+
imgWrapper.setAttribute('data-picture-number', `Picture ${pictureCount}`); // Set the picture number
|
157 |
+
|
158 |
+
const imgElement = document.createElement('img');
|
159 |
+
imgElement.src = imageDataUrl;
|
160 |
+
imgWrapper.appendChild(imgElement); // Append the image to the wrapper
|
161 |
+
capturedImagesContainer.appendChild(imgWrapper); // Append the wrapper to the container
|
162 |
+
|
163 |
+
// Scroll to the latest image
|
164 |
+
capturedImagesContainer.scrollLeft = capturedImagesContainer.scrollWidth;
|
165 |
+
|
166 |
+
if (ws && ws.readyState === WebSocket.OPEN) {
|
167 |
+
ws.send(JSON.stringify({ image: imageDataUrl.split(',')[1], voiceId: selectedVoiceId, voiceName: selectedVoiceName, pictureCount: pictureCount }));
|
168 |
+
} else {
|
169 |
+
console.error("WebSocket is not open.");
|
170 |
+
}
|
171 |
+
}
|
172 |
+
|
173 |
+
// Initialise WebSocket connection and event handlers
|
174 |
+
function initWebSocket() {
|
175 |
+
console.log(`wss://${window.location.host}/narrate`);
|
176 |
+
ws = new WebSocket(`wss://${window.location.host}/narrate`);
|
177 |
+
ws.binaryType = 'arraybuffer'; // Important for audio data
|
178 |
+
|
179 |
+
ws.onopen = () => {
|
180 |
+
console.log("WebSocket connection opened.");
|
181 |
+
// Now safe to send messages
|
182 |
+
};
|
183 |
+
|
184 |
+
ws.onmessage = (event) => {
|
185 |
+
if (typeof event.data === "string") {
|
186 |
+
const message = JSON.parse(event.data);
|
187 |
+
if (message.type === "text_chunk") {
|
188 |
+
let feedbackElement = document.getElementById('feedback');
|
189 |
+
let p = document.querySelector(`p[data-picture-count="${message.pictureCount}"]`);
|
190 |
+
if (!p) {
|
191 |
+
p = document.createElement('p');
|
192 |
+
const timestamp = new Date().toLocaleTimeString();
|
193 |
+
p.setAttribute('data-picture-count', message.pictureCount);
|
194 |
+
p.innerHTML = `<strong>[${timestamp}] [Picture ${message.pictureCount}] [${message.voiceName}]</strong> `;
|
195 |
+
feedbackElement.appendChild(p);
|
196 |
+
}
|
197 |
+
p.innerHTML += `${message.data}`;
|
198 |
+
feedbackElement.scrollTop = feedbackElement.scrollHeight;
|
199 |
+
}
|
200 |
+
} else {
|
201 |
+
playAudio(event.data);
|
202 |
+
}
|
203 |
+
};
|
204 |
+
|
205 |
+
ws.onerror = (error) => {
|
206 |
+
console.error("WebSocket error:", error);
|
207 |
+
};
|
208 |
+
|
209 |
+
ws.onclose = () => {
|
210 |
+
console.log("WebSocket connection closed.");
|
211 |
+
};
|
212 |
+
}
|
213 |
+
|
214 |
+
// Add event listener to the start button for capturing and analysing the image
|
215 |
+
document.getElementById('start-btn').addEventListener('click', captureAndAnalyseImage);
|
216 |
+
|
217 |
+
// Initialise WebSocket connection
|
218 |
+
initWebSocket();
|
219 |
+
|
220 |
+
let continuousNarrationInterval; // Holds the interval ID for continuous narration
|
221 |
+
|
222 |
+
document.getElementById('continuous-narrate-toggle').addEventListener('change', function() {
|
223 |
+
if (this.checked) {
|
224 |
+
if (!selectedVoiceId) {
|
225 |
+
document.getElementById('feedback').textContent = 'Please select a voice before narrating.';
|
226 |
+
document.getElementById('feedback').classList.add('error');
|
227 |
+
this.checked = false;
|
228 |
+
return;
|
229 |
+
}
|
230 |
+
captureAndAnalyseImage(); // Send the first image immediately
|
231 |
+
if (!continuousNarrationInterval) {
|
232 |
+
continuousNarrationInterval = setInterval(captureAndAnalyseImage, 5000); // 5-second delay for subsequent images
|
233 |
+
}
|
234 |
+
} else {
|
235 |
+
if (continuousNarrationInterval) {
|
236 |
+
clearInterval(continuousNarrationInterval);
|
237 |
+
continuousNarrationInterval = null;
|
238 |
+
}
|
239 |
+
}
|
240 |
+
});
|
241 |
+
|
242 |
+
// Existing code for adding event listener to the start button
|
243 |
+
document.getElementById('start-btn').addEventListener('click', captureAndAnalyseImage);
|
244 |
+
|
245 |
+
// Initialise WebSocket connection
|
246 |
+
initWebSocket();
|
247 |
+
|
248 |
+
let pictureCount = 0;
|
templates/main.html
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!DOCTYPE html>
|
2 |
+
<html lang="en">
|
3 |
+
<head>
|
4 |
+
<meta charset="UTF-8">
|
5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
6 |
+
<title>AI Image Narrator</title>
|
7 |
+
<link rel="stylesheet" href="/static/css/style.css">
|
8 |
+
<link rel="apple-touch-icon" sizes="180x180" href="/static/assets/apple-touch-icon.png">
|
9 |
+
<link rel="icon" type="image/png" sizes="32x32" href="/static/assets/favicon-32x32.png">
|
10 |
+
<link rel="icon" type="image/png" sizes="16x16" href="/static/assets/favicon-16x16.png">
|
11 |
+
<link rel="manifest" href="/static/assets/site.webmanifest">
|
12 |
+
</head>
|
13 |
+
<body>
|
14 |
+
<video id="camera-feed" autoplay></video>
|
15 |
+
<div id="captured-images" style="display: flex; overflow-x: auto; margin-top: 20px;"></div>
|
16 |
+
<div id="voice-selection">
|
17 |
+
<button class="voice-btn" data-voice-id="0SQfBfjRCI4jQdnyrF5B" data-voice-name="Michael Caine">Michael Kaine π©</button>
|
18 |
+
<button class="voice-btn" data-voice-id="4c42HvUOZ0L0feAu3r5C" data-voice-name="David Attenborough">Daniel Attenborough π</button>
|
19 |
+
<button class="voice-btn" data-voice-id="DFtRVeaAE1d7V4uhxFcF" data-voice-name="Stephen Fry">Stephon Fry π</button>
|
20 |
+
<button class="voice-btn" data-voice-id="K8sG6kT7jA4WnERxh8vd" data-voice-name="Morgan Freeman">Morgan Free π€</button>
|
21 |
+
<button class="voice-btn" data-voice-id="WiXK0UI5GPQ98IYxy8he" data-voice-name="Joanna Lumley">Johanna Lumly π</button>
|
22 |
+
<button class="voice-btn" data-voice-id="bnvSNcvmOz9I0VhuOh58" data-voice-name="John Cleese">Jon Cheese π§</button>
|
23 |
+
<button class="voice-btn" data-voice-id="g5Qp5bT7Dm1TIJecJuds" data-voice-name="Judi Dench">Judy Drench π</button>
|
24 |
+
<button class="voice-btn" data-voice-id="w642gnqphLNLyM1zH2eI" data-voice-name="Richard Hammond">Richard Hamed π</button>
|
25 |
+
</div>
|
26 |
+
<div id="picture-counter">Pictures taken: 0</div>
|
27 |
+
<div style="display: flex; justify-content: center; align-items: center; gap: 10px; margin-top: 10px;">
|
28 |
+
<button id="start-btn">Single Narrate</button>
|
29 |
+
<label class="switch">
|
30 |
+
<input type="checkbox" id="continuous-narrate-toggle">
|
31 |
+
<span class="slider round"></span>
|
32 |
+
</label>
|
33 |
+
<span>Continuously Narrate</span>
|
34 |
+
<button id="toggle-camera-btn">Toggle Camera</button>
|
35 |
+
</div>
|
36 |
+
<div id="feedback"></div>
|
37 |
+
<script src="/static/js/main.js"></script>
|
38 |
+
</body>
|
39 |
+
</html>
|