Spaces:
Running
Running
Update utils.py
Browse files
utils.py
CHANGED
@@ -16,7 +16,6 @@ from groq import Groq
|
|
16 |
import numpy as np
|
17 |
import torch
|
18 |
import random
|
19 |
-
from num2words import num2words # For robust number-to-words conversion
|
20 |
|
21 |
class DialogueItem(BaseModel):
|
22 |
speaker: Literal["Jane", "John"] # TTS voice
|
@@ -266,36 +265,56 @@ def fetch_article_text(link: str) -> str:
|
|
266 |
print(f"[ERROR] Error fetching article text: {e}")
|
267 |
return ""
|
268 |
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
273 |
print("[LOG] Generating script with tone:", tone, "and length:", target_length)
|
274 |
groq_client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
|
275 |
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
"10-20 Mins": (1500, 3000)
|
282 |
-
}
|
283 |
-
min_words, max_words = length_mapping.get(target_length, (200, 450))
|
284 |
|
285 |
-
|
|
|
|
|
|
|
286 |
"Humorous": "funny and exciting, makes people chuckle",
|
287 |
"Formal": "business-like, well-structured, professional",
|
288 |
"Casual": "like a conversation between close friends, relaxed and informal",
|
289 |
"Youthful": "like how teenagers might chat, energetic and lively"
|
290 |
}
|
291 |
-
chosen_tone =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
292 |
|
293 |
-
# Construct prompt
|
294 |
prompt = (
|
295 |
f"{system_prompt}\n"
|
296 |
f"TONE: {chosen_tone}\n"
|
297 |
-
f"TARGET LENGTH: {target_length} ({min_words}-{max_words} words)\n"
|
298 |
f"INPUT TEXT: {input_text}\n\n"
|
|
|
299 |
"Please provide the output in the following JSON format without any additional text:\n\n"
|
300 |
"{\n"
|
301 |
' "dialogue": [\n'
|
@@ -325,33 +344,46 @@ def generate_script(system_prompt: str, input_text: str, tone: str, target_lengt
|
|
325 |
raise ValueError(f"Error communicating with Groq API: {str(e)}")
|
326 |
|
327 |
raw_content = response.choices[0].message.content.strip()
|
328 |
-
# Attempt to parse JSON
|
329 |
start_index = raw_content.find('{')
|
330 |
end_index = raw_content.rfind('}')
|
331 |
if start_index == -1 or end_index == -1:
|
332 |
raise ValueError("Failed to parse dialogue: No JSON found.")
|
333 |
|
334 |
json_str = raw_content[start_index:end_index+1].strip()
|
|
|
335 |
try:
|
336 |
data = json.loads(json_str)
|
337 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
338 |
except Exception as e:
|
339 |
print("[ERROR] JSON decoding failed:", e)
|
340 |
raise ValueError(f"Failed to parse dialogue: {str(e)}")
|
341 |
|
342 |
-
|
343 |
-
# REPLACE the YTDLP-based approach with the RapidAPI approach
|
344 |
-
# ----------------------------------------------------------------------
|
345 |
def transcribe_youtube_video(video_url: str) -> str:
|
346 |
-
"""
|
347 |
-
Transcribe the given YouTube video by calling the RapidAPI 'youtube-transcriptor' endpoint.
|
348 |
-
1) Extract the 11-char video ID from the YouTube URL.
|
349 |
-
2) Call the RapidAPI endpoint (lang=en).
|
350 |
-
3) Parse and extract 'transcriptionAsText' from the response.
|
351 |
-
4) Return that transcript as a string.
|
352 |
-
"""
|
353 |
print("[LOG] Transcribing YouTube video via RapidAPI:", video_url)
|
354 |
-
# Extract video ID
|
355 |
video_id_match = re.search(r"(?:v=|\/)([0-9A-Za-z_-]{11})", video_url)
|
356 |
if not video_id_match:
|
357 |
raise ValueError(f"Invalid YouTube URL: {video_url}, cannot extract video ID.")
|
@@ -372,7 +404,7 @@ def transcribe_youtube_video(video_url: str) -> str:
|
|
372 |
try:
|
373 |
response = requests.get(base_url, headers=headers, params=params, timeout=30)
|
374 |
print("[LOG] RapidAPI Response Status Code:", response.status_code)
|
375 |
-
print("[LOG] RapidAPI Response Body:", response.text)
|
376 |
|
377 |
if response.status_code != 200:
|
378 |
raise ValueError(f"RapidAPI transcription error: {response.status_code}, {response.text}")
|
@@ -381,19 +413,13 @@ def transcribe_youtube_video(video_url: str) -> str:
|
|
381 |
if not isinstance(data, list) or not data:
|
382 |
raise ValueError(f"Unexpected transcript format or empty transcript: {data}")
|
383 |
|
384 |
-
# Extract 'transcriptionAsText'
|
385 |
transcript_as_text = data[0].get('transcriptionAsText', '').strip()
|
386 |
if not transcript_as_text:
|
387 |
raise ValueError("transcriptionAsText field is missing or empty.")
|
388 |
|
389 |
print("[LOG] Transcript retrieval successful.")
|
390 |
print(f"[DEBUG] Transcript Length: {len(transcript_as_text)} characters.")
|
391 |
-
|
392 |
-
# Optionally, print a snippet of the transcript
|
393 |
-
if len(transcript_as_text) > 200:
|
394 |
-
snippet = transcript_as_text[:200] + "..."
|
395 |
-
else:
|
396 |
-
snippet = transcript_as_text
|
397 |
print(f"[DEBUG] Transcript Snippet: {snippet}")
|
398 |
|
399 |
return transcript_as_text
|
@@ -405,17 +431,16 @@ def transcribe_youtube_video(video_url: str) -> str:
|
|
405 |
def generate_audio_mp3(text: str, speaker: str) -> str:
|
406 |
"""
|
407 |
Calls Deepgram TTS with the text, returning a path to a temp MP3 file.
|
408 |
-
We also do some pre-processing for punctuation, abbreviations,
|
|
|
409 |
"""
|
410 |
try:
|
411 |
print(f"[LOG] Generating audio for speaker: {speaker}")
|
412 |
-
|
413 |
-
# Preprocess text with speaker context
|
414 |
processed_text = _preprocess_text_for_tts(text, speaker)
|
415 |
|
416 |
deepgram_api_url = "https://api.deepgram.com/v1/speak"
|
417 |
params = {
|
418 |
-
"model": "aura-asteria-en", # default
|
419 |
}
|
420 |
if speaker == "John":
|
421 |
params["model"] = "aura-zeus-en"
|
@@ -459,94 +484,73 @@ def generate_audio_mp3(text: str, speaker: str) -> str:
|
|
459 |
raise ValueError(f"Error generating audio: {str(e)}")
|
460 |
|
461 |
def transcribe_youtube_video_OLD_YTDLP(video_url: str) -> str:
|
462 |
-
"""
|
463 |
-
Original ytdlp-based approach for local transcription.
|
464 |
-
No longer used, but kept for reference.
|
465 |
-
"""
|
466 |
pass
|
467 |
|
468 |
-
# ---------------------------------------------------------------------
|
469 |
-
# TEXT PRE-PROCESSING FOR NATURAL TTS (punctuation, abbreviations, etc.)
|
470 |
-
# ---------------------------------------------------------------------
|
471 |
def _preprocess_text_for_tts(text: str, speaker: str) -> str:
|
472 |
"""
|
473 |
-
|
474 |
-
|
475 |
-
|
476 |
-
"""
|
477 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
478 |
text = re.sub(r"-", " ", text)
|
479 |
|
480 |
-
#
|
481 |
-
def convert_decimal(m):
|
482 |
-
number_str = m.group()
|
483 |
-
parts = number_str.split('.')
|
484 |
-
whole_part = _spell_digits(parts[0])
|
485 |
-
decimal_part = " ".join(_spell_digits(d) for d in parts[1])
|
486 |
-
return f"{whole_part} point {decimal_part}"
|
487 |
-
|
488 |
-
text = re.sub(r"\d+\.\d+", convert_decimal, text)
|
489 |
-
|
490 |
-
# 3) Abbreviations (e.g., NASA -> N A S A, MPs -> M Peas)
|
491 |
-
def expand_abbreviations(match):
|
492 |
-
abbrev = match.group()
|
493 |
-
# Check if it's a plural abbreviation
|
494 |
-
if abbrev.endswith('s') and abbrev[:-1].isupper():
|
495 |
-
singular = abbrev[:-1]
|
496 |
-
expanded = " ".join(list(singular)) + "s" # Append 's' to the expanded form
|
497 |
-
# Handle specific plural forms
|
498 |
-
specific_plural = {
|
499 |
-
"MPs": "M Peas",
|
500 |
-
"TMTs": "T M Tees",
|
501 |
-
"ARJs": "A R Jays",
|
502 |
-
# Add more as needed
|
503 |
-
}
|
504 |
-
return specific_plural.get(abbrev, expanded)
|
505 |
-
else:
|
506 |
-
return " ".join(list(abbrev))
|
507 |
-
|
508 |
-
# Regex to match abbreviations (all uppercase letters, possibly ending with 's')
|
509 |
-
text = re.sub(r"\b[A-Z]{2,}s?\b", expand_abbreviations, text)
|
510 |
|
511 |
-
#
|
512 |
-
|
513 |
-
|
514 |
-
|
515 |
-
# text = re.sub(r"\?(\s|$)", r"?...\1", text)
|
516 |
|
517 |
-
#
|
518 |
if speaker != "Jane":
|
519 |
def insert_thinking_pause(m):
|
520 |
word = m.group(1)
|
521 |
-
|
522 |
-
if random.random() < 0.3: # 30% chance
|
523 |
filler = random.choice(['hmm,', 'well,', 'let me see,'])
|
524 |
return f"{word}..., {filler}"
|
525 |
else:
|
526 |
return f"{word}...,"
|
527 |
-
|
528 |
keywords_pattern = r"\b(important|significant|crucial|point|topic)\b"
|
529 |
text = re.sub(keywords_pattern, insert_thinking_pause, text, flags=re.IGNORECASE)
|
530 |
|
531 |
-
|
532 |
-
|
533 |
-
conjunctions_pattern = r"\b(and|but|so|because|however)\b"
|
534 |
-
text = re.sub(conjunctions_pattern, lambda m: f"{m.group()}...", text, flags=re.IGNORECASE)
|
535 |
|
536 |
-
#
|
537 |
text = re.sub(r"\b(uh|um|ah)\b", "", text, flags=re.IGNORECASE)
|
538 |
|
539 |
-
#
|
540 |
-
def capitalize_match(
|
541 |
-
return
|
542 |
-
|
543 |
text = re.sub(r'(^\s*\w)|([.!?]\s*\w)', capitalize_match, text)
|
544 |
|
545 |
return text.strip()
|
546 |
|
547 |
def _spell_digits(d: str) -> str:
|
548 |
"""
|
549 |
-
Convert digits '3' -> 'three'
|
550 |
"""
|
551 |
digit_map = {
|
552 |
'0': 'zero',
|
@@ -562,23 +566,25 @@ def _spell_digits(d: str) -> str:
|
|
562 |
}
|
563 |
return " ".join(digit_map[ch] for ch in d if ch in digit_map)
|
564 |
|
565 |
-
def mix_with_bg_music(spoken: AudioSegment) -> AudioSegment:
|
566 |
"""
|
567 |
-
Mixes 'spoken' with bg_music.mp3
|
568 |
1) Start with 2 seconds of music alone before speech begins.
|
569 |
2) Loop the music if it's shorter than the final audio length.
|
570 |
-
3) Lower
|
571 |
"""
|
572 |
-
|
|
|
|
|
|
|
573 |
|
574 |
try:
|
575 |
-
bg_music = AudioSegment.from_file(
|
576 |
except Exception as e:
|
577 |
print("[ERROR] Failed to load background music:", e)
|
578 |
return spoken
|
579 |
|
580 |
-
|
581 |
-
bg_music = bg_music - 18.0 # Lower volume (e.g. -18 dB)
|
582 |
|
583 |
total_length_ms = len(spoken) + 2000
|
584 |
looped_music = AudioSegment.empty()
|
@@ -586,8 +592,28 @@ def mix_with_bg_music(spoken: AudioSegment) -> AudioSegment:
|
|
586 |
looped_music += bg_music
|
587 |
|
588 |
looped_music = looped_music[:total_length_ms]
|
589 |
-
|
590 |
-
# Overlay spoken at 2000ms so we get 2s of music first
|
591 |
final_mix = looped_music.overlay(spoken, position=2000)
|
592 |
-
|
593 |
return final_mix
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
import numpy as np
|
17 |
import torch
|
18 |
import random
|
|
|
19 |
|
20 |
class DialogueItem(BaseModel):
|
21 |
speaker: Literal["Jane", "John"] # TTS voice
|
|
|
265 |
print(f"[ERROR] Error fetching article text: {e}")
|
266 |
return ""
|
267 |
|
268 |
+
def generate_script(
|
269 |
+
system_prompt: str,
|
270 |
+
input_text: str,
|
271 |
+
tone: str,
|
272 |
+
target_length: str,
|
273 |
+
host_name: str = "Jane",
|
274 |
+
guest_name: str = "John",
|
275 |
+
sponsor_style: str = "Separate Break"
|
276 |
+
):
|
277 |
+
"""
|
278 |
+
Sends the system_prompt plus input_text to the Groq LLM to generate a
|
279 |
+
multi-speaker Dialogue in JSON, returning a Dialogue object.
|
280 |
+
"""
|
281 |
print("[LOG] Generating script with tone:", tone, "and length:", target_length)
|
282 |
groq_client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
|
283 |
|
284 |
+
words_per_minute = 150
|
285 |
+
numeric_minutes = 3
|
286 |
+
match = re.search(r"(\d+)", target_length)
|
287 |
+
if match:
|
288 |
+
numeric_minutes = int(match.group(1))
|
|
|
|
|
|
|
289 |
|
290 |
+
min_words = max(50, numeric_minutes * 100)
|
291 |
+
max_words = numeric_minutes * words_per_minute
|
292 |
+
|
293 |
+
tone_map = {
|
294 |
"Humorous": "funny and exciting, makes people chuckle",
|
295 |
"Formal": "business-like, well-structured, professional",
|
296 |
"Casual": "like a conversation between close friends, relaxed and informal",
|
297 |
"Youthful": "like how teenagers might chat, energetic and lively"
|
298 |
}
|
299 |
+
chosen_tone = tone_map.get(tone, "casual")
|
300 |
+
|
301 |
+
if sponsor_style == "Separate Break":
|
302 |
+
sponsor_instructions = (
|
303 |
+
"If sponsor content is provided, include it in a separate ad break (~30 seconds). "
|
304 |
+
"Use phrasing like 'Now a word from our sponsor...' and end with 'Back to the show' or similar."
|
305 |
+
)
|
306 |
+
else:
|
307 |
+
sponsor_instructions = (
|
308 |
+
"If sponsor content is provided, blend it naturally (~30 seconds) into the conversation. "
|
309 |
+
"Avoid abrupt transitions."
|
310 |
+
)
|
311 |
|
|
|
312 |
prompt = (
|
313 |
f"{system_prompt}\n"
|
314 |
f"TONE: {chosen_tone}\n"
|
315 |
+
f"TARGET LENGTH: {target_length} (~{min_words}-{max_words} words)\n"
|
316 |
f"INPUT TEXT: {input_text}\n\n"
|
317 |
+
f"# Sponsor Style Instruction:\n{sponsor_instructions}\n\n"
|
318 |
"Please provide the output in the following JSON format without any additional text:\n\n"
|
319 |
"{\n"
|
320 |
' "dialogue": [\n'
|
|
|
344 |
raise ValueError(f"Error communicating with Groq API: {str(e)}")
|
345 |
|
346 |
raw_content = response.choices[0].message.content.strip()
|
|
|
347 |
start_index = raw_content.find('{')
|
348 |
end_index = raw_content.rfind('}')
|
349 |
if start_index == -1 or end_index == -1:
|
350 |
raise ValueError("Failed to parse dialogue: No JSON found.")
|
351 |
|
352 |
json_str = raw_content[start_index:end_index+1].strip()
|
353 |
+
|
354 |
try:
|
355 |
data = json.loads(json_str)
|
356 |
+
dialogue_list = data.get("dialogue", [])
|
357 |
+
|
358 |
+
for d in dialogue_list:
|
359 |
+
raw_speaker = d.get("speaker", "Jane")
|
360 |
+
if raw_speaker.lower() == host_name.lower():
|
361 |
+
d["speaker"] = "Jane"
|
362 |
+
d["display_speaker"] = host_name
|
363 |
+
elif raw_speaker.lower() == guest_name.lower():
|
364 |
+
d["speaker"] = "John"
|
365 |
+
d["display_speaker"] = guest_name
|
366 |
+
else:
|
367 |
+
d["speaker"] = "Jane"
|
368 |
+
d["display_speaker"] = raw_speaker
|
369 |
+
|
370 |
+
new_dialogue_items = []
|
371 |
+
for d in dialogue_list:
|
372 |
+
if "display_speaker" not in d:
|
373 |
+
d["display_speaker"] = d["speaker"]
|
374 |
+
new_dialogue_items.append(DialogueItem(**d))
|
375 |
+
|
376 |
+
return Dialogue(dialogue=new_dialogue_items)
|
377 |
+
except json.JSONDecodeError as e:
|
378 |
+
print("[ERROR] JSON decoding (format) failed:", e)
|
379 |
+
raise ValueError(f"Failed to parse dialogue: {str(e)}")
|
380 |
except Exception as e:
|
381 |
print("[ERROR] JSON decoding failed:", e)
|
382 |
raise ValueError(f"Failed to parse dialogue: {str(e)}")
|
383 |
|
384 |
+
|
|
|
|
|
385 |
def transcribe_youtube_video(video_url: str) -> str:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
386 |
print("[LOG] Transcribing YouTube video via RapidAPI:", video_url)
|
|
|
387 |
video_id_match = re.search(r"(?:v=|\/)([0-9A-Za-z_-]{11})", video_url)
|
388 |
if not video_id_match:
|
389 |
raise ValueError(f"Invalid YouTube URL: {video_url}, cannot extract video ID.")
|
|
|
404 |
try:
|
405 |
response = requests.get(base_url, headers=headers, params=params, timeout=30)
|
406 |
print("[LOG] RapidAPI Response Status Code:", response.status_code)
|
407 |
+
print("[LOG] RapidAPI Response Body:", response.text)
|
408 |
|
409 |
if response.status_code != 200:
|
410 |
raise ValueError(f"RapidAPI transcription error: {response.status_code}, {response.text}")
|
|
|
413 |
if not isinstance(data, list) or not data:
|
414 |
raise ValueError(f"Unexpected transcript format or empty transcript: {data}")
|
415 |
|
|
|
416 |
transcript_as_text = data[0].get('transcriptionAsText', '').strip()
|
417 |
if not transcript_as_text:
|
418 |
raise ValueError("transcriptionAsText field is missing or empty.")
|
419 |
|
420 |
print("[LOG] Transcript retrieval successful.")
|
421 |
print(f"[DEBUG] Transcript Length: {len(transcript_as_text)} characters.")
|
422 |
+
snippet = transcript_as_text[:200] + "..." if len(transcript_as_text) > 200 else transcript_as_text
|
|
|
|
|
|
|
|
|
|
|
423 |
print(f"[DEBUG] Transcript Snippet: {snippet}")
|
424 |
|
425 |
return transcript_as_text
|
|
|
431 |
def generate_audio_mp3(text: str, speaker: str) -> str:
|
432 |
"""
|
433 |
Calls Deepgram TTS with the text, returning a path to a temp MP3 file.
|
434 |
+
We also do some pre-processing for punctuation, abbreviations, numeric expansions,
|
435 |
+
plus emotive expressions (ha, sigh, etc.).
|
436 |
"""
|
437 |
try:
|
438 |
print(f"[LOG] Generating audio for speaker: {speaker}")
|
|
|
|
|
439 |
processed_text = _preprocess_text_for_tts(text, speaker)
|
440 |
|
441 |
deepgram_api_url = "https://api.deepgram.com/v1/speak"
|
442 |
params = {
|
443 |
+
"model": "aura-asteria-en", # female by default
|
444 |
}
|
445 |
if speaker == "John":
|
446 |
params["model"] = "aura-zeus-en"
|
|
|
484 |
raise ValueError(f"Error generating audio: {str(e)}")
|
485 |
|
486 |
def transcribe_youtube_video_OLD_YTDLP(video_url: str) -> str:
|
|
|
|
|
|
|
|
|
487 |
pass
|
488 |
|
|
|
|
|
|
|
489 |
def _preprocess_text_for_tts(text: str, speaker: str) -> str:
|
490 |
"""
|
491 |
+
1) "SaaS" => "sass"
|
492 |
+
2) Insert periods for uppercase abbreviations -> remove for TTS
|
493 |
+
3) Convert decimals like "3.14" -> "three point one four"
|
494 |
+
4) Convert pure integer numbers like "20" -> "twenty"
|
495 |
+
5) Expand leftover all-caps
|
496 |
+
6) Emotive placeholders for 'ha', 'haha', 'sigh', 'groan', etc.
|
497 |
+
7) If speaker != Jane, insert filler words
|
498 |
+
8) Remove random fillers
|
499 |
+
9) Capitalize sentence starts
|
500 |
+
"""
|
501 |
+
# 1) "SaaS" => "sass"
|
502 |
+
text = re.sub(r"\b(?i)SaaS\b", "sass", text)
|
503 |
+
|
504 |
+
# 2) Insert periods in uppercase abbreviations (>=2 chars), then remove them
|
505 |
+
def insert_periods_for_abbrev(m):
|
506 |
+
abbr = m.group(0)
|
507 |
+
parted = ".".join(list(abbr)) + "."
|
508 |
+
return parted
|
509 |
+
text = re.sub(r"\b([A-Z0-9]{2,})\b", insert_periods_for_abbrev, text)
|
510 |
+
text = re.sub(r"\.\.", ".", text)
|
511 |
+
def remove_periods_for_tts(m):
|
512 |
+
chunk = m.group(0)
|
513 |
+
return chunk.replace(".", " ").strip()
|
514 |
+
text = re.sub(r"[A-Z0-9]\.[A-Z0-9](?:\.[A-Z0-9])*\.", remove_periods_for_tts, text)
|
515 |
+
|
516 |
+
# 3) Hyphens -> spaces
|
517 |
text = re.sub(r"-", " ", text)
|
518 |
|
519 |
+
# Removed numeric conversions to let TTS handle numbers naturally.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
520 |
|
521 |
+
# 6) Emotive placeholders
|
522 |
+
text = re.sub(r"\b(ha(ha)?|heh|lol)\b", "(* laughs *)", text, flags=re.IGNORECASE)
|
523 |
+
text = re.sub(r"\bsigh\b", "(* sighs *)", text, flags=re.IGNORECASE)
|
524 |
+
text = re.sub(r"\b(groan|moan)\b", "(* groans *)", text, flags=re.IGNORECASE)
|
|
|
525 |
|
526 |
+
# 7) Insert filler words if speaker != "Jane"
|
527 |
if speaker != "Jane":
|
528 |
def insert_thinking_pause(m):
|
529 |
word = m.group(1)
|
530 |
+
if random.random() < 0.3:
|
|
|
531 |
filler = random.choice(['hmm,', 'well,', 'let me see,'])
|
532 |
return f"{word}..., {filler}"
|
533 |
else:
|
534 |
return f"{word}...,"
|
|
|
535 |
keywords_pattern = r"\b(important|significant|crucial|point|topic)\b"
|
536 |
text = re.sub(keywords_pattern, insert_thinking_pause, text, flags=re.IGNORECASE)
|
537 |
|
538 |
+
conj_pattern = r"\b(and|but|so|because|however)\b"
|
539 |
+
text = re.sub(conj_pattern, lambda m: f"{m.group()}...", text, flags=re.IGNORECASE)
|
|
|
|
|
540 |
|
541 |
+
# 8) Remove random fillers
|
542 |
text = re.sub(r"\b(uh|um|ah)\b", "", text, flags=re.IGNORECASE)
|
543 |
|
544 |
+
# 9) Capitalize sentence starts
|
545 |
+
def capitalize_match(m):
|
546 |
+
return m.group().upper()
|
|
|
547 |
text = re.sub(r'(^\s*\w)|([.!?]\s*\w)', capitalize_match, text)
|
548 |
|
549 |
return text.strip()
|
550 |
|
551 |
def _spell_digits(d: str) -> str:
|
552 |
"""
|
553 |
+
Convert individual digits '3' -> 'three'.
|
554 |
"""
|
555 |
digit_map = {
|
556 |
'0': 'zero',
|
|
|
566 |
}
|
567 |
return " ".join(digit_map[ch] for ch in d if ch in digit_map)
|
568 |
|
569 |
+
def mix_with_bg_music(spoken: AudioSegment, custom_music_path=None) -> AudioSegment:
|
570 |
"""
|
571 |
+
Mixes 'spoken' with a default bg_music.mp3 or user-provided custom music:
|
572 |
1) Start with 2 seconds of music alone before speech begins.
|
573 |
2) Loop the music if it's shorter than the final audio length.
|
574 |
+
3) Lower music volume so the speech is clear.
|
575 |
"""
|
576 |
+
if custom_music_path:
|
577 |
+
music_path = custom_music_path
|
578 |
+
else:
|
579 |
+
music_path = "bg_music.mp3"
|
580 |
|
581 |
try:
|
582 |
+
bg_music = AudioSegment.from_file(music_path, format="mp3")
|
583 |
except Exception as e:
|
584 |
print("[ERROR] Failed to load background music:", e)
|
585 |
return spoken
|
586 |
|
587 |
+
bg_music = bg_music - 18.0
|
|
|
588 |
|
589 |
total_length_ms = len(spoken) + 2000
|
590 |
looped_music = AudioSegment.empty()
|
|
|
592 |
looped_music += bg_music
|
593 |
|
594 |
looped_music = looped_music[:total_length_ms]
|
|
|
|
|
595 |
final_mix = looped_music.overlay(spoken, position=2000)
|
|
|
596 |
return final_mix
|
597 |
+
|
598 |
+
# This function is new for short Q&A calls
|
599 |
+
def call_groq_api_for_qa(system_prompt: str) -> str:
|
600 |
+
"""
|
601 |
+
A minimal placeholder for your short Q&A LLM call.
|
602 |
+
Must return a JSON string, e.g.:
|
603 |
+
{"speaker": "John", "text": "Short answer here"}
|
604 |
+
"""
|
605 |
+
groq_client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
|
606 |
+
try:
|
607 |
+
response = groq_client.chat.completions.create(
|
608 |
+
messages=[{"role": "system", "content": system_prompt}],
|
609 |
+
model="llama-3.3-70b-versatile",
|
610 |
+
max_tokens=512,
|
611 |
+
temperature=0.7
|
612 |
+
)
|
613 |
+
except Exception as e:
|
614 |
+
print("[ERROR] Groq API error:", e)
|
615 |
+
fallback = {"speaker": "John", "text": "I'm sorry, I'm having trouble answering right now."}
|
616 |
+
return json.dumps(fallback)
|
617 |
+
|
618 |
+
raw_content = response.choices[0].message.content.strip()
|
619 |
+
return raw_content
|