Spaces:
Running
Running
Update utils.py
Browse files
utils.py
CHANGED
@@ -165,7 +165,7 @@ def research_topic(topic: str) -> str:
|
|
165 |
print("[DEBUG] Aggregated info from primary sources:")
|
166 |
print(aggregated_info)
|
167 |
|
168 |
-
#
|
169 |
if not is_sufficient(aggregated_info):
|
170 |
print("[LOG] Insufficient info from primary sources. Fallback to LLM.")
|
171 |
additional_info = query_llm_for_additional_info(topic, aggregated_info)
|
@@ -277,9 +277,6 @@ def generate_script(
|
|
277 |
"""
|
278 |
Sends the system_prompt plus input_text to the Groq LLM to generate a
|
279 |
multi-speaker Dialogue in JSON, returning a Dialogue object.
|
280 |
-
|
281 |
-
sponsor_style can be "Separate Break" or "Blended".
|
282 |
-
We add instructions telling the model how to integrate the sponsor content.
|
283 |
"""
|
284 |
print("[LOG] Generating script with tone:", tone, "and length:", target_length)
|
285 |
groq_client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
|
@@ -306,7 +303,7 @@ def generate_script(
|
|
306 |
"If sponsor content is provided, include it in a separate ad break (~30 seconds). "
|
307 |
"Use phrasing like 'Now a word from our sponsor...' and end with 'Back to the show' or similar."
|
308 |
)
|
309 |
-
else:
|
310 |
sponsor_instructions = (
|
311 |
"If sponsor content is provided, blend it naturally (~30 seconds) into the conversation. "
|
312 |
"Avoid abrupt transitions."
|
@@ -377,7 +374,6 @@ def generate_script(
|
|
377 |
new_dialogue_items.append(DialogueItem(**d))
|
378 |
|
379 |
return Dialogue(dialogue=new_dialogue_items)
|
380 |
-
|
381 |
except json.JSONDecodeError as e:
|
382 |
print("[ERROR] JSON decoding (format) failed:", e)
|
383 |
raise ValueError(f"Failed to parse dialogue: {str(e)}")
|
@@ -472,7 +468,7 @@ def generate_audio_mp3(text: str, speaker: str) -> str:
|
|
472 |
mp3_file.write(chunk)
|
473 |
mp3_path = mp3_file.name
|
474 |
|
475 |
-
# Normalize
|
476 |
audio_seg = AudioSegment.from_file(mp3_path, format="mp3")
|
477 |
audio_seg = effects.normalize(audio_seg)
|
478 |
|
@@ -493,7 +489,7 @@ def transcribe_youtube_video_OLD_YTDLP(video_url: str) -> str:
|
|
493 |
def _preprocess_text_for_tts(text: str, speaker: str) -> str:
|
494 |
"""
|
495 |
1) "SaaS" => "sass"
|
496 |
-
2) Insert periods
|
497 |
3) Convert decimals like "3.14" -> "three point one four"
|
498 |
4) Convert pure integer numbers like "20" -> "twenty"
|
499 |
5) Expand leftover all-caps
|
@@ -505,7 +501,7 @@ def _preprocess_text_for_tts(text: str, speaker: str) -> str:
|
|
505 |
# 1) "SaaS" => "sass"
|
506 |
text = re.sub(r"\b(?i)SaaS\b", "sass", text)
|
507 |
|
508 |
-
# 2) Insert periods
|
509 |
def insert_periods_for_abbrev(m):
|
510 |
abbr = m.group(0)
|
511 |
parted = ".".join(list(abbr)) + "."
|
@@ -552,14 +548,11 @@ def _preprocess_text_for_tts(text: str, speaker: str) -> str:
|
|
552 |
text = re.sub(r"\b[A-Z]{2,}s?\b", expand_abbreviations, text)
|
553 |
|
554 |
# 7) Emotive placeholders
|
555 |
-
# "haha", "ha", "heh", "lol" => "(* laughs *)"
|
556 |
text = re.sub(r"\b(ha(ha)?|heh|lol)\b", "(* laughs *)", text, flags=re.IGNORECASE)
|
557 |
-
# "sigh" => "(* sighs *)"
|
558 |
text = re.sub(r"\bsigh\b", "(* sighs *)", text, flags=re.IGNORECASE)
|
559 |
-
# "groan", "moan" => "(* groans *)"
|
560 |
text = re.sub(r"\b(groan|moan)\b", "(* groans *)", text, flags=re.IGNORECASE)
|
561 |
|
562 |
-
# 8) Insert filler words if speaker != Jane
|
563 |
if speaker != "Jane":
|
564 |
def insert_thinking_pause(m):
|
565 |
word = m.group(1)
|
@@ -584,9 +577,27 @@ def _preprocess_text_for_tts(text: str, speaker: str) -> str:
|
|
584 |
|
585 |
return text.strip()
|
586 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
587 |
def number_to_words(n: int) -> str:
|
588 |
"""
|
589 |
-
Basic integer-to-words up to ~99999.
|
590 |
For a robust approach, consider the 'num2words' library.
|
591 |
"""
|
592 |
if n == 0:
|
@@ -637,8 +648,8 @@ def mix_with_bg_music(spoken: AudioSegment, custom_music_path=None) -> AudioSegm
|
|
637 |
"""
|
638 |
Mixes 'spoken' with a default bg_music.mp3 or user-provided custom music:
|
639 |
1) Start with 2 seconds of music alone before speech begins.
|
640 |
-
2) Loop music if shorter than final audio length.
|
641 |
-
3) Lower music volume so speech is clear.
|
642 |
"""
|
643 |
if custom_music_path:
|
644 |
music_path = custom_music_path
|
@@ -661,3 +672,26 @@ def mix_with_bg_music(spoken: AudioSegment, custom_music_path=None) -> AudioSegm
|
|
661 |
looped_music = looped_music[:total_length_ms]
|
662 |
final_mix = looped_music.overlay(spoken, position=2000)
|
663 |
return final_mix
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
165 |
print("[DEBUG] Aggregated info from primary sources:")
|
166 |
print(aggregated_info)
|
167 |
|
168 |
+
# If not enough data, fallback to LLM
|
169 |
if not is_sufficient(aggregated_info):
|
170 |
print("[LOG] Insufficient info from primary sources. Fallback to LLM.")
|
171 |
additional_info = query_llm_for_additional_info(topic, aggregated_info)
|
|
|
277 |
"""
|
278 |
Sends the system_prompt plus input_text to the Groq LLM to generate a
|
279 |
multi-speaker Dialogue in JSON, returning a Dialogue object.
|
|
|
|
|
|
|
280 |
"""
|
281 |
print("[LOG] Generating script with tone:", tone, "and length:", target_length)
|
282 |
groq_client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
|
|
|
303 |
"If sponsor content is provided, include it in a separate ad break (~30 seconds). "
|
304 |
"Use phrasing like 'Now a word from our sponsor...' and end with 'Back to the show' or similar."
|
305 |
)
|
306 |
+
else:
|
307 |
sponsor_instructions = (
|
308 |
"If sponsor content is provided, blend it naturally (~30 seconds) into the conversation. "
|
309 |
"Avoid abrupt transitions."
|
|
|
374 |
new_dialogue_items.append(DialogueItem(**d))
|
375 |
|
376 |
return Dialogue(dialogue=new_dialogue_items)
|
|
|
377 |
except json.JSONDecodeError as e:
|
378 |
print("[ERROR] JSON decoding (format) failed:", e)
|
379 |
raise ValueError(f"Failed to parse dialogue: {str(e)}")
|
|
|
468 |
mp3_file.write(chunk)
|
469 |
mp3_path = mp3_file.name
|
470 |
|
471 |
+
# Normalize volume
|
472 |
audio_seg = AudioSegment.from_file(mp3_path, format="mp3")
|
473 |
audio_seg = effects.normalize(audio_seg)
|
474 |
|
|
|
489 |
def _preprocess_text_for_tts(text: str, speaker: str) -> str:
|
490 |
"""
|
491 |
1) "SaaS" => "sass"
|
492 |
+
2) Insert periods for uppercase abbreviations -> remove for TTS
|
493 |
3) Convert decimals like "3.14" -> "three point one four"
|
494 |
4) Convert pure integer numbers like "20" -> "twenty"
|
495 |
5) Expand leftover all-caps
|
|
|
501 |
# 1) "SaaS" => "sass"
|
502 |
text = re.sub(r"\b(?i)SaaS\b", "sass", text)
|
503 |
|
504 |
+
# 2) Insert periods in uppercase abbreviations (>=2 chars), then remove them
|
505 |
def insert_periods_for_abbrev(m):
|
506 |
abbr = m.group(0)
|
507 |
parted = ".".join(list(abbr)) + "."
|
|
|
548 |
text = re.sub(r"\b[A-Z]{2,}s?\b", expand_abbreviations, text)
|
549 |
|
550 |
# 7) Emotive placeholders
|
|
|
551 |
text = re.sub(r"\b(ha(ha)?|heh|lol)\b", "(* laughs *)", text, flags=re.IGNORECASE)
|
|
|
552 |
text = re.sub(r"\bsigh\b", "(* sighs *)", text, flags=re.IGNORECASE)
|
|
|
553 |
text = re.sub(r"\b(groan|moan)\b", "(* groans *)", text, flags=re.IGNORECASE)
|
554 |
|
555 |
+
# 8) Insert filler words if speaker != "Jane"
|
556 |
if speaker != "Jane":
|
557 |
def insert_thinking_pause(m):
|
558 |
word = m.group(1)
|
|
|
577 |
|
578 |
return text.strip()
|
579 |
|
580 |
+
def _spell_digits(d: str) -> str:
|
581 |
+
"""
|
582 |
+
Convert individual digits '3' -> 'three'.
|
583 |
+
"""
|
584 |
+
digit_map = {
|
585 |
+
'0': 'zero',
|
586 |
+
'1': 'one',
|
587 |
+
'2': 'two',
|
588 |
+
'3': 'three',
|
589 |
+
'4': 'four',
|
590 |
+
'5': 'five',
|
591 |
+
'6': 'six',
|
592 |
+
'7': 'seven',
|
593 |
+
'8': 'eight',
|
594 |
+
'9': 'nine'
|
595 |
+
}
|
596 |
+
return " ".join(digit_map[ch] for ch in d if ch in digit_map)
|
597 |
+
|
598 |
def number_to_words(n: int) -> str:
|
599 |
"""
|
600 |
+
Basic integer-to-words up to ~99999.
|
601 |
For a robust approach, consider the 'num2words' library.
|
602 |
"""
|
603 |
if n == 0:
|
|
|
648 |
"""
|
649 |
Mixes 'spoken' with a default bg_music.mp3 or user-provided custom music:
|
650 |
1) Start with 2 seconds of music alone before speech begins.
|
651 |
+
2) Loop the music if it's shorter than the final audio length.
|
652 |
+
3) Lower music volume so the speech is clear.
|
653 |
"""
|
654 |
if custom_music_path:
|
655 |
music_path = custom_music_path
|
|
|
672 |
looped_music = looped_music[:total_length_ms]
|
673 |
final_mix = looped_music.overlay(spoken, position=2000)
|
674 |
return final_mix
|
675 |
+
|
676 |
+
# This function is new for short Q&A calls
|
677 |
+
def call_groq_api_for_qa(system_prompt: str) -> str:
|
678 |
+
"""
|
679 |
+
A minimal placeholder for your short Q&A LLM call.
|
680 |
+
Must return a JSON string, e.g.:
|
681 |
+
{"speaker": "John", "text": "Short answer here"}
|
682 |
+
"""
|
683 |
+
groq_client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
|
684 |
+
try:
|
685 |
+
response = groq_client.chat.completions.create(
|
686 |
+
messages=[{"role": "system", "content": system_prompt}],
|
687 |
+
model="llama-3.3-70b-versatile",
|
688 |
+
max_tokens=512,
|
689 |
+
temperature=0.7
|
690 |
+
)
|
691 |
+
except Exception as e:
|
692 |
+
print("[ERROR] Groq API error:", e)
|
693 |
+
fallback = {"speaker": "John", "text": "I'm sorry, I'm having trouble answering right now."}
|
694 |
+
return json.dumps(fallback)
|
695 |
+
|
696 |
+
raw_content = response.choices[0].message.content.strip()
|
697 |
+
return raw_content
|