MyPod_10

Running

App Files Files Community

siddhartharyaai commited on Jan 6

Commit

89feeb0

verified ·

1 Parent(s): a3e57a9

Update utils.py

Browse files

Files changed (1) hide show

utils.py +78 -33

utils.py CHANGED Viewed

@@ -15,6 +15,7 @@ import tiktoken
 from groq import Groq  # Ensure Groq client is imported
 import numpy as np
 import torch  # Added to check CUDA availability
 class DialogueItem(BaseModel):
     speaker: Literal["Jane", "John"]
@@ -143,7 +144,6 @@ def research_topic(topic: str) -> str:
                 if article_text:
                     summary_parts.append(f"From {name}: {article_text}")
                 else:
-                    # If no main text extracted, use title/desc
                     summary_parts.append(f"From {name}: {title} - {desc}")
         except Exception as e:
             print(f"[ERROR] Error fetching from {name} RSS feed:", e)
@@ -162,7 +162,6 @@ def research_topic(topic: str) -> str:
             print("[ERROR] Failed to retrieve additional information from LLM.")
     if not aggregated_info:
-        # No info found at all
         print("[LOG] No information found for the topic.")
         return f"Sorry, I couldn't find recent information on '{topic}'."
@@ -201,7 +200,6 @@ def fetch_rss_feed(feed_url: str) -> list:
         if resp.status_code != 200:
             print(f"[ERROR] Failed to fetch RSS feed: {feed_url} with status code {resp.status_code}")
             return []
-        # Use html.parser instead of xml to avoid needing lxml or other parsers.
         soup = BeautifulSoup(resp.content, "html.parser")
         items = soup.find_all("item")
         print(f"[LOG] Number of items fetched from {feed_url}: {len(items)}")
@@ -246,10 +244,8 @@ def fetch_article_text(link: str) -> str:
             print(f"[ERROR] Failed to fetch article from link: {link} with status code {resp.status_code}")
             return ""
         soup = BeautifulSoup(resp.text, 'html.parser')
-        # This is site-specific. We'll try a generic approach:
-        # Just take all paragraphs:
         paragraphs = soup.find_all("p")
-        text = " ".join(p.get_text() for p in paragraphs[:5])  # first 5 paragraphs for more context
         print("[LOG] Article text fetched successfully.")
         return text.strip()
     except Exception as e:
@@ -260,7 +256,6 @@ def generate_script(system_prompt: str, input_text: str, tone: str, target_lengt
     print("[LOG] Generating script with tone:", tone, "and length:", target_length)
     groq_client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
-    # Map target_length to word ranges
     length_mapping = {
         "1-3 Mins": (200, 450),
         "3-5 Mins": (450, 750),
@@ -269,7 +264,6 @@ def generate_script(system_prompt: str, input_text: str, tone: str, target_lengt
     }
     min_words, max_words = length_mapping.get(target_length, (200, 450))
-    # Adjust tone description for clarity in prompt
     tone_description = {
         "Humorous": "funny and exciting, makes people chuckle",
         "Formal": "business-like, well-structured, professional",
@@ -279,7 +273,6 @@ def generate_script(system_prompt: str, input_text: str, tone: str, target_lengt
     chosen_tone = tone_description.get(tone, "casual")
-    # Construct the prompt with clear instructions for JSON output
     prompt = (
         f"{system_prompt}\n"
         f"TONE: {chosen_tone}\n"
@@ -300,7 +293,7 @@ def generate_script(system_prompt: str, input_text: str, tone: str, target_lengt
         "}"
     )
     print("[LOG] Sending prompt to Groq:")
-    print(prompt)  # Log the prompt being sent
     try:
         response = groq_client.chat.completions.create(
@@ -313,14 +306,11 @@ def generate_script(system_prompt: str, input_text: str, tone: str, target_lengt
         print("[ERROR] Groq API error:", e)
         raise ValueError(f"Error communicating with Groq API: {str(e)}")
-    # Log the raw response content for debugging
     raw_content = response.choices[0].message.content.strip()
     print("[DEBUG] Raw API response content:")
     print(raw_content)
-    # Attempt to extract JSON from the response
     content = raw_content.replace('```json', '').replace('```', '').strip()
     start_index = content.find('{')
     end_index = content.rfind('}')
@@ -345,55 +335,120 @@ def generate_script(system_prompt: str, input_text: str, tone: str, target_lengt
         print(content)
         raise ValueError(f"Failed to parse dialogue: {str(e)}")
 def generate_audio_mp3(text: str, speaker: str) -> str:
     try:
         print(f"[LOG] Generating audio for speaker: {speaker}")
         # Define Deepgram API endpoint
         deepgram_api_url = "https://api.deepgram.com/v1/speak"
         # Prepare query parameters
         params = {
             "model": "aura-asteria-en",  # Default model; adjust if needed
-            # You can add more parameters here as needed, e.g., bit_rate, sample_rate, etc.
         }
         # Override model if needed based on speaker
         if speaker == "Jane":
-            params["model"] = "aura-asteria-en"  # Female voice
         elif speaker == "John":
-            params["model"] = "aura-perseus-en"  # Male voice
         else:
             raise ValueError(f"Unknown speaker: {speaker}")
-        # Prepare headers
         headers = {
-            "Accept": "audio/mpeg",  # Request MP3 files
             "Content-Type": "application/json",
             "Authorization": f"Token {os.environ.get('DEEPGRAM_API_KEY')}"
         }
-        # Prepare body
         body = {
             "text": text
         }
         print("[LOG] Sending TTS request to Deepgram...")
-        # Make the POST request to Deepgram's TTS API
         response = requests.post(deepgram_api_url, params=params, headers=headers, json=body, stream=True)
         if response.status_code != 200:
             print(f"[ERROR] Deepgram TTS API returned status code {response.status_code}: {response.text}")
             raise ValueError(f"Deepgram TTS API error: {response.status_code} - {response.text}")
-        # Verify Content-Type
         content_type = response.headers.get('Content-Type', '')
         if 'audio/mpeg' not in content_type:
             print("[ERROR] Unexpected Content-Type received from Deepgram:", content_type)
             print("[ERROR] Response content:", response.text)
             raise ValueError("Unexpected Content-Type received from Deepgram.")
-        # Save the streamed audio to a temporary MP3 file
         with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as mp3_file:
             for chunk in response.iter_content(chunk_size=8192):
                 if chunk:
@@ -405,19 +460,10 @@ def generate_audio_mp3(text: str, speaker: str) -> str:
         audio_seg = AudioSegment.from_file(mp3_temp_path, format="mp3")
         audio_seg = effects.normalize(audio_seg)
-        # Removed pitch shifting for male voice
-        # Previously:
-        # if speaker == "John":
-        #     semitones = -5  # Shift down by 5 semitones for a deeper voice
-        #     audio_seg = pitch_shift(audio_seg, semitones=semitones)
-        #     print(f"[LOG] Applied pitch shift to John's voice by {semitones} semitones.")
-        # Export the final audio as MP3
         final_mp3_path = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3").name
         audio_seg.export(final_mp3_path, format="mp3")
         print("[LOG] Audio post-processed and saved at:", final_mp3_path)
-        # Clean up the initial MP3 file
         if os.path.exists(mp3_temp_path):
             os.remove(mp3_temp_path)
             print(f"[LOG] Removed temporary MP3 file: {mp3_temp_path}")
@@ -462,7 +508,6 @@ def transcribe_youtube_video(video_url: str) -> str:
         print("[ERROR] ASR transcription error:", e)
         raise ValueError(f"Error transcribing YouTube video: {str(e)}")
     finally:
-        # Clean up the downloaded audio file
         if os.path.exists(audio_file):
             os.remove(audio_file)
             print(f"[LOG] Removed temporary audio file: {audio_file}")

 from groq import Groq  # Ensure Groq client is imported
 import numpy as np
 import torch  # Added to check CUDA availability
+import random
 class DialogueItem(BaseModel):
     speaker: Literal["Jane", "John"]
                 if article_text:
                     summary_parts.append(f"From {name}: {article_text}")
                 else:
                     summary_parts.append(f"From {name}: {title} - {desc}")
         except Exception as e:
             print(f"[ERROR] Error fetching from {name} RSS feed:", e)
             print("[ERROR] Failed to retrieve additional information from LLM.")
     if not aggregated_info:
         print("[LOG] No information found for the topic.")
         return f"Sorry, I couldn't find recent information on '{topic}'."
         if resp.status_code != 200:
             print(f"[ERROR] Failed to fetch RSS feed: {feed_url} with status code {resp.status_code}")
             return []
         soup = BeautifulSoup(resp.content, "html.parser")
         items = soup.find_all("item")
         print(f"[LOG] Number of items fetched from {feed_url}: {len(items)}")
             print(f"[ERROR] Failed to fetch article from link: {link} with status code {resp.status_code}")
             return ""
         soup = BeautifulSoup(resp.text, 'html.parser')
         paragraphs = soup.find_all("p")
+        text = " ".join(p.get_text() for p in paragraphs[:5])
         print("[LOG] Article text fetched successfully.")
         return text.strip()
     except Exception as e:
     print("[LOG] Generating script with tone:", tone, "and length:", target_length)
     groq_client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
     length_mapping = {
         "1-3 Mins": (200, 450),
         "3-5 Mins": (450, 750),
     }
     min_words, max_words = length_mapping.get(target_length, (200, 450))
     tone_description = {
         "Humorous": "funny and exciting, makes people chuckle",
         "Formal": "business-like, well-structured, professional",
     chosen_tone = tone_description.get(tone, "casual")
     prompt = (
         f"{system_prompt}\n"
         f"TONE: {chosen_tone}\n"
         "}"
     )
     print("[LOG] Sending prompt to Groq:")
+    print(prompt)
     try:
         response = groq_client.chat.completions.create(
         print("[ERROR] Groq API error:", e)
         raise ValueError(f"Error communicating with Groq API: {str(e)}")
     raw_content = response.choices[0].message.content.strip()
     print("[DEBUG] Raw API response content:")
     print(raw_content)
     content = raw_content.replace('```json', '').replace('```', '').strip()
     start_index = content.find('{')
     end_index = content.rfind('}')
         print(content)
         raise ValueError(f"Failed to parse dialogue: {str(e)}")
+# ----------------------------------------------------------------------
+# We ONLY modify the generate_audio_mp3 flow below to insert random filler words
+# and modify punctuation (.,!?) for more natural TTS pauses and intonation.
+# ----------------------------------------------------------------------
+def _make_text_sound_more_human(text: str) -> str:
+    """
+    Inserts small filler words and adds extra punctuation to encourage
+    natural-sounding pauses at commas, periods, exclamations, and question marks.
+    """
+    # Filler words or short phrases
+    fillers = ["uh", "um", "ah", "hmm", "you know", "well", "I mean", "like"]
+    # 1) Split text by punctuation but keep the punctuation in the result
+    #    We'll handle ".", "?", "!", and commas:
+    pattern = r'([.,?!])'
+    parts = re.split(pattern, text)
+    # 2) Process each chunk, occasionally inserting filler words or extra punctuation
+    processed_chunks = []
+    for i in range(len(parts)):
+        chunk = parts[i].strip()
+        # If the chunk is punctuation, keep it
+        if chunk in [".", ",", "?", "!"]:
+            # Possibly turn "." into "..." or add "..." after "?"
+            if chunk == "." and random.random() < 0.5:
+                chunk = "..."
+            elif chunk == "?" and random.random() < 0.3:
+                # Sometimes add "?!"
+                chunk = "?!"
+            elif chunk == "!" and random.random() < 0.3:
+                # Sometimes add "!!" for more emphasis
+                chunk = "!!"
+            processed_chunks.append(chunk)
+            continue
+        # Sometimes insert a filler at the start or mid-chunk
+        if chunk and random.random() < 0.3:
+            filler = random.choice(fillers)
+            # Insert at the beginning or in the middle
+            if random.random() < 0.5:
+                chunk = f"{filler}, {chunk}"
+            else:
+                # Insert near the middle
+                words = chunk.split()
+                mid = len(words) // 2
+                chunk = " ".join(words[:mid] + [f"{filler},"] + words[mid:])
+        processed_chunks.append(chunk)
+    # 3) Rejoin them carefully with a space or nothing
+    #    We'll add a small space after punctuation, so TTS sees them as separate tokens
+    out_text = []
+    for i in range(len(processed_chunks)):
+        if i == 0:
+            out_text.append(processed_chunks[i])
+        else:
+            # If the previous chunk was punctuation or the current chunk is punctuation
+            if processed_chunks[i] in [".", "...", "?", "?!", "!", "!!", ","]:
+                out_text.append(processed_chunks[i])
+            else:
+                out_text.append(" " + processed_chunks[i])
+    final_text = "".join(out_text)
+    return final_text.strip()
 def generate_audio_mp3(text: str, speaker: str) -> str:
     try:
         print(f"[LOG] Generating audio for speaker: {speaker}")
+        # Make text more "human-like"
+        text = _make_text_sound_more_human(text)
         # Define Deepgram API endpoint
         deepgram_api_url = "https://api.deepgram.com/v1/speak"
         # Prepare query parameters
         params = {
             "model": "aura-asteria-en",  # Default model; adjust if needed
         }
         # Override model if needed based on speaker
         if speaker == "Jane":
+            params["model"] = "aura-asteria-en"
         elif speaker == "John":
+            params["model"] = "aura-perseus-en"
         else:
             raise ValueError(f"Unknown speaker: {speaker}")
         headers = {
+            "Accept": "audio/mpeg",
             "Content-Type": "application/json",
             "Authorization": f"Token {os.environ.get('DEEPGRAM_API_KEY')}"
         }
         body = {
             "text": text
         }
         print("[LOG] Sending TTS request to Deepgram...")
         response = requests.post(deepgram_api_url, params=params, headers=headers, json=body, stream=True)
         if response.status_code != 200:
             print(f"[ERROR] Deepgram TTS API returned status code {response.status_code}: {response.text}")
             raise ValueError(f"Deepgram TTS API error: {response.status_code} - {response.text}")
         content_type = response.headers.get('Content-Type', '')
         if 'audio/mpeg' not in content_type:
             print("[ERROR] Unexpected Content-Type received from Deepgram:", content_type)
             print("[ERROR] Response content:", response.text)
             raise ValueError("Unexpected Content-Type received from Deepgram.")
         with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as mp3_file:
             for chunk in response.iter_content(chunk_size=8192):
                 if chunk:
         audio_seg = AudioSegment.from_file(mp3_temp_path, format="mp3")
         audio_seg = effects.normalize(audio_seg)
         final_mp3_path = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3").name
         audio_seg.export(final_mp3_path, format="mp3")
         print("[LOG] Audio post-processed and saved at:", final_mp3_path)
         if os.path.exists(mp3_temp_path):
             os.remove(mp3_temp_path)
             print(f"[LOG] Removed temporary MP3 file: {mp3_temp_path}")
         print("[ERROR] ASR transcription error:", e)
         raise ValueError(f"Error transcribing YouTube video: {str(e)}")
     finally:
         if os.path.exists(audio_file):
             os.remove(audio_file)
             print(f"[LOG] Removed temporary audio file: {audio_file}")