Spaces:
Running
Running
Create utils.py
Browse files
utils.py
ADDED
@@ -0,0 +1,729 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import re
|
3 |
+
import json
|
4 |
+
import requests
|
5 |
+
import tempfile
|
6 |
+
from bs4 import BeautifulSoup
|
7 |
+
from typing import List, Literal
|
8 |
+
from pydantic import BaseModel
|
9 |
+
from pydub import AudioSegment, effects
|
10 |
+
from transformers import pipeline
|
11 |
+
import yt_dlp
|
12 |
+
import tiktoken
|
13 |
+
from groq import Groq # Retained for other functions if needed
|
14 |
+
import numpy as np
|
15 |
+
import torch
|
16 |
+
import random
|
17 |
+
|
18 |
+
import base64
|
19 |
+
from io import BytesIO
|
20 |
+
import altair as alt
|
21 |
+
import pdfkit
|
22 |
+
import altair_saver # For PNG export with Altair
|
23 |
+
|
24 |
+
###############################################################################
|
25 |
+
# Pydantic Models
|
26 |
+
###############################################################################
|
27 |
+
class DialogueItem(BaseModel):
|
28 |
+
speaker: Literal["Jane", "John"]
|
29 |
+
display_speaker: str = "Jane"
|
30 |
+
text: str
|
31 |
+
|
32 |
+
class Dialogue(BaseModel):
|
33 |
+
dialogue: List[DialogueItem]
|
34 |
+
|
35 |
+
###############################################################################
|
36 |
+
# ASR Pipeline (Whisper tiny)
|
37 |
+
###############################################################################
|
38 |
+
asr_pipeline = pipeline(
|
39 |
+
"automatic-speech-recognition",
|
40 |
+
model="openai/whisper-tiny.en",
|
41 |
+
device=0 if torch.cuda.is_available() else -1
|
42 |
+
)
|
43 |
+
|
44 |
+
###############################################################################
|
45 |
+
# Helper: Truncate text if it exceeds token limit
|
46 |
+
###############################################################################
|
47 |
+
def truncate_text(text, max_tokens=2048):
|
48 |
+
print("[LOG] Truncating text if needed.")
|
49 |
+
tokenizer = tiktoken.get_encoding("cl100k_base")
|
50 |
+
tokens = tokenizer.encode(text)
|
51 |
+
if len(tokens) > max_tokens:
|
52 |
+
print("[LOG] Text too long, truncating.")
|
53 |
+
return tokenizer.decode(tokens[:max_tokens])
|
54 |
+
return text
|
55 |
+
|
56 |
+
###############################################################################
|
57 |
+
# Extract text from a URL
|
58 |
+
###############################################################################
|
59 |
+
def extract_text_from_url(url):
|
60 |
+
print("[LOG] Extracting text from URL:", url)
|
61 |
+
try:
|
62 |
+
headers = {
|
63 |
+
"User-Agent": (
|
64 |
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
65 |
+
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
66 |
+
"Chrome/115.0.0.0 Safari/537.36"
|
67 |
+
)
|
68 |
+
}
|
69 |
+
response = requests.get(url, headers=headers)
|
70 |
+
if response.status_code != 200:
|
71 |
+
print(f"[ERROR] Failed to fetch URL: {url} with status code {response.status_code}")
|
72 |
+
return ""
|
73 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
74 |
+
for script in soup(["script", "style"]):
|
75 |
+
script.decompose()
|
76 |
+
text = soup.get_text(separator=' ')
|
77 |
+
print("[LOG] Text extraction from URL successful.")
|
78 |
+
return text
|
79 |
+
except Exception as e:
|
80 |
+
print(f"[ERROR] Exception during text extraction from URL: {e}")
|
81 |
+
return ""
|
82 |
+
|
83 |
+
###############################################################################
|
84 |
+
# Optional pitch-shift (unused)
|
85 |
+
###############################################################################
|
86 |
+
def pitch_shift(audio: AudioSegment, semitones: int) -> AudioSegment:
|
87 |
+
print(f"[LOG] Shifting pitch by {semitones} semitones.")
|
88 |
+
new_sample_rate = int(audio.frame_rate * (2.0 ** (semitones / 12.0)))
|
89 |
+
shifted_audio = audio._spawn(audio.raw_data, overrides={'frame_rate': new_sample_rate})
|
90 |
+
return shifted_audio.set_frame_rate(audio.frame_rate)
|
91 |
+
|
92 |
+
###############################################################################
|
93 |
+
# Check if text is sufficient
|
94 |
+
###############################################################################
|
95 |
+
def is_sufficient(text: str, min_word_count: int = 500) -> bool:
|
96 |
+
word_count = len(text.split())
|
97 |
+
print(f"[DEBUG] Aggregated word count: {word_count}")
|
98 |
+
return word_count >= min_word_count
|
99 |
+
|
100 |
+
###############################################################################
|
101 |
+
# LLM fallback if insufficient data
|
102 |
+
###############################################################################
|
103 |
+
def query_llm_for_additional_info(topic: str, existing_text: str) -> str:
|
104 |
+
print("[LOG] Querying LLM for additional info.")
|
105 |
+
system_prompt = (
|
106 |
+
"You are an AI assistant with extensive knowledge up to 2023-10. "
|
107 |
+
"Provide additional relevant information on the following topic based on your knowledge base.\n\n"
|
108 |
+
f"Topic: {topic}\n\n"
|
109 |
+
f"Existing Information: {existing_text}\n\n"
|
110 |
+
"Please add more insightful details, facts, and perspectives."
|
111 |
+
)
|
112 |
+
groq_client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
|
113 |
+
try:
|
114 |
+
response = groq_client.chat.completions.create(
|
115 |
+
messages=[{"role": "system", "content": system_prompt}],
|
116 |
+
model="llama-3.3-70b-versatile",
|
117 |
+
max_tokens=1024,
|
118 |
+
temperature=0.7
|
119 |
+
)
|
120 |
+
except Exception as e:
|
121 |
+
print("[ERROR] Groq API error during fallback:", e)
|
122 |
+
return ""
|
123 |
+
info = response.choices[0].message.content.strip()
|
124 |
+
print("[DEBUG] Additional info from LLM:")
|
125 |
+
print(info)
|
126 |
+
return info
|
127 |
+
|
128 |
+
###############################################################################
|
129 |
+
# Rewrite text in professional style
|
130 |
+
###############################################################################
|
131 |
+
def rewrite_in_professional_style(topic: str, raw_text: str) -> str:
|
132 |
+
if not raw_text.strip():
|
133 |
+
return ""
|
134 |
+
|
135 |
+
system_prompt = (
|
136 |
+
"You are a professional writing assistant. Rewrite the provided text:\n"
|
137 |
+
"1) Use clear, fluent, professional English.\n"
|
138 |
+
"2) Keep it on-topic about {topic}, removing disclaimers or non-English filler.\n"
|
139 |
+
"3) Summarize if too long, but keep important data/facts.\n"
|
140 |
+
"4) Organize in paragraphs/bullet points.\n"
|
141 |
+
"5) Avoid referencing any rewriting.\n"
|
142 |
+
).format(topic=topic)
|
143 |
+
|
144 |
+
user_prompt = f"Please rewrite this text:\n\n{raw_text}"
|
145 |
+
groq_client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
|
146 |
+
try:
|
147 |
+
response = groq_client.chat.completions.create(
|
148 |
+
messages=[
|
149 |
+
{"role": "system", "content": system_prompt},
|
150 |
+
{"role": "user", "content": user_prompt}
|
151 |
+
],
|
152 |
+
model="llama-3.3-70b-versatile",
|
153 |
+
max_tokens=1024,
|
154 |
+
temperature=0.7
|
155 |
+
)
|
156 |
+
return response.choices[0].message.content.strip()
|
157 |
+
except Exception as e:
|
158 |
+
print("[ERROR] rewriting text via LLM:", e)
|
159 |
+
return raw_text
|
160 |
+
|
161 |
+
###############################################################################
|
162 |
+
# Legacy research: RSS + Wikipedia
|
163 |
+
###############################################################################
|
164 |
+
def research_topic(topic: str) -> str:
|
165 |
+
sources = {
|
166 |
+
"BBC": "https://feeds.bbci.co.uk/news/rss.xml",
|
167 |
+
"CNN": "http://rss.cnn.com/rss/edition.rss",
|
168 |
+
"Associated Press": "https://apnews.com/apf-topnews",
|
169 |
+
"NDTV": "https://www.ndtv.com/rss/top-stories",
|
170 |
+
"Times of India": "https://timesofindia.indiatimes.com/rssfeeds/296589292.cms",
|
171 |
+
"The Hindu": "https://www.thehindu.com/news/national/kerala/rssfeed.xml",
|
172 |
+
"Economic Times": "https://economictimes.indiatimes.com/rssfeeds/1977021501.cms",
|
173 |
+
"Google News - Custom": f"https://news.google.com/rss/search?q={requests.utils.quote(topic)}&hl=en-IN&gl=IN&ceid=IN:en",
|
174 |
+
}
|
175 |
+
|
176 |
+
summary_parts = []
|
177 |
+
wiki_summary = fetch_wikipedia_summary(topic)
|
178 |
+
if wiki_summary:
|
179 |
+
summary_parts.append(f"From Wikipedia: {wiki_summary}")
|
180 |
+
|
181 |
+
for name, feed_url in sources.items():
|
182 |
+
try:
|
183 |
+
items = fetch_rss_feed(feed_url)
|
184 |
+
if not items:
|
185 |
+
continue
|
186 |
+
title, desc, link = find_relevant_article(items, topic, min_match=2)
|
187 |
+
if link:
|
188 |
+
article_text = fetch_article_text(link)
|
189 |
+
if article_text:
|
190 |
+
summary_parts.append(f"From {name}: {article_text}")
|
191 |
+
else:
|
192 |
+
summary_parts.append(f"From {name}: {title} - {desc}")
|
193 |
+
except Exception as e:
|
194 |
+
print(f"[ERROR] Error fetching from {name} RSS feed:", e)
|
195 |
+
continue
|
196 |
+
|
197 |
+
aggregated_info = " ".join(summary_parts)
|
198 |
+
print("[DEBUG] Aggregated info from primary sources:")
|
199 |
+
print(aggregated_info)
|
200 |
+
|
201 |
+
if not is_sufficient(aggregated_info):
|
202 |
+
print("[LOG] Not enough info. LLM fallback.")
|
203 |
+
extra_info = query_llm_for_additional_info(topic, aggregated_info)
|
204 |
+
if extra_info:
|
205 |
+
aggregated_info += " " + extra_info
|
206 |
+
else:
|
207 |
+
print("[ERROR] LLM fallback gave nothing.")
|
208 |
+
if not aggregated_info:
|
209 |
+
return f"Sorry, no info on '{topic}'."
|
210 |
+
return aggregated_info
|
211 |
+
|
212 |
+
def fetch_wikipedia_summary(topic: str) -> str:
|
213 |
+
print("[LOG] Fetching Wikipedia summary for:", topic)
|
214 |
+
try:
|
215 |
+
search_url = (
|
216 |
+
f"https://en.wikipedia.org/w/api.php?action=opensearch&search={requests.utils.quote(topic)}"
|
217 |
+
"&limit=1&namespace=0&format=json"
|
218 |
+
)
|
219 |
+
resp = requests.get(search_url)
|
220 |
+
if resp.status_code != 200:
|
221 |
+
print(f"[ERROR] Wikipedia fetch fail for {topic}")
|
222 |
+
return ""
|
223 |
+
data = resp.json()
|
224 |
+
if len(data) > 1 and data[1]:
|
225 |
+
title = data[1][0]
|
226 |
+
summary_url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{requests.utils.quote(title)}"
|
227 |
+
s_resp = requests.get(summary_url)
|
228 |
+
if s_resp.status_code == 200:
|
229 |
+
s_data = s_resp.json()
|
230 |
+
if "extract" in s_data:
|
231 |
+
print("[LOG] Wikipedia summary found.")
|
232 |
+
return s_data["extract"]
|
233 |
+
return ""
|
234 |
+
except Exception as e:
|
235 |
+
print(f"[ERROR] Wikipedia summary error: {e}")
|
236 |
+
return ""
|
237 |
+
|
238 |
+
def fetch_rss_feed(feed_url: str) -> list:
|
239 |
+
print("[LOG] RSS feed:", feed_url)
|
240 |
+
try:
|
241 |
+
resp = requests.get(feed_url)
|
242 |
+
if resp.status_code != 200:
|
243 |
+
print(f"[ERROR] RSS feed fail: {feed_url}")
|
244 |
+
return []
|
245 |
+
soup = BeautifulSoup(resp.content, "xml")
|
246 |
+
return soup.find_all("item")
|
247 |
+
except Exception as e:
|
248 |
+
print(f"[ERROR] RSS error: {e}")
|
249 |
+
return []
|
250 |
+
|
251 |
+
def find_relevant_article(items, topic: str, min_match=2) -> tuple:
|
252 |
+
print("[LOG] Searching relevant article...")
|
253 |
+
keywords = re.findall(r'\w+', topic.lower())
|
254 |
+
for item in items:
|
255 |
+
title = item.find("title").get_text().strip() if item.find("title") else ""
|
256 |
+
description = item.find("description").get_text().strip() if item.find("description") else ""
|
257 |
+
text = (title + " " + description).lower()
|
258 |
+
matches = sum(1 for kw in keywords if kw in text)
|
259 |
+
if matches >= min_match:
|
260 |
+
link = item.find("link").get_text().strip() if item.find("link") else ""
|
261 |
+
print(f"[LOG] Relevant article found: {title}")
|
262 |
+
return title, description, link
|
263 |
+
return None, None, None
|
264 |
+
|
265 |
+
def fetch_article_text(link: str) -> str:
|
266 |
+
print("[LOG] Fetching article text from:", link)
|
267 |
+
if not link:
|
268 |
+
print("[LOG] No link.")
|
269 |
+
return ""
|
270 |
+
try:
|
271 |
+
r = requests.get(link)
|
272 |
+
if r.status_code != 200:
|
273 |
+
print(f"[ERROR] Article fetch fail: {link}")
|
274 |
+
return ""
|
275 |
+
soup = BeautifulSoup(r.text, 'html.parser')
|
276 |
+
paragraphs = soup.find_all("p")
|
277 |
+
text = " ".join(p.get_text() for p in paragraphs[:5])
|
278 |
+
print("[LOG] Article text fetched.")
|
279 |
+
return text.strip()
|
280 |
+
except Exception as e:
|
281 |
+
print(f"[ERROR] fetch_article_text error: {e}")
|
282 |
+
return ""
|
283 |
+
|
284 |
+
###############################################################################
|
285 |
+
# Script generation for podcasts
|
286 |
+
###############################################################################
|
287 |
+
def generate_script(
|
288 |
+
system_prompt: str,
|
289 |
+
input_text: str,
|
290 |
+
tone: str,
|
291 |
+
target_length: str,
|
292 |
+
host_name: str = "Jane",
|
293 |
+
guest_name: str = "John",
|
294 |
+
sponsor_style: str = "Separate Break",
|
295 |
+
sponsor_provided=None
|
296 |
+
):
|
297 |
+
import streamlit as st
|
298 |
+
print("[LOG] Generating script. Tone:", tone, "Length:", target_length)
|
299 |
+
|
300 |
+
if (host_name == "Jane" or not host_name) and st.session_state.get("language_selection") in ["English (Indian)", "Hinglish", "Hindi"]:
|
301 |
+
host_name = "Isha"
|
302 |
+
if (guest_name == "John" or not guest_name) and st.session_state.get("language_selection") in ["English (Indian)", "Hinglish", "Hindi"]:
|
303 |
+
guest_name = "Aarav"
|
304 |
+
|
305 |
+
words_per_minute = 150
|
306 |
+
numeric_minutes = 3
|
307 |
+
match = re.search(r"(\d+)", target_length)
|
308 |
+
if match:
|
309 |
+
numeric_minutes = int(match.group(1))
|
310 |
+
|
311 |
+
min_words = max(50, numeric_minutes * 100)
|
312 |
+
max_words = numeric_minutes * words_per_minute
|
313 |
+
|
314 |
+
tone_map = {
|
315 |
+
"Humorous": "funny and exciting",
|
316 |
+
"Formal": "business-like, well-structured, professional",
|
317 |
+
"Casual": "like a conversation between close friends",
|
318 |
+
"Youthful": "energetic and lively"
|
319 |
+
}
|
320 |
+
chosen_tone = tone_map.get(tone, "casual")
|
321 |
+
|
322 |
+
if sponsor_provided:
|
323 |
+
if sponsor_style == "Separate Break":
|
324 |
+
sponsor_instructions = "If sponsor content is provided, place in a separate ad break (~30s)."
|
325 |
+
else:
|
326 |
+
sponsor_instructions = "If sponsor content is provided, blend (~30s) into the conversation."
|
327 |
+
else:
|
328 |
+
sponsor_instructions = ""
|
329 |
+
|
330 |
+
prompt = (
|
331 |
+
f"{system_prompt}\n"
|
332 |
+
f"TONE: {chosen_tone}\n"
|
333 |
+
f"TARGET LENGTH: {target_length} (~{min_words}-{max_words} words)\n"
|
334 |
+
f"INPUT TEXT: {input_text}\n\n"
|
335 |
+
f"# Sponsor Style Instruction:\n{sponsor_instructions}\n\n"
|
336 |
+
"Output must be JSON:\n"
|
337 |
+
"{\n"
|
338 |
+
' "dialogue": [\n'
|
339 |
+
' {"speaker": "Jane", "text": "..."},\n'
|
340 |
+
' {"speaker": "John", "text": "..."}\n'
|
341 |
+
" ]\n"
|
342 |
+
"}"
|
343 |
+
)
|
344 |
+
print("[LOG] Prompt to LLM:", prompt)
|
345 |
+
|
346 |
+
if st.session_state.get("language_selection") == "Hinglish":
|
347 |
+
prompt += "\n\nPlease generate the script in Romanized Hindi.\n"
|
348 |
+
elif st.session_state.get("language_selection") == "Hindi":
|
349 |
+
prompt += "\n\nPlease generate the script exclusively in Hindi.\n"
|
350 |
+
|
351 |
+
try:
|
352 |
+
headers = {
|
353 |
+
"Authorization": f"Bearer {os.environ.get('DEEPSEEK_API_KEY')}",
|
354 |
+
"Content-Type": "application/json"
|
355 |
+
}
|
356 |
+
data = {
|
357 |
+
"model": "deepseek/deepseek-r1",
|
358 |
+
"messages": [{"role": "user", "content": prompt}],
|
359 |
+
"max_tokens": 2048,
|
360 |
+
"temperature": 0.7
|
361 |
+
}
|
362 |
+
r = requests.post("https://openrouter.ai/api/v1/chat/completions", headers=headers, data=json.dumps(data))
|
363 |
+
r.raise_for_status()
|
364 |
+
raw_content = r.json()["choices"][0]["message"]["content"].strip()
|
365 |
+
except Exception as e:
|
366 |
+
print("[ERROR] LLM error:", e)
|
367 |
+
raise ValueError(f"Error with LLM call: {e}")
|
368 |
+
|
369 |
+
start_index = raw_content.find('{')
|
370 |
+
end_index = raw_content.rfind('}')
|
371 |
+
if start_index == -1 or end_index == -1:
|
372 |
+
raise ValueError("No JSON found in LLM response.")
|
373 |
+
|
374 |
+
json_str = raw_content[start_index:end_index+1].strip()
|
375 |
+
try:
|
376 |
+
data_js = json.loads(json_str)
|
377 |
+
diag_list = data_js.get("dialogue", [])
|
378 |
+
for d in diag_list:
|
379 |
+
raw_speaker = d.get("speaker", "Jane")
|
380 |
+
if raw_speaker.lower() == host_name.lower():
|
381 |
+
d["speaker"] = "Jane"
|
382 |
+
d["display_speaker"] = host_name
|
383 |
+
elif raw_speaker.lower() == guest_name.lower():
|
384 |
+
d["speaker"] = "John"
|
385 |
+
d["display_speaker"] = guest_name
|
386 |
+
else:
|
387 |
+
d["speaker"] = "Jane"
|
388 |
+
d["display_speaker"] = raw_speaker
|
389 |
+
|
390 |
+
final_items = []
|
391 |
+
for d in diag_list:
|
392 |
+
if "display_speaker" not in d:
|
393 |
+
d["display_speaker"] = d["speaker"]
|
394 |
+
final_items.append(DialogueItem(**d))
|
395 |
+
return Dialogue(dialogue=final_items)
|
396 |
+
except Exception as e:
|
397 |
+
print("[ERROR] JSON parse error:", e)
|
398 |
+
raise ValueError(f"Failed to parse JSON from LLM: {e}")
|
399 |
+
|
400 |
+
###############################################################################
|
401 |
+
# Transcribe YouTube (RapidAPI)
|
402 |
+
###############################################################################
|
403 |
+
def transcribe_youtube_video(video_url: str) -> str:
|
404 |
+
print("[LOG] Transcribing YouTube via RapidAPI:", video_url)
|
405 |
+
vid_match = re.search(r"(?:v=|\/)([0-9A-Za-z_-]{11})", video_url)
|
406 |
+
if not vid_match:
|
407 |
+
raise ValueError("Invalid YouTube URL, cannot find video ID.")
|
408 |
+
video_id = vid_match.group(1)
|
409 |
+
|
410 |
+
base_url = "https://youtube-transcriptor.p.rapidapi.com/transcript"
|
411 |
+
params = {"video_id": video_id, "lang": "en"}
|
412 |
+
headers = {
|
413 |
+
"x-rapidapi-host": "youtube-transcriptor.p.rapidapi.com",
|
414 |
+
"x-rapidapi-key": os.environ.get("RAPIDAPI_KEY")
|
415 |
+
}
|
416 |
+
try:
|
417 |
+
r = requests.get(base_url, headers=headers, params=params, timeout=30)
|
418 |
+
r.raise_for_status()
|
419 |
+
data = r.json()
|
420 |
+
if not isinstance(data, list) or not data:
|
421 |
+
raise ValueError("No transcript data returned.")
|
422 |
+
text = data[0].get('transcriptionAsText', '').strip()
|
423 |
+
if not text:
|
424 |
+
raise ValueError("Transcript is empty.")
|
425 |
+
return text
|
426 |
+
except Exception as e:
|
427 |
+
print("[ERROR] RapidAPI transcription error:", e)
|
428 |
+
raise ValueError(f"Error transcribing YouTube: {e}")
|
429 |
+
|
430 |
+
###############################################################################
|
431 |
+
# TTS => mp3 file path
|
432 |
+
###############################################################################
|
433 |
+
def generate_audio_mp3(text: str, speaker: str) -> str:
|
434 |
+
import streamlit as st
|
435 |
+
print(f"[LOG] Generating audio for speaker: {speaker}")
|
436 |
+
language = st.session_state.get("language_selection", "English (American)")
|
437 |
+
|
438 |
+
if language == "English (American)":
|
439 |
+
# DEEPGRAM approach
|
440 |
+
...
|
441 |
+
else:
|
442 |
+
# MURF approach
|
443 |
+
...
|
444 |
+
return "...some_mp3_file_path..."
|
445 |
+
|
446 |
+
def transcribe_youtube_video_OLD_YTDLP(video_url: str):
|
447 |
+
pass
|
448 |
+
|
449 |
+
def _preprocess_text_for_tts(text: str, speaker: str) -> str:
|
450 |
+
return text
|
451 |
+
|
452 |
+
def _spell_digits(d: str) -> str:
|
453 |
+
return ""
|
454 |
+
|
455 |
+
###############################################################################
|
456 |
+
# Mix with BG music
|
457 |
+
###############################################################################
|
458 |
+
def mix_with_bg_music(spoken: AudioSegment, custom_music_path=None) -> AudioSegment:
|
459 |
+
if custom_music_path:
|
460 |
+
music_path = custom_music_path
|
461 |
+
else:
|
462 |
+
music_path = "bg_music.mp3"
|
463 |
+
try:
|
464 |
+
bg_music = AudioSegment.from_file(music_path, format="mp3")
|
465 |
+
except Exception as e:
|
466 |
+
print("[ERROR] Could not load bg music:", e)
|
467 |
+
return spoken
|
468 |
+
bg_music = bg_music - 18.0
|
469 |
+
total_len = len(spoken) + 2000
|
470 |
+
looped = AudioSegment.empty()
|
471 |
+
while len(looped) < total_len:
|
472 |
+
looped += bg_music
|
473 |
+
looped = looped[:total_len]
|
474 |
+
final_mix = looped.overlay(spoken, position=2000)
|
475 |
+
return final_mix
|
476 |
+
|
477 |
+
###############################################################################
|
478 |
+
# Q&A
|
479 |
+
###############################################################################
|
480 |
+
def call_groq_api_for_qa(system_prompt: str) -> str:
|
481 |
+
try:
|
482 |
+
headers = {
|
483 |
+
"Authorization": f"Bearer {os.environ.get('DEEPSEEK_API_KEY')}",
|
484 |
+
"Content-Type": "application/json",
|
485 |
+
"Accept": "application/json"
|
486 |
+
}
|
487 |
+
data = {
|
488 |
+
"model": "deepseek/deepseek-r1",
|
489 |
+
"messages": [{"role": "user", "content": system_prompt}],
|
490 |
+
"max_tokens": 512,
|
491 |
+
"temperature": 0.7
|
492 |
+
}
|
493 |
+
r = requests.post("https://openrouter.ai/api/v1/chat/completions",
|
494 |
+
headers=headers, data=json.dumps(data))
|
495 |
+
r.raise_for_status()
|
496 |
+
return r.json()["choices"][0]["message"]["content"].strip()
|
497 |
+
except Exception as e:
|
498 |
+
print("[ERROR] QA call failed:", e)
|
499 |
+
fallback = {"speaker": "John", "text": "I'm sorry, I'm having trouble answering now."}
|
500 |
+
return json.dumps(fallback)
|
501 |
+
|
502 |
+
###############################################################################
|
503 |
+
# Bing multi-search
|
504 |
+
###############################################################################
|
505 |
+
def fetch_bing_results(query: str, count: int = 12) -> list:
|
506 |
+
"""
|
507 |
+
We also fix possible newlines in the key.
|
508 |
+
"""
|
509 |
+
bing_api_key = os.environ.get("BING_API_KEY")
|
510 |
+
if bing_api_key:
|
511 |
+
bing_api_key = bing_api_key.strip() # remove trailing newline if any
|
512 |
+
else:
|
513 |
+
return []
|
514 |
+
|
515 |
+
print("[LOG] Attempting Bing Web Search for:", query)
|
516 |
+
url = "https://api.bing.microsoft.com/v7.0/search"
|
517 |
+
headers = {"Ocp-Apim-Subscription-Key": bing_api_key}
|
518 |
+
params = {"q": query, "count": count}
|
519 |
+
try:
|
520 |
+
resp = requests.get(url, headers=headers, params=params, timeout=15)
|
521 |
+
if resp.status_code != 200:
|
522 |
+
print("[ERROR] Bing search code:", resp.status_code)
|
523 |
+
print("[DEBUG] Bing search body:", resp.text)
|
524 |
+
resp.raise_for_status()
|
525 |
+
data = resp.json()
|
526 |
+
web_pages = data.get("webPages", {}).get("value", [])
|
527 |
+
results = []
|
528 |
+
for wp in web_pages:
|
529 |
+
results.append({
|
530 |
+
"title": wp.get("name", ""),
|
531 |
+
"link": wp.get("url", ""),
|
532 |
+
"snippet": wp.get("snippet", "")
|
533 |
+
})
|
534 |
+
return results
|
535 |
+
except Exception as e:
|
536 |
+
print("[ERROR] Bing search failed:", e)
|
537 |
+
return []
|
538 |
+
|
539 |
+
###############################################################################
|
540 |
+
# Combine all cleaned sources
|
541 |
+
###############################################################################
|
542 |
+
def _gather_cleaned_sources(topic: str, sources_list: list) -> str:
|
543 |
+
combined_body = []
|
544 |
+
for s in sources_list:
|
545 |
+
snippet = (
|
546 |
+
f"**Title**: {s['title']}\n"
|
547 |
+
f"**Link**: {s['link']}\n\n"
|
548 |
+
f"{s['cleaned_text']}\n\n"
|
549 |
+
)
|
550 |
+
combined_body.append(snippet)
|
551 |
+
return "\n".join(combined_body)
|
552 |
+
|
553 |
+
###############################################################################
|
554 |
+
# Reorganize final text into multi-section "Professional Research Report"
|
555 |
+
# with dynamic headings
|
556 |
+
###############################################################################
|
557 |
+
def rewrite_into_pro_outline(topic: str, combined_body: str) -> str:
|
558 |
+
prompt = f"""
|
559 |
+
You are a professional research writer. Please produce a final research report with this structure:
|
560 |
+
|
561 |
+
# Professional Research Report
|
562 |
+
## {{Topic}}
|
563 |
+
|
564 |
+
### Executive Summary
|
565 |
+
(Write a concise summary of the key insights.)
|
566 |
+
|
567 |
+
Then automatically determine 3-5 relevant section headings for this content, giving each a descriptive title.
|
568 |
+
Follow with a 'Conclusion' section.
|
569 |
+
Finally add 'References & Footnotes', then '(End of Professional Report)'.
|
570 |
+
|
571 |
+
Adapt headings to the actual content. Avoid headings that don't make sense for the user topic.
|
572 |
+
|
573 |
+
Topic: {topic}
|
574 |
+
|
575 |
+
Raw Combined Text:
|
576 |
+
{combined_body}
|
577 |
+
"""
|
578 |
+
groq_client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
|
579 |
+
try:
|
580 |
+
resp = groq_client.chat.completions.create(
|
581 |
+
messages=[
|
582 |
+
{"role": "system", "content": "You are a professional research writer."},
|
583 |
+
{"role": "user", "content": prompt}
|
584 |
+
],
|
585 |
+
model="llama-3.3-70b-versatile",
|
586 |
+
max_tokens=4096,
|
587 |
+
temperature=0.7
|
588 |
+
)
|
589 |
+
return resp.choices[0].message.content.strip()
|
590 |
+
except Exception as e:
|
591 |
+
print("[ERROR] rewriting into pro outline failed:", e)
|
592 |
+
return combined_body
|
593 |
+
|
594 |
+
###############################################################################
|
595 |
+
# The main function that queries Google & Bing
|
596 |
+
###############################################################################
|
597 |
+
def perform_deep_research(topic: str) -> str:
|
598 |
+
# HF SPACES UPGRADE: remove trailing newlines from env secrets
|
599 |
+
google_cse_id = os.environ.get("GOOGLE_CSE_ID")
|
600 |
+
google_api_key = os.environ.get("GOOGLE_API_KEY")
|
601 |
+
if google_cse_id:
|
602 |
+
google_cse_id = google_cse_id.strip()
|
603 |
+
if google_api_key:
|
604 |
+
google_api_key = google_api_key.strip()
|
605 |
+
|
606 |
+
all_sources = []
|
607 |
+
|
608 |
+
# GOOGLE
|
609 |
+
google_results = []
|
610 |
+
if google_cse_id and google_api_key:
|
611 |
+
try:
|
612 |
+
print("[LOG] Attempting Google CSE for:", topic)
|
613 |
+
url = "https://customsearch.googleapis.com/customsearch/v1"
|
614 |
+
params = {
|
615 |
+
"q": topic,
|
616 |
+
"cx": google_cse_id,
|
617 |
+
"key": google_api_key,
|
618 |
+
"num": 12
|
619 |
+
}
|
620 |
+
# We'll do an explicit check/log
|
621 |
+
resp = requests.get(url, params=params, timeout=15)
|
622 |
+
if resp.status_code != 200:
|
623 |
+
print("[ERROR] Google CSE status code:", resp.status_code)
|
624 |
+
print("[DEBUG] Google CSE response body:", resp.text)
|
625 |
+
resp.raise_for_status()
|
626 |
+
data = resp.json()
|
627 |
+
items = data.get("items", [])
|
628 |
+
for it in items:
|
629 |
+
google_results.append({
|
630 |
+
"title": it.get("title", ""),
|
631 |
+
"link": it.get("link", ""),
|
632 |
+
"snippet": it.get("snippet", "")
|
633 |
+
})
|
634 |
+
except requests.HTTPError as e:
|
635 |
+
print("[ERROR] Google approach failed (HTTPError):", e)
|
636 |
+
except Exception as e:
|
637 |
+
print("[ERROR] Google approach failed (other error):", e)
|
638 |
+
|
639 |
+
# BING
|
640 |
+
bing_results = fetch_bing_results(topic, count=12)
|
641 |
+
combined_raw = google_results + bing_results
|
642 |
+
|
643 |
+
if not combined_raw:
|
644 |
+
print("[LOG] No direct search results, fallback to older approach.")
|
645 |
+
fallback_info = research_topic(topic)
|
646 |
+
cleaned_fb = rewrite_in_professional_style(topic, fallback_info)
|
647 |
+
all_sources = [{
|
648 |
+
"index": 1,
|
649 |
+
"title": "Fallback Info",
|
650 |
+
"link": "N/A",
|
651 |
+
"cleaned_text": cleaned_fb
|
652 |
+
}]
|
653 |
+
else:
|
654 |
+
idx = 0
|
655 |
+
for res in combined_raw:
|
656 |
+
idx += 1
|
657 |
+
link = res["link"]
|
658 |
+
snippet = res["snippet"] or ""
|
659 |
+
title = res["title"] or ""
|
660 |
+
article_text = fetch_article_text(link)
|
661 |
+
if not article_text.strip():
|
662 |
+
article_text = snippet
|
663 |
+
cleaned = rewrite_in_professional_style(topic, article_text)
|
664 |
+
if cleaned.strip():
|
665 |
+
item = {
|
666 |
+
"index": idx,
|
667 |
+
"title": title,
|
668 |
+
"link": link,
|
669 |
+
"cleaned_text": cleaned
|
670 |
+
}
|
671 |
+
all_sources.append(item)
|
672 |
+
|
673 |
+
if not all_sources:
|
674 |
+
print("[LOG] None found after rewriting, fallback anyway.")
|
675 |
+
fb_info = research_topic(topic)
|
676 |
+
cleaned_fb = rewrite_in_professional_style(topic, fb_info)
|
677 |
+
all_sources = [{
|
678 |
+
"index": 1,
|
679 |
+
"title": "Fallback Info",
|
680 |
+
"link": "N/A",
|
681 |
+
"cleaned_text": cleaned_fb
|
682 |
+
}]
|
683 |
+
|
684 |
+
combined_body = _gather_cleaned_sources(topic, all_sources)
|
685 |
+
final_report = rewrite_into_pro_outline(topic, combined_body)
|
686 |
+
return final_report
|
687 |
+
|
688 |
+
###############################################################################
|
689 |
+
# Chart Generation -> base64 PNG
|
690 |
+
###############################################################################
|
691 |
+
def generate_simple_chart(data_list: list) -> str:
|
692 |
+
if not data_list:
|
693 |
+
return ""
|
694 |
+
import pandas as pd
|
695 |
+
df = pd.DataFrame({"value": data_list, "index": range(len(data_list))})
|
696 |
+
chart = alt.Chart(df).mark_bar().encode(
|
697 |
+
x="index:O",
|
698 |
+
y="value:Q"
|
699 |
+
).properties(title="Sample Chart")
|
700 |
+
try:
|
701 |
+
png_bytes = altair_saver.save(chart, fp=None, fmt="png")
|
702 |
+
b64_img = base64.b64encode(png_bytes).decode("utf-8")
|
703 |
+
return f""
|
704 |
+
except Exception as e:
|
705 |
+
print("[ERROR] Chart export error:", e)
|
706 |
+
return "*(Chart could not be generated in PNG form.)*"
|
707 |
+
|
708 |
+
###############################################################################
|
709 |
+
# Markdown -> PDF
|
710 |
+
###############################################################################
|
711 |
+
def generate_pdf_from_markdown(md_content: str) -> bytes:
|
712 |
+
import markdown
|
713 |
+
html_content = markdown.markdown(md_content, extensions=["extra", "tables", "toc"])
|
714 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".html") as tmp_html:
|
715 |
+
tmp_html.write(html_content.encode("utf-8"))
|
716 |
+
tmp_html_path = tmp_html.name
|
717 |
+
|
718 |
+
tmp_pdf_path = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf").name
|
719 |
+
try:
|
720 |
+
pdfkit.from_file(tmp_html_path, tmp_pdf_path)
|
721 |
+
with open(tmp_pdf_path, "rb") as f:
|
722 |
+
pdf_bytes = f.read()
|
723 |
+
finally:
|
724 |
+
if os.path.exists(tmp_html_path):
|
725 |
+
os.remove(tmp_html_path)
|
726 |
+
if os.path.exists(tmp_pdf_path):
|
727 |
+
os.remove(tmp_pdf_path)
|
728 |
+
|
729 |
+
return pdf_bytes
|