Spaces:
Running
Running
Update utils.py
Browse files
utils.py
CHANGED
@@ -4,18 +4,15 @@ import json
|
|
4 |
import requests
|
5 |
import tempfile
|
6 |
from bs4 import BeautifulSoup
|
7 |
-
from typing import List, Literal
|
8 |
from pydantic import BaseModel
|
9 |
from pydub import AudioSegment, effects
|
10 |
from transformers import pipeline
|
11 |
-
import yt_dlp
|
12 |
import tiktoken
|
13 |
from groq import Groq
|
14 |
-
import
|
15 |
-
import
|
16 |
-
import random
|
17 |
from tavily import TavilyClient
|
18 |
-
from report_structure import generate_report
|
19 |
|
20 |
|
21 |
|
@@ -42,28 +39,6 @@ def truncate_text(text, max_tokens=2048):
|
|
42 |
return tokenizer.decode(tokens[:max_tokens])
|
43 |
return text
|
44 |
|
45 |
-
def extract_text_from_url(url):
|
46 |
-
# Kept for potential user-provided URLs, but not primary.
|
47 |
-
print("[LOG] Extracting text from URL (fallback method):", url)
|
48 |
-
try:
|
49 |
-
headers = {
|
50 |
-
"User-Agent": ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
51 |
-
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
52 |
-
"Chrome/115.0.0.0 Safari/537.36")
|
53 |
-
}
|
54 |
-
response = requests.get(url, headers=headers)
|
55 |
-
if response.status_code != 200:
|
56 |
-
print(f"[ERROR] Failed to fetch URL: {url} with status code {response.status_code}")
|
57 |
-
return ""
|
58 |
-
soup = BeautifulSoup(response.text, 'html.parser')
|
59 |
-
for script in soup(["script", "style"]):
|
60 |
-
script.decompose()
|
61 |
-
text = soup.get_text(separator=' ')
|
62 |
-
print("[LOG] Text extraction from URL (fallback) successful.")
|
63 |
-
return text
|
64 |
-
except Exception as e:
|
65 |
-
print(f"[ERROR] Exception during text extraction from URL (fallback): {e}")
|
66 |
-
return ""
|
67 |
|
68 |
def pitch_shift(audio: AudioSegment, semitones: int) -> AudioSegment:
|
69 |
print(f"[LOG] Shifting pitch by {semitones} semitones.")
|
@@ -71,33 +46,8 @@ def pitch_shift(audio: AudioSegment, semitones: int) -> AudioSegment:
|
|
71 |
shifted_audio = audio._spawn(audio.raw_data, overrides={'frame_rate': new_sample_rate})
|
72 |
return shifted_audio.set_frame_rate(audio.frame_rate)
|
73 |
|
74 |
-
|
75 |
-
|
76 |
-
print(f"[DEBUG] Aggregated word count: {word_count}")
|
77 |
-
return word_count >= min_word_count
|
78 |
-
|
79 |
-
def query_llm_for_additional_info(topic: str, existing_text: str) -> str:
|
80 |
-
# No longer needed
|
81 |
-
pass
|
82 |
-
def research_topic(topic: str) -> str:
|
83 |
-
# No longer needed
|
84 |
-
pass
|
85 |
-
|
86 |
-
def fetch_wikipedia_summary(topic: str) -> str:
|
87 |
-
# No longer needed
|
88 |
-
pass
|
89 |
-
|
90 |
-
def fetch_rss_feed(feed_url: str) -> list:
|
91 |
-
# No longer needed
|
92 |
-
pass
|
93 |
-
|
94 |
-
def find_relevant_article(items, topic: str, min_match=2) -> tuple:
|
95 |
-
# No longer needed
|
96 |
-
pass
|
97 |
-
|
98 |
-
def fetch_article_text(link: str) -> str:
|
99 |
-
# No longer needed
|
100 |
-
pass
|
101 |
def generate_script(
|
102 |
system_prompt: str,
|
103 |
input_text: str,
|
@@ -223,7 +173,7 @@ def generate_script(
|
|
223 |
d["display_speaker"] = d["speaker"]
|
224 |
new_dialogue_items.append(DialogueItem(**d))
|
225 |
|
226 |
-
|
227 |
except json.JSONDecodeError as e:
|
228 |
print("[ERROR] JSON decoding (format) failed:", e)
|
229 |
raise ValueError(f"Failed to parse dialogue: {str(e)}")
|
@@ -461,69 +411,75 @@ def generate_script(
|
|
461 |
print("[ERROR] Groq API error:", e)
|
462 |
fallback = {"speaker": "John", "text": "I'm sorry, I'm having trouble answering right now."}
|
463 |
return json.dumps(fallback)
|
464 |
-
|
465 |
-
# --- Agent and Tavily Integration ---
|
466 |
def run_research_agent(topic: str, report_type: str = "research_report", max_results: int = 10) -> str:
|
467 |
-
|
468 |
-
|
469 |
-
|
470 |
-
|
471 |
-
|
472 |
-
|
473 |
-
|
474 |
-
|
475 |
-
if not search_results:
|
476 |
-
return "No relevant search results found."
|
477 |
-
print(f"[DEBUG] Tavily results: {search_results}")
|
478 |
-
combined_content = ""
|
479 |
-
for result in search_results:
|
480 |
-
url = result.url # Directly access 'url' attribute
|
481 |
-
print(f"[LOG] Scraping URL with Firecrawl: {url}")
|
482 |
-
headers = {'Authorization': f'Bearer {os.environ.get("FIRECRAWL_API_KEY")}'}
|
483 |
-
payload = {"url": url, "formats": ["markdown"], "onlyMainContent": True}
|
484 |
-
try:
|
485 |
-
response = requests.post("https://api.firecrawl.dev/v1/scrape", headers=headers, json=payload)
|
486 |
-
response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
|
487 |
-
data = response.json()
|
488 |
-
print(f"[DEBUG] Firecrawl response: {data}")
|
489 |
-
if data.get('success') and 'markdown' in data.get('data', {}):
|
490 |
-
combined_content += data['data']['markdown'] + "\n\n"
|
491 |
-
else:
|
492 |
-
print(f"[WARNING] Firecrawl scrape failed or no markdown content for {url}: {data.get('error')}")
|
493 |
-
except requests.RequestException as e:
|
494 |
-
print(f"[ERROR] Error during Firecrawl request for {url}: {e}")
|
495 |
-
continue # Continue to the next URL
|
496 |
-
if not combined_content:
|
497 |
-
return "Could not retrieve content from any of the search results."
|
498 |
|
499 |
-
|
|
|
500 |
|
501 |
-
|
502 |
|
503 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
504 |
|
505 |
-
|
|
|
|
|
|
|
|
|
506 |
|
507 |
-
|
|
|
|
|
|
|
508 |
|
509 |
-
|
510 |
-
|
511 |
-
|
512 |
-
|
513 |
-
|
514 |
-
|
515 |
-
|
516 |
-
|
517 |
-
|
518 |
-
|
519 |
-
|
520 |
-
|
521 |
-
|
522 |
-
|
523 |
-
|
524 |
-
|
525 |
-
|
526 |
-
|
527 |
-
|
528 |
-
|
529 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
import requests
|
5 |
import tempfile
|
6 |
from bs4 import BeautifulSoup
|
7 |
+
from typing import List, Literal
|
8 |
from pydantic import BaseModel
|
9 |
from pydub import AudioSegment, effects
|
10 |
from transformers import pipeline
|
|
|
11 |
import tiktoken
|
12 |
from groq import Groq
|
13 |
+
import time
|
14 |
+
from report_structure import generate_report # Import report structure
|
|
|
15 |
from tavily import TavilyClient
|
|
|
16 |
|
17 |
|
18 |
|
|
|
39 |
return tokenizer.decode(tokens[:max_tokens])
|
40 |
return text
|
41 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
|
43 |
def pitch_shift(audio: AudioSegment, semitones: int) -> AudioSegment:
|
44 |
print(f"[LOG] Shifting pitch by {semitones} semitones.")
|
|
|
46 |
shifted_audio = audio._spawn(audio.raw_data, overrides={'frame_rate': new_sample_rate})
|
47 |
return shifted_audio.set_frame_rate(audio.frame_rate)
|
48 |
|
49 |
+
|
50 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
def generate_script(
|
52 |
system_prompt: str,
|
53 |
input_text: str,
|
|
|
173 |
d["display_speaker"] = d["speaker"]
|
174 |
new_dialogue_items.append(DialogueItem(**d))
|
175 |
|
176 |
+
return Dialogue(dialogue=new_dialogue_items)
|
177 |
except json.JSONDecodeError as e:
|
178 |
print("[ERROR] JSON decoding (format) failed:", e)
|
179 |
raise ValueError(f"Failed to parse dialogue: {str(e)}")
|
|
|
411 |
print("[ERROR] Groq API error:", e)
|
412 |
fallback = {"speaker": "John", "text": "I'm sorry, I'm having trouble answering right now."}
|
413 |
return json.dumps(fallback)
|
414 |
+
|
|
|
415 |
def run_research_agent(topic: str, report_type: str = "research_report", max_results: int = 10) -> str:
|
416 |
+
"""
|
417 |
+
Runs the new research agent to generate a research report. This version uses
|
418 |
+
Tavily for search and Firecrawl for content extraction.
|
419 |
+
"""
|
420 |
+
print(f"[LOG] Starting research agent for topic: {topic}")
|
421 |
+
try:
|
422 |
+
tavily_client = TavilyClient(api_key=os.environ.get("TAVILY_API_KEY"))
|
423 |
+
search_results = tavily_client.search(query=topic, max_results=max_results).results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
424 |
|
425 |
+
if not search_results:
|
426 |
+
return "No relevant search results found."
|
427 |
|
428 |
+
print(f"[DEBUG] Tavily results: {search_results}")
|
429 |
|
430 |
+
# Use Firecrawl to scrape the content of each URL
|
431 |
+
combined_content = ""
|
432 |
+
for result in search_results:
|
433 |
+
url = result.url # Use dot notation to access attributes
|
434 |
+
print(f"[LOG] Scraping URL with Firecrawl: {url}")
|
435 |
+
headers = {'Authorization': f'Bearer {os.environ.get("FIRECRAWL_API_KEY")}'}
|
436 |
+
payload = {"url": url, "formats": ["markdown"], "onlyMainContent": True}
|
437 |
|
438 |
+
try:
|
439 |
+
response = requests.post("https://api.firecrawl.dev/v1/scrape", headers=headers, json=payload)
|
440 |
+
response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
|
441 |
+
data = response.json()
|
442 |
+
#print(f"[DEBUG] Firecrawl response: {data}") #Uncomment for debugging
|
443 |
|
444 |
+
if data.get('success') and 'markdown' in data.get('data', {}):
|
445 |
+
combined_content += data['data']['markdown'] + "\n\n"
|
446 |
+
else:
|
447 |
+
print(f"[WARNING] Firecrawl scrape failed or no markdown content for {url}: {data.get('error')}")
|
448 |
|
449 |
+
except requests.RequestException as e:
|
450 |
+
print(f"[ERROR] Error during Firecrawl request for {url}: {e}")
|
451 |
+
continue # Continue to the next URL
|
452 |
+
|
453 |
+
if not combined_content:
|
454 |
+
return "Could not retrieve content from any of the search results."
|
455 |
+
|
456 |
+
# Use Groq LLM to generate the report
|
457 |
+
prompt = f"""You are a world-class researcher, and you are tasked to write a comprehensive research report on the following topic:
|
458 |
+
|
459 |
+
{topic}
|
460 |
+
|
461 |
+
Use the following pieces of information, gathered from various web sources, to construct your report:
|
462 |
+
|
463 |
+
{combined_content}
|
464 |
+
|
465 |
+
Compile and synthesize the information to create a well-structured and informative research report. Include a title, introduction, main body with clearly defined sections, and a conclusion. Cite sources appropriately in the context.
|
466 |
+
"""
|
467 |
+
|
468 |
+
groq_client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
|
469 |
+
response = groq_client.chat.completions.create(
|
470 |
+
messages=[
|
471 |
+
{"role": "user", "content": prompt}
|
472 |
+
],
|
473 |
+
model="deepseek-r1-distill-llama-70b",
|
474 |
+
temperature = 0.2
|
475 |
+
)
|
476 |
+
report_text = response.choices[0].message.content
|
477 |
+
#print(f"[DEBUG] Raw report from LLM:\n{report_text}") #Keep this commented out unless you have a very specific reason
|
478 |
+
|
479 |
+
structured_report = generate_report(report_text) # Use your report structuring function
|
480 |
+
return structured_report
|
481 |
+
|
482 |
+
|
483 |
+
except Exception as e:
|
484 |
+
print(f"[ERROR] Error in research agent: {e}")
|
485 |
+
return f"Sorry, I encountered an error during research: {e}"
|