Spaces:
Running
Running
Update utils.py
Browse files
utils.py
CHANGED
@@ -4,16 +4,37 @@ import json
|
|
4 |
import requests
|
5 |
import tempfile
|
6 |
from bs4 import BeautifulSoup
|
7 |
-
from typing import List, Literal
|
8 |
from pydantic import BaseModel
|
9 |
from pydub import AudioSegment, effects
|
10 |
-
|
11 |
import tiktoken
|
12 |
from groq import Groq
|
13 |
-
import
|
|
|
|
|
|
|
|
|
14 |
from report_structure import generate_report # Import report structure
|
15 |
-
from tavily import TavilyClient
|
16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
|
18 |
|
19 |
class DialogueItem(BaseModel):
|
@@ -39,6 +60,28 @@ def truncate_text(text, max_tokens=2048):
|
|
39 |
return tokenizer.decode(tokens[:max_tokens])
|
40 |
return text
|
41 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
|
43 |
def pitch_shift(audio: AudioSegment, semitones: int) -> AudioSegment:
|
44 |
print(f"[LOG] Shifting pitch by {semitones} semitones.")
|
@@ -46,7 +89,34 @@ def pitch_shift(audio: AudioSegment, semitones: int) -> AudioSegment:
|
|
46 |
shifted_audio = audio._spawn(audio.raw_data, overrides={'frame_rate': new_sample_rate})
|
47 |
return shifted_audio.set_frame_rate(audio.frame_rate)
|
48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
|
51 |
def generate_script(
|
52 |
system_prompt: str,
|
@@ -173,7 +243,7 @@ def generate_script(
|
|
173 |
d["display_speaker"] = d["speaker"]
|
174 |
new_dialogue_items.append(DialogueItem(**d))
|
175 |
|
176 |
-
|
177 |
except json.JSONDecodeError as e:
|
178 |
print("[ERROR] JSON decoding (format) failed:", e)
|
179 |
raise ValueError(f"Failed to parse dialogue: {str(e)}")
|
@@ -411,75 +481,75 @@ def generate_script(
|
|
411 |
print("[ERROR] Groq API error:", e)
|
412 |
fallback = {"speaker": "John", "text": "I'm sorry, I'm having trouble answering right now."}
|
413 |
return json.dumps(fallback)
|
414 |
-
|
415 |
-
def run_research_agent(topic: str, report_type: str = "research_report", max_results: int = 10) -> str:
|
416 |
-
"""
|
417 |
-
Runs the new research agent to generate a research report. This version uses
|
418 |
-
Tavily for search and Firecrawl for content extraction.
|
419 |
-
"""
|
420 |
-
print(f"[LOG] Starting research agent for topic: {topic}")
|
421 |
-
try:
|
422 |
-
tavily_client = TavilyClient(api_key=os.environ.get("TAVILY_API_KEY"))
|
423 |
-
search_results = tavily_client.search(query=topic, max_results=max_results).results
|
424 |
-
|
425 |
-
if not search_results:
|
426 |
-
return "No relevant search results found."
|
427 |
-
|
428 |
-
print(f"[DEBUG] Tavily results: {search_results}")
|
429 |
-
|
430 |
-
# Use Firecrawl to scrape the content of each URL
|
431 |
-
combined_content = ""
|
432 |
-
for result in search_results:
|
433 |
-
url = result.url # Use dot notation to access attributes
|
434 |
-
print(f"[LOG] Scraping URL with Firecrawl: {url}")
|
435 |
-
headers = {'Authorization': f'Bearer {os.environ.get("FIRECRAWL_API_KEY")}'}
|
436 |
-
payload = {"url": url, "formats": ["markdown"], "onlyMainContent": True}
|
437 |
-
|
438 |
-
try:
|
439 |
-
response = requests.post("https://api.firecrawl.dev/v1/scrape", headers=headers, json=payload)
|
440 |
-
response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
|
441 |
-
data = response.json()
|
442 |
-
#print(f"[DEBUG] Firecrawl response: {data}") #Uncomment for debugging
|
443 |
|
444 |
-
|
445 |
-
|
446 |
-
|
447 |
-
|
448 |
-
|
449 |
-
|
450 |
-
|
451 |
-
|
452 |
-
|
453 |
-
|
454 |
-
|
455 |
-
|
456 |
-
|
457 |
-
|
458 |
-
|
459 |
-
|
460 |
-
|
461 |
-
|
462 |
-
|
463 |
-
|
464 |
-
|
465 |
-
|
466 |
-
|
467 |
-
|
468 |
-
|
469 |
-
|
470 |
-
|
471 |
-
|
472 |
-
|
473 |
-
|
474 |
-
|
475 |
-
|
476 |
-
|
477 |
-
|
478 |
-
|
479 |
-
|
480 |
-
|
481 |
-
|
482 |
-
|
483 |
-
|
484 |
-
|
485 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
import requests
|
5 |
import tempfile
|
6 |
from bs4 import BeautifulSoup
|
7 |
+
from typing import List, Literal, Optional
|
8 |
from pydantic import BaseModel
|
9 |
from pydub import AudioSegment, effects
|
10 |
+
import yt_dlp
|
11 |
import tiktoken
|
12 |
from groq import Groq
|
13 |
+
import numpy as np
|
14 |
+
import torch # Moved to the top
|
15 |
+
from transformers import pipeline # Moved to the top, since it's used before other things
|
16 |
+
import random
|
17 |
+
from tavily import TavilyClient #Moved
|
18 |
from report_structure import generate_report # Import report structure
|
|
|
19 |
|
20 |
+
# --- Add the cloned repository to the Python path ---
|
21 |
+
repo_path = os.path.join('/home', 'user', 'open_deep_research')
|
22 |
+
print(f"DEBUG: repo_path = {repo_path}") # Debug print - keep this for now
|
23 |
+
if repo_path not in sys.path:
|
24 |
+
print("DEBUG: Adding repo_path to sys.path") # Debug print - keep this
|
25 |
+
sys.path.insert(0, repo_path)
|
26 |
+
else:
|
27 |
+
print("DEBUG: repo_path already in sys.path") # Debug print - keep this for now
|
28 |
+
print(f"DEBUG: sys.path = {sys.path}") # Debug print - keep this for now
|
29 |
+
|
30 |
+
# --- CORRECT IMPORT (for local cloned repo) ---
|
31 |
+
try:
|
32 |
+
from open_deep_research.agent import OpenDeepResearchAgent
|
33 |
+
print("DEBUG: Import successful!")
|
34 |
+
except ImportError as e:
|
35 |
+
print(f"DEBUG: Import failed: {e}")
|
36 |
+
raise
|
37 |
+
from report_structure import generate_report
|
38 |
|
39 |
|
40 |
class DialogueItem(BaseModel):
|
|
|
60 |
return tokenizer.decode(tokens[:max_tokens])
|
61 |
return text
|
62 |
|
63 |
+
def extract_text_from_url(url):
|
64 |
+
# This function is retained for potential edge cases.
|
65 |
+
print("[LOG] Extracting text from URL (fallback method):", url)
|
66 |
+
try:
|
67 |
+
headers = {
|
68 |
+
"User-Agent": ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
69 |
+
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
70 |
+
"Chrome/115.0.0.0 Safari/537.36")
|
71 |
+
}
|
72 |
+
response = requests.get(url, headers=headers)
|
73 |
+
if response.status_code != 200:
|
74 |
+
print(f"[ERROR] Failed to fetch URL: {url} with status code {response.status_code}")
|
75 |
+
return ""
|
76 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
77 |
+
for script in soup(["script", "style"]):
|
78 |
+
script.decompose()
|
79 |
+
text = soup.get_text(separator=' ')
|
80 |
+
print("[LOG] Text extraction from URL (fallback) successful.")
|
81 |
+
return text
|
82 |
+
except Exception as e:
|
83 |
+
print(f"[ERROR] Exception during text extraction from URL (fallback): {e}")
|
84 |
+
return ""
|
85 |
|
86 |
def pitch_shift(audio: AudioSegment, semitones: int) -> AudioSegment:
|
87 |
print(f"[LOG] Shifting pitch by {semitones} semitones.")
|
|
|
89 |
shifted_audio = audio._spawn(audio.raw_data, overrides={'frame_rate': new_sample_rate})
|
90 |
return shifted_audio.set_frame_rate(audio.frame_rate)
|
91 |
|
92 |
+
def is_sufficient(text: str, min_word_count: int = 500) -> bool:
|
93 |
+
# This function's role is reduced; the agent decides.
|
94 |
+
word_count = len(text.split())
|
95 |
+
print(f"[DEBUG] Aggregated word count: {word_count}")
|
96 |
+
return word_count >= min_word_count
|
97 |
+
|
98 |
+
def query_llm_for_additional_info(topic: str, existing_text: str) -> str:
|
99 |
+
# No longer needed
|
100 |
+
pass
|
101 |
+
def research_topic(topic: str) -> str:
|
102 |
+
# No longer needed
|
103 |
+
pass
|
104 |
+
|
105 |
+
def fetch_wikipedia_summary(topic: str) -> str:
|
106 |
+
# No longer needed
|
107 |
+
pass
|
108 |
+
|
109 |
+
def fetch_rss_feed(feed_url: str) -> list:
|
110 |
+
# No longer needed
|
111 |
+
pass
|
112 |
|
113 |
+
def find_relevant_article(items, topic: str, min_match=2) -> tuple:
|
114 |
+
# No longer needed
|
115 |
+
pass
|
116 |
+
|
117 |
+
def fetch_article_text(link: str) -> str:
|
118 |
+
# No longer needed
|
119 |
+
pass
|
120 |
|
121 |
def generate_script(
|
122 |
system_prompt: str,
|
|
|
243 |
d["display_speaker"] = d["speaker"]
|
244 |
new_dialogue_items.append(DialogueItem(**d))
|
245 |
|
246 |
+
return Dialogue(dialogue=new_dialogue_items)
|
247 |
except json.JSONDecodeError as e:
|
248 |
print("[ERROR] JSON decoding (format) failed:", e)
|
249 |
raise ValueError(f"Failed to parse dialogue: {str(e)}")
|
|
|
481 |
print("[ERROR] Groq API error:", e)
|
482 |
fallback = {"speaker": "John", "text": "I'm sorry, I'm having trouble answering right now."}
|
483 |
return json.dumps(fallback)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
484 |
|
485 |
+
# --- Agent and Tavily Integration ---
|
486 |
+
def run_research_agent(topic: str, report_type: str = "research_report", max_results: int = 20) -> str:
|
487 |
+
"""
|
488 |
+
Runs the new research agent to generate a research report. This version uses
|
489 |
+
Tavily for search and Firecrawl for content extraction.
|
490 |
+
"""
|
491 |
+
print(f"[LOG] Starting research agent for topic: {topic}")
|
492 |
+
try:
|
493 |
+
tavily_client = TavilyClient(api_key=os.environ.get("TAVILY_API_KEY"))
|
494 |
+
search_results = tavily_client.search(query=topic, max_results=max_results).results
|
495 |
+
|
496 |
+
if not search_results:
|
497 |
+
return "No relevant search results found."
|
498 |
+
|
499 |
+
print(f"[DEBUG] Tavily results: {search_results}")
|
500 |
+
|
501 |
+
# Use Firecrawl to scrape the content of each URL
|
502 |
+
combined_content = ""
|
503 |
+
for result in search_results:
|
504 |
+
url = result.url # Use dot notation to access attributes
|
505 |
+
print(f"[LOG] Scraping URL with Firecrawl: {url}")
|
506 |
+
headers = {'Authorization': f'Bearer {os.environ.get("FIRECRAWL_API_KEY")}'}
|
507 |
+
payload = {"url": url, "formats": ["markdown"], "onlyMainContent": True}
|
508 |
+
try:
|
509 |
+
response = requests.post("https://api.firecrawl.dev/v1/scrape", headers=headers, json=payload)
|
510 |
+
response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
|
511 |
+
data = response.json()
|
512 |
+
# print(f"[DEBUG] Firecrawl response: {data}") #keep commented out
|
513 |
+
|
514 |
+
if data.get('success') and 'markdown' in data.get('data', {}):
|
515 |
+
combined_content += data['data']['markdown'] + "\n\n"
|
516 |
+
else:
|
517 |
+
print(f"[WARNING] Firecrawl scrape failed or no markdown content for {url}: {data.get('error')}")
|
518 |
+
|
519 |
+
except requests.RequestException as e:
|
520 |
+
print(f"[ERROR] Error during Firecrawl request for {url}: {e}")
|
521 |
+
continue # Continue to the next URL
|
522 |
+
|
523 |
+
if not combined_content:
|
524 |
+
return "Could not retrieve content from any of the search results."
|
525 |
+
|
526 |
+
# Use Groq LLM to generate the report
|
527 |
+
prompt = f"""You are a world-class researcher, and you are tasked to write a comprehensive research report on the following topic:
|
528 |
+
|
529 |
+
{topic}
|
530 |
+
|
531 |
+
Use the following pieces of information, gathered from various web sources, to construct your report:
|
532 |
+
|
533 |
+
{combined_content}
|
534 |
+
|
535 |
+
Compile and synthesize the information to create a well-structured and informative research report. Include a title, introduction, main body with clearly defined sections, and a conclusion. Cite sources appropriately in the context. Do not hallucinate. Do not make anything up.
|
536 |
+
"""
|
537 |
+
|
538 |
+
groq_client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
|
539 |
+
response = groq_client.chat.completions.create(
|
540 |
+
messages=[
|
541 |
+
{"role": "user", "content": prompt}
|
542 |
+
],
|
543 |
+
model="deepseek-r1-distill-llama-70b",
|
544 |
+
temperature = 0.2
|
545 |
+
)
|
546 |
+
report_text = response.choices[0].message.content
|
547 |
+
# print(f"[DEBUG] Raw report from LLM:\n{report_text}") #keep this commented out.
|
548 |
+
|
549 |
+
structured_report = generate_report(report_text) # Use your report structuring function
|
550 |
+
return structured_report
|
551 |
+
|
552 |
+
|
553 |
+
except Exception as e:
|
554 |
+
print(f"[ERROR] Error in research agent: {e}")
|
555 |
+
return f"Sorry, I encountered an error during research: {e}"
|