siddhartharyaai commited on
Commit
de89d5c
·
verified ·
1 Parent(s): 7e05530

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +146 -76
utils.py CHANGED
@@ -4,16 +4,37 @@ import json
4
  import requests
5
  import tempfile
6
  from bs4 import BeautifulSoup
7
- from typing import List, Literal
8
  from pydantic import BaseModel
9
  from pydub import AudioSegment, effects
10
- from transformers import pipeline
11
  import tiktoken
12
  from groq import Groq
13
- import time
 
 
 
 
14
  from report_structure import generate_report # Import report structure
15
- from tavily import TavilyClient
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
 
19
  class DialogueItem(BaseModel):
@@ -39,6 +60,28 @@ def truncate_text(text, max_tokens=2048):
39
  return tokenizer.decode(tokens[:max_tokens])
40
  return text
41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
  def pitch_shift(audio: AudioSegment, semitones: int) -> AudioSegment:
44
  print(f"[LOG] Shifting pitch by {semitones} semitones.")
@@ -46,7 +89,34 @@ def pitch_shift(audio: AudioSegment, semitones: int) -> AudioSegment:
46
  shifted_audio = audio._spawn(audio.raw_data, overrides={'frame_rate': new_sample_rate})
47
  return shifted_audio.set_frame_rate(audio.frame_rate)
48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
 
 
 
 
 
 
 
50
 
51
  def generate_script(
52
  system_prompt: str,
@@ -173,7 +243,7 @@ def generate_script(
173
  d["display_speaker"] = d["speaker"]
174
  new_dialogue_items.append(DialogueItem(**d))
175
 
176
- return Dialogue(dialogue=new_dialogue_items)
177
  except json.JSONDecodeError as e:
178
  print("[ERROR] JSON decoding (format) failed:", e)
179
  raise ValueError(f"Failed to parse dialogue: {str(e)}")
@@ -411,75 +481,75 @@ def generate_script(
411
  print("[ERROR] Groq API error:", e)
412
  fallback = {"speaker": "John", "text": "I'm sorry, I'm having trouble answering right now."}
413
  return json.dumps(fallback)
414
-
415
- def run_research_agent(topic: str, report_type: str = "research_report", max_results: int = 10) -> str:
416
- """
417
- Runs the new research agent to generate a research report. This version uses
418
- Tavily for search and Firecrawl for content extraction.
419
- """
420
- print(f"[LOG] Starting research agent for topic: {topic}")
421
- try:
422
- tavily_client = TavilyClient(api_key=os.environ.get("TAVILY_API_KEY"))
423
- search_results = tavily_client.search(query=topic, max_results=max_results).results
424
-
425
- if not search_results:
426
- return "No relevant search results found."
427
-
428
- print(f"[DEBUG] Tavily results: {search_results}")
429
-
430
- # Use Firecrawl to scrape the content of each URL
431
- combined_content = ""
432
- for result in search_results:
433
- url = result.url # Use dot notation to access attributes
434
- print(f"[LOG] Scraping URL with Firecrawl: {url}")
435
- headers = {'Authorization': f'Bearer {os.environ.get("FIRECRAWL_API_KEY")}'}
436
- payload = {"url": url, "formats": ["markdown"], "onlyMainContent": True}
437
-
438
- try:
439
- response = requests.post("https://api.firecrawl.dev/v1/scrape", headers=headers, json=payload)
440
- response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
441
- data = response.json()
442
- #print(f"[DEBUG] Firecrawl response: {data}") #Uncomment for debugging
443
 
444
- if data.get('success') and 'markdown' in data.get('data', {}):
445
- combined_content += data['data']['markdown'] + "\n\n"
446
- else:
447
- print(f"[WARNING] Firecrawl scrape failed or no markdown content for {url}: {data.get('error')}")
448
-
449
- except requests.RequestException as e:
450
- print(f"[ERROR] Error during Firecrawl request for {url}: {e}")
451
- continue # Continue to the next URL
452
-
453
- if not combined_content:
454
- return "Could not retrieve content from any of the search results."
455
-
456
- # Use Groq LLM to generate the report
457
- prompt = f"""You are a world-class researcher, and you are tasked to write a comprehensive research report on the following topic:
458
-
459
- {topic}
460
-
461
- Use the following pieces of information, gathered from various web sources, to construct your report:
462
-
463
- {combined_content}
464
-
465
- Compile and synthesize the information to create a well-structured and informative research report. Include a title, introduction, main body with clearly defined sections, and a conclusion. Cite sources appropriately in the context.
466
- """
467
-
468
- groq_client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
469
- response = groq_client.chat.completions.create(
470
- messages=[
471
- {"role": "user", "content": prompt}
472
- ],
473
- model="deepseek-r1-distill-llama-70b",
474
- temperature = 0.2
475
- )
476
- report_text = response.choices[0].message.content
477
- #print(f"[DEBUG] Raw report from LLM:\n{report_text}") #Keep this commented out unless you have a very specific reason
478
-
479
- structured_report = generate_report(report_text) # Use your report structuring function
480
- return structured_report
481
-
482
-
483
- except Exception as e:
484
- print(f"[ERROR] Error in research agent: {e}")
485
- return f"Sorry, I encountered an error during research: {e}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  import requests
5
  import tempfile
6
  from bs4 import BeautifulSoup
7
+ from typing import List, Literal, Optional
8
  from pydantic import BaseModel
9
  from pydub import AudioSegment, effects
10
+ import yt_dlp
11
  import tiktoken
12
  from groq import Groq
13
+ import numpy as np
14
+ import torch # Moved to the top
15
+ from transformers import pipeline # Moved to the top, since it's used before other things
16
+ import random
17
+ from tavily import TavilyClient #Moved
18
  from report_structure import generate_report # Import report structure
 
19
 
20
+ # --- Add the cloned repository to the Python path ---
21
+ repo_path = os.path.join('/home', 'user', 'open_deep_research')
22
+ print(f"DEBUG: repo_path = {repo_path}") # Debug print - keep this for now
23
+ if repo_path not in sys.path:
24
+ print("DEBUG: Adding repo_path to sys.path") # Debug print - keep this
25
+ sys.path.insert(0, repo_path)
26
+ else:
27
+ print("DEBUG: repo_path already in sys.path") # Debug print - keep this for now
28
+ print(f"DEBUG: sys.path = {sys.path}") # Debug print - keep this for now
29
+
30
+ # --- CORRECT IMPORT (for local cloned repo) ---
31
+ try:
32
+ from open_deep_research.agent import OpenDeepResearchAgent
33
+ print("DEBUG: Import successful!")
34
+ except ImportError as e:
35
+ print(f"DEBUG: Import failed: {e}")
36
+ raise
37
+ from report_structure import generate_report
38
 
39
 
40
  class DialogueItem(BaseModel):
 
60
  return tokenizer.decode(tokens[:max_tokens])
61
  return text
62
 
63
+ def extract_text_from_url(url):
64
+ # This function is retained for potential edge cases.
65
+ print("[LOG] Extracting text from URL (fallback method):", url)
66
+ try:
67
+ headers = {
68
+ "User-Agent": ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
69
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
70
+ "Chrome/115.0.0.0 Safari/537.36")
71
+ }
72
+ response = requests.get(url, headers=headers)
73
+ if response.status_code != 200:
74
+ print(f"[ERROR] Failed to fetch URL: {url} with status code {response.status_code}")
75
+ return ""
76
+ soup = BeautifulSoup(response.text, 'html.parser')
77
+ for script in soup(["script", "style"]):
78
+ script.decompose()
79
+ text = soup.get_text(separator=' ')
80
+ print("[LOG] Text extraction from URL (fallback) successful.")
81
+ return text
82
+ except Exception as e:
83
+ print(f"[ERROR] Exception during text extraction from URL (fallback): {e}")
84
+ return ""
85
 
86
  def pitch_shift(audio: AudioSegment, semitones: int) -> AudioSegment:
87
  print(f"[LOG] Shifting pitch by {semitones} semitones.")
 
89
  shifted_audio = audio._spawn(audio.raw_data, overrides={'frame_rate': new_sample_rate})
90
  return shifted_audio.set_frame_rate(audio.frame_rate)
91
 
92
+ def is_sufficient(text: str, min_word_count: int = 500) -> bool:
93
+ # This function's role is reduced; the agent decides.
94
+ word_count = len(text.split())
95
+ print(f"[DEBUG] Aggregated word count: {word_count}")
96
+ return word_count >= min_word_count
97
+
98
+ def query_llm_for_additional_info(topic: str, existing_text: str) -> str:
99
+ # No longer needed
100
+ pass
101
+ def research_topic(topic: str) -> str:
102
+ # No longer needed
103
+ pass
104
+
105
+ def fetch_wikipedia_summary(topic: str) -> str:
106
+ # No longer needed
107
+ pass
108
+
109
+ def fetch_rss_feed(feed_url: str) -> list:
110
+ # No longer needed
111
+ pass
112
 
113
+ def find_relevant_article(items, topic: str, min_match=2) -> tuple:
114
+ # No longer needed
115
+ pass
116
+
117
+ def fetch_article_text(link: str) -> str:
118
+ # No longer needed
119
+ pass
120
 
121
  def generate_script(
122
  system_prompt: str,
 
243
  d["display_speaker"] = d["speaker"]
244
  new_dialogue_items.append(DialogueItem(**d))
245
 
246
+ return Dialogue(dialogue=new_dialogue_items)
247
  except json.JSONDecodeError as e:
248
  print("[ERROR] JSON decoding (format) failed:", e)
249
  raise ValueError(f"Failed to parse dialogue: {str(e)}")
 
481
  print("[ERROR] Groq API error:", e)
482
  fallback = {"speaker": "John", "text": "I'm sorry, I'm having trouble answering right now."}
483
  return json.dumps(fallback)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
484
 
485
+ # --- Agent and Tavily Integration ---
486
+ def run_research_agent(topic: str, report_type: str = "research_report", max_results: int = 20) -> str:
487
+ """
488
+ Runs the new research agent to generate a research report. This version uses
489
+ Tavily for search and Firecrawl for content extraction.
490
+ """
491
+ print(f"[LOG] Starting research agent for topic: {topic}")
492
+ try:
493
+ tavily_client = TavilyClient(api_key=os.environ.get("TAVILY_API_KEY"))
494
+ search_results = tavily_client.search(query=topic, max_results=max_results).results
495
+
496
+ if not search_results:
497
+ return "No relevant search results found."
498
+
499
+ print(f"[DEBUG] Tavily results: {search_results}")
500
+
501
+ # Use Firecrawl to scrape the content of each URL
502
+ combined_content = ""
503
+ for result in search_results:
504
+ url = result.url # Use dot notation to access attributes
505
+ print(f"[LOG] Scraping URL with Firecrawl: {url}")
506
+ headers = {'Authorization': f'Bearer {os.environ.get("FIRECRAWL_API_KEY")}'}
507
+ payload = {"url": url, "formats": ["markdown"], "onlyMainContent": True}
508
+ try:
509
+ response = requests.post("https://api.firecrawl.dev/v1/scrape", headers=headers, json=payload)
510
+ response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
511
+ data = response.json()
512
+ # print(f"[DEBUG] Firecrawl response: {data}") #keep commented out
513
+
514
+ if data.get('success') and 'markdown' in data.get('data', {}):
515
+ combined_content += data['data']['markdown'] + "\n\n"
516
+ else:
517
+ print(f"[WARNING] Firecrawl scrape failed or no markdown content for {url}: {data.get('error')}")
518
+
519
+ except requests.RequestException as e:
520
+ print(f"[ERROR] Error during Firecrawl request for {url}: {e}")
521
+ continue # Continue to the next URL
522
+
523
+ if not combined_content:
524
+ return "Could not retrieve content from any of the search results."
525
+
526
+ # Use Groq LLM to generate the report
527
+ prompt = f"""You are a world-class researcher, and you are tasked to write a comprehensive research report on the following topic:
528
+
529
+ {topic}
530
+
531
+ Use the following pieces of information, gathered from various web sources, to construct your report:
532
+
533
+ {combined_content}
534
+
535
+ Compile and synthesize the information to create a well-structured and informative research report. Include a title, introduction, main body with clearly defined sections, and a conclusion. Cite sources appropriately in the context. Do not hallucinate. Do not make anything up.
536
+ """
537
+
538
+ groq_client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
539
+ response = groq_client.chat.completions.create(
540
+ messages=[
541
+ {"role": "user", "content": prompt}
542
+ ],
543
+ model="deepseek-r1-distill-llama-70b",
544
+ temperature = 0.2
545
+ )
546
+ report_text = response.choices[0].message.content
547
+ # print(f"[DEBUG] Raw report from LLM:\n{report_text}") #keep this commented out.
548
+
549
+ structured_report = generate_report(report_text) # Use your report structuring function
550
+ return structured_report
551
+
552
+
553
+ except Exception as e:
554
+ print(f"[ERROR] Error in research agent: {e}")
555
+ return f"Sorry, I encountered an error during research: {e}"