siddhartharyaai commited on
Commit
2ae64da
·
verified ·
1 Parent(s): 22de709

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +71 -115
utils.py CHANGED
@@ -4,18 +4,15 @@ import json
4
  import requests
5
  import tempfile
6
  from bs4 import BeautifulSoup
7
- from typing import List, Literal, Optional
8
  from pydantic import BaseModel
9
  from pydub import AudioSegment, effects
10
  from transformers import pipeline
11
- import yt_dlp
12
  import tiktoken
13
  from groq import Groq
14
- import numpy as np
15
- import torch
16
- import random
17
  from tavily import TavilyClient
18
- from report_structure import generate_report
19
 
20
 
21
 
@@ -42,28 +39,6 @@ def truncate_text(text, max_tokens=2048):
42
  return tokenizer.decode(tokens[:max_tokens])
43
  return text
44
 
45
- def extract_text_from_url(url):
46
- # Kept for potential user-provided URLs, but not primary.
47
- print("[LOG] Extracting text from URL (fallback method):", url)
48
- try:
49
- headers = {
50
- "User-Agent": ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
51
- "AppleWebKit/537.36 (KHTML, like Gecko) "
52
- "Chrome/115.0.0.0 Safari/537.36")
53
- }
54
- response = requests.get(url, headers=headers)
55
- if response.status_code != 200:
56
- print(f"[ERROR] Failed to fetch URL: {url} with status code {response.status_code}")
57
- return ""
58
- soup = BeautifulSoup(response.text, 'html.parser')
59
- for script in soup(["script", "style"]):
60
- script.decompose()
61
- text = soup.get_text(separator=' ')
62
- print("[LOG] Text extraction from URL (fallback) successful.")
63
- return text
64
- except Exception as e:
65
- print(f"[ERROR] Exception during text extraction from URL (fallback): {e}")
66
- return ""
67
 
68
  def pitch_shift(audio: AudioSegment, semitones: int) -> AudioSegment:
69
  print(f"[LOG] Shifting pitch by {semitones} semitones.")
@@ -71,33 +46,8 @@ def pitch_shift(audio: AudioSegment, semitones: int) -> AudioSegment:
71
  shifted_audio = audio._spawn(audio.raw_data, overrides={'frame_rate': new_sample_rate})
72
  return shifted_audio.set_frame_rate(audio.frame_rate)
73
 
74
- def is_sufficient(text: str, min_word_count: int = 500) -> bool:
75
- word_count = len(text.split())
76
- print(f"[DEBUG] Aggregated word count: {word_count}")
77
- return word_count >= min_word_count
78
-
79
- def query_llm_for_additional_info(topic: str, existing_text: str) -> str:
80
- # No longer needed
81
- pass
82
- def research_topic(topic: str) -> str:
83
- # No longer needed
84
- pass
85
-
86
- def fetch_wikipedia_summary(topic: str) -> str:
87
- # No longer needed
88
- pass
89
-
90
- def fetch_rss_feed(feed_url: str) -> list:
91
- # No longer needed
92
- pass
93
-
94
- def find_relevant_article(items, topic: str, min_match=2) -> tuple:
95
- # No longer needed
96
- pass
97
-
98
- def fetch_article_text(link: str) -> str:
99
- # No longer needed
100
- pass
101
  def generate_script(
102
  system_prompt: str,
103
  input_text: str,
@@ -223,7 +173,7 @@ def generate_script(
223
  d["display_speaker"] = d["speaker"]
224
  new_dialogue_items.append(DialogueItem(**d))
225
 
226
- return Dialogue(dialogue=new_dialogue_items)
227
  except json.JSONDecodeError as e:
228
  print("[ERROR] JSON decoding (format) failed:", e)
229
  raise ValueError(f"Failed to parse dialogue: {str(e)}")
@@ -461,69 +411,75 @@ def generate_script(
461
  print("[ERROR] Groq API error:", e)
462
  fallback = {"speaker": "John", "text": "I'm sorry, I'm having trouble answering right now."}
463
  return json.dumps(fallback)
464
-
465
- # --- Agent and Tavily Integration ---
466
  def run_research_agent(topic: str, report_type: str = "research_report", max_results: int = 10) -> str:
467
- """
468
- Runs the new research agent to generate a research report. This version uses
469
- Tavily for search and Firecrawl for content extraction.
470
- """
471
- print(f"[LOG] Starting research agent for topic: {topic}")
472
- try:
473
- tavily_client = TavilyClient(api_key=os.environ.get("TAVILY_API_KEY"))
474
- search_results = tavily_client.search(query=topic, max_results=max_results).results
475
- if not search_results:
476
- return "No relevant search results found."
477
- print(f"[DEBUG] Tavily results: {search_results}")
478
- combined_content = ""
479
- for result in search_results:
480
- url = result.url # Directly access 'url' attribute
481
- print(f"[LOG] Scraping URL with Firecrawl: {url}")
482
- headers = {'Authorization': f'Bearer {os.environ.get("FIRECRAWL_API_KEY")}'}
483
- payload = {"url": url, "formats": ["markdown"], "onlyMainContent": True}
484
- try:
485
- response = requests.post("https://api.firecrawl.dev/v1/scrape", headers=headers, json=payload)
486
- response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
487
- data = response.json()
488
- print(f"[DEBUG] Firecrawl response: {data}")
489
- if data.get('success') and 'markdown' in data.get('data', {}):
490
- combined_content += data['data']['markdown'] + "\n\n"
491
- else:
492
- print(f"[WARNING] Firecrawl scrape failed or no markdown content for {url}: {data.get('error')}")
493
- except requests.RequestException as e:
494
- print(f"[ERROR] Error during Firecrawl request for {url}: {e}")
495
- continue # Continue to the next URL
496
- if not combined_content:
497
- return "Could not retrieve content from any of the search results."
498
 
499
- # Use Groq LLM to generate the report
 
500
 
501
- prompt = f"""You are a world-class researcher, and you are tasked to write a comprehensive research report on the following topic:
502
 
503
- {topic}
 
 
 
 
 
 
504
 
505
- Use the following pieces of information, gathered from various web sources, to construct your report:
 
 
 
 
506
 
507
- {combined_content}
 
 
 
508
 
509
- Compile and synthesize the information to create a well-structured and informative research report.
510
- Include a title, introduction, main body with clearly defined sections, and a conclusion. Cite sources appropriately.
511
- """
512
- groq_client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
513
-
514
- response = groq_client.chat.completions.create(
515
- messages=[
516
- {"role": "user", "content": prompt}
517
- ],
518
- model="deepseek-r1-distill-llama-70b",
519
- temperature = 0.2
520
- )
521
- report_text = response.choices[0].message.content
522
- print(f"[DEBUG] Raw report from LLM:\n{report_text}")
523
-
524
- structured_report = generate_report(report_text) # Use your report structuring function
525
- return structured_report
526
-
527
- except Exception as e:
528
- print(f"[ERROR] Error in research agent: {e}")
529
- return f"Sorry, I encountered an error during research: {e}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  import requests
5
  import tempfile
6
  from bs4 import BeautifulSoup
7
+ from typing import List, Literal
8
  from pydantic import BaseModel
9
  from pydub import AudioSegment, effects
10
  from transformers import pipeline
 
11
  import tiktoken
12
  from groq import Groq
13
+ import time
14
+ from report_structure import generate_report # Import report structure
 
15
  from tavily import TavilyClient
 
16
 
17
 
18
 
 
39
  return tokenizer.decode(tokens[:max_tokens])
40
  return text
41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
  def pitch_shift(audio: AudioSegment, semitones: int) -> AudioSegment:
44
  print(f"[LOG] Shifting pitch by {semitones} semitones.")
 
46
  shifted_audio = audio._spawn(audio.raw_data, overrides={'frame_rate': new_sample_rate})
47
  return shifted_audio.set_frame_rate(audio.frame_rate)
48
 
49
+
50
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  def generate_script(
52
  system_prompt: str,
53
  input_text: str,
 
173
  d["display_speaker"] = d["speaker"]
174
  new_dialogue_items.append(DialogueItem(**d))
175
 
176
+ return Dialogue(dialogue=new_dialogue_items)
177
  except json.JSONDecodeError as e:
178
  print("[ERROR] JSON decoding (format) failed:", e)
179
  raise ValueError(f"Failed to parse dialogue: {str(e)}")
 
411
  print("[ERROR] Groq API error:", e)
412
  fallback = {"speaker": "John", "text": "I'm sorry, I'm having trouble answering right now."}
413
  return json.dumps(fallback)
414
+
 
415
  def run_research_agent(topic: str, report_type: str = "research_report", max_results: int = 10) -> str:
416
+ """
417
+ Runs the new research agent to generate a research report. This version uses
418
+ Tavily for search and Firecrawl for content extraction.
419
+ """
420
+ print(f"[LOG] Starting research agent for topic: {topic}")
421
+ try:
422
+ tavily_client = TavilyClient(api_key=os.environ.get("TAVILY_API_KEY"))
423
+ search_results = tavily_client.search(query=topic, max_results=max_results).results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
424
 
425
+ if not search_results:
426
+ return "No relevant search results found."
427
 
428
+ print(f"[DEBUG] Tavily results: {search_results}")
429
 
430
+ # Use Firecrawl to scrape the content of each URL
431
+ combined_content = ""
432
+ for result in search_results:
433
+ url = result.url # Use dot notation to access attributes
434
+ print(f"[LOG] Scraping URL with Firecrawl: {url}")
435
+ headers = {'Authorization': f'Bearer {os.environ.get("FIRECRAWL_API_KEY")}'}
436
+ payload = {"url": url, "formats": ["markdown"], "onlyMainContent": True}
437
 
438
+ try:
439
+ response = requests.post("https://api.firecrawl.dev/v1/scrape", headers=headers, json=payload)
440
+ response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
441
+ data = response.json()
442
+ #print(f"[DEBUG] Firecrawl response: {data}") #Uncomment for debugging
443
 
444
+ if data.get('success') and 'markdown' in data.get('data', {}):
445
+ combined_content += data['data']['markdown'] + "\n\n"
446
+ else:
447
+ print(f"[WARNING] Firecrawl scrape failed or no markdown content for {url}: {data.get('error')}")
448
 
449
+ except requests.RequestException as e:
450
+ print(f"[ERROR] Error during Firecrawl request for {url}: {e}")
451
+ continue # Continue to the next URL
452
+
453
+ if not combined_content:
454
+ return "Could not retrieve content from any of the search results."
455
+
456
+ # Use Groq LLM to generate the report
457
+ prompt = f"""You are a world-class researcher, and you are tasked to write a comprehensive research report on the following topic:
458
+
459
+ {topic}
460
+
461
+ Use the following pieces of information, gathered from various web sources, to construct your report:
462
+
463
+ {combined_content}
464
+
465
+ Compile and synthesize the information to create a well-structured and informative research report. Include a title, introduction, main body with clearly defined sections, and a conclusion. Cite sources appropriately in the context.
466
+ """
467
+
468
+ groq_client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
469
+ response = groq_client.chat.completions.create(
470
+ messages=[
471
+ {"role": "user", "content": prompt}
472
+ ],
473
+ model="deepseek-r1-distill-llama-70b",
474
+ temperature = 0.2
475
+ )
476
+ report_text = response.choices[0].message.content
477
+ #print(f"[DEBUG] Raw report from LLM:\n{report_text}") #Keep this commented out unless you have a very specific reason
478
+
479
+ structured_report = generate_report(report_text) # Use your report structuring function
480
+ return structured_report
481
+
482
+
483
+ except Exception as e:
484
+ print(f"[ERROR] Error in research agent: {e}")
485
+ return f"Sorry, I encountered an error during research: {e}"