SearchPod1.0

Running

App Files Files Community

siddhartharyaai commited on Feb 9

Commit

de89d5c

verified ·

1 Parent(s): 7e05530

Update utils.py

Browse files

Files changed (1) hide show

utils.py +146 -76

utils.py CHANGED Viewed

@@ -4,16 +4,37 @@ import json
 import requests
 import tempfile
 from bs4 import BeautifulSoup
-from typing import List, Literal
 from pydantic import BaseModel
 from pydub import AudioSegment, effects
-from transformers import pipeline
 import tiktoken
 from groq import Groq
-import time
 from report_structure import generate_report  # Import report structure
-from tavily import TavilyClient
 class DialogueItem(BaseModel):
@@ -39,6 +60,28 @@ def truncate_text(text, max_tokens=2048):
         return tokenizer.decode(tokens[:max_tokens])
     return text
 def pitch_shift(audio: AudioSegment, semitones: int) -> AudioSegment:
     print(f"[LOG] Shifting pitch by {semitones} semitones.")
@@ -46,7 +89,34 @@ def pitch_shift(audio: AudioSegment, semitones: int) -> AudioSegment:
     shifted_audio = audio._spawn(audio.raw_data, overrides={'frame_rate': new_sample_rate})
     return shifted_audio.set_frame_rate(audio.frame_rate)
 def generate_script(
     system_prompt: str,
@@ -173,7 +243,7 @@ def generate_script(
                     d["display_speaker"] = d["speaker"]
                 new_dialogue_items.append(DialogueItem(**d))
-        return Dialogue(dialogue=new_dialogue_items)
     except json.JSONDecodeError as e:
         print("[ERROR] JSON decoding (format) failed:", e)
         raise ValueError(f"Failed to parse dialogue: {str(e)}")
@@ -411,75 +481,75 @@ def generate_script(
             print("[ERROR] Groq API error:", e)
             fallback = {"speaker": "John", "text": "I'm sorry, I'm having trouble answering right now."}
             return json.dumps(fallback)
-    def run_research_agent(topic: str, report_type: str = "research_report", max_results: int = 10) -> str:
-        """
-        Runs the new research agent to generate a research report. This version uses
-        Tavily for search and Firecrawl for content extraction.
-        """
-        print(f"[LOG] Starting research agent for topic: {topic}")
-        try:
-            tavily_client = TavilyClient(api_key=os.environ.get("TAVILY_API_KEY"))
-            search_results = tavily_client.search(query=topic, max_results=max_results).results
-            if not search_results:
-                return "No relevant search results found."
-            print(f"[DEBUG] Tavily results: {search_results}")
-            # Use Firecrawl to scrape the content of each URL
-            combined_content = ""
-            for result in search_results:
-                url = result.url  # Use dot notation to access attributes
-                print(f"[LOG] Scraping URL with Firecrawl: {url}")
-                headers = {'Authorization': f'Bearer {os.environ.get("FIRECRAWL_API_KEY")}'}
-                payload = {"url": url, "formats": ["markdown"], "onlyMainContent": True}
-                try:
-                    response = requests.post("https://api.firecrawl.dev/v1/scrape", headers=headers, json=payload)
-                    response.raise_for_status()  # Raise HTTPError for bad responses (4xx or 5xx)
-                    data = response.json()
-                    #print(f"[DEBUG] Firecrawl response: {data}") #Uncomment for debugging
-                    if data.get('success') and 'markdown' in data.get('data', {}):
-                        combined_content += data['data']['markdown'] + "\n\n"
-                    else:
-                        print(f"[WARNING] Firecrawl scrape failed or no markdown content for {url}: {data.get('error')}")
-                except requests.RequestException as e:
-                    print(f"[ERROR] Error during Firecrawl request for {url}: {e}")
-                    continue  # Continue to the next URL
-            if not combined_content:
-                return "Could not retrieve content from any of the search results."
-            # Use Groq LLM to generate the report
-            prompt = f"""You are a world-class researcher, and you are tasked to write a comprehensive research report on the following topic:
-            {topic}
-            Use the following pieces of information, gathered from various web sources, to construct your report:
-            {combined_content}
-            Compile and synthesize the information to create a well-structured and informative research report.  Include a title, introduction, main body with clearly defined sections, and a conclusion. Cite sources appropriately in the context.
-            """
-            groq_client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
-            response = groq_client.chat.completions.create(
-                messages=[
-                    {"role": "user", "content": prompt}
-                ],
-                model="deepseek-r1-distill-llama-70b",
-                temperature = 0.2
-            )
-            report_text = response.choices[0].message.content
-            #print(f"[DEBUG] Raw report from LLM:\n{report_text}") #Keep this commented out unless you have a very specific reason
-            structured_report = generate_report(report_text)  # Use your report structuring function
-            return structured_report
-        except Exception as e:
-            print(f"[ERROR] Error in research agent: {e}")
-            return f"Sorry, I encountered an error during research: {e}"

 import requests
 import tempfile
 from bs4 import BeautifulSoup
+from typing import List, Literal, Optional
 from pydantic import BaseModel
 from pydub import AudioSegment, effects
+import yt_dlp
 import tiktoken
 from groq import Groq
+import numpy as np
+import torch # Moved to the top
+from transformers import pipeline # Moved to the top, since it's used before other things
+import random
+from tavily import TavilyClient #Moved
 from report_structure import generate_report  # Import report structure
+# --- Add the cloned repository to the Python path ---
+repo_path = os.path.join('/home', 'user', 'open_deep_research')
+print(f"DEBUG: repo_path = {repo_path}")  # Debug print - keep this for now
+if repo_path not in sys.path:
+    print("DEBUG: Adding repo_path to sys.path")  # Debug print - keep this
+    sys.path.insert(0, repo_path)
+else:
+    print("DEBUG: repo_path already in sys.path") # Debug print - keep this for now
+print(f"DEBUG: sys.path = {sys.path}")  # Debug print - keep this for now
+# --- CORRECT IMPORT (for local cloned repo) ---
+try:
+    from open_deep_research.agent import OpenDeepResearchAgent
+    print("DEBUG: Import successful!")
+except ImportError as e:
+    print(f"DEBUG: Import failed: {e}")
+    raise
+from report_structure import generate_report
 class DialogueItem(BaseModel):
         return tokenizer.decode(tokens[:max_tokens])
     return text
+def extract_text_from_url(url):
+    # This function is retained for potential edge cases.
+    print("[LOG] Extracting text from URL (fallback method):", url)
+    try:
+        headers = {
+            "User-Agent": ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+                           "AppleWebKit/537.36 (KHTML, like Gecko) "
+                           "Chrome/115.0.0.0 Safari/537.36")
+        }
+        response = requests.get(url, headers=headers)
+        if response.status_code != 200:
+            print(f"[ERROR] Failed to fetch URL: {url} with status code {response.status_code}")
+            return ""
+        soup = BeautifulSoup(response.text, 'html.parser')
+        for script in soup(["script", "style"]):
+            script.decompose()
+        text = soup.get_text(separator=' ')
+        print("[LOG] Text extraction from URL (fallback) successful.")
+        return text
+    except Exception as e:
+        print(f"[ERROR] Exception during text extraction from URL (fallback): {e}")
+        return ""
 def pitch_shift(audio: AudioSegment, semitones: int) -> AudioSegment:
     print(f"[LOG] Shifting pitch by {semitones} semitones.")
     shifted_audio = audio._spawn(audio.raw_data, overrides={'frame_rate': new_sample_rate})
     return shifted_audio.set_frame_rate(audio.frame_rate)
+def is_sufficient(text: str, min_word_count: int = 500) -> bool:
+    # This function's role is reduced; the agent decides.
+    word_count = len(text.split())
+    print(f"[DEBUG] Aggregated word count: {word_count}")
+    return word_count >= min_word_count
+def query_llm_for_additional_info(topic: str, existing_text: str) -> str:
+    # No longer needed
+    pass
+def research_topic(topic: str) -> str:
+     # No longer needed
+     pass
+def fetch_wikipedia_summary(topic: str) -> str:
+    # No longer needed
+    pass
+def fetch_rss_feed(feed_url: str) -> list:
+    # No longer needed
+    pass
+def find_relevant_article(items, topic: str, min_match=2) -> tuple:
+    # No longer needed
+    pass
+def fetch_article_text(link: str) -> str:
+    # No longer needed
+    pass
 def generate_script(
     system_prompt: str,
                     d["display_speaker"] = d["speaker"]
                 new_dialogue_items.append(DialogueItem(**d))
+            return Dialogue(dialogue=new_dialogue_items)
     except json.JSONDecodeError as e:
         print("[ERROR] JSON decoding (format) failed:", e)
         raise ValueError(f"Failed to parse dialogue: {str(e)}")
             print("[ERROR] Groq API error:", e)
             fallback = {"speaker": "John", "text": "I'm sorry, I'm having trouble answering right now."}
             return json.dumps(fallback)
+    # --- Agent and Tavily Integration ---
+    def run_research_agent(topic: str, report_type: str = "research_report", max_results: int = 20) -> str:
+      """
+      Runs the new research agent to generate a research report. This version uses
+      Tavily for search and Firecrawl for content extraction.
+      """
+      print(f"[LOG] Starting research agent for topic: {topic}")
+      try:
+          tavily_client = TavilyClient(api_key=os.environ.get("TAVILY_API_KEY"))
+          search_results = tavily_client.search(query=topic, max_results=max_results).results
+          if not search_results:
+              return "No relevant search results found."
+          print(f"[DEBUG] Tavily results: {search_results}")
+          # Use Firecrawl to scrape the content of each URL
+          combined_content = ""
+          for result in search_results:
+              url = result.url  # Use dot notation to access attributes
+              print(f"[LOG] Scraping URL with Firecrawl: {url}")
+              headers = {'Authorization': f'Bearer {os.environ.get("FIRECRAWL_API_KEY")}'}
+              payload = {"url": url, "formats": ["markdown"], "onlyMainContent": True}
+              try:
+                  response = requests.post("https://api.firecrawl.dev/v1/scrape", headers=headers, json=payload)
+                  response.raise_for_status()  # Raise HTTPError for bad responses (4xx or 5xx)
+                  data = response.json()
+                  # print(f"[DEBUG] Firecrawl response: {data}") #keep commented out
+                  if data.get('success') and 'markdown' in data.get('data', {}):
+                      combined_content += data['data']['markdown'] + "\n\n"
+                  else:
+                      print(f"[WARNING] Firecrawl scrape failed or no markdown content for {url}: {data.get('error')}")
+              except requests.RequestException as e:
+                  print(f"[ERROR] Error during Firecrawl request for {url}: {e}")
+                  continue # Continue to the next URL
+          if not combined_content:
+              return "Could not retrieve content from any of the search results."
+          # Use Groq LLM to generate the report
+          prompt = f"""You are a world-class researcher, and you are tasked to write a comprehensive research report on the following topic:
+          {topic}
+          Use the following pieces of information, gathered from various web sources, to construct your report:
+          {combined_content}
+          Compile and synthesize the information to create a well-structured and informative research report.  Include a title, introduction, main body with clearly defined sections, and a conclusion. Cite sources appropriately in the context. Do not hallucinate. Do not make anything up.
+          """
+          groq_client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
+          response = groq_client.chat.completions.create(
+              messages=[
+                  {"role": "user", "content": prompt}
+              ],
+              model="deepseek-r1-distill-llama-70b",
+              temperature = 0.2
+          )
+          report_text = response.choices[0].message.content
+          # print(f"[DEBUG] Raw report from LLM:\n{report_text}") #keep this commented out.
+          structured_report = generate_report(report_text)  # Use your report structuring function
+          return structured_report
+      except Exception as e:
+          print(f"[ERROR] Error in research agent: {e}")
+          return f"Sorry, I encountered an error during research: {e}"