SearchPod1.0

Running

App Files Files Community

siddhartharyaai commited on Feb 10

Commit

cb1debc

verified ·

1 Parent(s): de89d5c

Update utils.py

Browse files

Files changed (1) hide show

utils.py +86 -145

utils.py CHANGED Viewed

@@ -7,35 +7,17 @@ from bs4 import BeautifulSoup
 from typing import List, Literal, Optional
 from pydantic import BaseModel
 from pydub import AudioSegment, effects
-import yt_dlp
 import tiktoken
-from groq import Groq
 import numpy as np
-import torch # Moved to the top
-from transformers import pipeline # Moved to the top, since it's used before other things
 import random
-from tavily import TavilyClient #Moved
-from report_structure import generate_report  # Import report structure
-# --- Add the cloned repository to the Python path ---
-repo_path = os.path.join('/home', 'user', 'open_deep_research')
-print(f"DEBUG: repo_path = {repo_path}")  # Debug print - keep this for now
-if repo_path not in sys.path:
-    print("DEBUG: Adding repo_path to sys.path")  # Debug print - keep this
-    sys.path.insert(0, repo_path)
-else:
-    print("DEBUG: repo_path already in sys.path") # Debug print - keep this for now
-print(f"DEBUG: sys.path = {sys.path}")  # Debug print - keep this for now
-# --- CORRECT IMPORT (for local cloned repo) ---
-try:
-    from open_deep_research.agent import OpenDeepResearchAgent
-    print("DEBUG: Import successful!")
-except ImportError as e:
-    print(f"DEBUG: Import failed: {e}")
-    raise
-from report_structure import generate_report
 class DialogueItem(BaseModel):
     speaker: Literal["Jane", "John"]
@@ -60,28 +42,6 @@ def truncate_text(text, max_tokens=2048):
         return tokenizer.decode(tokens[:max_tokens])
     return text
-def extract_text_from_url(url):
-    # This function is retained for potential edge cases.
-    print("[LOG] Extracting text from URL (fallback method):", url)
-    try:
-        headers = {
-            "User-Agent": ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
-                           "AppleWebKit/537.36 (KHTML, like Gecko) "
-                           "Chrome/115.0.0.0 Safari/537.36")
-        }
-        response = requests.get(url, headers=headers)
-        if response.status_code != 200:
-            print(f"[ERROR] Failed to fetch URL: {url} with status code {response.status_code}")
-            return ""
-        soup = BeautifulSoup(response.text, 'html.parser')
-        for script in soup(["script", "style"]):
-            script.decompose()
-        text = soup.get_text(separator=' ')
-        print("[LOG] Text extraction from URL (fallback) successful.")
-        return text
-    except Exception as e:
-        print(f"[ERROR] Exception during text extraction from URL (fallback): {e}")
-        return ""
 def pitch_shift(audio: AudioSegment, semitones: int) -> AudioSegment:
     print(f"[LOG] Shifting pitch by {semitones} semitones.")
@@ -89,34 +49,15 @@ def pitch_shift(audio: AudioSegment, semitones: int) -> AudioSegment:
     shifted_audio = audio._spawn(audio.raw_data, overrides={'frame_rate': new_sample_rate})
     return shifted_audio.set_frame_rate(audio.frame_rate)
-def is_sufficient(text: str, min_word_count: int = 500) -> bool:
-    # This function's role is reduced; the agent decides.
-    word_count = len(text.split())
-    print(f"[DEBUG] Aggregated word count: {word_count}")
-    return word_count >= min_word_count
-def query_llm_for_additional_info(topic: str, existing_text: str) -> str:
-    # No longer needed
-    pass
-def research_topic(topic: str) -> str:
-     # No longer needed
-     pass
-def fetch_wikipedia_summary(topic: str) -> str:
-    # No longer needed
-    pass
-def fetch_rss_feed(feed_url: str) -> list:
-    # No longer needed
-    pass
-def find_relevant_article(items, topic: str, min_match=2) -> tuple:
-    # No longer needed
-    pass
-def fetch_article_text(link: str) -> str:
-    # No longer needed
-    pass
 def generate_script(
     system_prompt: str,
@@ -129,7 +70,7 @@ def generate_script(
     sponsor_provided=None
 ):
     print("[LOG] Generating script with tone:", tone, "and length:", target_length)
-    import streamlit as st
     if (host_name == "Jane" or not host_name) and st.session_state.get("language_selection") in ["English (Indian)", "Hinglish", "Hindi"]:
         host_name = "Isha"
     if (guest_name == "John" or not guest_name) and st.session_state.get("language_selection") in ["English (Indian)", "Hinglish", "Hindi"]:
@@ -483,73 +424,73 @@ def generate_script(
             return json.dumps(fallback)
     # --- Agent and Tavily Integration ---
-    def run_research_agent(topic: str, report_type: str = "research_report", max_results: int = 20) -> str:
-      """
-      Runs the new research agent to generate a research report. This version uses
-      Tavily for search and Firecrawl for content extraction.
-      """
-      print(f"[LOG] Starting research agent for topic: {topic}")
-      try:
-          tavily_client = TavilyClient(api_key=os.environ.get("TAVILY_API_KEY"))
-          search_results = tavily_client.search(query=topic, max_results=max_results).results
-          if not search_results:
-              return "No relevant search results found."
-          print(f"[DEBUG] Tavily results: {search_results}")
-          # Use Firecrawl to scrape the content of each URL
-          combined_content = ""
-          for result in search_results:
-              url = result.url  # Use dot notation to access attributes
-              print(f"[LOG] Scraping URL with Firecrawl: {url}")
-              headers = {'Authorization': f'Bearer {os.environ.get("FIRECRAWL_API_KEY")}'}
-              payload = {"url": url, "formats": ["markdown"], "onlyMainContent": True}
-              try:
-                  response = requests.post("https://api.firecrawl.dev/v1/scrape", headers=headers, json=payload)
-                  response.raise_for_status()  # Raise HTTPError for bad responses (4xx or 5xx)
-                  data = response.json()
-                  # print(f"[DEBUG] Firecrawl response: {data}") #keep commented out
-                  if data.get('success') and 'markdown' in data.get('data', {}):
-                      combined_content += data['data']['markdown'] + "\n\n"
-                  else:
-                      print(f"[WARNING] Firecrawl scrape failed or no markdown content for {url}: {data.get('error')}")
-              except requests.RequestException as e:
-                  print(f"[ERROR] Error during Firecrawl request for {url}: {e}")
-                  continue # Continue to the next URL
-          if not combined_content:
-              return "Could not retrieve content from any of the search results."
-          # Use Groq LLM to generate the report
-          prompt = f"""You are a world-class researcher, and you are tasked to write a comprehensive research report on the following topic:
-          {topic}
-          Use the following pieces of information, gathered from various web sources, to construct your report:
-          {combined_content}
-          Compile and synthesize the information to create a well-structured and informative research report.  Include a title, introduction, main body with clearly defined sections, and a conclusion. Cite sources appropriately in the context. Do not hallucinate. Do not make anything up.
-          """
-          groq_client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
-          response = groq_client.chat.completions.create(
-              messages=[
-                  {"role": "user", "content": prompt}
-              ],
-              model="deepseek-r1-distill-llama-70b",
-              temperature = 0.2
-          )
-          report_text = response.choices[0].message.content
-          # print(f"[DEBUG] Raw report from LLM:\n{report_text}") #keep this commented out.
-          structured_report = generate_report(report_text)  # Use your report structuring function
-          return structured_report
-      except Exception as e:
-          print(f"[ERROR] Error in research agent: {e}")
-          return f"Sorry, I encountered an error during research: {e}"

 from typing import List, Literal, Optional
 from pydantic import BaseModel
 from pydub import AudioSegment, effects
+from transformers import pipeline
 import tiktoken
+from groq import Groq  # Retained for LLM interaction
 import numpy as np
+import torch
 import random
+# --- CORRECT IMPORTS ---
+# No more sys.path modification!
+from report_structure import generate_report  # For report structuring
+from tavily import TavilyClient
 class DialogueItem(BaseModel):
     speaker: Literal["Jane", "John"]
         return tokenizer.decode(tokens[:max_tokens])
     return text
 def pitch_shift(audio: AudioSegment, semitones: int) -> AudioSegment:
     print(f"[LOG] Shifting pitch by {semitones} semitones.")
     shifted_audio = audio._spawn(audio.raw_data, overrides={'frame_rate': new_sample_rate})
     return shifted_audio.set_frame_rate(audio.frame_rate)
+# --- Functions no longer needed ---
+# def is_sufficient(...)
+# def query_llm_for_additional_info(...)
+# def research_topic(...)
+# def fetch_wikipedia_summary(...)
+# def fetch_rss_feed(...)
+# def find_relevant_article(...)
+# def fetch_article_text(...)
 def generate_script(
     system_prompt: str,
     sponsor_provided=None
 ):
     print("[LOG] Generating script with tone:", tone, "and length:", target_length)
+    import streamlit as st  # Import streamlit here, where it's used
     if (host_name == "Jane" or not host_name) and st.session_state.get("language_selection") in ["English (Indian)", "Hinglish", "Hindi"]:
         host_name = "Isha"
     if (guest_name == "John" or not guest_name) and st.session_state.get("language_selection") in ["English (Indian)", "Hinglish", "Hindi"]:
             return json.dumps(fallback)
     # --- Agent and Tavily Integration ---
+def run_research_agent(topic: str, report_type: str = "research_report", max_results: int = 10) -> str:
+    """
+    Runs the new research agent to generate a research report. This version uses
+    Tavily for search and Firecrawl for content extraction.
+    """
+    print(f"[LOG] Starting research agent for topic: {topic}")
+    try:
+        tavily_client = TavilyClient(api_key=os.environ.get("TAVILY_API_KEY"))
+        search_results = tavily_client.search(query=topic, max_results=max_results).results
+        if not search_results:
+            return "No relevant search results found."
+        print(f"[DEBUG] Tavily results: {search_results}")
+        # Use Firecrawl to scrape the content of each URL
+        combined_content = ""
+        for result in search_results:
+            url = result.url  # Use dot notation to access attributes
+            print(f"[LOG] Scraping URL with Firecrawl: {url}")
+            headers = {'Authorization': f'Bearer {os.environ.get("FIRECRAWL_API_KEY")}'}
+            payload = {"url": url, "formats": ["markdown"], "onlyMainContent": True}
+            try:
+                response = requests.post("https://api.firecrawl.dev/v1/scrape", headers=headers, json=payload)
+                response.raise_for_status()  # Raise HTTPError for bad responses (4xx or 5xx)
+                data = response.json()
+                # print(f"[DEBUG] Firecrawl response: {data}") #keep commented
+                if data.get('success') and 'markdown' in data.get('data', {}):
+                    combined_content += data['data']['markdown'] + "\n\n"
+                else:
+                    print(f"[WARNING] Firecrawl scrape failed or no markdown content for {url}: {data.get('error')}")
+            except requests.RequestException as e:
+                print(f"[ERROR] Error during Firecrawl request for {url}: {e}")
+                continue  # Continue to the next URL
+        if not combined_content:
+            return "Could not retrieve content from any of the search results."
+        # Use Groq LLM to generate the report
+        prompt = f"""You are a world-class researcher, and you are tasked to write a comprehensive research report on the following topic:
+        {topic}
+        Use the following pieces of information, gathered from various web sources, to construct your report:
+        {combined_content}
+        Compile and synthesize the information to create a well-structured and informative research report.  Include a title, introduction, main body with clearly defined sections, and a conclusion. Cite sources appropriately in the context. Do not hallucinate or make anything up.
+        """
+        groq_client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
+        response = groq_client.chat.completions.create(
+            messages=[
+                {"role": "user", "content": prompt}
+            ],
+             model="deepseek-r1-distill-llama-70b",
+            temperature = 0.2
+        )
+        report_text = response.choices[0].message.content
+        #print(f"[DEBUG] Raw report from LLM:\n{report_text}") #Keep commented out unless you have a very specific reason
+        structured_report = generate_report(report_text)  # Use your report structuring function
+        return structured_report
+    except Exception as e:
+        print(f"[ERROR] Error in research agent: {e}")
+        return f"Sorry, I encountered an error during research: {e}"