SearchPod1.0

Running

App Files Files Community

siddhartharyaai commited on Feb 9

Commit

2ae64da

verified ·

1 Parent(s): 22de709

Update utils.py

Browse files

Files changed (1) hide show

utils.py +71 -115

utils.py CHANGED Viewed

@@ -4,18 +4,15 @@ import json
 import requests
 import tempfile
 from bs4 import BeautifulSoup
-from typing import List, Literal, Optional
 from pydantic import BaseModel
 from pydub import AudioSegment, effects
 from transformers import pipeline
-import yt_dlp
 import tiktoken
 from groq import Groq
-import numpy as np
-import torch
-import random
 from tavily import TavilyClient
-from report_structure import generate_report
@@ -42,28 +39,6 @@ def truncate_text(text, max_tokens=2048):
         return tokenizer.decode(tokens[:max_tokens])
     return text
-def extract_text_from_url(url):
-    # Kept for potential user-provided URLs, but not primary.
-    print("[LOG] Extracting text from URL (fallback method):", url)
-    try:
-        headers = {
-            "User-Agent": ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
-                           "AppleWebKit/537.36 (KHTML, like Gecko) "
-                           "Chrome/115.0.0.0 Safari/537.36")
-        }
-        response = requests.get(url, headers=headers)
-        if response.status_code != 200:
-            print(f"[ERROR] Failed to fetch URL: {url} with status code {response.status_code}")
-            return ""
-        soup = BeautifulSoup(response.text, 'html.parser')
-        for script in soup(["script", "style"]):
-            script.decompose()
-        text = soup.get_text(separator=' ')
-        print("[LOG] Text extraction from URL (fallback) successful.")
-        return text
-    except Exception as e:
-        print(f"[ERROR] Exception during text extraction from URL (fallback): {e}")
-        return ""
 def pitch_shift(audio: AudioSegment, semitones: int) -> AudioSegment:
     print(f"[LOG] Shifting pitch by {semitones} semitones.")
@@ -71,33 +46,8 @@ def pitch_shift(audio: AudioSegment, semitones: int) -> AudioSegment:
     shifted_audio = audio._spawn(audio.raw_data, overrides={'frame_rate': new_sample_rate})
     return shifted_audio.set_frame_rate(audio.frame_rate)
-def is_sufficient(text: str, min_word_count: int = 500) -> bool:
-    word_count = len(text.split())
-    print(f"[DEBUG] Aggregated word count: {word_count}")
-    return word_count >= min_word_count
-def query_llm_for_additional_info(topic: str, existing_text: str) -> str:
-    # No longer needed
-    pass
-def research_topic(topic: str) -> str:
-     # No longer needed
-     pass
-def fetch_wikipedia_summary(topic: str) -> str:
-    # No longer needed
-    pass
-def fetch_rss_feed(feed_url: str) -> list:
-    # No longer needed
-    pass
-def find_relevant_article(items, topic: str, min_match=2) -> tuple:
-    # No longer needed
-    pass
-def fetch_article_text(link: str) -> str:
-    # No longer needed
-    pass
 def generate_script(
     system_prompt: str,
     input_text: str,
@@ -223,7 +173,7 @@ def generate_script(
                     d["display_speaker"] = d["speaker"]
                 new_dialogue_items.append(DialogueItem(**d))
-            return Dialogue(dialogue=new_dialogue_items)
     except json.JSONDecodeError as e:
         print("[ERROR] JSON decoding (format) failed:", e)
         raise ValueError(f"Failed to parse dialogue: {str(e)}")
@@ -461,69 +411,75 @@ def generate_script(
             print("[ERROR] Groq API error:", e)
             fallback = {"speaker": "John", "text": "I'm sorry, I'm having trouble answering right now."}
             return json.dumps(fallback)
-    # --- Agent and Tavily Integration ---
     def run_research_agent(topic: str, report_type: str = "research_report", max_results: int = 10) -> str:
-      """
-      Runs the new research agent to generate a research report. This version uses
-      Tavily for search and Firecrawl for content extraction.
-      """
-      print(f"[LOG] Starting research agent for topic: {topic}")
-      try:
-        tavily_client = TavilyClient(api_key=os.environ.get("TAVILY_API_KEY"))
-        search_results = tavily_client.search(query=topic, max_results=max_results).results
-        if not search_results:
-            return "No relevant search results found."
-        print(f"[DEBUG] Tavily results: {search_results}")
-        combined_content = ""
-        for result in search_results:
-            url = result.url  # Directly access 'url' attribute
-            print(f"[LOG] Scraping URL with Firecrawl: {url}")
-            headers = {'Authorization': f'Bearer {os.environ.get("FIRECRAWL_API_KEY")}'}
-            payload = {"url": url, "formats": ["markdown"], "onlyMainContent": True}
-            try:
-                response = requests.post("https://api.firecrawl.dev/v1/scrape", headers=headers, json=payload)
-                response.raise_for_status()  # Raise HTTPError for bad responses (4xx or 5xx)
-                data = response.json()
-                print(f"[DEBUG] Firecrawl response: {data}")
-                if data.get('success') and 'markdown' in data.get('data', {}):
-                    combined_content += data['data']['markdown'] + "\n\n"
-                else:
-                     print(f"[WARNING] Firecrawl scrape failed or no markdown content for {url}: {data.get('error')}")
-            except requests.RequestException as e:
-                print(f"[ERROR] Error during Firecrawl request for {url}: {e}")
-                continue # Continue to the next URL
-        if not combined_content:
-            return "Could not retrieve content from any of the search results."
-        # Use Groq LLM to generate the report
-        prompt = f"""You are a world-class researcher, and you are tasked to write a comprehensive research report on the following topic:
-        {topic}
-        Use the following pieces of information, gathered from various web sources, to construct your report:
-        {combined_content}
-        Compile and synthesize the information to create a well-structured and informative research report.
-        Include a title, introduction, main body with clearly defined sections, and a conclusion. Cite sources appropriately.
-        """
-        groq_client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
-        response = groq_client.chat.completions.create(
-            messages=[
-                {"role": "user", "content": prompt}
-            ],
-             model="deepseek-r1-distill-llama-70b",
-            temperature = 0.2
-        )
-        report_text = response.choices[0].message.content
-        print(f"[DEBUG] Raw report from LLM:\n{report_text}")
-        structured_report = generate_report(report_text)  # Use your report structuring function
-        return structured_report
-      except Exception as e:
-        print(f"[ERROR] Error in research agent: {e}")
-        return f"Sorry, I encountered an error during research: {e}"

 import requests
 import tempfile
 from bs4 import BeautifulSoup
+from typing import List, Literal
 from pydantic import BaseModel
 from pydub import AudioSegment, effects
 from transformers import pipeline
 import tiktoken
 from groq import Groq
+import time
+from report_structure import generate_report  # Import report structure
 from tavily import TavilyClient
         return tokenizer.decode(tokens[:max_tokens])
     return text
 def pitch_shift(audio: AudioSegment, semitones: int) -> AudioSegment:
     print(f"[LOG] Shifting pitch by {semitones} semitones.")
     shifted_audio = audio._spawn(audio.raw_data, overrides={'frame_rate': new_sample_rate})
     return shifted_audio.set_frame_rate(audio.frame_rate)
 def generate_script(
     system_prompt: str,
     input_text: str,
                     d["display_speaker"] = d["speaker"]
                 new_dialogue_items.append(DialogueItem(**d))
+        return Dialogue(dialogue=new_dialogue_items)
     except json.JSONDecodeError as e:
         print("[ERROR] JSON decoding (format) failed:", e)
         raise ValueError(f"Failed to parse dialogue: {str(e)}")
             print("[ERROR] Groq API error:", e)
             fallback = {"speaker": "John", "text": "I'm sorry, I'm having trouble answering right now."}
             return json.dumps(fallback)
     def run_research_agent(topic: str, report_type: str = "research_report", max_results: int = 10) -> str:
+        """
+        Runs the new research agent to generate a research report. This version uses
+        Tavily for search and Firecrawl for content extraction.
+        """
+        print(f"[LOG] Starting research agent for topic: {topic}")
+        try:
+            tavily_client = TavilyClient(api_key=os.environ.get("TAVILY_API_KEY"))
+            search_results = tavily_client.search(query=topic, max_results=max_results).results
+            if not search_results:
+                return "No relevant search results found."
+            print(f"[DEBUG] Tavily results: {search_results}")
+            # Use Firecrawl to scrape the content of each URL
+            combined_content = ""
+            for result in search_results:
+                url = result.url  # Use dot notation to access attributes
+                print(f"[LOG] Scraping URL with Firecrawl: {url}")
+                headers = {'Authorization': f'Bearer {os.environ.get("FIRECRAWL_API_KEY")}'}
+                payload = {"url": url, "formats": ["markdown"], "onlyMainContent": True}
+                try:
+                    response = requests.post("https://api.firecrawl.dev/v1/scrape", headers=headers, json=payload)
+                    response.raise_for_status()  # Raise HTTPError for bad responses (4xx or 5xx)
+                    data = response.json()
+                    #print(f"[DEBUG] Firecrawl response: {data}") #Uncomment for debugging
+                    if data.get('success') and 'markdown' in data.get('data', {}):
+                        combined_content += data['data']['markdown'] + "\n\n"
+                    else:
+                        print(f"[WARNING] Firecrawl scrape failed or no markdown content for {url}: {data.get('error')}")
+                except requests.RequestException as e:
+                    print(f"[ERROR] Error during Firecrawl request for {url}: {e}")
+                    continue  # Continue to the next URL
+            if not combined_content:
+                return "Could not retrieve content from any of the search results."
+            # Use Groq LLM to generate the report
+            prompt = f"""You are a world-class researcher, and you are tasked to write a comprehensive research report on the following topic:
+            {topic}
+            Use the following pieces of information, gathered from various web sources, to construct your report:
+            {combined_content}
+            Compile and synthesize the information to create a well-structured and informative research report.  Include a title, introduction, main body with clearly defined sections, and a conclusion. Cite sources appropriately in the context.
+            """
+            groq_client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
+            response = groq_client.chat.completions.create(
+                messages=[
+                    {"role": "user", "content": prompt}
+                ],
+                model="deepseek-r1-distill-llama-70b",
+                temperature = 0.2
+            )
+            report_text = response.choices[0].message.content
+            #print(f"[DEBUG] Raw report from LLM:\n{report_text}") #Keep this commented out unless you have a very specific reason
+            structured_report = generate_report(report_text)  # Use your report structuring function
+            return structured_report
+        except Exception as e:
+            print(f"[ERROR] Error in research agent: {e}")
+            return f"Sorry, I encountered an error during research: {e}"