SearchPod1.0

Sleeping

App Files Files Community

siddhartharyaai commited on Feb 12

Commit

3290a02

verified ·

1 Parent(s): f09db25

Update utils.py

Browse files

Files changed (1) hide show

utils.py +22 -15

utils.py CHANGED Viewed

@@ -4,6 +4,7 @@ import json
 import requests
 import tempfile
 import random
 import numpy as np
 import torch
 import time
@@ -42,9 +43,10 @@ class Dialogue(BaseModel):
 def call_llm_with_retry(groq_client, **payload):
     """
     Wraps groq_client.chat.completions.create(**payload) in a retry loop
-    to catch 429 rate-limit errors. If we see “try again in XXs,” we parse
-    that wait time, sleep, then retry. We also do a short sleep (0.3s)
-    after each successful call to spread usage.
     """
     max_retries = 3
     for attempt in range(max_retries):
@@ -58,13 +60,17 @@ def call_llm_with_retry(groq_client, **payload):
         except Exception as e:
             err_str = str(e).lower()
             print(f"[WARN] call_llm_with_retry attempt {attempt+1} failed: {e}")
-            if "rate_limit_exceeded" in err_str or "try again in" in err_str:
-                # parse recommended wait time
                 wait_time = 60.0
                 match = re.search(r'try again in (\d+(?:\.\d+)?)s', str(e), re.IGNORECASE)
                 if match:
                     wait_time = float(match.group(1)) + 1.0
-                print(f"[WARN] Rate limited. Sleeping for {wait_time:.1f}s, then retrying.")
                 time.sleep(wait_time)
             else:
                 raise
@@ -497,9 +503,9 @@ def run_research_agent(
     Low-Call approach:
       1) Tavily search (up to 20 URLs).
       2) Firecrawl scrape => combined text
-      3) Truncate to 12k tokens total
-      4) Split into chunks (each 4500 tokens) => Summarize each chunk individually => summaries
-      5) Single final merge => final PDF
       => 2 or more total LLM calls (but no more than 10) to reduce the chance of rate limit errors.
     """
     print(f"[LOG] Starting LOW-CALL research agent for topic: {topic}")
@@ -544,15 +550,16 @@ def run_research_agent(
         # Step 2.5: Input Sanitization - Remove any chain-of-thought markers from the scraped content.
         combined_content = re.sub(r"<think>.*?</think>", "", combined_content, flags=re.DOTALL)
-        # Step 3: Truncate to 12k tokens total
-        print("[LOG] Step 3: Truncating combined text to 12,000 tokens if needed.")
-        combined_content = truncate_text_tokens(combined_content, max_tokens=12000)
         # Step 4: Splitting text into chunks (4500 tokens each) and summarizing each chunk.
-        print("[LOG] Step 4: Splitting text into chunks (4500 tokens each). Summarizing each chunk.")
-        tokenizer = tiktoken.get_encoding("cl100k_base")
         tokens = tokenizer.encode(combined_content)
-        chunk_size = 4500          # Reduced chunk size to avoid exceeding the LLM's TPM limit.
         max_chunks = 10            # Allow up to 10 chunks (and thus 10 LLM calls).
         summaries = []
         start = 0

 import requests
 import tempfile
 import random
+import math
 import numpy as np
 import torch
 import time
 def call_llm_with_retry(groq_client, **payload):
     """
     Wraps groq_client.chat.completions.create(**payload) in a retry loop
+    to catch rate-limit errors or service unavailable (503) errors.
+    If we see “try again in XXs,” or detect a 503 error, we parse the wait time,
+    sleep, then retry. We also do a short sleep (0.3s) after each successful call
+    to spread usage.
     """
     max_retries = 3
     for attempt in range(max_retries):
         except Exception as e:
             err_str = str(e).lower()
             print(f"[WARN] call_llm_with_retry attempt {attempt+1} failed: {e}")
+            # Check for rate limit or service unavailable errors.
+            if ("rate_limit_exceeded" in err_str or "try again in" in err_str or "503" in err_str):
                 wait_time = 60.0
                 match = re.search(r'try again in (\d+(?:\.\d+)?)s', str(e), re.IGNORECASE)
                 if match:
                     wait_time = float(match.group(1)) + 1.0
+                else:
+                    # If a 503 error, use default wait time.
+                    if "503" in err_str:
+                        wait_time = 60.0
+                print(f"[WARN] Detected error (rate limit or 503). Sleeping for {wait_time:.1f}s, then retrying.")
                 time.sleep(wait_time)
             else:
                 raise
     Low-Call approach:
       1) Tavily search (up to 20 URLs).
       2) Firecrawl scrape => combined text
+      3) Use the full combined text without truncation.
+      4) Split into chunks (each 4500 tokens) => Summarize each chunk individually => summaries.
+      5) Single final merge => final PDF.
       => 2 or more total LLM calls (but no more than 10) to reduce the chance of rate limit errors.
     """
     print(f"[LOG] Starting LOW-CALL research agent for topic: {topic}")
         # Step 2.5: Input Sanitization - Remove any chain-of-thought markers from the scraped content.
         combined_content = re.sub(r"<think>.*?</think>", "", combined_content, flags=re.DOTALL)
+        # Step 3: Use the full combined text without truncation.
+        tokenizer = tiktoken.get_encoding("cl100k_base")
+        total_tokens = len(tokenizer.encode(combined_content))
+        print(f"[LOG] Step 3: Using the full combined text without truncation. Total tokens: {total_tokens}")
         # Step 4: Splitting text into chunks (4500 tokens each) and summarizing each chunk.
         tokens = tokenizer.encode(combined_content)
+        chunk_size = 4500          # Each chunk is 4500 tokens or less.
+        total_chunks = math.ceil(len(tokens) / chunk_size)
+        print(f"[LOG] Step 4: Splitting text into chunks of up to 4500 tokens. Total chunks: {total_chunks}")
         max_chunks = 10            # Allow up to 10 chunks (and thus 10 LLM calls).
         summaries = []
         start = 0