siddhartharyaai commited on
Commit
3290a02
·
verified ·
1 Parent(s): f09db25

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +22 -15
utils.py CHANGED
@@ -4,6 +4,7 @@ import json
4
  import requests
5
  import tempfile
6
  import random
 
7
  import numpy as np
8
  import torch
9
  import time
@@ -42,9 +43,10 @@ class Dialogue(BaseModel):
42
  def call_llm_with_retry(groq_client, **payload):
43
  """
44
  Wraps groq_client.chat.completions.create(**payload) in a retry loop
45
- to catch 429 rate-limit errors. If we see “try again in XXs,” we parse
46
- that wait time, sleep, then retry. We also do a short sleep (0.3s)
47
- after each successful call to spread usage.
 
48
  """
49
  max_retries = 3
50
  for attempt in range(max_retries):
@@ -58,13 +60,17 @@ def call_llm_with_retry(groq_client, **payload):
58
  except Exception as e:
59
  err_str = str(e).lower()
60
  print(f"[WARN] call_llm_with_retry attempt {attempt+1} failed: {e}")
61
- if "rate_limit_exceeded" in err_str or "try again in" in err_str:
62
- # parse recommended wait time
63
  wait_time = 60.0
64
  match = re.search(r'try again in (\d+(?:\.\d+)?)s', str(e), re.IGNORECASE)
65
  if match:
66
  wait_time = float(match.group(1)) + 1.0
67
- print(f"[WARN] Rate limited. Sleeping for {wait_time:.1f}s, then retrying.")
 
 
 
 
68
  time.sleep(wait_time)
69
  else:
70
  raise
@@ -497,9 +503,9 @@ def run_research_agent(
497
  Low-Call approach:
498
  1) Tavily search (up to 20 URLs).
499
  2) Firecrawl scrape => combined text
500
- 3) Truncate to 12k tokens total
501
- 4) Split into chunks (each 4500 tokens) => Summarize each chunk individually => summaries
502
- 5) Single final merge => final PDF
503
  => 2 or more total LLM calls (but no more than 10) to reduce the chance of rate limit errors.
504
  """
505
  print(f"[LOG] Starting LOW-CALL research agent for topic: {topic}")
@@ -544,15 +550,16 @@ def run_research_agent(
544
  # Step 2.5: Input Sanitization - Remove any chain-of-thought markers from the scraped content.
545
  combined_content = re.sub(r"<think>.*?</think>", "", combined_content, flags=re.DOTALL)
546
 
547
- # Step 3: Truncate to 12k tokens total
548
- print("[LOG] Step 3: Truncating combined text to 12,000 tokens if needed.")
549
- combined_content = truncate_text_tokens(combined_content, max_tokens=12000)
 
550
 
551
  # Step 4: Splitting text into chunks (4500 tokens each) and summarizing each chunk.
552
- print("[LOG] Step 4: Splitting text into chunks (4500 tokens each). Summarizing each chunk.")
553
- tokenizer = tiktoken.get_encoding("cl100k_base")
554
  tokens = tokenizer.encode(combined_content)
555
- chunk_size = 4500 # Reduced chunk size to avoid exceeding the LLM's TPM limit.
 
 
556
  max_chunks = 10 # Allow up to 10 chunks (and thus 10 LLM calls).
557
  summaries = []
558
  start = 0
 
4
  import requests
5
  import tempfile
6
  import random
7
+ import math
8
  import numpy as np
9
  import torch
10
  import time
 
43
  def call_llm_with_retry(groq_client, **payload):
44
  """
45
  Wraps groq_client.chat.completions.create(**payload) in a retry loop
46
+ to catch rate-limit errors or service unavailable (503) errors.
47
+ If we see “try again in XXs,” or detect a 503 error, we parse the wait time,
48
+ sleep, then retry. We also do a short sleep (0.3s) after each successful call
49
+ to spread usage.
50
  """
51
  max_retries = 3
52
  for attempt in range(max_retries):
 
60
  except Exception as e:
61
  err_str = str(e).lower()
62
  print(f"[WARN] call_llm_with_retry attempt {attempt+1} failed: {e}")
63
+ # Check for rate limit or service unavailable errors.
64
+ if ("rate_limit_exceeded" in err_str or "try again in" in err_str or "503" in err_str):
65
  wait_time = 60.0
66
  match = re.search(r'try again in (\d+(?:\.\d+)?)s', str(e), re.IGNORECASE)
67
  if match:
68
  wait_time = float(match.group(1)) + 1.0
69
+ else:
70
+ # If a 503 error, use default wait time.
71
+ if "503" in err_str:
72
+ wait_time = 60.0
73
+ print(f"[WARN] Detected error (rate limit or 503). Sleeping for {wait_time:.1f}s, then retrying.")
74
  time.sleep(wait_time)
75
  else:
76
  raise
 
503
  Low-Call approach:
504
  1) Tavily search (up to 20 URLs).
505
  2) Firecrawl scrape => combined text
506
+ 3) Use the full combined text without truncation.
507
+ 4) Split into chunks (each 4500 tokens) => Summarize each chunk individually => summaries.
508
+ 5) Single final merge => final PDF.
509
  => 2 or more total LLM calls (but no more than 10) to reduce the chance of rate limit errors.
510
  """
511
  print(f"[LOG] Starting LOW-CALL research agent for topic: {topic}")
 
550
  # Step 2.5: Input Sanitization - Remove any chain-of-thought markers from the scraped content.
551
  combined_content = re.sub(r"<think>.*?</think>", "", combined_content, flags=re.DOTALL)
552
 
553
+ # Step 3: Use the full combined text without truncation.
554
+ tokenizer = tiktoken.get_encoding("cl100k_base")
555
+ total_tokens = len(tokenizer.encode(combined_content))
556
+ print(f"[LOG] Step 3: Using the full combined text without truncation. Total tokens: {total_tokens}")
557
 
558
  # Step 4: Splitting text into chunks (4500 tokens each) and summarizing each chunk.
 
 
559
  tokens = tokenizer.encode(combined_content)
560
+ chunk_size = 4500 # Each chunk is 4500 tokens or less.
561
+ total_chunks = math.ceil(len(tokens) / chunk_size)
562
+ print(f"[LOG] Step 4: Splitting text into chunks of up to 4500 tokens. Total chunks: {total_chunks}")
563
  max_chunks = 10 # Allow up to 10 chunks (and thus 10 LLM calls).
564
  summaries = []
565
  start = 0