Spaces:
Sleeping
Sleeping
Update utils.py
Browse files
utils.py
CHANGED
@@ -4,6 +4,7 @@ import json
|
|
4 |
import requests
|
5 |
import tempfile
|
6 |
import random
|
|
|
7 |
import numpy as np
|
8 |
import torch
|
9 |
import time
|
@@ -42,9 +43,10 @@ class Dialogue(BaseModel):
|
|
42 |
def call_llm_with_retry(groq_client, **payload):
|
43 |
"""
|
44 |
Wraps groq_client.chat.completions.create(**payload) in a retry loop
|
45 |
-
to catch
|
46 |
-
|
47 |
-
after each successful call
|
|
|
48 |
"""
|
49 |
max_retries = 3
|
50 |
for attempt in range(max_retries):
|
@@ -58,13 +60,17 @@ def call_llm_with_retry(groq_client, **payload):
|
|
58 |
except Exception as e:
|
59 |
err_str = str(e).lower()
|
60 |
print(f"[WARN] call_llm_with_retry attempt {attempt+1} failed: {e}")
|
61 |
-
|
62 |
-
|
63 |
wait_time = 60.0
|
64 |
match = re.search(r'try again in (\d+(?:\.\d+)?)s', str(e), re.IGNORECASE)
|
65 |
if match:
|
66 |
wait_time = float(match.group(1)) + 1.0
|
67 |
-
|
|
|
|
|
|
|
|
|
68 |
time.sleep(wait_time)
|
69 |
else:
|
70 |
raise
|
@@ -497,9 +503,9 @@ def run_research_agent(
|
|
497 |
Low-Call approach:
|
498 |
1) Tavily search (up to 20 URLs).
|
499 |
2) Firecrawl scrape => combined text
|
500 |
-
3)
|
501 |
-
4) Split into chunks (each 4500 tokens) => Summarize each chunk individually => summaries
|
502 |
-
5) Single final merge => final PDF
|
503 |
=> 2 or more total LLM calls (but no more than 10) to reduce the chance of rate limit errors.
|
504 |
"""
|
505 |
print(f"[LOG] Starting LOW-CALL research agent for topic: {topic}")
|
@@ -544,15 +550,16 @@ def run_research_agent(
|
|
544 |
# Step 2.5: Input Sanitization - Remove any chain-of-thought markers from the scraped content.
|
545 |
combined_content = re.sub(r"<think>.*?</think>", "", combined_content, flags=re.DOTALL)
|
546 |
|
547 |
-
# Step 3:
|
548 |
-
|
549 |
-
|
|
|
550 |
|
551 |
# Step 4: Splitting text into chunks (4500 tokens each) and summarizing each chunk.
|
552 |
-
print("[LOG] Step 4: Splitting text into chunks (4500 tokens each). Summarizing each chunk.")
|
553 |
-
tokenizer = tiktoken.get_encoding("cl100k_base")
|
554 |
tokens = tokenizer.encode(combined_content)
|
555 |
-
chunk_size = 4500 #
|
|
|
|
|
556 |
max_chunks = 10 # Allow up to 10 chunks (and thus 10 LLM calls).
|
557 |
summaries = []
|
558 |
start = 0
|
|
|
4 |
import requests
|
5 |
import tempfile
|
6 |
import random
|
7 |
+
import math
|
8 |
import numpy as np
|
9 |
import torch
|
10 |
import time
|
|
|
43 |
def call_llm_with_retry(groq_client, **payload):
|
44 |
"""
|
45 |
Wraps groq_client.chat.completions.create(**payload) in a retry loop
|
46 |
+
to catch rate-limit errors or service unavailable (503) errors.
|
47 |
+
If we see “try again in XXs,” or detect a 503 error, we parse the wait time,
|
48 |
+
sleep, then retry. We also do a short sleep (0.3s) after each successful call
|
49 |
+
to spread usage.
|
50 |
"""
|
51 |
max_retries = 3
|
52 |
for attempt in range(max_retries):
|
|
|
60 |
except Exception as e:
|
61 |
err_str = str(e).lower()
|
62 |
print(f"[WARN] call_llm_with_retry attempt {attempt+1} failed: {e}")
|
63 |
+
# Check for rate limit or service unavailable errors.
|
64 |
+
if ("rate_limit_exceeded" in err_str or "try again in" in err_str or "503" in err_str):
|
65 |
wait_time = 60.0
|
66 |
match = re.search(r'try again in (\d+(?:\.\d+)?)s', str(e), re.IGNORECASE)
|
67 |
if match:
|
68 |
wait_time = float(match.group(1)) + 1.0
|
69 |
+
else:
|
70 |
+
# If a 503 error, use default wait time.
|
71 |
+
if "503" in err_str:
|
72 |
+
wait_time = 60.0
|
73 |
+
print(f"[WARN] Detected error (rate limit or 503). Sleeping for {wait_time:.1f}s, then retrying.")
|
74 |
time.sleep(wait_time)
|
75 |
else:
|
76 |
raise
|
|
|
503 |
Low-Call approach:
|
504 |
1) Tavily search (up to 20 URLs).
|
505 |
2) Firecrawl scrape => combined text
|
506 |
+
3) Use the full combined text without truncation.
|
507 |
+
4) Split into chunks (each 4500 tokens) => Summarize each chunk individually => summaries.
|
508 |
+
5) Single final merge => final PDF.
|
509 |
=> 2 or more total LLM calls (but no more than 10) to reduce the chance of rate limit errors.
|
510 |
"""
|
511 |
print(f"[LOG] Starting LOW-CALL research agent for topic: {topic}")
|
|
|
550 |
# Step 2.5: Input Sanitization - Remove any chain-of-thought markers from the scraped content.
|
551 |
combined_content = re.sub(r"<think>.*?</think>", "", combined_content, flags=re.DOTALL)
|
552 |
|
553 |
+
# Step 3: Use the full combined text without truncation.
|
554 |
+
tokenizer = tiktoken.get_encoding("cl100k_base")
|
555 |
+
total_tokens = len(tokenizer.encode(combined_content))
|
556 |
+
print(f"[LOG] Step 3: Using the full combined text without truncation. Total tokens: {total_tokens}")
|
557 |
|
558 |
# Step 4: Splitting text into chunks (4500 tokens each) and summarizing each chunk.
|
|
|
|
|
559 |
tokens = tokenizer.encode(combined_content)
|
560 |
+
chunk_size = 4500 # Each chunk is 4500 tokens or less.
|
561 |
+
total_chunks = math.ceil(len(tokens) / chunk_size)
|
562 |
+
print(f"[LOG] Step 4: Splitting text into chunks of up to 4500 tokens. Total chunks: {total_chunks}")
|
563 |
max_chunks = 10 # Allow up to 10 chunks (and thus 10 LLM calls).
|
564 |
summaries = []
|
565 |
start = 0
|