Spaces:
Sleeping
Sleeping
Update utils.py
Browse files
utils.py
CHANGED
@@ -497,7 +497,7 @@ def run_research_agent(
|
|
497 |
Low-Call approach:
|
498 |
1) Tavily search (up to 20 URLs).
|
499 |
2) Firecrawl scrape => combined text
|
500 |
-
3)
|
501 |
4) Split into chunks (each 4500 tokens) => Summarize each chunk individually => summaries
|
502 |
5) Single final merge => final PDF
|
503 |
=> 2 or more total LLM calls (but no more than 10) to reduce the chance of rate limit errors.
|
@@ -544,8 +544,9 @@ def run_research_agent(
|
|
544 |
# Step 2.5: Input Sanitization - Remove any chain-of-thought markers from the scraped content.
|
545 |
combined_content = re.sub(r"<think>.*?</think>", "", combined_content, flags=re.DOTALL)
|
546 |
|
547 |
-
# Step 3:
|
548 |
-
print("[LOG] Step 3:
|
|
|
549 |
|
550 |
# Step 4: Splitting text into chunks (4500 tokens each) and summarizing each chunk.
|
551 |
print("[LOG] Step 4: Splitting text into chunks (4500 tokens each). Summarizing each chunk.")
|
|
|
497 |
Low-Call approach:
|
498 |
1) Tavily search (up to 20 URLs).
|
499 |
2) Firecrawl scrape => combined text
|
500 |
+
3) Truncate to 12k tokens total
|
501 |
4) Split into chunks (each 4500 tokens) => Summarize each chunk individually => summaries
|
502 |
5) Single final merge => final PDF
|
503 |
=> 2 or more total LLM calls (but no more than 10) to reduce the chance of rate limit errors.
|
|
|
544 |
# Step 2.5: Input Sanitization - Remove any chain-of-thought markers from the scraped content.
|
545 |
combined_content = re.sub(r"<think>.*?</think>", "", combined_content, flags=re.DOTALL)
|
546 |
|
547 |
+
# Step 3: Truncate to 12k tokens total
|
548 |
+
print("[LOG] Step 3: Truncating combined text to 12,000 tokens if needed.")
|
549 |
+
combined_content = truncate_text_tokens(combined_content, max_tokens=12000)
|
550 |
|
551 |
# Step 4: Splitting text into chunks (4500 tokens each) and summarizing each chunk.
|
552 |
print("[LOG] Step 4: Splitting text into chunks (4500 tokens each). Summarizing each chunk.")
|