fikird commited on
Commit
6c83b94
·
1 Parent(s): edb4444

Add rate limiting and retry logic for DuckDuckGo search

Browse files
Files changed (1) hide show
  1. search_engine.py +51 -7
search_engine.py CHANGED
@@ -8,6 +8,10 @@ import time
8
  import json
9
  import os
10
  from urllib.parse import urlparse
 
 
 
 
11
 
12
  class ModelManager:
13
  """Manages different AI models for specific tasks"""
@@ -63,9 +67,22 @@ class WebSearchEngine:
63
  def __init__(self):
64
  self.processor = ContentProcessor()
65
  self.session = requests.Session()
66
- self.request_delay = 1.0
67
  self.last_request_time = 0
68
- self.ddgs = DDGS()
 
 
 
 
 
 
 
 
 
 
 
 
 
69
 
70
  def is_valid_url(self, url: str) -> bool:
71
  """Check if URL is valid for crawling"""
@@ -92,11 +109,12 @@ class WebSearchEngine:
92
  return {'error': f"Invalid URL: {url}"}
93
 
94
  try:
95
- # Rate limiting
96
  current_time = time.time()
97
  time_since_last = current_time - self.last_request_time
98
  if time_since_last < self.request_delay:
99
- time.sleep(self.request_delay - time_since_last)
 
100
 
101
  response = self.session.get(url, timeout=10)
102
  self.last_request_time = time.time()
@@ -134,10 +152,34 @@ class WebSearchEngine:
134
  def search(self, query: str, max_results: int = 5) -> Dict:
135
  """Perform search and process results"""
136
  try:
137
- # Search using DuckDuckGo
 
 
 
 
 
 
 
138
  search_results = []
139
- for result in self.ddgs.text(query, max_results=max_results):
140
- search_results.append(result)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
 
142
  results = []
143
  for result in search_results:
@@ -145,6 +187,8 @@ class WebSearchEngine:
145
  processed = self.process_url(result['link'])
146
  if 'error' not in processed:
147
  results.append(processed)
 
 
148
 
149
  # Generate insights from results
150
  all_content = " ".join([r['summary'] for r in results if 'summary' in r])
 
8
  import json
9
  import os
10
  from urllib.parse import urlparse
11
+ import logging
12
+ import random
13
+
14
+ logger = logging.getLogger(__name__)
15
 
16
  class ModelManager:
17
  """Manages different AI models for specific tasks"""
 
67
  def __init__(self):
68
  self.processor = ContentProcessor()
69
  self.session = requests.Session()
70
+ self.request_delay = 2.0 # Increased delay between requests
71
  self.last_request_time = 0
72
+ self.max_retries = 3
73
+ self.ddgs = None
74
+ self.initialize_search()
75
+
76
+ def initialize_search(self):
77
+ """Initialize DuckDuckGo search with retries"""
78
+ for _ in range(self.max_retries):
79
+ try:
80
+ self.ddgs = DDGS()
81
+ return
82
+ except Exception as e:
83
+ logger.error(f"Error initializing DDGS: {str(e)}")
84
+ time.sleep(random.uniform(1, 3))
85
+ raise Exception("Failed to initialize DuckDuckGo search after multiple attempts")
86
 
87
  def is_valid_url(self, url: str) -> bool:
88
  """Check if URL is valid for crawling"""
 
109
  return {'error': f"Invalid URL: {url}"}
110
 
111
  try:
112
+ # Rate limiting with random delay
113
  current_time = time.time()
114
  time_since_last = current_time - self.last_request_time
115
  if time_since_last < self.request_delay:
116
+ delay = self.request_delay - time_since_last + random.uniform(0.5, 1.5)
117
+ time.sleep(delay)
118
 
119
  response = self.session.get(url, timeout=10)
120
  self.last_request_time = time.time()
 
152
  def search(self, query: str, max_results: int = 5) -> Dict:
153
  """Perform search and process results"""
154
  try:
155
+ # Initialize search if needed
156
+ if self.ddgs is None:
157
+ self.initialize_search()
158
+
159
+ # Add delay before search
160
+ time.sleep(random.uniform(1, 2))
161
+
162
+ # Search using DuckDuckGo with retries
163
  search_results = []
164
+ retry_count = 0
165
+
166
+ while retry_count < self.max_retries:
167
+ try:
168
+ for result in self.ddgs.text(query, max_results=max_results):
169
+ search_results.append(result)
170
+ # Add small delay between results
171
+ time.sleep(random.uniform(0.2, 0.5))
172
+ break
173
+ except Exception as e:
174
+ retry_count += 1
175
+ if retry_count >= self.max_retries:
176
+ return {'error': f"Search failed after {self.max_retries} attempts: {str(e)}"}
177
+ logger.warning(f"Search attempt {retry_count} failed: {str(e)}")
178
+ time.sleep(random.uniform(2, 5))
179
+ self.initialize_search()
180
+
181
+ if not search_results:
182
+ return {'error': 'No results found'}
183
 
184
  results = []
185
  for result in search_results:
 
187
  processed = self.process_url(result['link'])
188
  if 'error' not in processed:
189
  results.append(processed)
190
+ # Add delay between processing URLs
191
+ time.sleep(random.uniform(0.5, 1.0))
192
 
193
  # Generate insights from results
194
  all_content = " ".join([r['summary'] for r in results if 'summary' in r])