Spaces:
Build error
Build error
fikird
commited on
Commit
·
6c83b94
1
Parent(s):
edb4444
Add rate limiting and retry logic for DuckDuckGo search
Browse files- search_engine.py +51 -7
search_engine.py
CHANGED
@@ -8,6 +8,10 @@ import time
|
|
8 |
import json
|
9 |
import os
|
10 |
from urllib.parse import urlparse
|
|
|
|
|
|
|
|
|
11 |
|
12 |
class ModelManager:
|
13 |
"""Manages different AI models for specific tasks"""
|
@@ -63,9 +67,22 @@ class WebSearchEngine:
|
|
63 |
def __init__(self):
|
64 |
self.processor = ContentProcessor()
|
65 |
self.session = requests.Session()
|
66 |
-
self.request_delay =
|
67 |
self.last_request_time = 0
|
68 |
-
self.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
|
70 |
def is_valid_url(self, url: str) -> bool:
|
71 |
"""Check if URL is valid for crawling"""
|
@@ -92,11 +109,12 @@ class WebSearchEngine:
|
|
92 |
return {'error': f"Invalid URL: {url}"}
|
93 |
|
94 |
try:
|
95 |
-
# Rate limiting
|
96 |
current_time = time.time()
|
97 |
time_since_last = current_time - self.last_request_time
|
98 |
if time_since_last < self.request_delay:
|
99 |
-
|
|
|
100 |
|
101 |
response = self.session.get(url, timeout=10)
|
102 |
self.last_request_time = time.time()
|
@@ -134,10 +152,34 @@ class WebSearchEngine:
|
|
134 |
def search(self, query: str, max_results: int = 5) -> Dict:
|
135 |
"""Perform search and process results"""
|
136 |
try:
|
137 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
138 |
search_results = []
|
139 |
-
|
140 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
141 |
|
142 |
results = []
|
143 |
for result in search_results:
|
@@ -145,6 +187,8 @@ class WebSearchEngine:
|
|
145 |
processed = self.process_url(result['link'])
|
146 |
if 'error' not in processed:
|
147 |
results.append(processed)
|
|
|
|
|
148 |
|
149 |
# Generate insights from results
|
150 |
all_content = " ".join([r['summary'] for r in results if 'summary' in r])
|
|
|
8 |
import json
|
9 |
import os
|
10 |
from urllib.parse import urlparse
|
11 |
+
import logging
|
12 |
+
import random
|
13 |
+
|
14 |
+
logger = logging.getLogger(__name__)
|
15 |
|
16 |
class ModelManager:
|
17 |
"""Manages different AI models for specific tasks"""
|
|
|
67 |
def __init__(self):
|
68 |
self.processor = ContentProcessor()
|
69 |
self.session = requests.Session()
|
70 |
+
self.request_delay = 2.0 # Increased delay between requests
|
71 |
self.last_request_time = 0
|
72 |
+
self.max_retries = 3
|
73 |
+
self.ddgs = None
|
74 |
+
self.initialize_search()
|
75 |
+
|
76 |
+
def initialize_search(self):
|
77 |
+
"""Initialize DuckDuckGo search with retries"""
|
78 |
+
for _ in range(self.max_retries):
|
79 |
+
try:
|
80 |
+
self.ddgs = DDGS()
|
81 |
+
return
|
82 |
+
except Exception as e:
|
83 |
+
logger.error(f"Error initializing DDGS: {str(e)}")
|
84 |
+
time.sleep(random.uniform(1, 3))
|
85 |
+
raise Exception("Failed to initialize DuckDuckGo search after multiple attempts")
|
86 |
|
87 |
def is_valid_url(self, url: str) -> bool:
|
88 |
"""Check if URL is valid for crawling"""
|
|
|
109 |
return {'error': f"Invalid URL: {url}"}
|
110 |
|
111 |
try:
|
112 |
+
# Rate limiting with random delay
|
113 |
current_time = time.time()
|
114 |
time_since_last = current_time - self.last_request_time
|
115 |
if time_since_last < self.request_delay:
|
116 |
+
delay = self.request_delay - time_since_last + random.uniform(0.5, 1.5)
|
117 |
+
time.sleep(delay)
|
118 |
|
119 |
response = self.session.get(url, timeout=10)
|
120 |
self.last_request_time = time.time()
|
|
|
152 |
def search(self, query: str, max_results: int = 5) -> Dict:
|
153 |
"""Perform search and process results"""
|
154 |
try:
|
155 |
+
# Initialize search if needed
|
156 |
+
if self.ddgs is None:
|
157 |
+
self.initialize_search()
|
158 |
+
|
159 |
+
# Add delay before search
|
160 |
+
time.sleep(random.uniform(1, 2))
|
161 |
+
|
162 |
+
# Search using DuckDuckGo with retries
|
163 |
search_results = []
|
164 |
+
retry_count = 0
|
165 |
+
|
166 |
+
while retry_count < self.max_retries:
|
167 |
+
try:
|
168 |
+
for result in self.ddgs.text(query, max_results=max_results):
|
169 |
+
search_results.append(result)
|
170 |
+
# Add small delay between results
|
171 |
+
time.sleep(random.uniform(0.2, 0.5))
|
172 |
+
break
|
173 |
+
except Exception as e:
|
174 |
+
retry_count += 1
|
175 |
+
if retry_count >= self.max_retries:
|
176 |
+
return {'error': f"Search failed after {self.max_retries} attempts: {str(e)}"}
|
177 |
+
logger.warning(f"Search attempt {retry_count} failed: {str(e)}")
|
178 |
+
time.sleep(random.uniform(2, 5))
|
179 |
+
self.initialize_search()
|
180 |
+
|
181 |
+
if not search_results:
|
182 |
+
return {'error': 'No results found'}
|
183 |
|
184 |
results = []
|
185 |
for result in search_results:
|
|
|
187 |
processed = self.process_url(result['link'])
|
188 |
if 'error' not in processed:
|
189 |
results.append(processed)
|
190 |
+
# Add delay between processing URLs
|
191 |
+
time.sleep(random.uniform(0.5, 1.0))
|
192 |
|
193 |
# Generate insights from results
|
194 |
all_content = " ".join([r['summary'] for r in results if 'summary' in r])
|