Implement retry mechanism for URL requests in crawl function to enhance reliability
Browse files- source/stats.py +15 -9
source/stats.py
CHANGED
@@ -33,15 +33,21 @@ def crawl(delta):
|
|
33 |
else:
|
34 |
category_url = f"https://www.stats.gov.cn/sj/sjjd/index_{i}.html"
|
35 |
i = i + 1
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
for article in articlelist:
|
46 |
if isinstance(article, etree._Element):
|
47 |
subelement = etree.tostring(article).decode()
|
|
|
33 |
else:
|
34 |
category_url = f"https://www.stats.gov.cn/sj/sjjd/index_{i}.html"
|
35 |
i = i + 1
|
36 |
+
retries = 3
|
37 |
+
while retries > 0:
|
38 |
+
try:
|
39 |
+
req = urllib.request.urlopen(category_url)
|
40 |
+
text = req.read()
|
41 |
+
html_text = text.decode("utf-8")
|
42 |
+
page = etree.HTML(html_text)
|
43 |
+
articlelist = page.xpath("//div[contains(@class, 'list-content')]/ul/li")
|
44 |
+
except (urllib.error.URLError, http.client.IncompleteRead) as error:
|
45 |
+
logger.info(error)
|
46 |
+
retries -= 1
|
47 |
+
if retries > 0:
|
48 |
+
time.sleep(5) # Wait for 5 seconds before retrying
|
49 |
+
else:
|
50 |
+
continue # Skip to the next URL after retries are exhausted
|
51 |
for article in articlelist:
|
52 |
if isinstance(article, etree._Element):
|
53 |
subelement = etree.tostring(article).decode()
|