gavinzli commited on
Commit
9939b16
·
1 Parent(s): 94ba329

Implement retry mechanism for URL requests in crawl function to enhance reliability

Browse files
Files changed (1) hide show
  1. source/stats.py +15 -9
source/stats.py CHANGED
@@ -33,15 +33,21 @@ def crawl(delta):
33
  else:
34
  category_url = f"https://www.stats.gov.cn/sj/sjjd/index_{i}.html"
35
  i = i + 1
36
- try:
37
- req = urllib.request.urlopen(category_url)
38
- text = req.read()
39
- html_text = text.decode("utf-8")
40
- page = etree.HTML(html_text)
41
- articlelist = page.xpath("//div[contains(@class, 'list-content')]/ul/li")
42
- except (urllib.error.URLError, http.client.IncompleteRead) as error:
43
- logger.info(error)
44
- continue
 
 
 
 
 
 
45
  for article in articlelist:
46
  if isinstance(article, etree._Element):
47
  subelement = etree.tostring(article).decode()
 
33
  else:
34
  category_url = f"https://www.stats.gov.cn/sj/sjjd/index_{i}.html"
35
  i = i + 1
36
+ retries = 3
37
+ while retries > 0:
38
+ try:
39
+ req = urllib.request.urlopen(category_url)
40
+ text = req.read()
41
+ html_text = text.decode("utf-8")
42
+ page = etree.HTML(html_text)
43
+ articlelist = page.xpath("//div[contains(@class, 'list-content')]/ul/li")
44
+ except (urllib.error.URLError, http.client.IncompleteRead) as error:
45
+ logger.info(error)
46
+ retries -= 1
47
+ if retries > 0:
48
+ time.sleep(5) # Wait for 5 seconds before retrying
49
+ else:
50
+ continue # Skip to the next URL after retries are exhausted
51
  for article in articlelist:
52
  if isinstance(article, etree._Element):
53
  subelement = etree.tostring(article).decode()