Muhammad Abdur Rahman Saad commited on
Commit
7db09ee
·
1 Parent(s): 497072d

add exponential backoff logic for persistent network issues

Browse files
Files changed (2) hide show
  1. source/safe.py +21 -9
  2. source/stats.py +11 -5
source/safe.py CHANGED
@@ -2,6 +2,7 @@
2
  import time
3
  import urllib.request
4
  from datetime import datetime, timedelta
 
5
 
6
  from lxml import etree
7
  from prefect import task, get_run_logger
@@ -28,15 +29,26 @@ def crawl(delta):
28
  else:
29
  category_url = f"https://www.safe.gov.cn/safe/zcfgjd/index_{i}.html"
30
  i = i + 1
31
- try:
32
- req = urllib.request.urlopen(category_url, timeout=60)
33
- text = req.read()
34
- html_text = text.decode("utf-8")
35
- page = etree.HTML(html_text)
36
- articlelist = page.xpath("//div[contains(@class, 'list_conr')]/ul/li")
37
- except (urllib.error.URLError, TimeoutError) as error:
38
- logger.error(error)
39
- continue
 
 
 
 
 
 
 
 
 
 
 
40
  for article in articlelist:
41
  if isinstance(article, etree._Element):
42
  subelement = etree.tostring(article).decode()
 
2
  import time
3
  import urllib.request
4
  from datetime import datetime, timedelta
5
+ import random
6
 
7
  from lxml import etree
8
  from prefect import task, get_run_logger
 
29
  else:
30
  category_url = f"https://www.safe.gov.cn/safe/zcfgjd/index_{i}.html"
31
  i = i + 1
32
+ max_retries = 5
33
+ backoff_factor = 2
34
+ retries = max_retries
35
+ while retries > 0:
36
+ try:
37
+ req = urllib.request.urlopen(category_url, timeout=120) # Increased timeout
38
+ text = req.read()
39
+ html_text = text.decode("utf-8")
40
+ page = etree.HTML(html_text)
41
+ articlelist = page.xpath("//div[contains(@class, 'list_conr')]/ul/li")
42
+ break # Success, exit retry loop
43
+ except (urllib.error.URLError, TimeoutError) as error:
44
+ logger.error(f"Network error: {error}. Retries left: {retries-1}")
45
+ retries -= 1
46
+ if retries > 0:
47
+ sleep_time = backoff_factor ** (max_retries - retries) + random.uniform(0, 1)
48
+ time.sleep(sleep_time)
49
+ else:
50
+ logger.error(f"Failed to fetch {category_url} after {max_retries} attempts.")
51
+ articlelist = []
52
  for article in articlelist:
53
  if isinstance(article, etree._Element):
54
  subelement = etree.tostring(article).decode()
source/stats.py CHANGED
@@ -3,6 +3,7 @@ import time
3
  import urllib.request
4
  import http.client
5
  from datetime import datetime, timedelta
 
6
 
7
  from lxml import etree
8
  from prefect import task, get_run_logger
@@ -33,21 +34,26 @@ def crawl(delta):
33
  else:
34
  category_url = f"https://www.stats.gov.cn/sj/sjjd/index_{i}.html"
35
  i = i + 1
36
- retries = 3
 
 
37
  while retries > 0:
38
  try:
39
  req = urllib.request.urlopen(category_url, timeout=60)
40
- retries -= 1
41
  text = req.read()
42
  html_text = text.decode("utf-8")
43
  page = etree.HTML(html_text)
44
  articlelist = page.xpath("//div[contains(@class, 'list-content')]/ul/li")
 
45
  except (urllib.error.URLError, http.client.IncompleteRead, TimeoutError) as error:
46
- logger.info(error)
 
47
  if retries > 0:
48
- time.sleep(5) # Wait for 5 seconds before retrying
 
49
  else:
50
- continue # Skip to the next URL after retries are exhausted
 
51
  for article in articlelist:
52
  if isinstance(article, etree._Element):
53
  subelement = etree.tostring(article).decode()
 
3
  import urllib.request
4
  import http.client
5
  from datetime import datetime, timedelta
6
+ import random
7
 
8
  from lxml import etree
9
  from prefect import task, get_run_logger
 
34
  else:
35
  category_url = f"https://www.stats.gov.cn/sj/sjjd/index_{i}.html"
36
  i = i + 1
37
+ max_retries = 5
38
+ backoff_factor = 2
39
+ retries = max_retries
40
  while retries > 0:
41
  try:
42
  req = urllib.request.urlopen(category_url, timeout=60)
 
43
  text = req.read()
44
  html_text = text.decode("utf-8")
45
  page = etree.HTML(html_text)
46
  articlelist = page.xpath("//div[contains(@class, 'list-content')]/ul/li")
47
+ break # Success, exit retry loop
48
  except (urllib.error.URLError, http.client.IncompleteRead, TimeoutError) as error:
49
+ logger.info(f"Network error: {error}. Retries left: {retries-1}")
50
+ retries -= 1
51
  if retries > 0:
52
+ sleep_time = backoff_factor ** (max_retries - retries) + random.uniform(0, 1)
53
+ time.sleep(sleep_time)
54
  else:
55
+ logger.error(f"Failed to fetch {category_url} after {max_retries} attempts.")
56
+ articlelist = [] # Prevents UnboundLocalError
57
  for article in articlelist:
58
  if isinstance(article, etree._Element):
59
  subelement = etree.tostring(article).decode()