Muhammad Abdur Rahman Saad
commited on
Commit
·
7db09ee
1
Parent(s):
497072d
add exponential backoff logic for persistent network issues
Browse files- source/safe.py +21 -9
- source/stats.py +11 -5
source/safe.py
CHANGED
@@ -2,6 +2,7 @@
|
|
2 |
import time
|
3 |
import urllib.request
|
4 |
from datetime import datetime, timedelta
|
|
|
5 |
|
6 |
from lxml import etree
|
7 |
from prefect import task, get_run_logger
|
@@ -28,15 +29,26 @@ def crawl(delta):
|
|
28 |
else:
|
29 |
category_url = f"https://www.safe.gov.cn/safe/zcfgjd/index_{i}.html"
|
30 |
i = i + 1
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
for article in articlelist:
|
41 |
if isinstance(article, etree._Element):
|
42 |
subelement = etree.tostring(article).decode()
|
|
|
2 |
import time
|
3 |
import urllib.request
|
4 |
from datetime import datetime, timedelta
|
5 |
+
import random
|
6 |
|
7 |
from lxml import etree
|
8 |
from prefect import task, get_run_logger
|
|
|
29 |
else:
|
30 |
category_url = f"https://www.safe.gov.cn/safe/zcfgjd/index_{i}.html"
|
31 |
i = i + 1
|
32 |
+
max_retries = 5
|
33 |
+
backoff_factor = 2
|
34 |
+
retries = max_retries
|
35 |
+
while retries > 0:
|
36 |
+
try:
|
37 |
+
req = urllib.request.urlopen(category_url, timeout=120) # Increased timeout
|
38 |
+
text = req.read()
|
39 |
+
html_text = text.decode("utf-8")
|
40 |
+
page = etree.HTML(html_text)
|
41 |
+
articlelist = page.xpath("//div[contains(@class, 'list_conr')]/ul/li")
|
42 |
+
break # Success, exit retry loop
|
43 |
+
except (urllib.error.URLError, TimeoutError) as error:
|
44 |
+
logger.error(f"Network error: {error}. Retries left: {retries-1}")
|
45 |
+
retries -= 1
|
46 |
+
if retries > 0:
|
47 |
+
sleep_time = backoff_factor ** (max_retries - retries) + random.uniform(0, 1)
|
48 |
+
time.sleep(sleep_time)
|
49 |
+
else:
|
50 |
+
logger.error(f"Failed to fetch {category_url} after {max_retries} attempts.")
|
51 |
+
articlelist = []
|
52 |
for article in articlelist:
|
53 |
if isinstance(article, etree._Element):
|
54 |
subelement = etree.tostring(article).decode()
|
source/stats.py
CHANGED
@@ -3,6 +3,7 @@ import time
|
|
3 |
import urllib.request
|
4 |
import http.client
|
5 |
from datetime import datetime, timedelta
|
|
|
6 |
|
7 |
from lxml import etree
|
8 |
from prefect import task, get_run_logger
|
@@ -33,21 +34,26 @@ def crawl(delta):
|
|
33 |
else:
|
34 |
category_url = f"https://www.stats.gov.cn/sj/sjjd/index_{i}.html"
|
35 |
i = i + 1
|
36 |
-
|
|
|
|
|
37 |
while retries > 0:
|
38 |
try:
|
39 |
req = urllib.request.urlopen(category_url, timeout=60)
|
40 |
-
retries -= 1
|
41 |
text = req.read()
|
42 |
html_text = text.decode("utf-8")
|
43 |
page = etree.HTML(html_text)
|
44 |
articlelist = page.xpath("//div[contains(@class, 'list-content')]/ul/li")
|
|
|
45 |
except (urllib.error.URLError, http.client.IncompleteRead, TimeoutError) as error:
|
46 |
-
logger.info(error)
|
|
|
47 |
if retries > 0:
|
48 |
-
|
|
|
49 |
else:
|
50 |
-
|
|
|
51 |
for article in articlelist:
|
52 |
if isinstance(article, etree._Element):
|
53 |
subelement = etree.tostring(article).decode()
|
|
|
3 |
import urllib.request
|
4 |
import http.client
|
5 |
from datetime import datetime, timedelta
|
6 |
+
import random
|
7 |
|
8 |
from lxml import etree
|
9 |
from prefect import task, get_run_logger
|
|
|
34 |
else:
|
35 |
category_url = f"https://www.stats.gov.cn/sj/sjjd/index_{i}.html"
|
36 |
i = i + 1
|
37 |
+
max_retries = 5
|
38 |
+
backoff_factor = 2
|
39 |
+
retries = max_retries
|
40 |
while retries > 0:
|
41 |
try:
|
42 |
req = urllib.request.urlopen(category_url, timeout=60)
|
|
|
43 |
text = req.read()
|
44 |
html_text = text.decode("utf-8")
|
45 |
page = etree.HTML(html_text)
|
46 |
articlelist = page.xpath("//div[contains(@class, 'list-content')]/ul/li")
|
47 |
+
break # Success, exit retry loop
|
48 |
except (urllib.error.URLError, http.client.IncompleteRead, TimeoutError) as error:
|
49 |
+
logger.info(f"Network error: {error}. Retries left: {retries-1}")
|
50 |
+
retries -= 1
|
51 |
if retries > 0:
|
52 |
+
sleep_time = backoff_factor ** (max_retries - retries) + random.uniform(0, 1)
|
53 |
+
time.sleep(sleep_time)
|
54 |
else:
|
55 |
+
logger.error(f"Failed to fetch {category_url} after {max_retries} attempts.")
|
56 |
+
articlelist = [] # Prevents UnboundLocalError
|
57 |
for article in articlelist:
|
58 |
if isinstance(article, etree._Element):
|
59 |
subelement = etree.tostring(article).decode()
|