Spaces:
Build error
Build error
Increase timeout for URL requests in crawl functions to enhance reliability
Browse files- controllers/utils.py +1 -1
- source/csrc.py +3 -3
- source/eastmoney.py +2 -2
- source/gov.py +2 -2
- source/mof.py +2 -2
- source/mofcom.py +1 -1
- source/ndrc.py +1 -1
- source/safe.py +2 -2
- source/stats.py +1 -1
controllers/utils.py
CHANGED
@@ -654,7 +654,7 @@ def crawl_by_url(url, article):
|
|
654 |
|
655 |
"""
|
656 |
domain = '.'.join(urlparse(url).netloc.split('.')[1:])
|
657 |
-
req = urllib.request.urlopen(url, timeout=
|
658 |
text = req.read()
|
659 |
html_text = text.decode("utf-8")
|
660 |
page = etree.HTML(html_text)
|
|
|
654 |
|
655 |
"""
|
656 |
domain = '.'.join(urlparse(url).netloc.split('.')[1:])
|
657 |
+
req = urllib.request.urlopen(url, timeout=60)
|
658 |
text = req.read()
|
659 |
html_text = text.decode("utf-8")
|
660 |
page = etree.HTML(html_text)
|
source/csrc.py
CHANGED
@@ -47,7 +47,7 @@ def crawl(delta):
|
|
47 |
category_url,
|
48 |
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
|
49 |
)
|
50 |
-
response = urllib.request.urlopen(req, timeout=
|
51 |
text = response.read()
|
52 |
html_text = text.decode("utf-8")
|
53 |
page = etree.HTML(html_text)
|
@@ -70,11 +70,11 @@ def crawl(delta):
|
|
70 |
article = {}
|
71 |
url = "http://www.csrc.gov.cn" + url
|
72 |
article['category'] = "Policy Interpretation"
|
73 |
-
logger.info(
|
74 |
crawl_by_url(url, article)
|
75 |
except (urllib.error.URLError, etree.XMLSyntaxError) as error:
|
76 |
logger.error(error)
|
77 |
-
except (urllib.error.URLError, etree.XMLSyntaxError, ValueError) as error:
|
78 |
i = -1
|
79 |
logger.error(error)
|
80 |
|
|
|
47 |
category_url,
|
48 |
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
|
49 |
)
|
50 |
+
response = urllib.request.urlopen(req, timeout=60)
|
51 |
text = response.read()
|
52 |
html_text = text.decode("utf-8")
|
53 |
page = etree.HTML(html_text)
|
|
|
70 |
article = {}
|
71 |
url = "http://www.csrc.gov.cn" + url
|
72 |
article['category'] = "Policy Interpretation"
|
73 |
+
logger.info("Processing article URL: %s", url)
|
74 |
crawl_by_url(url, article)
|
75 |
except (urllib.error.URLError, etree.XMLSyntaxError) as error:
|
76 |
logger.error(error)
|
77 |
+
except (urllib.error.URLError, etree.XMLSyntaxError, ValueError, TimeoutError) as error:
|
78 |
i = -1
|
79 |
logger.error(error)
|
80 |
|
source/eastmoney.py
CHANGED
@@ -47,10 +47,10 @@ def _crawl(url, article, retries=3):
|
|
47 |
domain = urlparse(url).netloc
|
48 |
for attempt in range(retries):
|
49 |
try:
|
50 |
-
req = urllib.request.urlopen(url, timeout=
|
51 |
text = req.read()
|
52 |
break
|
53 |
-
except IncompleteRead as e:
|
54 |
if attempt == retries - 1:
|
55 |
time.sleep(1) # Wait before retrying
|
56 |
continue
|
|
|
47 |
domain = urlparse(url).netloc
|
48 |
for attempt in range(retries):
|
49 |
try:
|
50 |
+
req = urllib.request.urlopen(url, timeout=60)
|
51 |
text = req.read()
|
52 |
break
|
53 |
+
except (IncompleteRead, TimeoutError) as e:
|
54 |
if attempt == retries - 1:
|
55 |
time.sleep(1) # Wait before retrying
|
56 |
continue
|
source/gov.py
CHANGED
@@ -28,7 +28,7 @@ def crawl(delta):
|
|
28 |
else:
|
29 |
category_url = f"https://www.gov.cn/zhengce/jiedu/home_{i}.htm"
|
30 |
i = i + 1
|
31 |
-
req = urllib.request.urlopen(category_url, timeout=
|
32 |
text = req.read()
|
33 |
html_text = text.decode("utf-8")
|
34 |
page = etree.HTML(html_text)
|
@@ -61,7 +61,7 @@ def crawl(delta):
|
|
61 |
else:
|
62 |
category_url = f"https://www.gov.cn/zhengce/zuixin/home_{i}.htm"
|
63 |
i = i + 1
|
64 |
-
req = urllib.request.urlopen(category_url, timeout=
|
65 |
text = req.read()
|
66 |
html_text = text.decode("utf-8")
|
67 |
page = etree.HTML(html_text)
|
|
|
28 |
else:
|
29 |
category_url = f"https://www.gov.cn/zhengce/jiedu/home_{i}.htm"
|
30 |
i = i + 1
|
31 |
+
req = urllib.request.urlopen(category_url, timeout=60)
|
32 |
text = req.read()
|
33 |
html_text = text.decode("utf-8")
|
34 |
page = etree.HTML(html_text)
|
|
|
61 |
else:
|
62 |
category_url = f"https://www.gov.cn/zhengce/zuixin/home_{i}.htm"
|
63 |
i = i + 1
|
64 |
+
req = urllib.request.urlopen(category_url, timeout=60)
|
65 |
text = req.read()
|
66 |
html_text = text.decode("utf-8")
|
67 |
page = etree.HTML(html_text)
|
source/mof.py
CHANGED
@@ -28,7 +28,7 @@ def crawl(delta):
|
|
28 |
else:
|
29 |
category_url = f"https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/index_{i}.htm"
|
30 |
i = i + 1
|
31 |
-
req = urllib.request.urlopen(category_url, timeout=
|
32 |
text = req.read()
|
33 |
html_text = text.decode("utf-8")
|
34 |
page = etree.HTML(html_text)
|
@@ -65,7 +65,7 @@ def crawl(delta):
|
|
65 |
else:
|
66 |
category_url = f"https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/index_{i}.htm"
|
67 |
i = i + 1
|
68 |
-
req = urllib.request.urlopen(category_url, timeout=
|
69 |
text = req.read()
|
70 |
html_text = text.decode("utf-8")
|
71 |
page = etree.HTML(html_text)
|
|
|
28 |
else:
|
29 |
category_url = f"https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/index_{i}.htm"
|
30 |
i = i + 1
|
31 |
+
req = urllib.request.urlopen(category_url, timeout=60)
|
32 |
text = req.read()
|
33 |
html_text = text.decode("utf-8")
|
34 |
page = etree.HTML(html_text)
|
|
|
65 |
else:
|
66 |
category_url = f"https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/index_{i}.htm"
|
67 |
i = i + 1
|
68 |
+
req = urllib.request.urlopen(category_url, timeout=60)
|
69 |
text = req.read()
|
70 |
html_text = text.decode("utf-8")
|
71 |
page = etree.HTML(html_text)
|
source/mofcom.py
CHANGED
@@ -31,7 +31,7 @@ def crawl(delta):
|
|
31 |
url = f"http://www.mofcom.gov.cn/article/zcjd/{category}/?{i}"
|
32 |
i = i + 1
|
33 |
try:
|
34 |
-
req = urllib.request.urlopen(url, timeout=
|
35 |
text = req.read()
|
36 |
html_text = text.decode("utf-8")
|
37 |
page = etree.HTML(html_text)
|
|
|
31 |
url = f"http://www.mofcom.gov.cn/article/zcjd/{category}/?{i}"
|
32 |
i = i + 1
|
33 |
try:
|
34 |
+
req = urllib.request.urlopen(url, timeout=60)
|
35 |
text = req.read()
|
36 |
html_text = text.decode("utf-8")
|
37 |
page = etree.HTML(html_text)
|
source/ndrc.py
CHANGED
@@ -31,7 +31,7 @@ def crawl(delta):
|
|
31 |
else:
|
32 |
category_url = f"https://www.ndrc.gov.cn/xxgk/jd/jd/index_{i}.html"
|
33 |
i = i + 1
|
34 |
-
req = urllib.request.urlopen(category_url, timeout=
|
35 |
text = req.read()
|
36 |
html_text = text.decode("utf-8")
|
37 |
page = etree.HTML(html_text)
|
|
|
31 |
else:
|
32 |
category_url = f"https://www.ndrc.gov.cn/xxgk/jd/jd/index_{i}.html"
|
33 |
i = i + 1
|
34 |
+
req = urllib.request.urlopen(category_url, timeout=60)
|
35 |
text = req.read()
|
36 |
html_text = text.decode("utf-8")
|
37 |
page = etree.HTML(html_text)
|
source/safe.py
CHANGED
@@ -29,7 +29,7 @@ def crawl(delta):
|
|
29 |
category_url = f"https://www.safe.gov.cn/safe/zcfgjd/index_{i}.html"
|
30 |
i = i + 1
|
31 |
try:
|
32 |
-
req = urllib.request.urlopen(category_url, timeout=
|
33 |
text = req.read()
|
34 |
html_text = text.decode("utf-8")
|
35 |
page = etree.HTML(html_text)
|
@@ -65,7 +65,7 @@ def crawl(delta):
|
|
65 |
else:
|
66 |
category_url = f"https://www.safe.gov.cn/safe/sjjd/index_{i}.html"
|
67 |
i = i + 1
|
68 |
-
req = urllib.request.urlopen(category_url, timeout=
|
69 |
text = req.read()
|
70 |
html_text = text.decode("utf-8")
|
71 |
page = etree.HTML(html_text)
|
|
|
29 |
category_url = f"https://www.safe.gov.cn/safe/zcfgjd/index_{i}.html"
|
30 |
i = i + 1
|
31 |
try:
|
32 |
+
req = urllib.request.urlopen(category_url, timeout=60)
|
33 |
text = req.read()
|
34 |
html_text = text.decode("utf-8")
|
35 |
page = etree.HTML(html_text)
|
|
|
65 |
else:
|
66 |
category_url = f"https://www.safe.gov.cn/safe/sjjd/index_{i}.html"
|
67 |
i = i + 1
|
68 |
+
req = urllib.request.urlopen(category_url, timeout=60)
|
69 |
text = req.read()
|
70 |
html_text = text.decode("utf-8")
|
71 |
page = etree.HTML(html_text)
|
source/stats.py
CHANGED
@@ -36,7 +36,7 @@ def crawl(delta):
|
|
36 |
retries = 3
|
37 |
while retries > 0:
|
38 |
try:
|
39 |
-
req = urllib.request.urlopen(category_url, timeout=
|
40 |
text = req.read()
|
41 |
html_text = text.decode("utf-8")
|
42 |
page = etree.HTML(html_text)
|
|
|
36 |
retries = 3
|
37 |
while retries > 0:
|
38 |
try:
|
39 |
+
req = urllib.request.urlopen(category_url, timeout=60)
|
40 |
text = req.read()
|
41 |
html_text = text.decode("utf-8")
|
42 |
page = etree.HTML(html_text)
|