gavinzli commited on
Commit
cc76656
·
1 Parent(s): fed78ac

Increase timeout for URL requests in crawl functions to enhance reliability

Browse files
controllers/utils.py CHANGED
@@ -654,7 +654,7 @@ def crawl_by_url(url, article):
654
 
655
  """
656
  domain = '.'.join(urlparse(url).netloc.split('.')[1:])
657
- req = urllib.request.urlopen(url, timeout=10)
658
  text = req.read()
659
  html_text = text.decode("utf-8")
660
  page = etree.HTML(html_text)
 
654
 
655
  """
656
  domain = '.'.join(urlparse(url).netloc.split('.')[1:])
657
+ req = urllib.request.urlopen(url, timeout=60)
658
  text = req.read()
659
  html_text = text.decode("utf-8")
660
  page = etree.HTML(html_text)
source/csrc.py CHANGED
@@ -47,7 +47,7 @@ def crawl(delta):
47
  category_url,
48
  headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
49
  )
50
- response = urllib.request.urlopen(req, timeout=10)
51
  text = response.read()
52
  html_text = text.decode("utf-8")
53
  page = etree.HTML(html_text)
@@ -70,11 +70,11 @@ def crawl(delta):
70
  article = {}
71
  url = "http://www.csrc.gov.cn" + url
72
  article['category'] = "Policy Interpretation"
73
- logger.info(f"Processing article URL: {url}")
74
  crawl_by_url(url, article)
75
  except (urllib.error.URLError, etree.XMLSyntaxError) as error:
76
  logger.error(error)
77
- except (urllib.error.URLError, etree.XMLSyntaxError, ValueError) as error:
78
  i = -1
79
  logger.error(error)
80
 
 
47
  category_url,
48
  headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
49
  )
50
+ response = urllib.request.urlopen(req, timeout=60)
51
  text = response.read()
52
  html_text = text.decode("utf-8")
53
  page = etree.HTML(html_text)
 
70
  article = {}
71
  url = "http://www.csrc.gov.cn" + url
72
  article['category'] = "Policy Interpretation"
73
+ logger.info("Processing article URL: %s", url)
74
  crawl_by_url(url, article)
75
  except (urllib.error.URLError, etree.XMLSyntaxError) as error:
76
  logger.error(error)
77
+ except (urllib.error.URLError, etree.XMLSyntaxError, ValueError, TimeoutError) as error:
78
  i = -1
79
  logger.error(error)
80
 
source/eastmoney.py CHANGED
@@ -47,10 +47,10 @@ def _crawl(url, article, retries=3):
47
  domain = urlparse(url).netloc
48
  for attempt in range(retries):
49
  try:
50
- req = urllib.request.urlopen(url, timeout=10)
51
  text = req.read()
52
  break
53
- except IncompleteRead as e:
54
  if attempt == retries - 1:
55
  time.sleep(1) # Wait before retrying
56
  continue
 
47
  domain = urlparse(url).netloc
48
  for attempt in range(retries):
49
  try:
50
+ req = urllib.request.urlopen(url, timeout=60)
51
  text = req.read()
52
  break
53
+ except (IncompleteRead, TimeoutError) as e:
54
  if attempt == retries - 1:
55
  time.sleep(1) # Wait before retrying
56
  continue
source/gov.py CHANGED
@@ -28,7 +28,7 @@ def crawl(delta):
28
  else:
29
  category_url = f"https://www.gov.cn/zhengce/jiedu/home_{i}.htm"
30
  i = i + 1
31
- req = urllib.request.urlopen(category_url, timeout=10)
32
  text = req.read()
33
  html_text = text.decode("utf-8")
34
  page = etree.HTML(html_text)
@@ -61,7 +61,7 @@ def crawl(delta):
61
  else:
62
  category_url = f"https://www.gov.cn/zhengce/zuixin/home_{i}.htm"
63
  i = i + 1
64
- req = urllib.request.urlopen(category_url, timeout=10)
65
  text = req.read()
66
  html_text = text.decode("utf-8")
67
  page = etree.HTML(html_text)
 
28
  else:
29
  category_url = f"https://www.gov.cn/zhengce/jiedu/home_{i}.htm"
30
  i = i + 1
31
+ req = urllib.request.urlopen(category_url, timeout=60)
32
  text = req.read()
33
  html_text = text.decode("utf-8")
34
  page = etree.HTML(html_text)
 
61
  else:
62
  category_url = f"https://www.gov.cn/zhengce/zuixin/home_{i}.htm"
63
  i = i + 1
64
+ req = urllib.request.urlopen(category_url, timeout=60)
65
  text = req.read()
66
  html_text = text.decode("utf-8")
67
  page = etree.HTML(html_text)
source/mof.py CHANGED
@@ -28,7 +28,7 @@ def crawl(delta):
28
  else:
29
  category_url = f"https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/index_{i}.htm"
30
  i = i + 1
31
- req = urllib.request.urlopen(category_url, timeout=10)
32
  text = req.read()
33
  html_text = text.decode("utf-8")
34
  page = etree.HTML(html_text)
@@ -65,7 +65,7 @@ def crawl(delta):
65
  else:
66
  category_url = f"https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/index_{i}.htm"
67
  i = i + 1
68
- req = urllib.request.urlopen(category_url, timeout=10)
69
  text = req.read()
70
  html_text = text.decode("utf-8")
71
  page = etree.HTML(html_text)
 
28
  else:
29
  category_url = f"https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/index_{i}.htm"
30
  i = i + 1
31
+ req = urllib.request.urlopen(category_url, timeout=60)
32
  text = req.read()
33
  html_text = text.decode("utf-8")
34
  page = etree.HTML(html_text)
 
65
  else:
66
  category_url = f"https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/index_{i}.htm"
67
  i = i + 1
68
+ req = urllib.request.urlopen(category_url, timeout=60)
69
  text = req.read()
70
  html_text = text.decode("utf-8")
71
  page = etree.HTML(html_text)
source/mofcom.py CHANGED
@@ -31,7 +31,7 @@ def crawl(delta):
31
  url = f"http://www.mofcom.gov.cn/article/zcjd/{category}/?{i}"
32
  i = i + 1
33
  try:
34
- req = urllib.request.urlopen(url, timeout=10)
35
  text = req.read()
36
  html_text = text.decode("utf-8")
37
  page = etree.HTML(html_text)
 
31
  url = f"http://www.mofcom.gov.cn/article/zcjd/{category}/?{i}"
32
  i = i + 1
33
  try:
34
+ req = urllib.request.urlopen(url, timeout=60)
35
  text = req.read()
36
  html_text = text.decode("utf-8")
37
  page = etree.HTML(html_text)
source/ndrc.py CHANGED
@@ -31,7 +31,7 @@ def crawl(delta):
31
  else:
32
  category_url = f"https://www.ndrc.gov.cn/xxgk/jd/jd/index_{i}.html"
33
  i = i + 1
34
- req = urllib.request.urlopen(category_url, timeout=10)
35
  text = req.read()
36
  html_text = text.decode("utf-8")
37
  page = etree.HTML(html_text)
 
31
  else:
32
  category_url = f"https://www.ndrc.gov.cn/xxgk/jd/jd/index_{i}.html"
33
  i = i + 1
34
+ req = urllib.request.urlopen(category_url, timeout=60)
35
  text = req.read()
36
  html_text = text.decode("utf-8")
37
  page = etree.HTML(html_text)
source/safe.py CHANGED
@@ -29,7 +29,7 @@ def crawl(delta):
29
  category_url = f"https://www.safe.gov.cn/safe/zcfgjd/index_{i}.html"
30
  i = i + 1
31
  try:
32
- req = urllib.request.urlopen(category_url, timeout=10)
33
  text = req.read()
34
  html_text = text.decode("utf-8")
35
  page = etree.HTML(html_text)
@@ -65,7 +65,7 @@ def crawl(delta):
65
  else:
66
  category_url = f"https://www.safe.gov.cn/safe/sjjd/index_{i}.html"
67
  i = i + 1
68
- req = urllib.request.urlopen(category_url, timeout=10)
69
  text = req.read()
70
  html_text = text.decode("utf-8")
71
  page = etree.HTML(html_text)
 
29
  category_url = f"https://www.safe.gov.cn/safe/zcfgjd/index_{i}.html"
30
  i = i + 1
31
  try:
32
+ req = urllib.request.urlopen(category_url, timeout=60)
33
  text = req.read()
34
  html_text = text.decode("utf-8")
35
  page = etree.HTML(html_text)
 
65
  else:
66
  category_url = f"https://www.safe.gov.cn/safe/sjjd/index_{i}.html"
67
  i = i + 1
68
+ req = urllib.request.urlopen(category_url, timeout=60)
69
  text = req.read()
70
  html_text = text.decode("utf-8")
71
  page = etree.HTML(html_text)
source/stats.py CHANGED
@@ -36,7 +36,7 @@ def crawl(delta):
36
  retries = 3
37
  while retries > 0:
38
  try:
39
- req = urllib.request.urlopen(category_url, timeout=10)
40
  text = req.read()
41
  html_text = text.decode("utf-8")
42
  page = etree.HTML(html_text)
 
36
  retries = 3
37
  while retries > 0:
38
  try:
39
+ req = urllib.request.urlopen(category_url, timeout=60)
40
  text = req.read()
41
  html_text = text.decode("utf-8")
42
  page = etree.HTML(html_text)