OxbridgeEconomics commited on
Commit
efcd6b8
·
1 Parent(s): 214d268
Files changed (1) hide show
  1. daily.py +32 -28
daily.py CHANGED
@@ -62,34 +62,38 @@ while i > -1:
62
  print("csrc.gov.cn")
63
  i = 1
64
  while i > -1:
65
- if i == 1:
66
- CATEGORY_URL = "http://www.csrc.gov.cn/csrc/c100039/common_list.shtml"
67
- else:
68
- CATEGORY_URL = f"http://www.csrc.gov.cn/csrc/c100039/common_list_{i}.shtml"
69
- i = i + 1
70
- req = urllib.request.urlopen(CATEGORY_URL)
71
- text = req.read()
72
- html_text = text.decode("utf-8")
73
- page = etree.HTML(html_text)
74
- articlelist = page.xpath("//div[contains(@class, 'main-right fr common-list')]/ul/li")
75
- for article in articlelist:
76
- if isinstance(article, etree._Element):
77
- subelement = etree.tostring(article).decode()
78
- subpage = etree.HTML(subelement)
79
- date = encode(subpage.xpath("//span[@class='date']"))
80
- parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
81
- if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
82
- i = -1
83
- else:
84
- urls = subpage.xpath("//a/@href")
85
- for url in urls:
86
- try:
87
- article = {}
88
- url = "http://www.csrc.gov.cn" + url
89
- article['category']= "Policy Interpretation"
90
- crawl(url, article)
91
- except Exception as error:
92
- print(error)
 
 
 
 
93
 
94
  i = 1
95
  while i > -1:
 
62
  print("csrc.gov.cn")
63
  i = 1
64
  while i > -1:
65
+ try:
66
+ if i == 1:
67
+ CATEGORY_URL = "http://www.csrc.gov.cn/csrc/c100039/common_list.shtml"
68
+ else:
69
+ CATEGORY_URL = f"http://www.csrc.gov.cn/csrc/c100039/common_list_{i}.shtml"
70
+ i = i + 1
71
+ req = urllib.request.urlopen(CATEGORY_URL)
72
+ text = req.read()
73
+ html_text = text.decode("utf-8")
74
+ page = etree.HTML(html_text)
75
+ articlelist = page.xpath("//div[contains(@class, 'main-right fr common-list')]/ul/li")
76
+ for article in articlelist:
77
+ if isinstance(article, etree._Element):
78
+ subelement = etree.tostring(article).decode()
79
+ subpage = etree.HTML(subelement)
80
+ date = encode(subpage.xpath("//span[@class='date']"))
81
+ parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
82
+ if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
83
+ i = -1
84
+ else:
85
+ urls = subpage.xpath("//a/@href")
86
+ for url in urls:
87
+ try:
88
+ article = {}
89
+ url = "http://www.csrc.gov.cn" + url
90
+ article['category']= "Policy Interpretation"
91
+ crawl(url, article)
92
+ except Exception as error:
93
+ print(error)
94
+ except Exception as error:
95
+ i = -1
96
+ print(error)
97
 
98
  i = 1
99
  while i > -1: