OxbridgeEconomics commited on
Commit
e1d71ff
·
1 Parent(s): 9f429a0
Files changed (1) hide show
  1. daily.py +31 -26
daily.py CHANGED
@@ -354,32 +354,37 @@ for category in categories:
354
  else:
355
  URL = f"http://www.mofcom.gov.cn/article/zcjd/{category}/?{i}"
356
  i = i + 1
357
- req = urllib.request.urlopen(URL)
358
- text = req.read()
359
- html_text = text.decode("utf-8")
360
- page = etree.HTML(html_text)
361
- articlelist = page.xpath("//section[contains(@class, 'listCon iListCon f-mt30')]/ul/li")
362
- for article in articlelist:
363
- if isinstance(article, etree._Element):
364
- subelement = etree.tostring(article).decode()
365
- subpage = etree.HTML(subelement)
366
- date = subpage.xpath("//span/text()")[0]
367
- parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d %H:%M:%S")), "%Y-%m-%d")
368
- if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
369
- i = -1
370
- else:
371
- urls = subpage.xpath("//a/@href")
372
- for url in urls:
373
- try:
374
- article = {}
375
- if '/article/zcjd' in url:
376
- url = "http://www.mofcom.gov.cn" + url
377
- article['category']= "Policy Interpretation"
378
- else:
379
- article['category']= "Policy Release"
380
- crawl(url, article)
381
- except Exception as error:
382
- print(error)
 
 
 
 
 
383
 
384
  print("ndrc.gov.cn")
385
  i = 0
 
354
  else:
355
  URL = f"http://www.mofcom.gov.cn/article/zcjd/{category}/?{i}"
356
  i = i + 1
357
+ try:
358
+ req = urllib.request.urlopen(URL)
359
+ text = req.read()
360
+ html_text = text.decode("utf-8")
361
+ page = etree.HTML(html_text)
362
+ articlelist = page.xpath("//section[contains(@class, 'listCon iListCon f-mt30')]/ul/li")
363
+ for article in articlelist:
364
+ if isinstance(article, etree._Element):
365
+ subelement = etree.tostring(article).decode()
366
+ subpage = etree.HTML(subelement)
367
+ date = subpage.xpath("//span/text()")[0]
368
+ parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d %H:%M:%S")), "%Y-%m-%d")
369
+ if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
370
+ i = -1
371
+ else:
372
+ urls = subpage.xpath("//a/@href")
373
+ for url in urls:
374
+ try:
375
+ article = {}
376
+ if '/article/zcjd' in url:
377
+ url = "http://www.mofcom.gov.cn" + url
378
+ article['category']= "Policy Interpretation"
379
+ else:
380
+ article['category']= "Policy Release"
381
+ crawl(url, article)
382
+ except Exception as error:
383
+ print(error)
384
+ except Exception as error:
385
+ i = -1
386
+ print(error)
387
+
388
 
389
  print("ndrc.gov.cn")
390
  i = 0