Spaces:

Oxbridge-Economics
/

Data-Collection-China

Build error

App Files Files Community

Data-Collection-China / mofcom.py

gavinzli

chore: Add script descriptions and improve code readability

39fe3d1 11 months ago

raw

history blame

2.12 kB

	"""
	This script is used to crawl and collect data from the Ministry of Commerce of the People's Republic of China (MOFCOM) website.
	It retrieves articles from different categories and extracts relevant information such as date and URL.
	The collected data is then passed to the 'crawl' function for further processing.
	"""

	import time
	import urllib.request
	from datetime import datetime, timedelta
	from lxml import etree
	from utils import crawl

	categories = ['jdzhsw','jdgnmy','jddwmy','jdtzhz']
	for category in categories:
	i = 1
	while i > -1:
	if i == 1:
	URL = f"http://www.mofcom.gov.cn/article/zcjd/{category}/"
	else:
	URL = f"http://www.mofcom.gov.cn/article/zcjd/{category}/?{i}"
	i = i + 1
	req = urllib.request.urlopen(URL)
	text = req.read()
	html_text = text.decode("utf-8")
	page = etree.HTML(html_text)
	articlelist = page.xpath("//section[contains(@class, 'listCon iListCon f-mt30')]/ul/li")
	for article in articlelist:
	if isinstance(article, etree._Element):
	subelement = etree.tostring(article).decode()
	subpage = etree.HTML(subelement)
	date = subpage.xpath("//span/text()")[0]
	parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d %H:%M:%S")), "%Y-%m-%d")
	if parsed_datetime < (datetime.today() - timedelta(days=183)):
	i = -1
	else:
	urls = subpage.xpath("//a/@href")
	for url in urls:
	try:
	article = {}
	if '/article/zcjd' in url:
	url = "http://www.mofcom.gov.cn" + url
	article['category']= "Policy Interpretation"
	else:
	article['category']= "Policy Release"
	crawl(url, article)
	except Exception as error:
	print(error)