Spaces:

Oxbridge-Economics
/

Data-Collection-China

Running

App Files Files Community

Data-Collection-China / mof.py

gavinzli

chore: Add script descriptions and improve code readability

39fe3d1 11 months ago

raw

history blame

3.81 kB

	"""
	This script is used to crawl and collect financial news and policy interpretation articles from the website of the Ministry of Finance of China (https://www.mof.gov.cn/).

	The script iterates through the pages of the "Financial News" and "Policy Interpretation" categories on the website and extracts the articles' URLs. It then calls the `crawl` function from the `utils` module to crawl and collect the article data.

	The script uses the `lxml` library to parse the HTML content of the website and extract the necessary information.

	Note: The script assumes the existence of a `crawl` function in the `utils` module.
	"""

	import time
	import urllib.request
	from datetime import datetime, timedelta

	from lxml import etree

	from utils import crawl

	# Crawl Financial News articles
	i = 0
	while i > -1:
	if i == 0:
	CATEGORY_URL = "https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/"
	else:
	CATEGORY_URL = f"https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/index_{i}.htm"
	i = i + 1
	print(CATEGORY_URL)
	req = urllib.request.urlopen(CATEGORY_URL)
	text = req.read()
	html_text = text.decode("utf-8")
	page = etree.HTML(html_text)
	articlelist = page.xpath("//div[contains(@class, 'xwfb_listerji')]/ul/li[not(@class = 'clear')]")
	for article in articlelist:
	if isinstance(article, etree._Element):
	subelement = etree.tostring(article).decode()
	subpage = etree.HTML(subelement)
	date = subpage.xpath("//span/text()")[0]
	parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
	if parsed_datetime < (datetime.today() - timedelta(days=183)):
	i = -1
	else:
	urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
	for url in urls:
	try:
	article = {}
	url = url.replace("../", "https://www.mof.gov.cn/zhengwuxinxi/")
	url = url.replace("./", "https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/")
	print(url)
	article['category']= "Financial News"
	crawl(url, article)
	except Exception as error:
	print(error)

	# Crawl Policy Interpretation articles
	i = 0
	while i > -1:
	if i == 0:
	CATEGORY_URL = "https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/"
	else:
	CATEGORY_URL = f"https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/index_{i}.htm"
	i = i + 1
	print(CATEGORY_URL)
	req = urllib.request.urlopen(CATEGORY_URL)
	text = req.read()
	html_text = text.decode("utf-8")
	page = etree.HTML(html_text)
	articlelist = page.xpath("//div[contains(@class, 'xwfb_listerji')]/ul/li[not(@class = 'clear')]")
	for article in articlelist:
	if isinstance(article, etree._Element):
	subelement = etree.tostring(article).decode()
	subpage = etree.HTML(subelement)
	date = subpage.xpath("//span/text()")[0]
	parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
	if parsed_datetime < (datetime.today() - timedelta(days=183)):
	i = -1
	else:
	urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
	for url in urls:
	try:
	article = {}
	url = url.replace("./", CATEGORY_URL)
	article['category']= "Policy Interpretation"
	print(url)
	crawl(url, article)
	except Exception as error:
	print(error)