Spaces:

Oxbridge-Economics
/

Data-Collection-China

Running

App Files Files Community

Data-Collection-China / gov.py

gavinzli

chore: Add script descriptions and improve code readability

39fe3d1 11 months ago

raw

history blame

5.26 kB

	"""
	This script is used to crawl and collect policy articles from the official website of the State Council of China (https://www.gov.cn).

	The script contains two main functions:
	1. crawl(url, article): This function is responsible for crawling a specific policy article given its URL and extracting relevant information such as title, author, content, publish date, etc.
	2. main(): This function is the entry point of the script. It iterates over different pages of policy articles and calls the crawl function to collect the information.

	Note: The script imports the following modules: datetime, timedelta, time, urllib.request, lxml.etree, and utils (custom module).
	"""

	from datetime import datetime, timedelta
	import time
	import urllib.request
	from lxml import etree
	from utils import crawl

	# Rest of the code...
	"""

	"""
	from datetime import datetime, timedelta
	import time
	import urllib.request
	from lxml import etree
	from utils import crawl

	# with open('xpath.json', 'r', encoding='UTF-8') as f:
	# xpath_dict = json.load(f)

	# def crawl(url, article):
	# domain = urlparse(url).netloc
	# req = urllib.request.urlopen(url)
	# text = req.read()
	# html_text = text.decode("utf-8")
	# page = etree.HTML(html_text)
	# originContent, summary = encode_content(page.xpath(xpath_dict[domain]['content']))
	# article['originTitle'] = encode(page.xpath(xpath_dict[domain]['title']))
	# article['title'] = translate(article['originTitle'])
	# article['author'] = translate(encode(page.xpath(xpath_dict[domain]['author'])))
	# article['originContent'] = repr(originContent)
	# if len(article['originContent']) < 10:
	# return None
	# CONTENT_ENG = ''
	# for element in originContent.split("\n"):
	# CONTENT_ENG += translate(element) + '\n'
	# article['content'] = repr(CONTENT_ENG)
	# article['subtitle'] = translate(summary)
	# article['publishDate'] = datemodifier(encode(page.xpath(xpath_dict[domain]['publishdate'])), xpath_dict[domain]['datetime']['format_string'])
	# article['link'] = url
	# article['attachment'] = ""
	# article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(originContent.replace("\n",""))
	# article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
	# upsert_content(article)

	i = 0
	while i > -1:
	if i == 0:
	CATEGORY_URL = "https://www.gov.cn/zhengce/jiedu/home.htm"
	else:
	CATEGORY_URL = f"https://www.gov.cn/zhengce/jiedu/home_{i}.htm"
	i = i + 1
	req = urllib.request.urlopen(CATEGORY_URL)
	text = req.read()
	html_text = text.decode("utf-8")
	page = etree.HTML(html_text)
	articlelist = page.xpath("//div[contains(@class, 'news_box')]//h4")
	for article in articlelist:
	if isinstance(article, etree._Element):
	subelement = etree.tostring(article).decode()
	subpage = etree.HTML(subelement)
	date = subpage.xpath("//span/text()")[0]
	parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
	if parsed_datetime < (datetime.today() - timedelta(days=183)):
	i = -1
	else:
	urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
	for url in urls:
	try:
	article = {}
	url = url.replace('../', 'https://www.gov.cn/zhengce/')
	if "https://www.gov.cn" in url:
	article['category']= "Policy Interpretation"
	crawl(url, article)
	except Exception as error:
	print(error)

	i = 0
	while i > -1:
	if i == 0:
	CATEGORY_URL = "https://www.gov.cn/zhengce/zuixin/home.htm"
	else:
	CATEGORY_URL = f"https://www.gov.cn/zhengce/zuixin/home_{i}.htm"
	i = i + 1
	req = urllib.request.urlopen(CATEGORY_URL)
	text = req.read()
	html_text = text.decode("utf-8")
	page = etree.HTML(html_text)
	articlelist = page.xpath("//div[contains(@class, 'news_box')]//h4")
	for article in articlelist:
	if isinstance(article, etree._Element):
	subelement = etree.tostring(article).decode()
	subpage = etree.HTML(subelement)
	date = subpage.xpath("//span/text()")[0]
	parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
	if parsed_datetime < (datetime.today() - timedelta(days=183)):
	i = -1
	else:
	urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
	for url in urls:
	try:
	article = {}
	url = url.replace('../', 'https://www.gov.cn/zhengce/')
	if "https://www.gov.cn" in url:
	article['category']= "Policy Release"
	article['originSite'] = "国务院"
	article['site'] = "State Council of China"
	crawl(url, article)
	except Exception as error:
	print(error)