Spaces:

Oxbridge-Economics
/

Data-Collection-China

Running

App Files Files Community

Data-Collection-China / csrc.py

gavinzli

chore: Add script descriptions and improve code readability

39fe3d1 11 months ago

raw

history blame

4.29 kB

	"""
	This script is used to crawl and collect data from the website of the China Securities Regulatory Commission (CSRC).
	It retrieves policy interpretation articles and financial news articles from the CSRC website.
	The collected data is then processed and stored in a database.

	The script consists of two main parts:
	1. Crawl and process policy interpretation articles from the CSRC website.
	2. Crawl and process financial news articles from the CSRC website.

	The script uses various libraries and functions to handle web scraping, data processing, and database operations.

	Note: This script assumes the presence of the following dependencies:
	- urllib
	- lxml
	- json
	- datetime
	- time
	- utils (custom module)

	Please make sure to install these dependencies before running the script.
	"""
	import uuid
	import json
	import time
	import urllib.request
	from datetime import datetime, timedelta
	from lxml import etree
	from utils import encode, translate, sentiment_computation, upsert_content, fetch_url, crawl

	i = 1
	while i > -1:
	if i == 1:
	CATEGORY_URL = "http://www.csrc.gov.cn/csrc/c100039/common_list.shtml"
	else:
	CATEGORY_URL = f"http://www.csrc.gov.cn/csrc/c100039/common_list_{i}.shtml"
	i = i + 1
	req = urllib.request.urlopen(CATEGORY_URL)
	text = req.read()
	html_text = text.decode("utf-8")
	page = etree.HTML(html_text)
	articlelist = page.xpath("//div[contains(@class, 'main-right fr common-list')]/ul/li")
	for article in articlelist:
	if isinstance(article, etree._Element):
	subelement = etree.tostring(article).decode()
	subpage = etree.HTML(subelement)
	date = encode(subpage.xpath("//span[@class='date']"))
	parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
	if parsed_datetime < (datetime.today() - timedelta(days=183)):
	i = -1
	else:
	urls = subpage.xpath("//a/@href")
	for url in urls:
	try:
	article = {}
	url = "http://www.csrc.gov.cn" + url
	article['category']= "Policy Interpretation"
	crawl(url, article)
	except Exception as error:
	print(error)

	i = 1
	while i > -1:
	CATEGORY_URL = f"http://www.csrc.gov.cn/searchList/a1a078ee0bc54721ab6b148884c784a8?_isAgg=true&_isJson=true&_pageSize=18&_template=index&_rangeTimeGte=&_channelName=&page={i}"
	i = i + 1
	content = fetch_url(CATEGORY_URL)
	reportinfo = json.loads(content)
	for article in reportinfo['data']['results']:
	try:
	parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(article['publishedTimeStr'],"%Y-%m-%d %H:%M:%S")), "%Y-%m-%d")
	if parsed_datetime < (datetime.today() - timedelta(days=183)):
	i = -1
	else:
	article['category']= "Financial News"
	article['site'] = "Securities Regulatory Commission of China"
	article['originSite'] = "证监会"
	article['titleCN'] = article['title']
	article['title'] = translate(article['titleCN'])
	article['author'] = ''
	article['contentCN'] = repr(article['content'])[1:-1].strip()
	if len(article['contentCN']) < 10:
	continue
	CONTENT_ENG = ''
	for element in article['contentCN'].split("。"):
	CONTENT_ENG += translate(element) + ' '
	article['content'] = repr(CONTENT_ENG)[1:-1].strip()
	article['subtitle'] = article['memo']
	article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(article['publishedTimeStr'],"%Y-%m-%d %H:%M:%S"))
	article['link'] = article['url']
	article['attachment'] = ""
	article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
	article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['titleCN']+article['publishDate'])
	upsert_content(article)
	except Exception as error:
	print(error)