Spaces:

Oxbridge-Economics
/

Data-Collection-China

Running

OxbridgeEconomics

commit

01677a0 10 months ago

5.59 kB

	"""Module to crawl the website 'https://www.csrc.gov.cn' to fetch and process articles."""
	import json
	import logging
	import time
	import urllib.request
	import uuid
	from datetime import datetime, timedelta

	from prefect import flow
	from lxml import etree

	from controllers.summarizer import summarize
	from controllers.utils import (
	crawl_by_url,
	encode,
	fetch_url,
	sentiment_computation,
	translate,
	update_content,
	)

	@flow(name = "Data Collection - csrc")
	def crawl(delta):
	"""
	Crawls the website http://www.csrc.gov.cn to retrieve articles based on the specified delta.

	Args:
	delta (int): The number of days in the past from today to retrieve articles.

	Returns:
	None

	Raises:
	None
	"""
	logging.info("csrc.gov.cn")
	i = 1
	while i > -1:
	try:
	if i == 1:
	category_url = "http://www.csrc.gov.cn/csrc/c100039/common_list.shtml"
	else:
	category_url = f"http://www.csrc.gov.cn/csrc/c100039/common_list_{i}.shtml"
	i = i + 1
	logging.info(category_url)
	req = urllib.request.urlopen(category_url)
	text = req.read()
	html_text = text.decode("utf-8")
	page = etree.HTML(html_text)
	articlelist = page.xpath(
	"//div[contains(@class, 'main-right fr common-list')]/ul/li")
	for article in articlelist:
	if isinstance(article, etree._Element):
	subelement = etree.tostring(article).decode()
	subpage = etree.HTML(subelement)
	date = encode(subpage.xpath("//span[@class='date']"))
	parsed_datetime = datetime.strptime(
	time.strftime("%Y-%m-%d", time.strptime(date, "%Y-%m-%d")),
	"%Y-%m-%d")
	if parsed_datetime < (datetime.today() - timedelta(days=delta)):
	i = -1
	else:
	urls = subpage.xpath("//a/@href")
	for url in urls:
	try:
	article = {}
	url = "http://www.csrc.gov.cn" + url
	article['category'] = "Policy Interpretation"
	logging.info(url)
	crawl_by_url(url, article)
	except Exception as error:
	logging.error(error)
	except Exception as error:
	i = -1
	logging.error(error)

	i = 1
	while i > -1:
	category_url = f"http://www.csrc.gov.cn/searchList/a1a078ee0bc54721ab6b148884c784a8?_isAgg=true&_isJson=true&_pageSize=18&_template=index&_rangeTimeGte=&_channelName=&page={i}"
	i = i + 1
	try:
	logging.info(category_url)
	content = fetch_url(category_url)
	if content is None:
	i = -1
	else:
	reportinfo = json.loads(content)
	if len(reportinfo['data']['results']) == 0:
	i = -1
	logging.info(len(reportinfo['data']['results']))
	for article in reportinfo['data']['results']:
	parsed_datetime = datetime.strptime(
	time.strftime(
	"%Y-%m-%d",
	time.strptime(article['publishedTimeStr'],
	"%Y-%m-%d %H:%M:%S")), "%Y-%m-%d")
	if parsed_datetime < (datetime.today() - timedelta(days=delta)):
	i = -1
	else:
	article['category'] = "Financial News"
	article['site'] = "Securities Regulatory Commission of China"
	article['originSite'] = "证监会"
	article['titleCN'] = article['title']
	article['title'] = translate(article['titleCN'])
	article['author'] = ''
	article['contentCN'] = repr(article['content'])[1:-1].strip()
	if len(article['contentCN']) < 10:
	continue
	contenteng = ''
	for element in article['contentCN'].split("。"):
	contenteng += translate(element) + ' '
	article['content'] = repr(contenteng)[1:-1].strip()
	article['subtitle'] = summarize(article['content'])
	article['publishDate'] = time.strftime(
	"%Y-%m-%d",
	time.strptime(article['publishedTimeStr'],
	"%Y-%m-%d %H:%M:%S"))
	article['link'] = article['url']
	article['attachment'] = ""
	article['sentimentScore'], article[
	'sentimentLabel'] = sentiment_computation(article['content'])
	article['id'] = uuid.uuid5(
	uuid.NAMESPACE_OID,
	article['titleCN'] + article['publishDate'])
	logging.info(article)
	# update_content(article)
	except Exception as error:
	i = -1
	logging.error(error)