Spaces:

Oxbridge-Economics
/

Data-Collection-China

Build error

Muhammad Abdur Rahman Saad

Update daily.py

5068e1f 10 months ago

23 kB

	"""
	This script is responsible for collecting data from various websites related to financial and policy information in China.
	It fetches data from different sources, extracts relevant information, translates it, and updates the content accordingly.
	The collected data includes policy interpretations, financial news, macroeconomic research, and more.
	"""
	import json
	import os
	import time
	import urllib.request
	import uuid
	from datetime import datetime, timedelta
	from urllib.parse import urlparse

	from lxml import etree
	from glue import glue_job_run
	from utils import (crawl, datemodifier, encode, encode_content,
	extract_from_pdf, extract_reference, fetch_url,
	sentiment_computation, translate, update_content)

	def crawl_eastmoney(url, article):
	"""
	Crawls the given URL and extracts information from the webpage.

	Args:
	url (str): The URL of the webpage to crawl.
	article (dict): A dictionary to store the extracted information.

	Returns:
	None: If the length of the extracted content is less than 10 characters.

	Raises:
	None.

	"""
	domain = urlparse(url).netloc
	req = urllib.request.urlopen(url)
	text = req.read()
	html_text = text.decode("utf-8")
	page = etree.HTML(html_text)
	contentCN, summary = encode_content(page.xpath(xpath_dict[domain]['content']))
	article['attachment'] = encode(page.xpath(xpath_dict[domain]['attachment']))
	article['link'] = url
	if article['orgSName'] == "''":
	article['site'] = translate(article['orgSName'])
	else:
	article['site'] = translate(article['orgName'])
	article['titleCN'] = article['title']
	article['title'] = translate(article['title'])
	article['author'] = translate(article['researcher'])
	article['originAuthor'] = article['researcher']
	article['contentCN'] = repr(contentCN)[1:-1].strip()
	article['subtitle'] = translate(summary)
	article['category'] = "Macroeconomic Research"
	if len(article['contentCN']) < 10:
	return None
	CONTENT_ENG = ''
	for element in contentCN.split("\n"):
	CONTENT_ENG += translate(element) + '\n'
	article['content'] = repr(CONTENT_ENG)[1:-1].strip()
	article['authorid'] = uuid.uuid5(uuid.NAMESPACE_OID, article['author'])
	article['publishDate'] = datemodifier(article['publishDate'], xpath_dict[domain]['datetime_format'])
	article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['titleCN']+article['publishDate'])
	article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(contentCN.replace("\n",""))
	extract_reference(article)
	update_content(article)

	with open('xpath.json', 'r', encoding='UTF-8') as f:
	xpath_dict = json.load(f)

	DELTA = int(os.environ.get('DELTA') or '1')
	print(f"DELTA = {DELTA}")

	print("cbirc.gov.cn")
	i = 1
	while i > -1:
	CATEGORY_URL = f"https://www.cbirc.gov.cn/cn/static/data/DocInfo/SelectDocByItemIdAndChild/data_itemId=917,pageIndex={i},pageSize=18.json"
	i = i + 1
	content = fetch_url(CATEGORY_URL)
	reportinfo = json.loads(content)
	for article in reportinfo['data']['rows']:
	try:
	article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(article['publishDate'],"%Y-%m-%d %H:%M:%S"))
	parsed_datetime = datetime.strptime(article['publishDate'], "%Y-%m-%d")
	if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
	i = -1
	else:
	contentCN, summary = extract_from_pdf("https://www.cbirc.gov.cn" + article['pdfFileUrl'])
	article['contentCN'] = repr(contentCN)[1:-1].strip()
	if len(contentCN) < 10:
	continue
	CONTENT_ENG = ''
	for element in article['contentCN'].split("\n"):
	CONTENT_ENG += translate(element) + '\n'
	article['content'] = repr(CONTENT_ENG)[1:-1].strip()
	article['site'] = "National Financial Regulatory Administration of China"
	article['originSite'] = "国家金融监督管理总局"
	article['titleCN'] = article['docSubtitle']
	article['title'] = translate(article['docSubtitle'])
	article['link'] = "https://www.cbirc.gov.cn" + str(article['pdfFileUrl'])
	article['category']= "Policy Interpretation"
	article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['titleCN']+article['publishDate'])
	article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
	article['attachment'] = ''
	article['author'] = ''
	article['subtitle'] = translate(summary)
	update_content(article)
	except Exception as error:
	print(error)

	print("csrc.gov.cn")
	i = 1
	while i > -1:
	try:
	if i == 1:
	CATEGORY_URL = "http://www.csrc.gov.cn/csrc/c100039/common_list.shtml"
	else:
	CATEGORY_URL = f"http://www.csrc.gov.cn/csrc/c100039/common_list_{i}.shtml"
	i = i + 1
	req = urllib.request.urlopen(CATEGORY_URL)
	text = req.read()
	html_text = text.decode("utf-8")
	page = etree.HTML(html_text)
	articlelist = page.xpath("//div[contains(@class, 'main-right fr common-list')]/ul/li")
	for article in articlelist:
	if isinstance(article, etree._Element):
	subelement = etree.tostring(article).decode()
	subpage = etree.HTML(subelement)
	date = encode(subpage.xpath("//span[@class='date']"))
	parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
	if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
	i = -1
	else:
	urls = subpage.xpath("//a/@href")
	for url in urls:
	try:
	article = {}
	url = "http://www.csrc.gov.cn" + url
	article['category']= "Policy Interpretation"
	crawl(url, article)
	except Exception as error:
	print(error)
	except Exception as error:
	i = -1
	print(error)

	i = 1
	while i > -1:
	CATEGORY_URL = f"http://www.csrc.gov.cn/searchList/a1a078ee0bc54721ab6b148884c784a8?_isAgg=true&_isJson=true&_pageSize=18&_template=index&_rangeTimeGte=&_channelName=&page={i}"
	i = i + 1
	try:
	content = fetch_url(CATEGORY_URL)
	reportinfo = json.loads(content)
	for article in reportinfo['data']['results']:
	parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(article['publishedTimeStr'],"%Y-%m-%d %H:%M:%S")), "%Y-%m-%d")
	if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
	i = -1
	else:
	article['category']= "Financial News"
	article['site'] = "Securities Regulatory Commission of China"
	article['originSite'] = "证监会"
	article['titleCN'] = article['title']
	article['title'] = translate(article['titleCN'])
	article['author'] = ''
	article['contentCN'] = repr(article['content'])[1:-1].strip()
	if len(article['contentCN']) < 10:
	continue
	CONTENT_ENG = ''
	for element in article['contentCN'].split("。"):
	CONTENT_ENG += translate(element) + ' '
	article['content'] = repr(CONTENT_ENG)[1:-1].strip()
	article['subtitle'] = article['memo']
	article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(article['publishedTimeStr'],"%Y-%m-%d %H:%M:%S"))
	article['link'] = article['url']
	article['attachment'] = ""
	article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
	article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['titleCN']+article['publishDate'])
	update_content(article)
	except Exception as error:
	print(error)

	print("data.eastmoney.com")

	today = datetime.today().strftime('%Y-%m-%d')
	beginDate = (datetime.today() - timedelta(days=DELTA)).strftime('%Y-%m-%d')
	i = 0
	while i > -1:
	URL = "https://reportapi.eastmoney.com/report/jg"
	params = {
	"cb": "datatable8544623",
	"pageSize": "100",
	"beginTime": beginDate,
	"endTime": today,
	"pageNo": i,
	"qType": "3",
	}
	URL = URL + "?" + "&".join(f"{key}={value}" for key, value in params.items())
	content = fetch_url(URL)
	if content:
	start_index = content.find("(")
	if start_index != -1:
	result = content[start_index + 1: -1]
	else:
	result = content
	reportinfo = json.loads(result)
	if reportinfo["size"] > 0:
	i = i + 1
	for article in reportinfo['data']:
	try:
	url = f"https://data.eastmoney.com/report/zw_macresearch.jshtml?encodeUrl={article['encodeUrl']}"
	crawl_eastmoney(url,article)
	except Exception as error:
	print(error)
	else:
	i = -1
	else:
	print("Failed to fetch URL:", url)

	print("gov.cn")
	i = 0
	while i > -1:
	if i == 0:
	CATEGORY_URL = "https://www.gov.cn/zhengce/jiedu/home.htm"
	else:
	CATEGORY_URL = f"https://www.gov.cn/zhengce/jiedu/home_{i}.htm"
	i = i + 1
	req = urllib.request.urlopen(CATEGORY_URL)
	text = req.read()
	html_text = text.decode("utf-8")
	page = etree.HTML(html_text)
	articlelist = page.xpath("//div[contains(@class, 'news_box')]//h4")
	for article in articlelist:
	if isinstance(article, etree._Element):
	subelement = etree.tostring(article).decode()
	subpage = etree.HTML(subelement)
	date = subpage.xpath("//span/text()")[0]
	parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
	if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
	i = -1
	else:
	urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
	for url in urls:
	try:
	article = {}
	url = url.replace('../', 'https://www.gov.cn/zhengce/')
	if "https://www.gov.cn" in url:
	article['category']= "Policy Interpretation"
	crawl(url, article)
	except Exception as error:
	print(error)

	i = 0
	while i > -1:
	if i == 0:
	CATEGORY_URL = "https://www.gov.cn/zhengce/zuixin/home.htm"
	else:
	CATEGORY_URL = f"https://www.gov.cn/zhengce/zuixin/home_{i}.htm"
	i = i + 1
	req = urllib.request.urlopen(CATEGORY_URL)
	text = req.read()
	html_text = text.decode("utf-8")
	page = etree.HTML(html_text)
	articlelist = page.xpath("//div[contains(@class, 'news_box')]//h4")
	for article in articlelist:
	if isinstance(article, etree._Element):
	subelement = etree.tostring(article).decode()
	subpage = etree.HTML(subelement)
	date = subpage.xpath("//span/text()")[0]
	parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
	if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
	i = -1
	else:
	urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
	for url in urls:
	try:
	article = {}
	url = url.replace('../', 'https://www.gov.cn/zhengce/')
	if "https://www.gov.cn" in url:
	article['site'] = "State Council of China"
	crawl(url, article)
	except Exception as error:
	print(error)

	print("mof.gov.cn")
	i = 0
	while i > -1:
	if i == 0:
	CATEGORY_URL = "https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/"
	else:
	CATEGORY_URL = f"https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/index_{i}.htm"
	i = i + 1
	req = urllib.request.urlopen(CATEGORY_URL)
	text = req.read()
	html_text = text.decode("utf-8")
	page = etree.HTML(html_text)
	articlelist = page.xpath("//div[contains(@class, 'xwfb_listerji')]/ul/li[not(@class = 'clear')]")
	for article in articlelist:
	if isinstance(article, etree._Element):
	subelement = etree.tostring(article).decode()
	subpage = etree.HTML(subelement)
	date = subpage.xpath("//span/text()")[0]
	parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
	if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
	i = -1
	else:
	urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
	for url in urls:
	try:
	article = {}
	url = url.replace("../", "https://www.mof.gov.cn/zhengwuxinxi/")
	url = url.replace("./", "https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/")
	article['category']= "Financial News"
	crawl(url, article)
	except Exception as error:
	print(error)

	i = 0
	while i > -1:
	if i == 0:
	CATEGORY_URL = "https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/"
	else:
	CATEGORY_URL = f"https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/index_{i}.htm"
	i = i + 1
	req = urllib.request.urlopen(CATEGORY_URL)
	text = req.read()
	html_text = text.decode("utf-8")
	page = etree.HTML(html_text)
	articlelist = page.xpath("//div[contains(@class, 'xwfb_listerji')]/ul/li[not(@class = 'clear')]")
	for article in articlelist:
	if isinstance(article, etree._Element):
	subelement = etree.tostring(article).decode()
	subpage = etree.HTML(subelement)
	date = subpage.xpath("//span/text()")[0]
	parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
	if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
	i = -1
	else:
	urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
	for url in urls:
	try:
	article = {}
	url = url.replace("./", CATEGORY_URL)
	article['category']= "Policy Interpretation"
	print(url)
	crawl(url, article)
	except Exception as error:
	print(error)

	print("mofcom.gov.cn")
	categories = ['jdzhsw','jdgnmy','jddwmy','jdtzhz']
	for category in categories:
	i = 1
	while i > -1:
	if i == 1:
	URL = f"http://www.mofcom.gov.cn/article/zcjd/{category}/"
	else:
	URL = f"http://www.mofcom.gov.cn/article/zcjd/{category}/?{i}"
	i = i + 1
	try:
	req = urllib.request.urlopen(URL)
	text = req.read()
	html_text = text.decode("utf-8")
	page = etree.HTML(html_text)
	articlelist = page.xpath("//section[contains(@class, 'listCon iListCon f-mt30')]/ul/li")
	for article in articlelist:
	if isinstance(article, etree._Element):
	subelement = etree.tostring(article).decode()
	subpage = etree.HTML(subelement)
	date = subpage.xpath("//span/text()")[0]
	parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d %H:%M:%S")), "%Y-%m-%d")
	if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
	i = -1
	else:
	urls = subpage.xpath("//a/@href")
	for url in urls:
	try:
	article = {}
	if '/article/zcjd' in url:
	url = "http://www.mofcom.gov.cn" + url
	article['category']= "Policy Interpretation"
	else:
	article['category']= "Policy Release"
	crawl(url, article)
	except Exception as error:
	print(error)
	except Exception as error:
	i = -1
	print(error)


	print("ndrc.gov.cn")
	i = 0
	while i > -1:
	if i == 0:
	CATEGORY_URL = "https://www.ndrc.gov.cn/xxgk/jd/jd/index.html"
	else:
	CATEGORY_URL = f"https://www.ndrc.gov.cn/xxgk/jd/jd/index_{i}.html"
	i = i + 1
	req = urllib.request.urlopen(CATEGORY_URL)
	text = req.read()
	html_text = text.decode("utf-8")
	page = etree.HTML(html_text)
	articlelist = page.xpath("//div[contains(@class, 'list')]/ul/li[not(@class = 'empty')]")
	for article in articlelist:
	if isinstance(article, etree._Element):
	subelement = etree.tostring(article).decode()
	subpage = etree.HTML(subelement)
	date = subpage.xpath("//span/text()")[0]
	parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y/%m/%d")), "%Y-%m-%d")
	if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
	i = -1
	else:
	urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
	for url in urls:
	try:
	article = {}
	if "www.gov.cn" in url:
	article['category']= "Policy Release"
	elif "../../zcfb/" in url:
	url = url.replace("../../zcfb/", "https://www.ndrc.gov.cn/xxgk/zcfb/")
	article['category']= "Policy Release"
	else:
	url = url.replace("../../", "https://www.ndrc.gov.cn/xxgk/jd/jd/")
	url = url.replace("./", "https://www.ndrc.gov.cn/xxgk/jd/jd/")
	article['category']= "Policy Interpretation"
	crawl(url, article)
	except Exception as error:
	print(error)

	print("safe.gov.cn")
	i = 1
	while i > -1:
	if i == 1:
	CATEGORY_URL = "https://www.safe.gov.cn/safe/zcfgjd/index.html"
	else:
	CATEGORY_URL = f"https://www.safe.gov.cn/safe/zcfgjd/index_{i}.html"
	i = i + 1
	req = urllib.request.urlopen(CATEGORY_URL)
	text = req.read()
	html_text = text.decode("utf-8")
	page = etree.HTML(html_text)
	articlelist = page.xpath("//div[contains(@class, 'list_conr')]/ul/li")
	for article in articlelist:
	if isinstance(article, etree._Element):
	subelement = etree.tostring(article).decode()
	subpage = etree.HTML(subelement)
	date = subpage.xpath("//dd/text()")[0]
	parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
	if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
	i = -1
	else:
	urls = subpage.xpath("//a/@href")
	for url in urls:
	try:
	article = {}
	url = "https://www.safe.gov.cn" + url
	article['category']= "Policy Interpretation"
	crawl(url, article)
	except Exception as error:
	print(error)

	i = 1
	while i > -1:
	if i == 1:
	CATEGORY_URL = "https://www.safe.gov.cn/safe/sjjd/index.html"
	else:
	CATEGORY_URL = f"https://www.safe.gov.cn/safe/sjjd/index_{i}.html"
	i = i + 1
	req = urllib.request.urlopen(CATEGORY_URL)
	text = req.read()
	html_text = text.decode("utf-8")
	page = etree.HTML(html_text)
	articlelist = page.xpath("//div[contains(@class, 'list_conr')]/ul/li")
	for article in articlelist:
	if isinstance(article, etree._Element):
	subelement = etree.tostring(article).decode()
	subpage = etree.HTML(subelement)
	date = subpage.xpath("//dd/text()")[0]
	parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
	if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
	i = -1
	else:
	urls = subpage.xpath("//a/@href")
	for url in urls:
	try:
	article = {}
	url = "https://www.safe.gov.cn" + url
	article['category']= "Data Interpretation"
	crawl(url, article)
	except Exception as error:
	print(error)

	print("stats.gov.hk")
	i = 0
	while i > -1:
	if i == 0:
	CATEGORY_URL = "https://www.stats.gov.cn/sj/sjjd/"
	else:
	CATEGORY_URL = f"https://www.stats.gov.cn/sj/sjjd/index_{i}.html"
	i = i + 1
	req = urllib.request.urlopen(CATEGORY_URL)
	text = req.read()
	html_text = text.decode("utf-8")
	page = etree.HTML(html_text)
	articlelist = page.xpath("//div[contains(@class, 'list-content')]/ul/li")
	for article in articlelist:
	if isinstance(article, etree._Element):
	subelement = etree.tostring(article).decode()
	subpage = etree.HTML(subelement)
	date = encode(subpage.xpath("//span"))
	parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
	if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
	i = -1
	else:
	urls = subpage.xpath("//a[@class='fl pc_1600']/@href")
	for url in urls:
	try:
	article = {}
	url = url.replace('./', "https://www.stats.gov.cn/sj/sjjd/")
	article['category']= "Data Interpretation"
	crawl(url, article)
	except Exception as error:
	print(error)

	glue_job_run()