Spaces:

Oxbridge-Economics
/

Data-Collection-China

Running

OxbridgeEconomics

commit

8925fd4 11 months ago

22.2 kB

	import os
	import json
	import uuid
	import time
	import urllib.request
	from lxml import etree
	from datetime import datetime, timedelta
	from urllib.parse import urlparse
	from utils import (encode,
	translate,
	sentiment_computation,
	fetch_url,
	extract_from_pdf,
	crawl,
	datemodifier,
	encode_content,
	update_content,
	extract_reference)

	with open('xpath.json', 'r', encoding='UTF-8') as f:
	xpath_dict = json.load(f)

	DELTA = int(os.environ.get('DELTA') or '1')
	print(f"DELTA = {DELTA}")

	print("cbirc.gov.cn")
	i = 1
	while i > -1:
	CATEGORY_URL = f"https://www.cbirc.gov.cn/cn/static/data/DocInfo/SelectDocByItemIdAndChild/data_itemId=917,pageIndex={i},pageSize=18.json"
	i = i + 1
	content = fetch_url(CATEGORY_URL)
	reportinfo = json.loads(content)
	for article in reportinfo['data']['rows']:
	try:
	article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(article['publishDate'],"%Y-%m-%d %H:%M:%S"))
	parsed_datetime = datetime.strptime(article['publishDate'], "%Y-%m-%d")
	if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
	i = -1
	else:
	contentCN, summary = extract_from_pdf("https://www.cbirc.gov.cn" + article['pdfFileUrl'])
	article['contentCN'] = repr(contentCN)[1:-1].strip()
	if len(contentCN) < 10:
	continue
	CONTENT_ENG = ''
	for element in article['contentCN'].split("\n"):
	CONTENT_ENG += translate(element) + '\n'
	article['content'] = repr(CONTENT_ENG)[1:-1].strip()
	article['site'] = "National Financial Regulatory Administration of China"
	article['originSite'] = "国家金融监督管理总局"
	article['titleCN'] = article['docSubtitle']
	article['title'] = translate(article['docSubtitle'])
	article['link'] = "https://www.cbirc.gov.cn" + str(article['pdfFileUrl'])
	article['category']= "Policy Interpretation"
	article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['titleCN']+article['publishDate'])
	article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
	article['attachment'] = ''
	article['author'] = ''
	article['subtitle'] = translate(summary)
	update_content(article)
	except Exception as error:
	print(error)

	print("csrc.gov.cn")
	i = 1
	while i > -1:
	try:
	if i == 1:
	CATEGORY_URL = "http://www.csrc.gov.cn/csrc/c100039/common_list.shtml"
	else:
	CATEGORY_URL = f"http://www.csrc.gov.cn/csrc/c100039/common_list_{i}.shtml"
	i = i + 1
	req = urllib.request.urlopen(CATEGORY_URL)
	text = req.read()
	html_text = text.decode("utf-8")
	page = etree.HTML(html_text)
	articlelist = page.xpath("//div[contains(@class, 'main-right fr common-list')]/ul/li")
	for article in articlelist:
	if isinstance(article, etree._Element):
	subelement = etree.tostring(article).decode()
	subpage = etree.HTML(subelement)
	date = encode(subpage.xpath("//span[@class='date']"))
	parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
	if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
	i = -1
	else:
	urls = subpage.xpath("//a/@href")
	for url in urls:
	try:
	article = {}
	url = "http://www.csrc.gov.cn" + url
	article['category']= "Policy Interpretation"
	crawl(url, article)
	except Exception as error:
	print(error)
	except Exception as error:
	i = -1
	print(error)

	i = 1
	while i > -1:
	CATEGORY_URL = f"http://www.csrc.gov.cn/searchList/a1a078ee0bc54721ab6b148884c784a8?_isAgg=true&_isJson=true&_pageSize=18&_template=index&_rangeTimeGte=&_channelName=&page={i}"
	i = i + 1
	content = fetch_url(CATEGORY_URL)
	reportinfo = json.loads(content)
	for article in reportinfo['data']['results']:
	try:
	parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(article['publishedTimeStr'],"%Y-%m-%d %H:%M:%S")), "%Y-%m-%d")
	if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
	i = -1
	else:
	article['category']= "Financial News"
	article['site'] = "Securities Regulatory Commission of China"
	article['originSite'] = "证监会"
	article['titleCN'] = article['title']
	article['title'] = translate(article['titleCN'])
	article['author'] = ''
	article['contentCN'] = repr(article['content'])[1:-1].strip()
	if len(article['contentCN']) < 10:
	continue
	CONTENT_ENG = ''
	for element in article['contentCN'].split("。"):
	CONTENT_ENG += translate(element) + ' '
	article['content'] = repr(CONTENT_ENG)[1:-1].strip()
	article['subtitle'] = article['memo']
	article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(article['publishedTimeStr'],"%Y-%m-%d %H:%M:%S"))
	article['link'] = article['url']
	article['attachment'] = ""
	article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
	article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['titleCN']+article['publishDate'])
	update_content(article)
	except Exception as error:
	print(error)

	print("data.eastmoney.com")
	def crawl_eastmoney(url, article):
	domain = urlparse(url).netloc
	req = urllib.request.urlopen(url)
	text = req.read()
	html_text = text.decode("utf-8")
	page = etree.HTML(html_text)
	contentCN, summary = encode_content(page.xpath(xpath_dict[domain]['content']))
	article['attachment'] = encode(page.xpath(xpath_dict[domain]['attachment']))
	article['link'] = url
	if article['orgSName'] == "''":
	article['site'] = translate(article['orgSName'])
	else:
	article['site'] = translate(article['orgName'])
	article['titleCN'] = article['title']
	article['title'] = translate(article['title'])
	article['author'] = translate(article['researcher'])
	article['originAuthor'] = article['researcher']
	article['contentCN'] = repr(contentCN)[1:-1].strip()
	article['subtitle'] = translate(summary)
	article['category'] = "Macroeconomic Research"
	if len(article['contentCN']) < 10:
	return None
	CONTENT_ENG = ''
	for element in contentCN.split("\n"):
	CONTENT_ENG += translate(element) + '\n'
	article['content'] = repr(CONTENT_ENG)[1:-1].strip()
	article['authorid'] = uuid.uuid5(uuid.NAMESPACE_OID, article['author'])
	article['publishDate'] = datemodifier(article['publishDate'], xpath_dict[domain]['datetime_format'])
	article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['titleCN']+article['publishDate'])
	article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(contentCN.replace("\n",""))
	extract_reference(article)
	update_content(article)

	today = datetime.today().strftime('%Y-%m-%d')
	beginDate = (datetime.today() - timedelta(days=DELTA)).strftime('%Y-%m-%d')
	i = 0
	while i > -1:
	URL = "https://reportapi.eastmoney.com/report/jg"
	params = {
	"cb": "datatable8544623",
	"pageSize": "100",
	"beginTime": beginDate,
	"endTime": today,
	"pageNo": i,
	"qType": "3",
	}
	URL = URL + "?" + "&".join(f"{key}={value}" for key, value in params.items())
	content = fetch_url(URL)
	if content:
	start_index = content.find("(")
	if start_index != -1:
	result = content[start_index + 1: -1]
	else:
	result = content
	reportinfo = json.loads(result)
	if reportinfo["size"] > 0:
	i = i + 1
	for article in reportinfo['data']:
	try:
	url = f"https://data.eastmoney.com/report/zw_macresearch.jshtml?encodeUrl={article['encodeUrl']}"
	crawl_eastmoney(url,article)
	except Exception as error:
	print(error)
	else:
	i = -1
	else:
	print("Failed to fetch URL:", url)

	print("gov.cn")
	i = 0
	while i > -1:
	if i == 0:
	CATEGORY_URL = "https://www.gov.cn/zhengce/jiedu/home.htm"
	else:
	CATEGORY_URL = f"https://www.gov.cn/zhengce/jiedu/home_{i}.htm"
	i = i + 1
	req = urllib.request.urlopen(CATEGORY_URL)
	text = req.read()
	html_text = text.decode("utf-8")
	page = etree.HTML(html_text)
	articlelist = page.xpath("//div[contains(@class, 'news_box')]//h4")
	for article in articlelist:
	if isinstance(article, etree._Element):
	subelement = etree.tostring(article).decode()
	subpage = etree.HTML(subelement)
	date = subpage.xpath("//span/text()")[0]
	parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
	if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
	i = -1
	else:
	urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
	for url in urls:
	try:
	article = {}
	url = url.replace('../', 'https://www.gov.cn/zhengce/')
	if "https://www.gov.cn" in url:
	article['category']= "Policy Interpretation"
	crawl(url, article)
	except Exception as error:
	print(error)

	i = 0
	while i > -1:
	if i == 0:
	CATEGORY_URL = "https://www.gov.cn/zhengce/zuixin/home.htm"
	else:
	CATEGORY_URL = f"https://www.gov.cn/zhengce/zuixin/home_{i}.htm"
	i = i + 1
	req = urllib.request.urlopen(CATEGORY_URL)
	text = req.read()
	html_text = text.decode("utf-8")
	page = etree.HTML(html_text)
	articlelist = page.xpath("//div[contains(@class, 'news_box')]//h4")
	for article in articlelist:
	if isinstance(article, etree._Element):
	subelement = etree.tostring(article).decode()
	subpage = etree.HTML(subelement)
	date = subpage.xpath("//span/text()")[0]
	parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
	if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
	i = -1
	else:
	urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
	for url in urls:
	try:
	article = {}
	url = url.replace('../', 'https://www.gov.cn/zhengce/')
	if "https://www.gov.cn" in url:
	article['site'] = "State Council of China"
	crawl(url, article)
	except Exception as error:
	print(error)

	print("mof.gov.cn")
	i = 0
	while i > -1:
	if i == 0:
	CATEGORY_URL = "https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/"
	else:
	CATEGORY_URL = f"https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/index_{i}.htm"
	i = i + 1
	req = urllib.request.urlopen(CATEGORY_URL)
	text = req.read()
	html_text = text.decode("utf-8")
	page = etree.HTML(html_text)
	articlelist = page.xpath("//div[contains(@class, 'xwfb_listerji')]/ul/li[not(@class = 'clear')]")
	for article in articlelist:
	if isinstance(article, etree._Element):
	subelement = etree.tostring(article).decode()
	subpage = etree.HTML(subelement)
	date = subpage.xpath("//span/text()")[0]
	parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
	if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
	i = -1
	else:
	urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
	for url in urls:
	try:
	article = {}
	url = url.replace("../", "https://www.mof.gov.cn/zhengwuxinxi/")
	url = url.replace("./", "https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/")
	article['category']= "Financial News"
	crawl(url, article)
	except Exception as error:
	print(error)

	i = 0
	while i > -1:
	if i == 0:
	CATEGORY_URL = "https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/"
	else:
	CATEGORY_URL = f"https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/index_{i}.htm"
	i = i + 1
	req = urllib.request.urlopen(CATEGORY_URL)
	text = req.read()
	html_text = text.decode("utf-8")
	page = etree.HTML(html_text)
	articlelist = page.xpath("//div[contains(@class, 'xwfb_listerji')]/ul/li[not(@class = 'clear')]")
	for article in articlelist:
	if isinstance(article, etree._Element):
	subelement = etree.tostring(article).decode()
	subpage = etree.HTML(subelement)
	date = subpage.xpath("//span/text()")[0]
	parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
	if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
	i = -1
	else:
	urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
	for url in urls:
	try:
	article = {}
	url = url.replace("./", CATEGORY_URL)
	article['category']= "Policy Interpretation"
	print(url)
	crawl(url, article)
	except Exception as error:
	print(error)

	print("mofcom.gov.cn")
	categories = ['jdzhsw','jdgnmy','jddwmy','jdtzhz']
	for category in categories:
	i = 1
	while i > -1:
	if i == 1:
	URL = f"http://www.mofcom.gov.cn/article/zcjd/{category}/"
	else:
	URL = f"http://www.mofcom.gov.cn/article/zcjd/{category}/?{i}"
	i = i + 1
	req = urllib.request.urlopen(URL)
	text = req.read()
	html_text = text.decode("utf-8")
	page = etree.HTML(html_text)
	articlelist = page.xpath("//section[contains(@class, 'listCon iListCon f-mt30')]/ul/li")
	for article in articlelist:
	if isinstance(article, etree._Element):
	subelement = etree.tostring(article).decode()
	subpage = etree.HTML(subelement)
	date = subpage.xpath("//span/text()")[0]
	parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d %H:%M:%S")), "%Y-%m-%d")
	if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
	i = -1
	else:
	urls = subpage.xpath("//a/@href")
	for url in urls:
	try:
	article = {}
	if '/article/zcjd' in url:
	url = "http://www.mofcom.gov.cn" + url
	article['category']= "Policy Interpretation"
	else:
	article['category']= "Policy Release"
	crawl(url, article)
	except Exception as error:
	print(error)

	print("ndrc.gov.cn")
	i = 0
	while i > -1:
	if i == 0:
	CATEGORY_URL = "https://www.ndrc.gov.cn/xxgk/jd/jd/index.html"
	else:
	CATEGORY_URL = f"https://www.ndrc.gov.cn/xxgk/jd/jd/index_{i}.html"
	i = i + 1
	req = urllib.request.urlopen(CATEGORY_URL)
	text = req.read()
	html_text = text.decode("utf-8")
	page = etree.HTML(html_text)
	articlelist = page.xpath("//div[contains(@class, 'list')]/ul/li[not(@class = 'empty')]")
	for article in articlelist:
	if isinstance(article, etree._Element):
	subelement = etree.tostring(article).decode()
	subpage = etree.HTML(subelement)
	date = subpage.xpath("//span/text()")[0]
	parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y/%m/%d")), "%Y-%m-%d")
	if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
	i = -1
	else:
	urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
	for url in urls:
	try:
	article = {}
	if "www.gov.cn" in url:
	article['category']= "Policy Release"
	elif "../../zcfb/" in url:
	url = url.replace("../../zcfb/", "https://www.ndrc.gov.cn/xxgk/zcfb/")
	article['category']= "Policy Release"
	else:
	url = url.replace("../../", "https://www.ndrc.gov.cn/xxgk/jd/jd/")
	url = url.replace("./", "https://www.ndrc.gov.cn/xxgk/jd/jd/")
	article['category']= "Policy Interpretation"
	crawl(url, article)
	except Exception as error:
	print(error)

	print("safe.gov.cn")
	i = 1
	while i > -1:
	if i == 1:
	CATEGORY_URL = "https://www.safe.gov.cn/safe/zcfgjd/index.html"
	else:
	CATEGORY_URL = f"https://www.safe.gov.cn/safe/zcfgjd/index_{i}.html"
	i = i + 1
	req = urllib.request.urlopen(CATEGORY_URL)
	text = req.read()
	html_text = text.decode("utf-8")
	page = etree.HTML(html_text)
	articlelist = page.xpath("//div[contains(@class, 'list_conr')]/ul/li")
	for article in articlelist:
	if isinstance(article, etree._Element):
	subelement = etree.tostring(article).decode()
	subpage = etree.HTML(subelement)
	date = subpage.xpath("//dd/text()")[0]
	parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
	if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
	i = -1
	else:
	urls = subpage.xpath("//a/@href")
	for url in urls:
	try:
	article = {}
	url = "https://www.safe.gov.cn" + url
	article['category']= "Policy Interpretation"
	crawl(url, article)
	except Exception as error:
	print(error)

	i = 1
	while i > -1:
	if i == 1:
	CATEGORY_URL = "https://www.safe.gov.cn/safe/sjjd/index.html"
	else:
	CATEGORY_URL = f"https://www.safe.gov.cn/safe/sjjd/index_{i}.html"
	i = i + 1
	req = urllib.request.urlopen(CATEGORY_URL)
	text = req.read()
	html_text = text.decode("utf-8")
	page = etree.HTML(html_text)
	articlelist = page.xpath("//div[contains(@class, 'list_conr')]/ul/li")
	for article in articlelist:
	if isinstance(article, etree._Element):
	subelement = etree.tostring(article).decode()
	subpage = etree.HTML(subelement)
	date = subpage.xpath("//dd/text()")[0]
	parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
	if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
	i = -1
	else:
	urls = subpage.xpath("//a/@href")
	for url in urls:
	try:
	article = {}
	url = "https://www.safe.gov.cn" + url
	article['category']= "Data Interpretation"
	crawl(url, article)
	except Exception as error:
	print(error)

	print("stats.gov.hk")
	i = 0
	while i > -1:
	if i == 0:
	CATEGORY_URL = "https://www.stats.gov.cn/sj/sjjd/"
	else:
	CATEGORY_URL = f"https://www.stats.gov.cn/sj/sjjd/index_{i}.html"
	i = i + 1
	req = urllib.request.urlopen(CATEGORY_URL)
	text = req.read()
	html_text = text.decode("utf-8")
	page = etree.HTML(html_text)
	articlelist = page.xpath("//div[contains(@class, 'list-content')]/ul/li")
	for article in articlelist:
	if isinstance(article, etree._Element):
	subelement = etree.tostring(article).decode()
	subpage = etree.HTML(subelement)
	date = encode(subpage.xpath("//span"))
	parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
	if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
	i = -1
	else:
	urls = subpage.xpath("//a[@class='fl pc_1600']/@href")
	for url in urls:
	try:
	article = {}
	url = url.replace('./', "https://www.stats.gov.cn/sj/sjjd/")
	article['category']= "Data Interpretation"
	crawl(url, article)
	except Exception as error:
	print(error)