Spaces:

Oxbridge-Economics
/

Data-Collection-China

Running

OxbridgeEconomics

commit

043eca4 about 1 year ago

5.06 kB

	import json
	import ssl
	import uuid
	from datetime import datetime, timedelta
	import time
	import urllib.request
	import urllib3
	from lxml import etree
	from utils import encode, translate, sentiment_computation, upsert_content, encode_content

	ssl._create_default_https_context = ssl._create_stdlib_context

	i = 0
	while i > -1:
	CATEGORY_URL = f"https://www.chinatax.gov.cn/search5/search/s?siteCode=bm29000002&searchWord=&type=1&xxgkResolveType=%E6%96%87%E5%AD%97&pageNum={i}&pageSize=10&cwrqStart=&cwrqEnd=&column=%E6%94%BF%E7%AD%96%E8%A7%A3%E8%AF%BB&likeDoc=0&wordPlace=0&videoreSolveType="
	i = i + 1
	urllib3.disable_warnings()
	req = urllib.request.urlopen(CATEGORY_URL)
	content = req.read().decode("utf-8")
	reportinfo = json.loads(content)
	for article in reportinfo['searchResultAll']['searchTotal']:
	try:
	parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(article['cwrq'],"%Y-%m-%d %H:%M:%S")), "%Y-%m-%d")
	if parsed_datetime < (datetime.today() - timedelta(days=183)):
	print(parsed_datetime)
	i = -1
	else:
	article['category']= "Policy Interpretation"
	contentCN = article['content'].replace('\\u','')
	article['contentCN'] = repr(contentCN)[1:-1].strip()
	if len(article['contentCN']) < 10:
	continue
	CONTENT_ENG = ''
	for element in contentCN.split("\n"):
	CONTENT_ENG += translate(element) + '\n'
	article['content'] = repr(CONTENT_ENG)[1:-1].strip()
	article['site'] = "State Taxation Administration of China"
	article['originalSite'] = "国家税务总局"
	article['titleCN'] = article['title']
	article['title'] = translate(article['originalTitle'])
	article['url'] = article['snapshotUrl']
	article['author'] = ""
	article['attachment'] = ""
	article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(article['cwrq'],"%Y-%m-%d %H:%M:%S"))
	article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(CONTENT_ENG.replace("\n",""))
	article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['titleCN']+article['publishDate'])
	upsert_content(article)
	except Exception as error:
	print(error)


	CATEGORY_URL = 'https://www.chinatax.gov.cn/getFileListByCodeId'
	i = 0
	while i > -1:
	# Define the payload data
	payload = {
	'channelId': '29a88b67e4b149cfa9fac7919dfb08a5',
	'page': i,
	'size': '10'
	}
	i = i + 1
	# Encode the payload data
	payload = urllib.parse.urlencode(payload).encode('utf-8')
	req = urllib.request.urlopen(CATEGORY_URL, data=payload)
	content = req.read().decode("utf-8")
	reportinfo = json.loads(content)
	for article in reportinfo['results']['data']['results']:
	parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(article['publishedTimeStr'],"%Y-%m-%d %H:%M:%S")), "%Y-%m-%d")
	if parsed_datetime < (datetime.today() - timedelta(days=183)):
	print(parsed_datetime)
	i = -1
	else:
	try:
	url = article['url'].replace("http://www.chinatax.gov.cn/zcfgk","https://fgk.chinatax.gov.cn/zcfgk")
	req = urllib.request.urlopen(url)
	text = req.read()
	html_text = text.decode("utf-8")
	page = etree.HTML(html_text)
	contentCN, summary = encode_content(page.xpath("//div[contains(@class, 'article')]//p"))
	article['contentCN'] = repr(contentCN)[1:-1].strip()
	if len(article['contentCN']) < 10:
	continue
	CONTENT_ENG = ''
	for element in contentCN.split("\n"):
	CONTENT_ENG += translate(element) + '\n'
	article['content'] = repr(CONTENT_ENG)[1:-1].strip()
	article['site'] = "State Taxation Administration of China"
	article['originalSite'] = "国家税务总局"
	article['titleCN'] = article['title']
	article['title'] = translate(article['originalTitle'])
	article['url'] = article['url']
	article['subtitle'] = translate(summary)
	article['attachment'] = ""
	article['author'] = ""
	article['category']= "Policy Interpretation"
	article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(article['publishedTimeStr'],"%Y-%m-%d %H:%M:%S"))
	article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['titleCN']+article['publishDate'])
	article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
	upsert_content(article)
	except Exception as error:
	print(error)