Spaces:

Oxbridge-Economics
/

Data-Collection-China

Running

App Files Files Community

Data-Collection-China / chinatax.py

gavinzli

chore: Add script descriptions and improve code readability

39fe3d1 11 months ago

raw

history blame

5.7 kB

	"""
	This script is used for data collection from the China Taxation website. It retrieves policy interpretation articles and processes them for further analysis.

	The script performs the following steps:
	1. Imports necessary modules and libraries.
	2. Defines the base URL for retrieving policy interpretation articles.
	3. Iterates through the pages of the search results.
	4. Retrieves the content of each article.
	5. Processes the content by translating it to English and performing sentiment analysis.
	6. Stores the processed data in a database.

	Note: The script also retrieves additional articles from a different URL and follows a similar process.
	"""
	import json
	import ssl
	import uuid
	from datetime import datetime, timedelta
	import time
	import urllib.request
	import urllib3
	from lxml import etree
	from utils import translate, sentiment_computation, upsert_content, encode_content

	ssl._create_default_https_context = ssl._create_stdlib_context

	i = 0
	while i > -1:
	CATEGORY_URL = f"https://www.chinatax.gov.cn/search5/search/s?siteCode=bm29000002&searchWord=&type=1&xxgkResolveType=%E6%96%87%E5%AD%97&pageNum={i}&pageSize=10&cwrqStart=&cwrqEnd=&column=%E6%94%BF%E7%AD%96%E8%A7%A3%E8%AF%BB&likeDoc=0&wordPlace=0&videoreSolveType="
	i = i + 1
	urllib3.disable_warnings()
	req = urllib.request.urlopen(CATEGORY_URL)
	content = req.read().decode("utf-8")
	reportinfo = json.loads(content)
	for article in reportinfo['searchResultAll']['searchTotal']:
	try:
	parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(article['cwrq'],"%Y-%m-%d %H:%M:%S")), "%Y-%m-%d")
	if parsed_datetime < (datetime.today() - timedelta(days=183)):
	print(parsed_datetime)
	i = -1
	else:
	article['category']= "Policy Interpretation"
	contentCN = article['content'].replace('\\u','')
	article['contentCN'] = repr(contentCN)[1:-1].strip()
	if len(article['contentCN']) < 10:
	continue
	CONTENT_ENG = ''
	for element in contentCN.split("\n"):
	CONTENT_ENG += translate(element) + '\n'
	article['content'] = repr(CONTENT_ENG)[1:-1].strip()
	article['site'] = "State Taxation Administration of China"
	article['originalSite'] = "国家税务总局"
	article['titleCN'] = article['title']
	article['title'] = translate(article['originalTitle'])
	article['url'] = article['snapshotUrl']
	article['author'] = ""
	article['attachment'] = ""
	article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(article['cwrq'],"%Y-%m-%d %H:%M:%S"))
	article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(CONTENT_ENG.replace("\n",""))
	article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['titleCN']+article['publishDate'])
	upsert_content(article)
	except Exception as error:
	print(error)


	CATEGORY_URL = 'https://www.chinatax.gov.cn/getFileListByCodeId'
	i = 0
	while i > -1:
	# Define the payload data
	payload = {
	'channelId': '29a88b67e4b149cfa9fac7919dfb08a5',
	'page': i,
	'size': '10'
	}
	i = i + 1
	# Encode the payload data
	payload = urllib.parse.urlencode(payload).encode('utf-8')
	req = urllib.request.urlopen(CATEGORY_URL, data=payload)
	content = req.read().decode("utf-8")
	reportinfo = json.loads(content)
	for article in reportinfo['results']['data']['results']:
	parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(article['publishedTimeStr'],"%Y-%m-%d %H:%M:%S")), "%Y-%m-%d")
	if parsed_datetime < (datetime.today() - timedelta(days=183)):
	print(parsed_datetime)
	i = -1
	else:
	try:
	url = article['url'].replace("http://www.chinatax.gov.cn/zcfgk","https://fgk.chinatax.gov.cn/zcfgk")
	req = urllib.request.urlopen(url)
	text = req.read()
	html_text = text.decode("utf-8")
	page = etree.HTML(html_text)
	contentCN, summary = encode_content(page.xpath("//div[contains(@class, 'article')]//p"))
	article['contentCN'] = repr(contentCN)[1:-1].strip()
	if len(article['contentCN']) < 10:
	continue
	CONTENT_ENG = ''
	for element in contentCN.split("\n"):
	CONTENT_ENG += translate(element) + '\n'
	article['content'] = repr(CONTENT_ENG)[1:-1].strip()
	article['site'] = "State Taxation Administration of China"
	article['originalSite'] = "国家税务总局"
	article['titleCN'] = article['title']
	article['title'] = translate(article['originalTitle'])
	article['url'] = article['url']
	article['subtitle'] = translate(summary)
	article['attachment'] = ""
	article['author'] = ""
	article['category']= "Policy Interpretation"
	article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(article['publishedTimeStr'],"%Y-%m-%d %H:%M:%S"))
	article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['titleCN']+article['publishDate'])
	article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
	upsert_content(article)
	except Exception as error:
	print(error)