Spaces:

Oxbridge-Economics
/

Data-Collection-China

Running

App Files Files Community

Data-Collection-China / cbirc.py

gavinzli

chore: Add script descriptions and improve code readability

39fe3d1 11 months ago

raw

history blame

5.11 kB

	"""
	This script fetches data from the China Banking and Insurance Regulatory Commission (CBIRC) website and extracts relevant information from the fetched data.
	The extracted information is then processed and stored in a database.

	The script performs the following steps:
	1. Fetches data from the CBIRC website by making HTTP requests.
	2. Parses the fetched data and extracts relevant information.
	3. Translates the extracted information to English.
	4. Computes sentiment scores for the translated content.
	5. Stores the processed information in a database.

	Note: The script also includes commented code for fetching data from the State Taxation Administration of China website, but it is currently disabled.
	"""
	import json
	import uuid
	import time
	from datetime import datetime, timedelta
	from utils import translate, sentiment_computation, upsert_content, fetch_url, extract_from_pdf

	i = 1
	while i > -1:
	CATEGORY_URL = f"https://www.cbirc.gov.cn/cn/static/data/DocInfo/SelectDocByItemIdAndChild/data_itemId=917,pageIndex={i},pageSize=18.json"
	i = i + 1
	print(CATEGORY_URL)
	content = fetch_url(CATEGORY_URL)
	reportinfo = json.loads(content)
	for article in reportinfo['data']['rows']:
	try:
	article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(article['publishDate'],"%Y-%m-%d %H:%M:%S"))
	parsed_datetime = datetime.strptime(article['publishDate'], "%Y-%m-%d")
	if parsed_datetime < (datetime.today() - timedelta(days=183)):
	i = -1
	else:
	contentCN, summary = extract_from_pdf("https://www.cbirc.gov.cn" + article['pdfFileUrl'])
	article['contentCN'] = repr(contentCN)[1:-1].strip()
	if len(contentCN) < 10:
	continue
	CONTENT_ENG = ''
	for element in article['contentCN'].split("\n"):
	CONTENT_ENG += translate(element) + '\n'
	article['content'] = repr(CONTENT_ENG)[1:-1].strip()
	article['site'] = "National Financial Regulatory Administration of China"
	article['originSite'] = "国家金融监督管理总局"
	article['titleCN'] = article['docSubtitle']
	article['title'] = translate(article['docSubtitle'])
	article['link'] = "https://www.cbirc.gov.cn" + str(article['pdfFileUrl'])
	article['category']= "Policy Interpretation"
	article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['titleCN']+article['publishDate'])
	article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
	article['attachment'] = ''
	article['author'] = ''
	article['subtitle'] = translate(summary)
	upsert_content(article)
	except Exception as error:
	print(error)


	# ssl._create_default_https_context = ssl._create_stdlib_context
	# i = 0
	# while i > -1:
	# CATEGORY_URL = f"https://www.chinatax.gov.cn/search5/search/s?siteCode=bm29000002&searchWord=&type=1&xxgkResolveType=%E6%96%87%E5%AD%97&pageNum={i}&pageSize=10&cwrqStart=&cwrqEnd=&column=%E6%94%BF%E7%AD%96%E8%A7%A3%E8%AF%BB&likeDoc=0&wordPlace=0&videoreSolveType="
	# i = i + 1
	# urllib3.disable_warnings()
	# try:
	# req = urllib.request.urlopen(CATEGORY_URL, timeout=30)
	# except:
	# break
	# content = req.read().decode("utf-8")
	# reportinfo = json.loads(content)
	# for article in reportinfo['searchResultAll']['searchTotal']:
	# try:
	# parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(article['cwrq'],"%Y-%m-%d %H:%M:%S")), "%Y-%m-%d")
	# if parsed_datetime < (datetime.today() - timedelta(days=183)):
	# i = -1
	# else:
	# article['originalContent'] = article['content'].replace('\\u','')
	# if len(article['originalContent']) < 10:
	# continue
	# CONTENT_ENG = ''
	# for element in article['originalContent'].split("。"):
	# CONTENT_ENG += translate(element) + ' '
	# article['content'] = CONTENT_ENG
	# article['site'] = "State Taxation Administration of China"
	# article['originalSite'] = "国家税务总局"
	# article['originalTitle'] = article['title']
	# article['title'] = translate(article['originalTitle'])
	# article['url'] = article['snapshotUrl']
	# article['category']= "Policy Interpretation"
	# article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(article['cwrq'],"%Y-%m-%d %H:%M:%S"))
	# article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
	# article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
	# upsert_content(article)
	# except Exception as error:
	# print(error)