OxbridgeEconomics
Update cbirc.py
b405c67 unverified
raw
history blame
4.15 kB
import json
import ssl
import uuid
import time
import urllib.request
import urllib3
from datetime import datetime, timedelta
from utils import translate, sentiment_computation, upsert_content, fetch_url, extract_from_pdf
i = 1
while i > -1:
CATEGORY_URL = f"https://www.cbirc.gov.cn/cn/static/data/DocInfo/SelectDocByItemIdAndChild/data_itemId=917,pageIndex={i},pageSize=18.json"
i = i + 1
content = fetch_url(CATEGORY_URL)
reportinfo = json.loads(content)
for article in reportinfo['data']['rows']:
try:
parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(article['publishDate'],"%Y-%m-%d %H:%M:%S")), "%Y-%m-%d")
if parsed_datetime < (datetime.today() - timedelta(days=183)):
i = -1
else:
article['originalContent'] = extract_from_pdf("https://www.cbirc.gov.cn" + article['pdfFileUrl'])
if len(article['originalContent']) < 10:
continue
CONTENT_ENG = ''
for element in article['originalContent'].split("。"):
CONTENT_ENG += translate(element) + ' '
article['content'] = CONTENT_ENG
article['site'] = "National Financial Regulatory Administration"
article['originalSite'] = "国家金融监督管理总局"
article['originalTitle'] = article['docSubtitle']
article['title'] = translate(article['originalTitle'])
article['url'] = "https://www.cbirc.gov.cn" + article['pdfFileUrl']
article['category']= "Policy Interpretation"
article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(article['publishDate'],"%Y-%m-%d %H:%M:%S"))
article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
upsert_content(article)
except Exception as error:
print(error)
ssl._create_default_https_context = ssl._create_stdlib_context
i = 0
while i > -1:
CATEGORY_URL = f"https://www.chinatax.gov.cn/search5/search/s?siteCode=bm29000002&searchWord=&type=1&xxgkResolveType=%E6%96%87%E5%AD%97&pageNum={i}&pageSize=10&cwrqStart=&cwrqEnd=&column=%E6%94%BF%E7%AD%96%E8%A7%A3%E8%AF%BB&likeDoc=0&wordPlace=0&videoreSolveType="
i = i + 1
urllib3.disable_warnings()
req = urllib.request.urlopen(CATEGORY_URL)
content = req.read().decode("utf-8")
reportinfo = json.loads(content)
for article in reportinfo['searchResultAll']['searchTotal']:
try:
parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(article['cwrq'],"%Y-%m-%d %H:%M:%S")), "%Y-%m-%d")
if parsed_datetime < (datetime.today() - timedelta(days=183)):
i = -1
else:
article['originalContent'] = article['content'].replace('\\u','')
if len(article['originalContent']) < 10:
continue
CONTENT_ENG = ''
for element in article['originalContent'].split("。"):
CONTENT_ENG += translate(element) + ' '
article['content'] = CONTENT_ENG
article['site'] = "State Taxation Administration"
article['originalSite'] = "国家税务总局"
article['originalTitle'] = article['title']
article['title'] = translate(article['originalTitle'])
article['url'] = article['snapshotUrl']
article['category']= "Policy Interpretation"
article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(article['cwrq'],"%Y-%m-%d %H:%M:%S"))
article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
upsert_content(article)
except Exception as error:
print(error)