Data-Collection-China / chinatax.py
OxbridgeEconomics
commit
b2a3d45
raw
history blame
4.7 kB
import json
import ssl
import uuid
from datetime import datetime, timedelta
import time
import urllib.request
import urllib3
from lxml import etree
from utils import encode, translate, sentiment_computation, upsert_content
ssl._create_default_https_context = ssl._create_stdlib_context
i = 0
while i > -1:
CATEGORY_URL = f"https://www.chinatax.gov.cn/search5/search/s?siteCode=bm29000002&searchWord=&type=1&xxgkResolveType=%E6%96%87%E5%AD%97&pageNum={i}&pageSize=10&cwrqStart=&cwrqEnd=&column=%E6%94%BF%E7%AD%96%E8%A7%A3%E8%AF%BB&likeDoc=0&wordPlace=0&videoreSolveType="
i = i + 1
urllib3.disable_warnings()
req = urllib.request.urlopen(CATEGORY_URL)
content = req.read().decode("utf-8")
reportinfo = json.loads(content)
for article in reportinfo['searchResultAll']['searchTotal']:
try:
parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(article['cwrq'],"%Y-%m-%d %H:%M:%S")), "%Y-%m-%d")
if parsed_datetime < (datetime.today() - timedelta(days=183)):
print(parsed_datetime)
i = -1
else:
article['originalContent'] = article['content'].replace('\\u','')
if len(article['originalContent']) < 10:
continue
CONTENT_ENG = ''
for element in article['originalContent'].split("。"):
CONTENT_ENG += translate(element) + ' '
article['content'] = CONTENT_ENG
article['site'] = "State Taxation Administration of China"
article['originalSite'] = "国家税务总局"
article['originalTitle'] = article['title']
article['title'] = translate(article['originalTitle'])
article['url'] = article['snapshotUrl']
article['category']= "Policy Interpretation"
article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(article['cwrq'],"%Y-%m-%d %H:%M:%S"))
article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
upsert_content(article)
except Exception as error:
print(error)
CATEGORY_URL = 'https://www.chinatax.gov.cn/getFileListByCodeId'
i = 0
while i > -1:
# Define the payload data
payload = {
'channelId': '29a88b67e4b149cfa9fac7919dfb08a5',
'page': i,
'size': '10'
}
i = i + 1
# Encode the payload data
payload = urllib.parse.urlencode(payload).encode('utf-8')
req = urllib.request.urlopen(CATEGORY_URL, data=payload)
content = req.read().decode("utf-8")
reportinfo = json.loads(content)
for article in reportinfo['results']['data']['results']:
parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(article['publishedTimeStr'],"%Y-%m-%d %H:%M:%S")), "%Y-%m-%d")
if parsed_datetime < (datetime.today() - timedelta(days=183)):
print(parsed_datetime)
i = -1
else:
try:
url = article['url'].replace("http://www.chinatax.gov.cn/zcfgk","https://fgk.chinatax.gov.cn/zcfgk")
req = urllib.request.urlopen(url)
text = req.read()
html_text = text.decode("utf-8")
page = etree.HTML(html_text)
article['originalContent'] = encode(page.xpath("//div[contains(@class, 'article')]//p"))
if len(article['originalContent']) < 10:
continue
CONTENT_ENG = ''
for element in article['originalContent'].split("。"):
CONTENT_ENG += translate(element) + ' '
article['content'] = CONTENT_ENG
article['site'] = "State Taxation Administration of China"
article['originalSite'] = "国家税务总局"
article['originalTitle'] = article['title']
article['title'] = translate(article['originalTitle'])
article['url'] = article['url']
article['category']= "Policy Interpretation"
article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(article['publishedTimeStr'],"%Y-%m-%d %H:%M:%S"))
article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
upsert_content(article)
except Exception as error:
print(error)