File size: 5,704 Bytes
39fe3d1 57c4050 39fe3d1 57c4050 422b41b 043eca4 57c4050 043eca4 b2a3d45 57c4050 422b41b 57c4050 422b41b 57c4050 422b41b 57c4050 2360747 57c4050 043eca4 57c4050 043eca4 422b41b b2a3d45 57c4050 422b41b 57c4050 043eca4 422b41b 57c4050 422b41b 57c4050 2360747 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 |
"""
This script is used for data collection from the China Taxation website. It retrieves policy interpretation articles and processes them for further analysis.
The script performs the following steps:
1. Imports necessary modules and libraries.
2. Defines the base URL for retrieving policy interpretation articles.
3. Iterates through the pages of the search results.
4. Retrieves the content of each article.
5. Processes the content by translating it to English and performing sentiment analysis.
6. Stores the processed data in a database.
Note: The script also retrieves additional articles from a different URL and follows a similar process.
"""
import json
import ssl
import uuid
from datetime import datetime, timedelta
import time
import urllib.request
import urllib3
from lxml import etree
from utils import translate, sentiment_computation, upsert_content, encode_content
ssl._create_default_https_context = ssl._create_stdlib_context
i = 0
while i > -1:
CATEGORY_URL = f"https://www.chinatax.gov.cn/search5/search/s?siteCode=bm29000002&searchWord=&type=1&xxgkResolveType=%E6%96%87%E5%AD%97&pageNum={i}&pageSize=10&cwrqStart=&cwrqEnd=&column=%E6%94%BF%E7%AD%96%E8%A7%A3%E8%AF%BB&likeDoc=0&wordPlace=0&videoreSolveType="
i = i + 1
urllib3.disable_warnings()
req = urllib.request.urlopen(CATEGORY_URL)
content = req.read().decode("utf-8")
reportinfo = json.loads(content)
for article in reportinfo['searchResultAll']['searchTotal']:
try:
parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(article['cwrq'],"%Y-%m-%d %H:%M:%S")), "%Y-%m-%d")
if parsed_datetime < (datetime.today() - timedelta(days=183)):
print(parsed_datetime)
i = -1
else:
article['category']= "Policy Interpretation"
contentCN = article['content'].replace('\\u','')
article['contentCN'] = repr(contentCN)[1:-1].strip()
if len(article['contentCN']) < 10:
continue
CONTENT_ENG = ''
for element in contentCN.split("\n"):
CONTENT_ENG += translate(element) + '\n'
article['content'] = repr(CONTENT_ENG)[1:-1].strip()
article['site'] = "State Taxation Administration of China"
article['originalSite'] = "国家税务总局"
article['titleCN'] = article['title']
article['title'] = translate(article['originalTitle'])
article['url'] = article['snapshotUrl']
article['author'] = ""
article['attachment'] = ""
article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(article['cwrq'],"%Y-%m-%d %H:%M:%S"))
article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(CONTENT_ENG.replace("\n",""))
article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['titleCN']+article['publishDate'])
upsert_content(article)
except Exception as error:
print(error)
CATEGORY_URL = 'https://www.chinatax.gov.cn/getFileListByCodeId'
i = 0
while i > -1:
# Define the payload data
payload = {
'channelId': '29a88b67e4b149cfa9fac7919dfb08a5',
'page': i,
'size': '10'
}
i = i + 1
# Encode the payload data
payload = urllib.parse.urlencode(payload).encode('utf-8')
req = urllib.request.urlopen(CATEGORY_URL, data=payload)
content = req.read().decode("utf-8")
reportinfo = json.loads(content)
for article in reportinfo['results']['data']['results']:
parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(article['publishedTimeStr'],"%Y-%m-%d %H:%M:%S")), "%Y-%m-%d")
if parsed_datetime < (datetime.today() - timedelta(days=183)):
print(parsed_datetime)
i = -1
else:
try:
url = article['url'].replace("http://www.chinatax.gov.cn/zcfgk","https://fgk.chinatax.gov.cn/zcfgk")
req = urllib.request.urlopen(url)
text = req.read()
html_text = text.decode("utf-8")
page = etree.HTML(html_text)
contentCN, summary = encode_content(page.xpath("//div[contains(@class, 'article')]//p"))
article['contentCN'] = repr(contentCN)[1:-1].strip()
if len(article['contentCN']) < 10:
continue
CONTENT_ENG = ''
for element in contentCN.split("\n"):
CONTENT_ENG += translate(element) + '\n'
article['content'] = repr(CONTENT_ENG)[1:-1].strip()
article['site'] = "State Taxation Administration of China"
article['originalSite'] = "国家税务总局"
article['titleCN'] = article['title']
article['title'] = translate(article['originalTitle'])
article['url'] = article['url']
article['subtitle'] = translate(summary)
article['attachment'] = ""
article['author'] = ""
article['category']= "Policy Interpretation"
article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(article['publishedTimeStr'],"%Y-%m-%d %H:%M:%S"))
article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['titleCN']+article['publishDate'])
article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
upsert_content(article)
except Exception as error:
print(error)
|