Spaces:
Build error
Build error
""" | |
This script is used for data collection from the China Taxation website. It retrieves policy interpretation articles and processes them for further analysis. | |
The script performs the following steps: | |
1. Imports necessary modules and libraries. | |
2. Defines the base URL for retrieving policy interpretation articles. | |
3. Iterates through the pages of the search results. | |
4. Retrieves the content of each article. | |
5. Processes the content by translating it to English and performing sentiment analysis. | |
6. Stores the processed data in a database. | |
Note: The script also retrieves additional articles from a different URL and follows a similar process. | |
""" | |
import json | |
import ssl | |
import uuid | |
from datetime import datetime, timedelta | |
import time | |
import urllib.request | |
import urllib3 | |
from lxml import etree | |
from utils import translate, sentiment_computation, upsert_content, encode_content | |
ssl._create_default_https_context = ssl._create_stdlib_context | |
i = 0 | |
while i > -1: | |
CATEGORY_URL = f"https://www.chinatax.gov.cn/search5/search/s?siteCode=bm29000002&searchWord=&type=1&xxgkResolveType=%E6%96%87%E5%AD%97&pageNum={i}&pageSize=10&cwrqStart=&cwrqEnd=&column=%E6%94%BF%E7%AD%96%E8%A7%A3%E8%AF%BB&likeDoc=0&wordPlace=0&videoreSolveType=" | |
i = i + 1 | |
urllib3.disable_warnings() | |
req = urllib.request.urlopen(CATEGORY_URL) | |
content = req.read().decode("utf-8") | |
reportinfo = json.loads(content) | |
for article in reportinfo['searchResultAll']['searchTotal']: | |
try: | |
parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(article['cwrq'],"%Y-%m-%d %H:%M:%S")), "%Y-%m-%d") | |
if parsed_datetime < (datetime.today() - timedelta(days=183)): | |
print(parsed_datetime) | |
i = -1 | |
else: | |
article['category']= "Policy Interpretation" | |
contentCN = article['content'].replace('\\u','') | |
article['contentCN'] = repr(contentCN)[1:-1].strip() | |
if len(article['contentCN']) < 10: | |
continue | |
CONTENT_ENG = '' | |
for element in contentCN.split("\n"): | |
CONTENT_ENG += translate(element) + '\n' | |
article['content'] = repr(CONTENT_ENG)[1:-1].strip() | |
article['site'] = "State Taxation Administration of China" | |
article['originalSite'] = "国家税务总局" | |
article['titleCN'] = article['title'] | |
article['title'] = translate(article['originalTitle']) | |
article['url'] = article['snapshotUrl'] | |
article['author'] = "" | |
article['attachment'] = "" | |
article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(article['cwrq'],"%Y-%m-%d %H:%M:%S")) | |
article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(CONTENT_ENG.replace("\n","")) | |
article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['titleCN']+article['publishDate']) | |
upsert_content(article) | |
except Exception as error: | |
print(error) | |
CATEGORY_URL = 'https://www.chinatax.gov.cn/getFileListByCodeId' | |
i = 0 | |
while i > -1: | |
# Define the payload data | |
payload = { | |
'channelId': '29a88b67e4b149cfa9fac7919dfb08a5', | |
'page': i, | |
'size': '10' | |
} | |
i = i + 1 | |
# Encode the payload data | |
payload = urllib.parse.urlencode(payload).encode('utf-8') | |
req = urllib.request.urlopen(CATEGORY_URL, data=payload) | |
content = req.read().decode("utf-8") | |
reportinfo = json.loads(content) | |
for article in reportinfo['results']['data']['results']: | |
parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(article['publishedTimeStr'],"%Y-%m-%d %H:%M:%S")), "%Y-%m-%d") | |
if parsed_datetime < (datetime.today() - timedelta(days=183)): | |
print(parsed_datetime) | |
i = -1 | |
else: | |
try: | |
url = article['url'].replace("http://www.chinatax.gov.cn/zcfgk","https://fgk.chinatax.gov.cn/zcfgk") | |
req = urllib.request.urlopen(url) | |
text = req.read() | |
html_text = text.decode("utf-8") | |
page = etree.HTML(html_text) | |
contentCN, summary = encode_content(page.xpath("//div[contains(@class, 'article')]//p")) | |
article['contentCN'] = repr(contentCN)[1:-1].strip() | |
if len(article['contentCN']) < 10: | |
continue | |
CONTENT_ENG = '' | |
for element in contentCN.split("\n"): | |
CONTENT_ENG += translate(element) + '\n' | |
article['content'] = repr(CONTENT_ENG)[1:-1].strip() | |
article['site'] = "State Taxation Administration of China" | |
article['originalSite'] = "国家税务总局" | |
article['titleCN'] = article['title'] | |
article['title'] = translate(article['originalTitle']) | |
article['url'] = article['url'] | |
article['subtitle'] = translate(summary) | |
article['attachment'] = "" | |
article['author'] = "" | |
article['category']= "Policy Interpretation" | |
article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(article['publishedTimeStr'],"%Y-%m-%d %H:%M:%S")) | |
article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['titleCN']+article['publishDate']) | |
article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content']) | |
upsert_content(article) | |
except Exception as error: | |
print(error) | |