File size: 5,056 Bytes
57c4050
 
 
 
 
 
 
 
422b41b
57c4050
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
422b41b
 
 
043eca4
57c4050
 
043eca4
 
 
b2a3d45
57c4050
422b41b
57c4050
 
422b41b
 
57c4050
422b41b
 
57c4050
 
 
 
 
 
 
 
 
 
 
 
 
 
2360747
57c4050
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
043eca4
 
 
57c4050
 
043eca4
 
422b41b
b2a3d45
57c4050
422b41b
57c4050
 
043eca4
422b41b
 
57c4050
 
422b41b
57c4050
 
 
2360747
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import json
import ssl
import uuid
from datetime import datetime, timedelta
import time
import urllib.request
import urllib3
from lxml import etree
from utils import encode, translate, sentiment_computation, upsert_content, encode_content

ssl._create_default_https_context = ssl._create_stdlib_context

i = 0
while i > -1:
    CATEGORY_URL = f"https://www.chinatax.gov.cn/search5/search/s?siteCode=bm29000002&searchWord=&type=1&xxgkResolveType=%E6%96%87%E5%AD%97&pageNum={i}&pageSize=10&cwrqStart=&cwrqEnd=&column=%E6%94%BF%E7%AD%96%E8%A7%A3%E8%AF%BB&likeDoc=0&wordPlace=0&videoreSolveType="
    i = i + 1
    urllib3.disable_warnings()
    req = urllib.request.urlopen(CATEGORY_URL)
    content = req.read().decode("utf-8")
    reportinfo = json.loads(content)
    for article in reportinfo['searchResultAll']['searchTotal']:
        try:
            parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(article['cwrq'],"%Y-%m-%d %H:%M:%S")), "%Y-%m-%d")
            if  parsed_datetime < (datetime.today() - timedelta(days=183)):
                print(parsed_datetime)
                i = -1
            else:
                article['category']= "Policy Interpretation"
                contentCN = article['content'].replace('\\u','')
                article['contentCN'] = repr(contentCN)[1:-1].strip()
                if len(article['contentCN']) < 10:
                    continue
                CONTENT_ENG = ''
                for element in contentCN.split("\n"):
                    CONTENT_ENG += translate(element) + '\n'
                article['content'] = repr(CONTENT_ENG)[1:-1].strip()
                article['site'] = "State Taxation Administration of China"
                article['originalSite'] = "国家税务总局"
                article['titleCN'] = article['title']
                article['title'] = translate(article['originalTitle'])
                article['url'] = article['snapshotUrl']
                article['author'] = ""
                article['attachment'] = ""
                article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(article['cwrq'],"%Y-%m-%d %H:%M:%S"))
                article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(CONTENT_ENG.replace("\n",""))
                article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['titleCN']+article['publishDate'])
                upsert_content(article)
        except Exception as error:
            print(error)


CATEGORY_URL = 'https://www.chinatax.gov.cn/getFileListByCodeId'
i = 0
while i > -1:
  # Define the payload data
    payload = {
        'channelId': '29a88b67e4b149cfa9fac7919dfb08a5',
        'page': i,
        'size': '10'
    }
    i = i + 1
    # Encode the payload data
    payload = urllib.parse.urlencode(payload).encode('utf-8')
    req = urllib.request.urlopen(CATEGORY_URL, data=payload)
    content = req.read().decode("utf-8")
    reportinfo = json.loads(content)
    for article in reportinfo['results']['data']['results']:
        parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(article['publishedTimeStr'],"%Y-%m-%d %H:%M:%S")), "%Y-%m-%d")
        if  parsed_datetime < (datetime.today() - timedelta(days=183)):
            print(parsed_datetime)
            i = -1
        else:
            try:
                url = article['url'].replace("http://www.chinatax.gov.cn/zcfgk","https://fgk.chinatax.gov.cn/zcfgk")
                req = urllib.request.urlopen(url)
                text = req.read()
                html_text = text.decode("utf-8")
                page = etree.HTML(html_text)
                contentCN, summary = encode_content(page.xpath("//div[contains(@class, 'article')]//p"))
                article['contentCN'] = repr(contentCN)[1:-1].strip()
                if len(article['contentCN']) < 10:
                    continue
                CONTENT_ENG = ''
                for element in contentCN.split("\n"):
                    CONTENT_ENG += translate(element) + '\n'
                article['content'] = repr(CONTENT_ENG)[1:-1].strip()
                article['site'] = "State Taxation Administration of China"
                article['originalSite'] = "国家税务总局"
                article['titleCN'] = article['title']
                article['title'] = translate(article['originalTitle'])
                article['url'] = article['url']
                article['subtitle'] = translate(summary)
                article['attachment'] = ""
                article['author'] = ""
                article['category']= "Policy Interpretation"
                article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(article['publishedTimeStr'],"%Y-%m-%d %H:%M:%S"))
                article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['titleCN']+article['publishDate'])
                article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
                upsert_content(article)
            except Exception as error:
                print(error)