File size: 5,927 Bytes
0fc522e
 
 
 
 
4a8b338
0fc522e
 
 
 
57c4050
0fc522e
57c4050
ec13f7a
4a8b338
0fc522e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b348cfd
0fc522e
 
 
4a8b338
 
 
 
 
57c4050
 
4a8b338
 
 
 
 
 
 
 
 
 
 
 
 
 
b348cfd
 
0fc522e
 
 
 
57c4050
0fc522e
57c4050
ec13f7a
57c4050
0fc522e
 
 
 
 
 
 
 
 
 
ec13f7a
 
 
0fc522e
 
 
 
 
 
4a8b338
 
 
 
 
57c4050
 
4a8b338
 
 
 
 
 
 
 
 
 
 
 
 
 
0fc522e
b348cfd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
from datetime import datetime, timedelta
import uuid
import time
import urllib.request
from lxml import etree
from utils import encode, translate, datemodifier, sentiment_computation, upsert_content

i = 0
while i > -1:
    if i == 0:
        CATEGORY_URL = "https://www.gov.cn/zhengce/jiedu/home.htm"
    else:
        CATEGORY_URL = f"https://www.gov.cn/zhengce/jiedu/home_{i}.htm"
    i = i + 1
    req = urllib.request.urlopen(CATEGORY_URL)
    text = req.read()
    html_text = text.decode("utf-8")
    page = etree.HTML(html_text)
    articlelist = page.xpath("//div[contains(@class, 'news_box')]//h4")
    for article in articlelist:
        if isinstance(article, etree._Element):
            subelement = etree.tostring(article).decode()
            subpage = etree.HTML(subelement)
            date = subpage.xpath("//span/text()")[0]
            parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
            if  parsed_datetime < (datetime.today() - timedelta(days=183)):
                i = -1
            else:
                urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
                for url in urls:
                    try:
                        article = {}
                        url = url.replace('../', 'https://www.gov.cn/zhengce/')
                        if "https://www.gov.cn" in url:
                            req = urllib.request.urlopen(url)
                            text = req.read()
                            html_text = text.decode("utf-8")
                            page = etree.HTML(html_text)
                            article['originalContent'] = encode(page.xpath("//div[contains(@id, 'UCAP-CONTENT')]//p"))
                            if len(article['originalContent']) < 10:
                                continue
                            CONTENT_ENG = ''
                            for element in article['originalContent'].split("。"):
                                CONTENT_ENG += translate(element) + ' '
                            article['content'] = CONTENT_ENG
                            article['site'] = "State Council"
                            article['originalSite'] = "国务院"
                            article['originalTitle'] = page.xpath("//title/text()")[0]
                            article['title'] = translate(article['originalTitle'])
                            article['url'] = url
                            article['category']= "Policy Interpretation"
                            article['publishDate'] = datemodifier(page.xpath("//meta[@name = 'firstpublishedtime']/@content")[0], "%Y-%m-%d-%H:%M:%S")
                            article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
                            article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
                            upsert_content(article)
                    except Exception as error:
                        print(error)

i = 0
while i > -1:
    if i == 0:
        CATEGORY_URL = "https://www.gov.cn/zhengce/zuixin/home.htm"
    else:
        CATEGORY_URL = f"https://www.gov.cn/zhengce/zuixin/home_{i}.htm"
    i = i + 1
    req = urllib.request.urlopen(CATEGORY_URL)
    text = req.read()
    html_text = text.decode("utf-8")
    page = etree.HTML(html_text)
    articlelist = page.xpath("//div[contains(@class, 'news_box')]//h4")
    for article in articlelist:
        if isinstance(article, etree._Element):
            subelement = etree.tostring(article).decode()
            subpage = etree.HTML(subelement)
            date = subpage.xpath("//span/text()")[0]
            parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
            if  parsed_datetime < (datetime.today() - timedelta(days=183)):
                i = -1
            else:
                urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
                for url in urls:
                    try:
                        article = {}
                        url = url.replace('../', 'https://www.gov.cn/zhengce/')
                        if "https://www.gov.cn" in url:
                            req = urllib.request.urlopen(url)
                            text = req.read()
                            html_text = text.decode("utf-8")
                            page = etree.HTML(html_text)
                            article['originalContent'] = encode(page.xpath("//div[contains(@id, 'UCAP-CONTENT')]//p"))
                            if len(article['originalContent']) < 10:
                                continue
                            CONTENT_ENG = ''
                            for element in article['originalContent'].split("。"):
                                CONTENT_ENG += translate(article['originalContent']) + ' '
                            article['content'] = CONTENT_ENG
                            article['site'] = "State Council"
                            article['originalSite'] = "国务院"
                            article['originalTitle'] = page.xpath("//title/text()")[0]
                            article['title'] = translate(article['originalTitle'])
                            article['url'] = url
                            article['category']= "Policy Release"
                            article['publishDate'] = datemodifier(page.xpath("//meta[@name = 'firstpublishedtime']/@content")[0], "%Y-%m-%d-%H:%M:%S")
                            article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
                            article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
                            upsert_content(article)
                    except Exception as error:
                        print(error)