File size: 6,221 Bytes
74475ac
 
 
 
 
 
 
4259f95
74475ac
 
964df08
74475ac
 
 
 
 
 
 
 
91fadcf
74475ac
 
 
 
 
 
 
 
 
 
 
 
 
4259f95
 
74475ac
 
 
 
 
 
 
 
82a33ed
4259f95
122b0c4
 
 
 
cc76656
122b0c4
74475ac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cc76656
74475ac
d705151
4259f95
cc76656
74475ac
4259f95
74475ac
 
 
 
 
 
4259f95
74475ac
 
 
 
 
 
 
4259f95
74475ac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5737030
 
d705151
 
74475ac
 
 
 
 
 
 
 
 
 
 
4259f95
74475ac
d705151
74475ac
4259f95
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
"""Module to crawl the website 'https://www.csrc.gov.cn' to fetch and process articles."""
import json
import time
import urllib.request
import uuid
from datetime import datetime, timedelta

from prefect import task, get_run_logger
from lxml import etree

from controllers.summarizer import summarize
from controllers.utils import (
    crawl_by_url,
    encode,
    fetch_url,
    sentiment_computation,
    translate,
)

@task(name = "Data Collection - csrc", log_prints = True)
def crawl(delta):
    """
    Crawls the website http://www.csrc.gov.cn to retrieve articles based on the specified delta.

    Args:
        delta (int): The number of days in the past from today to retrieve articles.

    Returns:
        None

    Raises:
        None
    """
    logger = get_run_logger()
    logger.info("csrc.gov.cn")
    i = 1
    while i > -1:
        try:
            if i == 1:
                category_url = "http://www.csrc.gov.cn/csrc/c100039/common_list.shtml"
            else:
                category_url = f"http://www.csrc.gov.cn/csrc/c100039/common_list_{i}.shtml"
            i = i + 1
            logger.info(f"Fetching from URL: {category_url}")
            logger.info(category_url)
            req = urllib.request.Request(
                category_url,
                headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
            )
            response = urllib.request.urlopen(req, timeout=60)
            text = response.read()
            html_text = text.decode("utf-8")
            page = etree.HTML(html_text)
            articlelist = page.xpath(
                    "//div[contains(@class, 'main-right fr common-list')]/ul/li")
            for article in articlelist:
                if isinstance(article, etree._Element):
                    subelement = etree.tostring(article).decode()
                    subpage = etree.HTML(subelement)
                    date = encode(subpage.xpath("//span[@class='date']"))
                    parsed_datetime = datetime.strptime(
                            time.strftime("%Y-%m-%d", time.strptime(date, "%Y-%m-%d")),
                            "%Y-%m-%d")
                    if parsed_datetime < (datetime.today() - timedelta(days=delta)):
                        i = -1
                    else:
                        urls = subpage.xpath("//a/@href")
                        for url in urls:
                            try:
                                article = {}
                                url = "http://www.csrc.gov.cn" + url
                                article['category'] = "Policy Interpretation"
                                logger.info("Processing article URL: %s", url)
                                crawl_by_url(url, article)
                            except (urllib.error.URLError, etree.XMLSyntaxError) as error:
                                logger.error(error)
        except (urllib.error.URLError, etree.XMLSyntaxError, ValueError, TimeoutError) as error:
            i = -1
            logger.error(error)

    i = 1
    while i > -1:
        category_url = f"http://www.csrc.gov.cn/searchList/a1a078ee0bc54721ab6b148884c784a8?_isAgg=true&_isJson=true&_pageSize=18&_template=index&_rangeTimeGte=&_channelName=&page={i}"
        i = i + 1
        try:
            logger.info(category_url)
            content = fetch_url(category_url)
            if content is None:
                i = -1
            else:
                reportinfo = json.loads(content)
                if len(reportinfo['data']['results']) == 0:
                    i = -1
                    logger.info(len(reportinfo['data']['results']))
                for article in reportinfo['data']['results']:
                    parsed_datetime = datetime.strptime(
                            time.strftime(
                                    "%Y-%m-%d",
                                    time.strptime(article['publishedTimeStr'],
                                                                "%Y-%m-%d %H:%M:%S")), "%Y-%m-%d")
                    if parsed_datetime < (datetime.today() - timedelta(days=delta)):
                        i = -1
                    else:
                        article['category'] = "Financial News"
                        article['site'] = "Securities Regulatory Commission of China"
                        article['originSite'] = "证监会"
                        article['titleCN'] = article['title']
                        article['title'] = translate(article['titleCN'])
                        article['author'] = ''
                        article['contentCN'] = repr(article['content'])[1:-1].strip()
                        if len(article['contentCN']) < 10:
                            continue
                        contenteng = ''
                        for element in article['contentCN'].split("。"):
                            contenteng += translate(element) + ' '
                        article['content'] = repr(contenteng)[1:-1].strip()
                        try:
                            article['subtitle'] = summarize(article['content'])
                        except (RuntimeError, ValueError):
                            article['subtitle'] = ""
                        article['publishDate'] = time.strftime(
                                "%Y-%m-%d",
                                time.strptime(article['publishedTimeStr'],
                                                            "%Y-%m-%d %H:%M:%S"))
                        article['link'] = article['url']
                        article['attachment'] = ""
                        article['sentimentScore'], article[
                                'sentimentLabel'] = sentiment_computation(article['content'])
                        article['id'] = uuid.uuid5(
                                uuid.NAMESPACE_OID,
                                article['titleCN'] + article['publishDate'])
                        logger.info(article)
                        # update_content(article)
        except (ValueError, KeyError, TypeError) as error:
            i = -1
            logger.error(error)