File size: 5,522 Bytes
74475ac
 
 
 
 
 
 
 
 
 
964df08
74475ac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
964df08
74475ac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
"""Module to crawl the website 'https://www.csrc.gov.cn' to fetch and process articles."""
import json
import logging
import time
import urllib.request
import uuid
from datetime import datetime, timedelta

from lxml import etree

from controllers.summarizer import summarize
from controllers.utils import (
    crawl_by_url,
    encode,
    fetch_url,
    sentiment_computation,
    translate,
    update_content,
)


def crawl(delta):
    """
    Crawls the website http://www.csrc.gov.cn to retrieve articles based on the specified delta.

    Args:
        delta (int): The number of days in the past from today to retrieve articles.

    Returns:
        None

    Raises:
        None
    """
    logging.info("csrc.gov.cn")
    i = 1
    while i > -1:
        try:
            if i == 1:
                category_url = "http://www.csrc.gov.cn/csrc/c100039/common_list.shtml"
            else:
                category_url = f"http://www.csrc.gov.cn/csrc/c100039/common_list_{i}.shtml"
            i = i + 1
            logging.info(category_url)
            req = urllib.request.urlopen(category_url)
            text = req.read()
            html_text = text.decode("utf-8")
            page = etree.HTML(html_text)
            articlelist = page.xpath(
                    "//div[contains(@class, 'main-right fr common-list')]/ul/li")
            for article in articlelist:
                if isinstance(article, etree._Element):
                    subelement = etree.tostring(article).decode()
                    subpage = etree.HTML(subelement)
                    date = encode(subpage.xpath("//span[@class='date']"))
                    parsed_datetime = datetime.strptime(
                            time.strftime("%Y-%m-%d", time.strptime(date, "%Y-%m-%d")),
                            "%Y-%m-%d")
                    if parsed_datetime < (datetime.today() - timedelta(days=delta)):
                        i = -1
                    else:
                        urls = subpage.xpath("//a/@href")
                        for url in urls:
                            try:
                                article = {}
                                url = "http://www.csrc.gov.cn" + url
                                article['category'] = "Policy Interpretation"
                                logging.info(url)
                                crawl_by_url(url, article)
                            except Exception as error:
                                logging.error(error)
        except Exception as error:
            i = -1
            logging.error(error)

    i = 1
    while i > -1:
        category_url = f"http://www.csrc.gov.cn/searchList/a1a078ee0bc54721ab6b148884c784a8?_isAgg=true&_isJson=true&_pageSize=18&_template=index&_rangeTimeGte=&_channelName=&page={i}"
        i = i + 1
        try:
            logging.info(category_url)
            content = fetch_url(category_url)
            if content is None:
                i = -1
            else:
                reportinfo = json.loads(content)
                if len(reportinfo['data']['results']) == 0:
                    i = -1
                    logging.info(len(reportinfo['data']['results']))
                for article in reportinfo['data']['results']:
                    parsed_datetime = datetime.strptime(
                            time.strftime(
                                    "%Y-%m-%d",
                                    time.strptime(article['publishedTimeStr'],
                                                                "%Y-%m-%d %H:%M:%S")), "%Y-%m-%d")
                    if parsed_datetime < (datetime.today() - timedelta(days=delta)):
                        i = -1
                    else:
                        article['category'] = "Financial News"
                        article['site'] = "Securities Regulatory Commission of China"
                        article['originSite'] = "证监会"
                        article['titleCN'] = article['title']
                        article['title'] = translate(article['titleCN'])
                        article['author'] = ''
                        article['contentCN'] = repr(article['content'])[1:-1].strip()
                        if len(article['contentCN']) < 10:
                            continue
                        contenteng = ''
                        for element in article['contentCN'].split("。"):
                            contenteng += translate(element) + ' '
                        article['content'] = repr(contenteng)[1:-1].strip()
                        article['subtitle'] = summarize(article['content'])
                        article['publishDate'] = time.strftime(
                                "%Y-%m-%d",
                                time.strptime(article['publishedTimeStr'],
                                                            "%Y-%m-%d %H:%M:%S"))
                        article['link'] = article['url']
                        article['attachment'] = ""
                        article['sentimentScore'], article[
                                'sentimentLabel'] = sentiment_computation(article['content'])
                        article['id'] = uuid.uuid5(
                                uuid.NAMESPACE_OID,
                                article['titleCN'] + article['publishDate'])
                        logging.info(article)
                        # update_content(article)
        except Exception as error:
            i = -1
            logging.error(error)