File size: 4,294 Bytes
39fe3d1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57c4050
 
 
 
 
 
046bb22
57c4050
42ba1cc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39fe3d1
57c4050
 
 
 
 
 
 
 
 
 
 
 
 
046bb22
 
 
422b41b
 
046bb22
422b41b
 
57c4050
 
422b41b
57c4050
422b41b
046bb22
57c4050
046bb22
 
57c4050
422b41b
57c4050
 
39fe3d1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
"""
This script is used to crawl and collect data from the website of the China Securities Regulatory Commission (CSRC).
It retrieves policy interpretation articles and financial news articles from the CSRC website.
The collected data is then processed and stored in a database.

The script consists of two main parts:
1. Crawl and process policy interpretation articles from the CSRC website.
2. Crawl and process financial news articles from the CSRC website.

The script uses various libraries and functions to handle web scraping, data processing, and database operations.

Note: This script assumes the presence of the following dependencies:
- urllib
- lxml
- json
- datetime
- time
- utils (custom module)

Please make sure to install these dependencies before running the script.
"""
import uuid
import json
import time
import urllib.request
from datetime import datetime, timedelta
from lxml import etree
from utils import encode, translate, sentiment_computation, upsert_content, fetch_url, crawl

i = 1
while i > -1:
    if i == 1:
        CATEGORY_URL = "http://www.csrc.gov.cn/csrc/c100039/common_list.shtml"
    else:
        CATEGORY_URL = f"http://www.csrc.gov.cn/csrc/c100039/common_list_{i}.shtml"
    i = i + 1
    req = urllib.request.urlopen(CATEGORY_URL)
    text = req.read()
    html_text = text.decode("utf-8")
    page = etree.HTML(html_text)
    articlelist = page.xpath("//div[contains(@class, 'main-right fr common-list')]/ul/li")
    for article in articlelist:
        if isinstance(article, etree._Element):
            subelement = etree.tostring(article).decode()
            subpage = etree.HTML(subelement)
            date = encode(subpage.xpath("//span[@class='date']"))
            parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
            if  parsed_datetime < (datetime.today() - timedelta(days=183)):
                i = -1
            else:
                urls = subpage.xpath("//a/@href")
                for url in urls:
                    try:
                        article = {}
                        url = "http://www.csrc.gov.cn" + url
                        article['category']= "Policy Interpretation"
                        crawl(url, article)
                    except Exception as error:
                        print(error)

i = 1
while i > -1:
    CATEGORY_URL = f"http://www.csrc.gov.cn/searchList/a1a078ee0bc54721ab6b148884c784a8?_isAgg=true&_isJson=true&_pageSize=18&_template=index&_rangeTimeGte=&_channelName=&page={i}"
    i = i + 1
    content = fetch_url(CATEGORY_URL)
    reportinfo = json.loads(content)
    for article in reportinfo['data']['results']:
        try:
            parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(article['publishedTimeStr'],"%Y-%m-%d %H:%M:%S")), "%Y-%m-%d")
            if  parsed_datetime < (datetime.today() - timedelta(days=183)):
                i = -1
            else:
                article['category']= "Financial News"
                article['site'] = "Securities Regulatory Commission of China"
                article['originSite'] = "证监会"
                article['titleCN'] = article['title']
                article['title'] = translate(article['titleCN'])
                article['author'] = ''
                article['contentCN'] = repr(article['content'])[1:-1].strip()
                if len(article['contentCN']) < 10:
                    continue
                CONTENT_ENG = ''
                for element in article['contentCN'].split("。"):
                    CONTENT_ENG += translate(element) + ' '
                article['content'] = repr(CONTENT_ENG)[1:-1].strip()
                article['subtitle'] = article['memo']
                article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(article['publishedTimeStr'],"%Y-%m-%d %H:%M:%S"))
                article['link'] = article['url']
                article['attachment'] = ""
                article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
                article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['titleCN']+article['publishDate'])
                upsert_content(article)
        except Exception as error:
            print(error)