Spaces:

Oxbridge-Economics
/

Data-Collection-China

Running

File size: 4,294 Bytes

"""
This script is used to crawl and collect data from the website of the China Securities Regulatory Commission (CSRC).
It retrieves policy interpretation articles and financial news articles from the CSRC website.
The collected data is then processed and stored in a database.

The script consists of two main parts:
1. Crawl and process policy interpretation articles from the CSRC website.
2. Crawl and process financial news articles from the CSRC website.

The script uses various libraries and functions to handle web scraping, data processing, and database operations.

Note: This script assumes the presence of the following dependencies:
- urllib
- lxml
- json
- datetime
- time
- utils (custom module)

Please make sure to install these dependencies before running the script.
"""
import uuid
import json
import time
import urllib.request
from datetime import datetime, timedelta
from lxml import etree
from utils import encode, translate, sentiment_computation, upsert_content, fetch_url, crawl

i = 1
while i > -1:
    if i == 1:
        CATEGORY_URL = "http://www.csrc.gov.cn/csrc/c100039/common_list.shtml"
    else:
        CATEGORY_URL = f"http://www.csrc.gov.cn/csrc/c100039/common_list_{i}.shtml"
    i = i + 1
    req = urllib.request.urlopen(CATEGORY_URL)
    text = req.read()
    html_text = text.decode("utf-8")
    page = etree.HTML(html_text)
    articlelist = page.xpath("//div[contains(@class, 'main-right fr common-list')]/ul/li")
    for article in articlelist:
        if isinstance(article, etree._Element):
            subelement = etree.tostring(article).decode()
            subpage = etree.HTML(subelement)
            date = encode(subpage.xpath("//span[@class='date']"))
            parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
            if  parsed_datetime < (datetime.today() - timedelta(days=183)):
                i = -1
            else:
                urls = subpage.xpath("//a/@href")
                for url in urls:
                    try:
                        article = {}
                        url = "http://www.csrc.gov.cn" + url
                        article['category']= "Policy Interpretation"
                        crawl(url, article)
                    except Exception as error:
                        print(error)

i = 1
while i > -1:
    CATEGORY_URL = f"http://www.csrc.gov.cn/searchList/a1a078ee0bc54721ab6b148884c784a8?_isAgg=true&_isJson=true&_pageSize=18&_template=index&_rangeTimeGte=&_channelName=&page={i}"
    i = i + 1
    content = fetch_url(CATEGORY_URL)
    reportinfo = json.loads(content)
    for article in reportinfo['data']['results']:
        try:
            parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(article['publishedTimeStr'],"%Y-%m-%d %H:%M:%S")), "%Y-%m-%d")
            if  parsed_datetime < (datetime.today() - timedelta(days=183)):
                i = -1
            else:
                article['category']= "Financial News"
                article['site'] = "Securities Regulatory Commission of China"
                article['originSite'] = "证监会"
                article['titleCN'] = article['title']
                article['title'] = translate(article['titleCN'])
                article['author'] = ''
                article['contentCN'] = repr(article['content'])[1:-1].strip()
                if len(article['contentCN']) < 10:
                    continue
                CONTENT_ENG = ''
                for element in article['contentCN'].split("。"):
                    CONTENT_ENG += translate(element) + ' '
                article['content'] = repr(CONTENT_ENG)[1:-1].strip()
                article['subtitle'] = article['memo']
                article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(article['publishedTimeStr'],"%Y-%m-%d %H:%M:%S"))
                article['link'] = article['url']
                article['attachment'] = ""
                article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
                article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['titleCN']+article['publishDate'])
                upsert_content(article)
        except Exception as error:
            print(error)