Spaces:

Oxbridge-Economics
/

Data-Collection-China

Build error

File size: 5,522 Bytes

"""Module to crawl the website 'https://www.csrc.gov.cn' to fetch and process articles."""
import json
import logging
import time
import urllib.request
import uuid
from datetime import datetime, timedelta

from lxml import etree

from controllers.summarizer import summarize
from controllers.utils import (
    crawl_by_url,
    encode,
    fetch_url,
    sentiment_computation,
    translate,
    update_content,
)


def crawl(delta):
    """
    Crawls the website http://www.csrc.gov.cn to retrieve articles based on the specified delta.

    Args:
        delta (int): The number of days in the past from today to retrieve articles.

    Returns:
        None

    Raises:
        None
    """
    logging.info("csrc.gov.cn")
    i = 1
    while i > -1:
        try:
            if i == 1:
                category_url = "http://www.csrc.gov.cn/csrc/c100039/common_list.shtml"
            else:
                category_url = f"http://www.csrc.gov.cn/csrc/c100039/common_list_{i}.shtml"
            i = i + 1
            logging.info(category_url)
            req = urllib.request.urlopen(category_url)
            text = req.read()
            html_text = text.decode("utf-8")
            page = etree.HTML(html_text)
            articlelist = page.xpath(
                    "//div[contains(@class, 'main-right fr common-list')]/ul/li")
            for article in articlelist:
                if isinstance(article, etree._Element):
                    subelement = etree.tostring(article).decode()
                    subpage = etree.HTML(subelement)
                    date = encode(subpage.xpath("//span[@class='date']"))
                    parsed_datetime = datetime.strptime(
                            time.strftime("%Y-%m-%d", time.strptime(date, "%Y-%m-%d")),
                            "%Y-%m-%d")
                    if parsed_datetime < (datetime.today() - timedelta(days=delta)):
                        i = -1
                    else:
                        urls = subpage.xpath("//a/@href")
                        for url in urls:
                            try:
                                article = {}
                                url = "http://www.csrc.gov.cn" + url
                                article['category'] = "Policy Interpretation"
                                logging.info(url)
                                crawl_by_url(url, article)
                            except Exception as error:
                                logging.error(error)
        except Exception as error:
            i = -1
            logging.error(error)

    i = 1
    while i > -1:
        category_url = f"http://www.csrc.gov.cn/searchList/a1a078ee0bc54721ab6b148884c784a8?_isAgg=true&_isJson=true&_pageSize=18&_template=index&_rangeTimeGte=&_channelName=&page={i}"
        i = i + 1
        try:
            logging.info(category_url)
            content = fetch_url(category_url)
            if content is None:
                i = -1
            else:
                reportinfo = json.loads(content)
                if len(reportinfo['data']['results']) == 0:
                    i = -1
                    logging.info(len(reportinfo['data']['results']))
                for article in reportinfo['data']['results']:
                    parsed_datetime = datetime.strptime(
                            time.strftime(
                                    "%Y-%m-%d",
                                    time.strptime(article['publishedTimeStr'],
                                                                "%Y-%m-%d %H:%M:%S")), "%Y-%m-%d")
                    if parsed_datetime < (datetime.today() - timedelta(days=delta)):
                        i = -1
                    else:
                        article['category'] = "Financial News"
                        article['site'] = "Securities Regulatory Commission of China"
                        article['originSite'] = "证监会"
                        article['titleCN'] = article['title']
                        article['title'] = translate(article['titleCN'])
                        article['author'] = ''
                        article['contentCN'] = repr(article['content'])[1:-1].strip()
                        if len(article['contentCN']) < 10:
                            continue
                        contenteng = ''
                        for element in article['contentCN'].split("。"):
                            contenteng += translate(element) + ' '
                        article['content'] = repr(contenteng)[1:-1].strip()
                        article['subtitle'] = summarize(article['content'])
                        article['publishDate'] = time.strftime(
                                "%Y-%m-%d",
                                time.strptime(article['publishedTimeStr'],
                                                            "%Y-%m-%d %H:%M:%S"))
                        article['link'] = article['url']
                        article['attachment'] = ""
                        article['sentimentScore'], article[
                                'sentimentLabel'] = sentiment_computation(article['content'])
                        article['id'] = uuid.uuid5(
                                uuid.NAMESPACE_OID,
                                article['titleCN'] + article['publishDate'])
                        logging.info(article)
                        # update_content(article)
        except Exception as error:
            i = -1
            logging.error(error)