Spaces:

Oxbridge-Economics
/

Data-Collection-China

Sleeping

File size: 6,221 Bytes

"""Module to crawl the website 'https://www.csrc.gov.cn' to fetch and process articles."""
import json
import time
import urllib.request
import uuid
from datetime import datetime, timedelta

from prefect import task, get_run_logger
from lxml import etree

from controllers.summarizer import summarize
from controllers.utils import (
    crawl_by_url,
    encode,
    fetch_url,
    sentiment_computation,
    translate,
)

@task(name = "Data Collection - csrc", log_prints = True)
def crawl(delta):
    """
    Crawls the website http://www.csrc.gov.cn to retrieve articles based on the specified delta.

    Args:
        delta (int): The number of days in the past from today to retrieve articles.

    Returns:
        None

    Raises:
        None
    """
    logger = get_run_logger()
    logger.info("csrc.gov.cn")
    i = 1
    while i > -1:
        try:
            if i == 1:
                category_url = "http://www.csrc.gov.cn/csrc/c100039/common_list.shtml"
            else:
                category_url = f"http://www.csrc.gov.cn/csrc/c100039/common_list_{i}.shtml"
            i = i + 1
            logger.info(f"Fetching from URL: {category_url}")
            logger.info(category_url)
            req = urllib.request.Request(
                category_url,
                headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
            )
            response = urllib.request.urlopen(req, timeout=60)
            text = response.read()
            html_text = text.decode("utf-8")
            page = etree.HTML(html_text)
            articlelist = page.xpath(
                    "//div[contains(@class, 'main-right fr common-list')]/ul/li")
            for article in articlelist:
                if isinstance(article, etree._Element):
                    subelement = etree.tostring(article).decode()
                    subpage = etree.HTML(subelement)
                    date = encode(subpage.xpath("//span[@class='date']"))
                    parsed_datetime = datetime.strptime(
                            time.strftime("%Y-%m-%d", time.strptime(date, "%Y-%m-%d")),
                            "%Y-%m-%d")
                    if parsed_datetime < (datetime.today() - timedelta(days=delta)):
                        i = -1
                    else:
                        urls = subpage.xpath("//a/@href")
                        for url in urls:
                            try:
                                article = {}
                                url = "http://www.csrc.gov.cn" + url
                                article['category'] = "Policy Interpretation"
                                logger.info("Processing article URL: %s", url)
                                crawl_by_url(url, article)
                            except (urllib.error.URLError, etree.XMLSyntaxError) as error:
                                logger.error(error)
        except (urllib.error.URLError, etree.XMLSyntaxError, ValueError, TimeoutError) as error:
            i = -1
            logger.error(error)

    i = 1
    while i > -1:
        category_url = f"http://www.csrc.gov.cn/searchList/a1a078ee0bc54721ab6b148884c784a8?_isAgg=true&_isJson=true&_pageSize=18&_template=index&_rangeTimeGte=&_channelName=&page={i}"
        i = i + 1
        try:
            logger.info(category_url)
            content = fetch_url(category_url)
            if content is None:
                i = -1
            else:
                reportinfo = json.loads(content)
                if len(reportinfo['data']['results']) == 0:
                    i = -1
                    logger.info(len(reportinfo['data']['results']))
                for article in reportinfo['data']['results']:
                    parsed_datetime = datetime.strptime(
                            time.strftime(
                                    "%Y-%m-%d",
                                    time.strptime(article['publishedTimeStr'],
                                                                "%Y-%m-%d %H:%M:%S")), "%Y-%m-%d")
                    if parsed_datetime < (datetime.today() - timedelta(days=delta)):
                        i = -1
                    else:
                        article['category'] = "Financial News"
                        article['site'] = "Securities Regulatory Commission of China"
                        article['originSite'] = "证监会"
                        article['titleCN'] = article['title']
                        article['title'] = translate(article['titleCN'])
                        article['author'] = ''
                        article['contentCN'] = repr(article['content'])[1:-1].strip()
                        if len(article['contentCN']) < 10:
                            continue
                        contenteng = ''
                        for element in article['contentCN'].split("。"):
                            contenteng += translate(element) + ' '
                        article['content'] = repr(contenteng)[1:-1].strip()
                        try:
                            article['subtitle'] = summarize(article['content'])
                        except (RuntimeError, ValueError):
                            article['subtitle'] = ""
                        article['publishDate'] = time.strftime(
                                "%Y-%m-%d",
                                time.strptime(article['publishedTimeStr'],
                                                            "%Y-%m-%d %H:%M:%S"))
                        article['link'] = article['url']
                        article['attachment'] = ""
                        article['sentimentScore'], article[
                                'sentimentLabel'] = sentiment_computation(article['content'])
                        article['id'] = uuid.uuid5(
                                uuid.NAMESPACE_OID,
                                article['titleCN'] + article['publishDate'])
                        logger.info(article)
                        # update_content(article)
        except (ValueError, KeyError, TypeError) as error:
            i = -1
            logger.error(error)