Spaces:

Oxbridge-Economics
/

Data-Collection-China

Sleeping

File size: 3,688 Bytes

"""Module to crawl the website 'https://www.cbirc.gov.cn' to fetch and process articles."""
import json
import time
import uuid
from datetime import datetime, timedelta
from prefect import task, get_run_logger

from controllers.summarizer import summarize
from controllers.utils import (
        extract_from_pdf,
        fetch_url,
        sentiment_computation,
        translate,
        update_content,
)
from controllers.vectorizer import vectorize

@task(name = "Data Collection - cbirc", log_prints = True)
def crawl(delta):
    """
    Crawls the website 'https://www.cbirc.gov.cn' to fetch and process articles.

    Args:
        delta (int): The number of days to consider for article publication date.

    Returns:
        None

    Raises:
        None
    """
    logger = get_run_logger()
    logger.info("cbirc.gov.cn")
    i = 1
    while i > -1:
        category_url= f"https://www.cbirc.gov.cn/cn/static/data/DocInfo/SelectDocByItemIdAndChild/data_itemId=917,pageIndex={i},pageSize=18.json"
        i = i + 1
        content = fetch_url(category_url)
        if content is not None:
            reportinfo = json.loads(content)
            for article in reportinfo['data']['rows']:
                try:
                    article['publishDate'] = time.strftime(
                            "%Y-%m-%d",
                            time.strptime(article['publishDate'], "%Y-%m-%d %H:%M:%S"))
                    parsed_datetime = datetime.strptime(article['publishDate'], "%Y-%m-%d")
                    if parsed_datetime < (datetime.today() - timedelta(days=delta)):
                        i = -1
                    else:
                        contentcn, summary = extract_from_pdf(
                            "https://www.cbirc.gov.cn" + article['pdfFileUrl'])
                        article['contentCN'] = repr(contentcn)[1:-1].strip()
                        if len(contentcn) < 10:
                            continue
                        contenteng = ''
                        for element in article['contentCN'].split("\n"):
                            contenteng += translate(element) + '\n'
                        article['content'] = repr(contenteng)[1:-1].strip()
                        article[
                                'site'] = "National Financial Regulatory Administration of China"
                        article['originSite'] = "国家金融监督管理总局"
                        article['titleCN'] = article['docSubtitle']
                        article['title'] = translate(article['docSubtitle'])
                        article['link'] = "https://www.cbirc.gov.cn" + str(
                                article['pdfFileUrl'])
                        article['category'] = "Policy Interpretation"
                        article['id'] = uuid.uuid5(
                                uuid.NAMESPACE_OID,
                                article['titleCN'] + article['publishDate'])
                        article['sentimentScore'], article[
                                'sentimentLabel'] = sentiment_computation(article['content'])
                        article['attachment'] = ''
                        article['author'] = ''
                        try:
                            article['subtitle'] = summarize(article['content'])
                        except (RuntimeError, ValueError):
                            article['subtitle'] = translate(summary)
                        article['referenceid'] = None
                        update_content(article)
                        vectorize(article)
                except (ValueError, KeyError, TypeError) as error:
                    logger.error(error)