"""Module to crawl the website 'https://www.cbirc.gov.cn' to fetch and process articles.""" import json import time import uuid from datetime import datetime, timedelta from prefect import task, get_run_logger from controllers.summarizer import summarize from controllers.utils import ( extract_from_pdf, fetch_url, sentiment_computation, translate, update_content, ) from controllers.vectorizer import vectorize @task(name = "Data Collection - cbirc", log_prints = True) def crawl(delta): """ Crawls the website 'https://www.cbirc.gov.cn' to fetch and process articles. Args: delta (int): The number of days to consider for article publication date. Returns: None Raises: None """ logger = get_run_logger() logger.info("cbirc.gov.cn") i = 1 while i > -1: category_url= f"https://www.cbirc.gov.cn/cn/static/data/DocInfo/SelectDocByItemIdAndChild/data_itemId=917,pageIndex={i},pageSize=18.json" i = i + 1 content = fetch_url(category_url) if content is not None: reportinfo = json.loads(content) for article in reportinfo['data']['rows']: try: article['publishDate'] = time.strftime( "%Y-%m-%d", time.strptime(article['publishDate'], "%Y-%m-%d %H:%M:%S")) parsed_datetime = datetime.strptime(article['publishDate'], "%Y-%m-%d") if parsed_datetime < (datetime.today() - timedelta(days=delta)): i = -1 else: contentcn, summary = extract_from_pdf( "https://www.cbirc.gov.cn" + article['pdfFileUrl']) article['contentCN'] = repr(contentcn)[1:-1].strip() if len(contentcn) < 10: continue contenteng = '' for element in article['contentCN'].split("\n"): contenteng += translate(element) + '\n' article['content'] = repr(contenteng)[1:-1].strip() article[ 'site'] = "National Financial Regulatory Administration of China" article['originSite'] = "国家金融监督管理总局" article['titleCN'] = article['docSubtitle'] article['title'] = translate(article['docSubtitle']) article['link'] = "https://www.cbirc.gov.cn" + str( article['pdfFileUrl']) article['category'] = "Policy Interpretation" article['id'] = uuid.uuid5( uuid.NAMESPACE_OID, article['titleCN'] + article['publishDate']) article['sentimentScore'], article[ 'sentimentLabel'] = sentiment_computation(article['content']) article['attachment'] = '' article['author'] = '' try: article['subtitle'] = summarize(article['content']) except (RuntimeError, ValueError): article['subtitle'] = translate(summary) article['referenceid'] = None update_content(article) vectorize(article) except (ValueError, KeyError, TypeError) as error: logger.error(error)