"""Module to crawl the website 'https://www.csrc.gov.cn' to fetch and process articles.""" import json import logging import time import urllib.request import uuid from datetime import datetime, timedelta from prefect import flow from lxml import etree from controllers.summarizer import summarize from controllers.utils import ( crawl_by_url, encode, fetch_url, sentiment_computation, translate, update_content, ) @flow(name = "Data Collection - csrc") def crawl(delta): """ Crawls the website http://www.csrc.gov.cn to retrieve articles based on the specified delta. Args: delta (int): The number of days in the past from today to retrieve articles. Returns: None Raises: None """ logging.info("csrc.gov.cn") i = 1 while i > -1: try: if i == 1: category_url = "http://www.csrc.gov.cn/csrc/c100039/common_list.shtml" else: category_url = f"http://www.csrc.gov.cn/csrc/c100039/common_list_{i}.shtml" i = i + 1 logging.info(category_url) req = urllib.request.urlopen(category_url) text = req.read() html_text = text.decode("utf-8") page = etree.HTML(html_text) articlelist = page.xpath( "//div[contains(@class, 'main-right fr common-list')]/ul/li") for article in articlelist: if isinstance(article, etree._Element): subelement = etree.tostring(article).decode() subpage = etree.HTML(subelement) date = encode(subpage.xpath("//span[@class='date']")) parsed_datetime = datetime.strptime( time.strftime("%Y-%m-%d", time.strptime(date, "%Y-%m-%d")), "%Y-%m-%d") if parsed_datetime < (datetime.today() - timedelta(days=delta)): i = -1 else: urls = subpage.xpath("//a/@href") for url in urls: try: article = {} url = "http://www.csrc.gov.cn" + url article['category'] = "Policy Interpretation" logging.info(url) crawl_by_url(url, article) except Exception as error: logging.error(error) except Exception as error: i = -1 logging.error(error) i = 1 while i > -1: category_url = f"http://www.csrc.gov.cn/searchList/a1a078ee0bc54721ab6b148884c784a8?_isAgg=true&_isJson=true&_pageSize=18&_template=index&_rangeTimeGte=&_channelName=&page={i}" i = i + 1 try: logging.info(category_url) content = fetch_url(category_url) if content is None: i = -1 else: reportinfo = json.loads(content) if len(reportinfo['data']['results']) == 0: i = -1 logging.info(len(reportinfo['data']['results'])) for article in reportinfo['data']['results']: parsed_datetime = datetime.strptime( time.strftime( "%Y-%m-%d", time.strptime(article['publishedTimeStr'], "%Y-%m-%d %H:%M:%S")), "%Y-%m-%d") if parsed_datetime < (datetime.today() - timedelta(days=delta)): i = -1 else: article['category'] = "Financial News" article['site'] = "Securities Regulatory Commission of China" article['originSite'] = "θ―η›‘δΌš" article['titleCN'] = article['title'] article['title'] = translate(article['titleCN']) article['author'] = '' article['contentCN'] = repr(article['content'])[1:-1].strip() if len(article['contentCN']) < 10: continue contenteng = '' for element in article['contentCN'].split("。"): contenteng += translate(element) + ' ' article['content'] = repr(contenteng)[1:-1].strip() article['subtitle'] = summarize(article['content']) article['publishDate'] = time.strftime( "%Y-%m-%d", time.strptime(article['publishedTimeStr'], "%Y-%m-%d %H:%M:%S")) article['link'] = article['url'] article['attachment'] = "" article['sentimentScore'], article[ 'sentimentLabel'] = sentiment_computation(article['content']) article['id'] = uuid.uuid5( uuid.NAMESPACE_OID, article['titleCN'] + article['publishDate']) logging.info(article) # update_content(article) except Exception as error: i = -1 logging.error(error)