"""Module to crawl the website 'https://www.csrc.gov.cn' to fetch and process articles.""" import json import time import urllib.request import uuid from datetime import datetime, timedelta from prefect import task, get_run_logger from lxml import etree from controllers.summarizer import summarize from controllers.utils import ( crawl_by_url, encode, fetch_url, sentiment_computation, translate, ) @task(name = "Data Collection - csrc", log_prints = True) def crawl(delta): """ Crawls the website http://www.csrc.gov.cn to retrieve articles based on the specified delta. Args: delta (int): The number of days in the past from today to retrieve articles. Returns: None Raises: None """ logger = get_run_logger() logger.info("csrc.gov.cn") i = 1 while i > -1: try: if i == 1: category_url = "http://www.csrc.gov.cn/csrc/c100039/common_list.shtml" else: category_url = f"http://www.csrc.gov.cn/csrc/c100039/common_list_{i}.shtml" i = i + 1 logger.info(f"Fetching from URL: {category_url}") logger.info(category_url) req = urllib.request.Request( category_url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'} ) response = urllib.request.urlopen(req, timeout=60) text = response.read() html_text = text.decode("utf-8") page = etree.HTML(html_text) articlelist = page.xpath( "//div[contains(@class, 'main-right fr common-list')]/ul/li") for article in articlelist: if isinstance(article, etree._Element): subelement = etree.tostring(article).decode() subpage = etree.HTML(subelement) date = encode(subpage.xpath("//span[@class='date']")) parsed_datetime = datetime.strptime( time.strftime("%Y-%m-%d", time.strptime(date, "%Y-%m-%d")), "%Y-%m-%d") if parsed_datetime < (datetime.today() - timedelta(days=delta)): i = -1 else: urls = subpage.xpath("//a/@href") for url in urls: try: article = {} url = "http://www.csrc.gov.cn" + url article['category'] = "Policy Interpretation" logger.info("Processing article URL: %s", url) crawl_by_url(url, article) except (urllib.error.URLError, etree.XMLSyntaxError) as error: logger.error(error) except (urllib.error.URLError, etree.XMLSyntaxError, ValueError, TimeoutError) as error: i = -1 logger.error(error) i = 1 while i > -1: category_url = f"http://www.csrc.gov.cn/searchList/a1a078ee0bc54721ab6b148884c784a8?_isAgg=true&_isJson=true&_pageSize=18&_template=index&_rangeTimeGte=&_channelName=&page={i}" i = i + 1 try: logger.info(category_url) content = fetch_url(category_url) if content is None: i = -1 else: reportinfo = json.loads(content) if len(reportinfo['data']['results']) == 0: i = -1 logger.info(len(reportinfo['data']['results'])) for article in reportinfo['data']['results']: parsed_datetime = datetime.strptime( time.strftime( "%Y-%m-%d", time.strptime(article['publishedTimeStr'], "%Y-%m-%d %H:%M:%S")), "%Y-%m-%d") if parsed_datetime < (datetime.today() - timedelta(days=delta)): i = -1 else: article['category'] = "Financial News" article['site'] = "Securities Regulatory Commission of China" article['originSite'] = "θ―η›‘δΌš" article['titleCN'] = article['title'] article['title'] = translate(article['titleCN']) article['author'] = '' article['contentCN'] = repr(article['content'])[1:-1].strip() if len(article['contentCN']) < 10: continue contenteng = '' for element in article['contentCN'].split("。"): contenteng += translate(element) + ' ' article['content'] = repr(contenteng)[1:-1].strip() try: article['subtitle'] = summarize(article['content']) except (RuntimeError, ValueError): article['subtitle'] = "" article['publishDate'] = time.strftime( "%Y-%m-%d", time.strptime(article['publishedTimeStr'], "%Y-%m-%d %H:%M:%S")) article['link'] = article['url'] article['attachment'] = "" article['sentimentScore'], article[ 'sentimentLabel'] = sentiment_computation(article['content']) article['id'] = uuid.uuid5( uuid.NAMESPACE_OID, article['titleCN'] + article['publishDate']) logger.info(article) # update_content(article) except (ValueError, KeyError, TypeError) as error: i = -1 logger.error(error)