""" This script is used to crawl and collect data from the website of the China Securities Regulatory Commission (CSRC). It retrieves policy interpretation articles and financial news articles from the CSRC website. The collected data is then processed and stored in a database. The script consists of two main parts: 1. Crawl and process policy interpretation articles from the CSRC website. 2. Crawl and process financial news articles from the CSRC website. The script uses various libraries and functions to handle web scraping, data processing, and database operations. Note: This script assumes the presence of the following dependencies: - urllib - lxml - json - datetime - time - utils (custom module) Please make sure to install these dependencies before running the script. """ import uuid import json import time import urllib.request from datetime import datetime, timedelta from lxml import etree from utils import encode, translate, sentiment_computation, upsert_content, fetch_url, crawl i = 1 while i > -1: if i == 1: CATEGORY_URL = "http://www.csrc.gov.cn/csrc/c100039/common_list.shtml" else: CATEGORY_URL = f"http://www.csrc.gov.cn/csrc/c100039/common_list_{i}.shtml" i = i + 1 req = urllib.request.urlopen(CATEGORY_URL) text = req.read() html_text = text.decode("utf-8") page = etree.HTML(html_text) articlelist = page.xpath("//div[contains(@class, 'main-right fr common-list')]/ul/li") for article in articlelist: if isinstance(article, etree._Element): subelement = etree.tostring(article).decode() subpage = etree.HTML(subelement) date = encode(subpage.xpath("//span[@class='date']")) parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d") if parsed_datetime < (datetime.today() - timedelta(days=183)): i = -1 else: urls = subpage.xpath("//a/@href") for url in urls: try: article = {} url = "http://www.csrc.gov.cn" + url article['category']= "Policy Interpretation" crawl(url, article) except Exception as error: print(error) i = 1 while i > -1: CATEGORY_URL = f"http://www.csrc.gov.cn/searchList/a1a078ee0bc54721ab6b148884c784a8?_isAgg=true&_isJson=true&_pageSize=18&_template=index&_rangeTimeGte=&_channelName=&page={i}" i = i + 1 content = fetch_url(CATEGORY_URL) reportinfo = json.loads(content) for article in reportinfo['data']['results']: try: parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(article['publishedTimeStr'],"%Y-%m-%d %H:%M:%S")), "%Y-%m-%d") if parsed_datetime < (datetime.today() - timedelta(days=183)): i = -1 else: article['category']= "Financial News" article['site'] = "Securities Regulatory Commission of China" article['originSite'] = "θ―η›‘δΌš" article['titleCN'] = article['title'] article['title'] = translate(article['titleCN']) article['author'] = '' article['contentCN'] = repr(article['content'])[1:-1].strip() if len(article['contentCN']) < 10: continue CONTENT_ENG = '' for element in article['contentCN'].split("。"): CONTENT_ENG += translate(element) + ' ' article['content'] = repr(CONTENT_ENG)[1:-1].strip() article['subtitle'] = article['memo'] article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(article['publishedTimeStr'],"%Y-%m-%d %H:%M:%S")) article['link'] = article['url'] article['attachment'] = "" article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content']) article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['titleCN']+article['publishDate']) upsert_content(article) except Exception as error: print(error)