"""Module to crawl the data from the website of State Administration of Foreign Exchange (SAFE) of China. This module contains code to crawl and collect data from the website of the State Administration of Foreign Exchange (SAFE) of China. It includes two sections: Policy Interpretation and Data Interpretation. Policy Interpretation: - The code crawls the web pages containing policy interpretations from the SAFE website. - It retrieves the publication date and checks if it is within the last 183 days. - If the publication date is within the last 183 days, it extracts the URL and other information of the policy interpretation article. - The extracted data is stored in a dictionary and passed to the 'crawl' function for further processing. Data Interpretation: - The code crawls the web pages containing data interpretations from the SAFE website. - It retrieves the publication date and checks if it is within the last 183 days. - If the publication date is within the last 183 days, it extracts the URL and other information of the data interpretation article. - The extracted data is stored in a dictionary and passed to the 'crawl' function for further processing. Note: The 'crawl' function is imported from the 'utils' module. """ import time import urllib.request from datetime import datetime, timedelta from lxml import etree from utils import crawl # Policy Interpretation i = 1 while i > -1: if i == 1: CATEGORY_URL = "https://www.safe.gov.cn/safe/zcfgjd/index.html" else: CATEGORY_URL = f"https://www.safe.gov.cn/safe/zcfgjd/index_{i}.html" i = i + 1 req = urllib.request.urlopen(CATEGORY_URL) text = req.read() html_text = text.decode("utf-8") page = etree.HTML(html_text) articlelist = page.xpath("//div[contains(@class, 'list_conr')]/ul/li") for article in articlelist: if isinstance(article, etree._Element): subelement = etree.tostring(article).decode() subpage = etree.HTML(subelement) date = subpage.xpath("//dd/text()")[0] parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d") if parsed_datetime < (datetime.today() - timedelta(days=183)): i = -1 else: urls = subpage.xpath("//a/@href") for url in urls: try: article = {} url = "https://www.safe.gov.cn" + url article['category']= "Policy Interpretation" crawl(url, article) except Exception as error: print(error) # Data Interpretation i = 1 while i > -1: if i == 1: CATEGORY_URL = "https://www.safe.gov.cn/safe/sjjd/index.html" else: CATEGORY_URL = f"https://www.safe.gov.cn/safe/sjjd/index_{i}.html" i = i + 1 req = urllib.request.urlopen(CATEGORY_URL) text = req.read() html_text = text.decode("utf-8") page = etree.HTML(html_text) articlelist = page.xpath("//div[contains(@class, 'list_conr')]/ul/li") for article in articlelist: if isinstance(article, etree._Element): subelement = etree.tostring(article).decode() subpage = etree.HTML(subelement) date = subpage.xpath("//dd/text()")[0] parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d") if parsed_datetime < (datetime.today() - timedelta(days=183)): i = -1 else: urls = subpage.xpath("//a/@href") for url in urls: try: article = {} url = "https://www.safe.gov.cn" + url article['category']= "Data Interpretation" crawl(url, article) except Exception as error: print(error)