gavinzli's picture
chore: Add script descriptions and improve code readability
39fe3d1
raw
history blame
3.81 kB
"""
This script is used to crawl and collect financial news and policy interpretation articles from the website of the Ministry of Finance of China (https://www.mof.gov.cn/).
The script iterates through the pages of the "Financial News" and "Policy Interpretation" categories on the website and extracts the articles' URLs. It then calls the `crawl` function from the `utils` module to crawl and collect the article data.
The script uses the `lxml` library to parse the HTML content of the website and extract the necessary information.
Note: The script assumes the existence of a `crawl` function in the `utils` module.
"""
import time
import urllib.request
from datetime import datetime, timedelta
from lxml import etree
from utils import crawl
# Crawl Financial News articles
i = 0
while i > -1:
if i == 0:
CATEGORY_URL = "https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/"
else:
CATEGORY_URL = f"https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/index_{i}.htm"
i = i + 1
print(CATEGORY_URL)
req = urllib.request.urlopen(CATEGORY_URL)
text = req.read()
html_text = text.decode("utf-8")
page = etree.HTML(html_text)
articlelist = page.xpath("//div[contains(@class, 'xwfb_listerji')]/ul/li[not(@class = 'clear')]")
for article in articlelist:
if isinstance(article, etree._Element):
subelement = etree.tostring(article).decode()
subpage = etree.HTML(subelement)
date = subpage.xpath("//span/text()")[0]
parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
if parsed_datetime < (datetime.today() - timedelta(days=183)):
i = -1
else:
urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
for url in urls:
try:
article = {}
url = url.replace("../", "https://www.mof.gov.cn/zhengwuxinxi/")
url = url.replace("./", "https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/")
print(url)
article['category']= "Financial News"
crawl(url, article)
except Exception as error:
print(error)
# Crawl Policy Interpretation articles
i = 0
while i > -1:
if i == 0:
CATEGORY_URL = "https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/"
else:
CATEGORY_URL = f"https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/index_{i}.htm"
i = i + 1
print(CATEGORY_URL)
req = urllib.request.urlopen(CATEGORY_URL)
text = req.read()
html_text = text.decode("utf-8")
page = etree.HTML(html_text)
articlelist = page.xpath("//div[contains(@class, 'xwfb_listerji')]/ul/li[not(@class = 'clear')]")
for article in articlelist:
if isinstance(article, etree._Element):
subelement = etree.tostring(article).decode()
subpage = etree.HTML(subelement)
date = subpage.xpath("//span/text()")[0]
parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
if parsed_datetime < (datetime.today() - timedelta(days=183)):
i = -1
else:
urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
for url in urls:
try:
article = {}
url = url.replace("./", CATEGORY_URL)
article['category']= "Policy Interpretation"
print(url)
crawl(url, article)
except Exception as error:
print(error)