File size: 2,117 Bytes
39fe3d1 57c4050 42ba1cc 57c4050 42ba1cc 57c4050 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 |
"""
This script is used to crawl and collect data from the Ministry of Commerce of the People's Republic of China (MOFCOM) website.
It retrieves articles from different categories and extracts relevant information such as date and URL.
The collected data is then passed to the 'crawl' function for further processing.
"""
import time
import urllib.request
from datetime import datetime, timedelta
from lxml import etree
from utils import crawl
categories = ['jdzhsw','jdgnmy','jddwmy','jdtzhz']
for category in categories:
i = 1
while i > -1:
if i == 1:
URL = f"http://www.mofcom.gov.cn/article/zcjd/{category}/"
else:
URL = f"http://www.mofcom.gov.cn/article/zcjd/{category}/?{i}"
i = i + 1
req = urllib.request.urlopen(URL)
text = req.read()
html_text = text.decode("utf-8")
page = etree.HTML(html_text)
articlelist = page.xpath("//section[contains(@class, 'listCon iListCon f-mt30')]/ul/li")
for article in articlelist:
if isinstance(article, etree._Element):
subelement = etree.tostring(article).decode()
subpage = etree.HTML(subelement)
date = subpage.xpath("//span/text()")[0]
parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d %H:%M:%S")), "%Y-%m-%d")
if parsed_datetime < (datetime.today() - timedelta(days=183)):
i = -1
else:
urls = subpage.xpath("//a/@href")
for url in urls:
try:
article = {}
if '/article/zcjd' in url:
url = "http://www.mofcom.gov.cn" + url
article['category']= "Policy Interpretation"
else:
article['category']= "Policy Release"
crawl(url, article)
except Exception as error:
print(error) |