File size: 2,948 Bytes
39fe3d1 57c4050 a6d7194 42ba1cc a6d7194 0fc522e 57c4050 0fc522e 57c4050 ec13f7a 57c4050 a6d7194 ec13f7a a6d7194 42ba1cc 57c4050 af6fef8 57c4050 71294ad 57c4050 42ba1cc a6d7194 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 |
"""
This script is used to crawl and collect data from the National Development and Reform Commission (NDRC) website.
It retrieves articles from the website and categorizes them as either "Policy Release" or "Policy Interpretation".
The script starts by iterating through the pages of the website, starting from the first page.
For each page, it retrieves the HTML content and parses it using lxml library.
It then extracts the article list from the parsed HTML and iterates through each article.
For each article, it extracts the publication date, converts it to a datetime object, and checks if it is within the last 183 days.
If the article is older than 183 days, the script stops iterating through the pages.
Otherwise, it extracts the URL of the article and categorizes it based on the URL pattern.
The script then calls the 'crawl' function from the 'utils' module to crawl the article and collect data.
Any exceptions that occur during the crawling process are caught and printed.
"""
from datetime import datetime, timedelta
import time
import urllib.request
from lxml import etree
from utils import crawl
i = 0
while i > -1:
if i == 0:
CATEGORY_URL = "https://www.ndrc.gov.cn/xxgk/jd/jd/index.html"
else:
CATEGORY_URL = f"https://www.ndrc.gov.cn/xxgk/jd/jd/index_{i}.html"
i = i + 1
req = urllib.request.urlopen(CATEGORY_URL)
text = req.read()
html_text = text.decode("utf-8")
page = etree.HTML(html_text)
articlelist = page.xpath("//div[contains(@class, 'list')]/ul/li[not(@class = 'empty')]")
for article in articlelist:
if isinstance(article, etree._Element):
subelement = etree.tostring(article).decode()
subpage = etree.HTML(subelement)
date = subpage.xpath("//span/text()")[0]
parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y/%m/%d")), "%Y-%m-%d")
if parsed_datetime < (datetime.today() - timedelta(days=183)):
i = -1
else:
urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
for url in urls:
try:
article = {}
if "www.gov.cn" in url:
article['category']= "Policy Release"
elif "../../zcfb/" in url:
url = url.replace("../../zcfb/", "https://www.ndrc.gov.cn/xxgk/zcfb/")
article['category']= "Policy Release"
else:
url = url.replace("../../", "https://www.ndrc.gov.cn/xxgk/jd/jd/")
url = url.replace("./", "https://www.ndrc.gov.cn/xxgk/jd/jd/")
article['category']= "Policy Interpretation"
crawl(url, article)
except Exception as error:
print(error)
|