Spaces:

Oxbridge-Economics
/

Data-Collection-China

Running

File size: 2,948 Bytes

39fe3d1
 
 
 
 
 
 
 
 
 
 
 
 
57c4050
a6d7194
 
 
42ba1cc
a6d7194
0fc522e
 
 
57c4050
0fc522e
57c4050
ec13f7a
57c4050
a6d7194
 
 
 
 
 
 
 
 
 
ec13f7a
 
 
a6d7194
 
 
 
42ba1cc
57c4050
af6fef8
 
57c4050
71294ad
57c4050
 
 
42ba1cc
a6d7194

"""
This script is used to crawl and collect data from the National Development and Reform Commission (NDRC) website.
It retrieves articles from the website and categorizes them as either "Policy Release" or "Policy Interpretation".
The script starts by iterating through the pages of the website, starting from the first page.
For each page, it retrieves the HTML content and parses it using lxml library.
It then extracts the article list from the parsed HTML and iterates through each article.
For each article, it extracts the publication date, converts it to a datetime object, and checks if it is within the last 183 days.
If the article is older than 183 days, the script stops iterating through the pages.
Otherwise, it extracts the URL of the article and categorizes it based on the URL pattern.
The script then calls the 'crawl' function from the 'utils' module to crawl the article and collect data.
Any exceptions that occur during the crawling process are caught and printed.
"""

from datetime import datetime, timedelta
import time
import urllib.request
from lxml import etree
from utils import crawl

i = 0
while i > -1:
    if i == 0:
        CATEGORY_URL = "https://www.ndrc.gov.cn/xxgk/jd/jd/index.html"
    else:
        CATEGORY_URL = f"https://www.ndrc.gov.cn/xxgk/jd/jd/index_{i}.html"
    i = i + 1
    req = urllib.request.urlopen(CATEGORY_URL)
    text = req.read()
    html_text = text.decode("utf-8")
    page = etree.HTML(html_text)
    articlelist = page.xpath("//div[contains(@class, 'list')]/ul/li[not(@class = 'empty')]")
    for article in articlelist:
        if isinstance(article, etree._Element):
            subelement = etree.tostring(article).decode()
            subpage = etree.HTML(subelement)
            date = subpage.xpath("//span/text()")[0]
            parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y/%m/%d")), "%Y-%m-%d")
            if  parsed_datetime < (datetime.today() - timedelta(days=183)):
                i = -1
            else:
                urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
                for url in urls:
                    try:
                        article = {}
                        if "www.gov.cn" in url:
                            article['category']= "Policy Release"
                        elif "../../zcfb/" in url:
                            url = url.replace("../../zcfb/", "https://www.ndrc.gov.cn/xxgk/zcfb/")
                            article['category']= "Policy Release"
                        else:
                            url = url.replace("../../", "https://www.ndrc.gov.cn/xxgk/jd/jd/")
                            url = url.replace("./", "https://www.ndrc.gov.cn/xxgk/jd/jd/")
                            article['category']= "Policy Interpretation"
                        crawl(url, article)
                    except Exception as error:
                        print(error)