Spaces:

Oxbridge-Economics
/

Data-Collection-China

Running

File size: 2,117 Bytes

"""
This script is used to crawl and collect data from the Ministry of Commerce of the People's Republic of China (MOFCOM) website.
It retrieves articles from different categories and extracts relevant information such as date and URL.
The collected data is then passed to the 'crawl' function for further processing.
"""

import time
import urllib.request
from datetime import datetime, timedelta
from lxml import etree
from utils import crawl

categories = ['jdzhsw','jdgnmy','jddwmy','jdtzhz']
for category in categories:
    i = 1
    while i > -1:
        if i == 1:
            URL = f"http://www.mofcom.gov.cn/article/zcjd/{category}/"
        else:
            URL = f"http://www.mofcom.gov.cn/article/zcjd/{category}/?{i}"
        i = i + 1
        req = urllib.request.urlopen(URL)
        text = req.read()
        html_text = text.decode("utf-8")
        page = etree.HTML(html_text)
        articlelist = page.xpath("//section[contains(@class, 'listCon iListCon f-mt30')]/ul/li")
        for article in articlelist:
            if isinstance(article, etree._Element):
                subelement = etree.tostring(article).decode()
                subpage = etree.HTML(subelement)
                date = subpage.xpath("//span/text()")[0]
                parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d %H:%M:%S")), "%Y-%m-%d")
                if  parsed_datetime < (datetime.today() - timedelta(days=183)):
                    i = -1
                else:
                    urls = subpage.xpath("//a/@href")
                    for url in urls:
                        try:
                            article = {}
                            if '/article/zcjd' in url:
                                url = "http://www.mofcom.gov.cn" + url
                                article['category']= "Policy Interpretation"
                            else:
                                article['category']= "Policy Release"
                            crawl(url, article)
                        except Exception as error:
                            print(error)