Spaces:

Oxbridge-Economics
/

Data-Collection-China

Running

File size: 3,383 Bytes

57c4050

import uuid
import time
import urllib.request
from datetime import datetime, timedelta
from lxml import etree
from utils import encode, translate, sentiment_computation, upsert_content

categories = ['jdzhsw','jdgnmy','jddwmy','jdtzhz']
for category in categories:
    i = 1
    while i > -1:
        if i == 1:
            URL = f"http://www.mofcom.gov.cn/article/zcjd/{category}/"
        else:
            URL = f"http://www.mofcom.gov.cn/article/zcjd/{category}/?{i}"
        i = i + 1
        req = urllib.request.urlopen(URL)
        text = req.read()
        html_text = text.decode("utf-8")
        page = etree.HTML(html_text)
        articlelist = page.xpath("//section[contains(@class, 'listCon iListCon f-mt30')]/ul/li")
        for article in articlelist:
            if isinstance(article, etree._Element):
                subelement = etree.tostring(article).decode()
                subpage = etree.HTML(subelement)
                date = subpage.xpath("//span/text()")[0]
                parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d %H:%M:%S")), "%Y-%m-%d")
                if  parsed_datetime < (datetime.today() - timedelta(days=183)):
                    i = -1
                else:
                    urls = subpage.xpath("//a/@href")
                    for url in urls:
                        try:
                            article = {}
                            if '/article/zcjd' in url:
                                url = "http://www.mofcom.gov.cn" + url
                                article['category']= "Policy Interpretation"
                            else:
                                article['category']= "Policy Release"
                            req = urllib.request.urlopen(url)
                            text = req.read()
                            html_text = text.decode("utf-8")
                            page = etree.HTML(html_text)
                            article['originalContent'] = encode(page.xpath("//div[contains(@class, 'art-con art-con-bottonmLine')]//p"))
                            if len(article['originalContent']) < 10:
                                continue
                            CONTENT_ENG = ''
                            for element in article['originalContent'].split("。"):
                                CONTENT_ENG += translate(element) + ' '
                            article['content'] = CONTENT_ENG
                            article['site'] = "Ministry of Commerce"
                            article['originalSite'] = "商务部"
                            article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
                            article['title'] = translate(article['originalTitle'])
                            article['url'] = url
                            article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(page.xpath("//meta[@name = 'PubDate']/@content")[0],"%Y-%m-%d %H:%M:%S"))
                            article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
                            article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
                            upsert_content(article)
                        except Exception as error:
                            print(error)