Spaces:

Oxbridge-Economics
/

Data-Collection-China

Running

File size: 4,432 Bytes

0fc522e
 
 
 
046bb22
b6dcee5
046bb22
 
b6dcee5
046bb22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0fc522e
 
 
 
57c4050
0fc522e
57c4050
ec13f7a
4a8b338
0fc522e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b348cfd
0fc522e
 
 
4a8b338
b6dcee5
b348cfd
 
0fc522e
 
 
 
57c4050
0fc522e
57c4050
ec13f7a
57c4050
0fc522e
 
 
 
 
 
 
 
 
 
ec13f7a
 
 
0fc522e
 
 
 
 
 
4a8b338
046bb22
 
b6dcee5
0fc522e
b348cfd

from datetime import datetime, timedelta
import time
import urllib.request
from lxml import etree
from utils import crawl

# with open('xpath.json', 'r', encoding='UTF-8') as f:
#     xpath_dict = json.load(f)

# def crawl(url, article):
#     domain = urlparse(url).netloc
#     req = urllib.request.urlopen(url)
#     text = req.read()
#     html_text = text.decode("utf-8")
#     page = etree.HTML(html_text)
#     originContent, summary  = encode_content(page.xpath(xpath_dict[domain]['content']))
#     article['originTitle'] = encode(page.xpath(xpath_dict[domain]['title']))
#     article['title'] = translate(article['originTitle'])
#     article['author'] = translate(encode(page.xpath(xpath_dict[domain]['author'])))
#     article['originContent'] = repr(originContent)
#     if len(article['originContent']) < 10:
#         return None
#     CONTENT_ENG = ''
#     for element in originContent.split("\n"):
#         CONTENT_ENG += translate(element) + '\n'
#     article['content'] = repr(CONTENT_ENG)
#     article['subtitle'] = translate(summary)
#     article['publishDate'] = datemodifier(encode(page.xpath(xpath_dict[domain]['publishdate'])), xpath_dict[domain]['datetime']['format_string'])
#     article['link'] = url
#     article['attachment'] = ""
#     article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(originContent.replace("\n",""))
#     article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
#     upsert_content(article)

i = 0
while i > -1:
    if i == 0:
        CATEGORY_URL = "https://www.gov.cn/zhengce/jiedu/home.htm"
    else:
        CATEGORY_URL = f"https://www.gov.cn/zhengce/jiedu/home_{i}.htm"
    i = i + 1
    req = urllib.request.urlopen(CATEGORY_URL)
    text = req.read()
    html_text = text.decode("utf-8")
    page = etree.HTML(html_text)
    articlelist = page.xpath("//div[contains(@class, 'news_box')]//h4")
    for article in articlelist:
        if isinstance(article, etree._Element):
            subelement = etree.tostring(article).decode()
            subpage = etree.HTML(subelement)
            date = subpage.xpath("//span/text()")[0]
            parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
            if  parsed_datetime < (datetime.today() - timedelta(days=183)):
                i = -1
            else:
                urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
                for url in urls:
                    try:
                        article = {}
                        url = url.replace('../', 'https://www.gov.cn/zhengce/')
                        if "https://www.gov.cn" in url:
                            article['category']= "Policy Interpretation"
                            crawl(url, article)
                    except Exception as error:
                        print(error)

i = 0
while i > -1:
    if i == 0:
        CATEGORY_URL = "https://www.gov.cn/zhengce/zuixin/home.htm"
    else:
        CATEGORY_URL = f"https://www.gov.cn/zhengce/zuixin/home_{i}.htm"
    i = i + 1
    req = urllib.request.urlopen(CATEGORY_URL)
    text = req.read()
    html_text = text.decode("utf-8")
    page = etree.HTML(html_text)
    articlelist = page.xpath("//div[contains(@class, 'news_box')]//h4")
    for article in articlelist:
        if isinstance(article, etree._Element):
            subelement = etree.tostring(article).decode()
            subpage = etree.HTML(subelement)
            date = subpage.xpath("//span/text()")[0]
            parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
            if  parsed_datetime < (datetime.today() - timedelta(days=183)):
                i = -1
            else:
                urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
                for url in urls:
                    try:
                        article = {}
                        url = url.replace('../', 'https://www.gov.cn/zhengce/')
                        if "https://www.gov.cn" in url:
                            article['category']= "Policy Release"
                            article['originSite'] = "国务院"
                            article['site'] = "State Council of China"
                            crawl(url, article)
                    except Exception as error:
                        print(error)