Spaces:

Oxbridge-Economics
/

Data-Collection-China

Running

File size: 2,954 Bytes

e47a6a0
57c4050
09317bd
57c4050
 
 
e47a6a0
0fc522e
 
 
57c4050
0fc522e
 
57c4050
ec13f7a
42ba1cc
e47a6a0
42ba1cc
 
 
 
 
 
 
 
 
57c4050
42ba1cc
ec13f7a
 
42ba1cc

import time
import uuid
from datetime import datetime, timedelta
import requests
from lxml import etree
from utils import encode, translate, datemodifier, sentiment_computation, upsert_content

i = 0
while i > -1:
    if i == 0:
        CATEGORY_URL = "http://www.pbc.gov.cn/rmyh/3963412/3963426/index.html"
    else:
        j = i + 1
        CATEGORY_URL = f"http://www.pbc.gov.cn/rmyh/3963412/3963426/index_{j}.html"
    i = i + 1
    response = requests.get(CATEGORY_URL, timeout=30)
    page = etree.HTML(response.text)
    articlelist = page.xpath("//td[contains(@height, '22')]")
    for article in articlelist:
        if isinstance(article, etree._Element):
            subelement = etree.tostring(article).decode()
            subpage = etree.HTML(subelement)
            date = subpage.xpath("//span/text()")
            try:
                parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
            except:
                continue
            if  parsed_datetime < (datetime.today() - timedelta(days=183)):
                i = -1
            else:
                urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
                for url in urls:
                    try:
                        article = {}
                        url = "http://www.pbc.gov.cn" + url
                        response = requests.get(url, timeout=20)
                        response.encoding = 'utf-8'
                        page = etree.HTML(response.text)
                        article['originalContent'] = encode(page.xpath("//div[@class='mainw950']//td[@class='content']/font[@class='zoom1']//p"))
                        if len(article['originalContent']) < 10:
                            continue
                        CONTENT_ENG = ''
                        for element in article['originalContent'].split("。"):
                            CONTENT_ENG += translate(element) + ' '
                        article['content'] = CONTENT_ENG
                        article['site'] = "The People's Bank of China"
                        article['originalSite'] = "中国人民银行"
                        article['originalTitle'] = page.xpath("//title/text()")[0]
                        article['title'] = translate(article['originalTitle'])
                        article['url'] = url
                        article['category']= "Policy Interpretation"
                        article['publishDate'] = datemodifier(page.xpath("//meta[@name = '页面生成时间']/@content")[0], "%Y-%m-%d %H:%M:%S")
                        article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
                        article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
                        upsert_content(article)
                    except Exception as error:
                        print(error)