Spaces:

Oxbridge-Economics
/

Data-Collection-China

Running

File size: 2,387 Bytes

e47a6a0
57c4050
09317bd
57c4050
 
 
e47a6a0
0fc522e
 
 
57c4050
0fc522e
 
57c4050
ec13f7a
57c4050
e47a6a0
 
 
 
 
 
 
57c4050
e47a6a0
 
 
57c4050
 
 
e47a6a0
57c4050
 
e47a6a0
 
 
57c4050
e47a6a0
 
57c4050
e47a6a0
9176677
ec13f7a
 
 
57c4050
ec13f7a
e47a6a0

import time
import uuid
from datetime import datetime, timedelta
import requests
from lxml import etree
from utils import encode, translate, datemodifier, sentiment_computation, upsert_content

i = 0
while i > -1:
    if i == 0:
        CATEGORY_URL = "http://www.pbc.gov.cn/rmyh/3963412/3963426/index.html"
    else:
        j = i + 1
        CATEGORY_URL = f"http://www.pbc.gov.cn/rmyh/3963412/3963426/index_{j}.html"
    i = i + 1
    response = requests.get(CATEGORY_URL, timeout=20)
    page = etree.HTML(response.text)
    urls = page.xpath("//td[contains(@height,'22')]//a[contains(@target, '_blank')]/@href")
    urls = [item for item in urls if item.startswith("/rmyh/")]
    for url in urls:
        try:
            url = "http://www.pbc.gov.cn" + url
            article = {}
            response = requests.get(url, timeout=20)
            response.encoding = 'utf-8'
            page = etree.HTML(response.text)
            article['originalContent'] = encode(page.xpath("//div[@class='mainw950']//td[@class='content']/font[@class='zoom1']//p"))
            if len(article['originalContent']) < 10:
                continue
            CONTENT_ENG = ''
            for element in article['originalContent'].split("。"):
                CONTENT_ENG += translate(element) + ' '
            article['content'] = CONTENT_ENG
            article['site'] = "The People's Bank of China"
            article['originalSite'] = "中国人民银行"
            article['originalTitle'] = page.xpath("//title/text()")[0]
            article['title'] = translate(article['originalTitle'])
            article['url'] = url
            article['category']= "Policy Interpretation"
            article['publishDate'] = datemodifier(page.xpath("//meta[@name = '页面生成时间']/@content")[0], "%Y-%m-%d %H:%M:%S")
            parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(article['publishDate'],"%Y-%m-%d")), "%Y-%m-%d")
            if parsed_datetime < (datetime.today() - timedelta(days=183)):
                i = -1
            else:
                article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
                article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
                upsert_content(article)
        except Exception as error:
            print(error)