Spaces:

Oxbridge-Economics
/

Data-Collection-China

Running

File size: 3,727 Bytes

39fe3d1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e47a6a0
57c4050
09317bd
57c4050
 
 
e47a6a0
0fc522e
 
 
57c4050
0fc522e
 
57c4050
ec13f7a
42ba1cc
e47a6a0
42ba1cc
 
 
 
 
 
 
 
 
57c4050
42ba1cc
ec13f7a
 
42ba1cc

"""
This module contains code to scrape the People's Bank of China website and collect policy interpretation articles. It iterates through the pages of the website, extracts relevant information from each article, and stores the data in a database.

The main functionality of this module includes:
- Scraping the website for policy interpretation articles
- Parsing the HTML content of each article
- Extracting relevant information such as title, content, publish date, and URL
- Translating the content from Chinese to English
- Computing sentiment scores for the content
- Storing the collected data in a database

Note: This code assumes the existence of the following helper functions: encode, translate, datemodifier, sentiment_computation, and upsert_content.

"""

import time
import uuid
from datetime import datetime, timedelta
import requests
from lxml import etree
from utils import encode, translate, datemodifier, sentiment_computation, upsert_content

i = 0
while i > -1:
    if i == 0:
        CATEGORY_URL = "http://www.pbc.gov.cn/rmyh/3963412/3963426/index.html"
    else:
        j = i + 1
        CATEGORY_URL = f"http://www.pbc.gov.cn/rmyh/3963412/3963426/index_{j}.html"
    i = i + 1
    response = requests.get(CATEGORY_URL, timeout=30)
    page = etree.HTML(response.text)
    articlelist = page.xpath("//td[contains(@height, '22')]")
    for article in articlelist:
        if isinstance(article, etree._Element):
            subelement = etree.tostring(article).decode()
            subpage = etree.HTML(subelement)
            date = subpage.xpath("//span/text()")
            try:
                parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
            except:
                continue
            if  parsed_datetime < (datetime.today() - timedelta(days=183)):
                i = -1
            else:
                urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
                for url in urls:
                    try:
                        article = {}
                        url = "http://www.pbc.gov.cn" + url
                        response = requests.get(url, timeout=20)
                        response.encoding = 'utf-8'
                        page = etree.HTML(response.text)
                        article['originalContent'] = encode(page.xpath("//div[@class='mainw950']//td[@class='content']/font[@class='zoom1']//p"))
                        if len(article['originalContent']) < 10:
                            continue
                        CONTENT_ENG = ''
                        for element in article['originalContent'].split("。"):
                            CONTENT_ENG += translate(element) + ' '
                        article['content'] = CONTENT_ENG
                        article['site'] = "The People's Bank of China"
                        article['originalSite'] = "中国人民银行"
                        article['originalTitle'] = page.xpath("//title/text()")[0]
                        article['title'] = translate(article['originalTitle'])
                        article['url'] = url
                        article['category']= "Policy Interpretation"
                        article['publishDate'] = datemodifier(page.xpath("//meta[@name = '页面生成时间']/@content")[0], "%Y-%m-%d %H:%M:%S")
                        article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
                        article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
                        upsert_content(article)
                    except Exception as error:
                        print(error)