|
""" |
|
This module contains code to scrape the People's Bank of China website and collect policy interpretation articles. It iterates through the pages of the website, extracts relevant information from each article, and stores the data in a database. |
|
|
|
The main functionality of this module includes: |
|
- Scraping the website for policy interpretation articles |
|
- Parsing the HTML content of each article |
|
- Extracting relevant information such as title, content, publish date, and URL |
|
- Translating the content from Chinese to English |
|
- Computing sentiment scores for the content |
|
- Storing the collected data in a database |
|
|
|
Note: This code assumes the existence of the following helper functions: encode, translate, datemodifier, sentiment_computation, and upsert_content. |
|
|
|
""" |
|
|
|
import time |
|
import uuid |
|
from datetime import datetime, timedelta |
|
import requests |
|
from lxml import etree |
|
from utils import encode, translate, datemodifier, sentiment_computation, upsert_content |
|
|
|
i = 0 |
|
while i > -1: |
|
if i == 0: |
|
CATEGORY_URL = "http://www.pbc.gov.cn/rmyh/3963412/3963426/index.html" |
|
else: |
|
j = i + 1 |
|
CATEGORY_URL = f"http://www.pbc.gov.cn/rmyh/3963412/3963426/index_{j}.html" |
|
i = i + 1 |
|
response = requests.get(CATEGORY_URL, timeout=30) |
|
page = etree.HTML(response.text) |
|
articlelist = page.xpath("//td[contains(@height, '22')]") |
|
for article in articlelist: |
|
if isinstance(article, etree._Element): |
|
subelement = etree.tostring(article).decode() |
|
subpage = etree.HTML(subelement) |
|
date = subpage.xpath("//span/text()") |
|
try: |
|
parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d") |
|
except: |
|
continue |
|
if parsed_datetime < (datetime.today() - timedelta(days=183)): |
|
i = -1 |
|
else: |
|
urls = subpage.xpath("//a[contains(@target, '_blank')]/@href") |
|
for url in urls: |
|
try: |
|
article = {} |
|
url = "http://www.pbc.gov.cn" + url |
|
response = requests.get(url, timeout=20) |
|
response.encoding = 'utf-8' |
|
page = etree.HTML(response.text) |
|
article['originalContent'] = encode(page.xpath("//div[@class='mainw950']//td[@class='content']/font[@class='zoom1']//p")) |
|
if len(article['originalContent']) < 10: |
|
continue |
|
CONTENT_ENG = '' |
|
for element in article['originalContent'].split("。"): |
|
CONTENT_ENG += translate(element) + ' ' |
|
article['content'] = CONTENT_ENG |
|
article['site'] = "The People's Bank of China" |
|
article['originalSite'] = "中国人民银行" |
|
article['originalTitle'] = page.xpath("//title/text()")[0] |
|
article['title'] = translate(article['originalTitle']) |
|
article['url'] = url |
|
article['category']= "Policy Interpretation" |
|
article['publishDate'] = datemodifier(page.xpath("//meta[@name = '页面生成时间']/@content")[0], "%Y-%m-%d %H:%M:%S") |
|
article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate']) |
|
article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content']) |
|
upsert_content(article) |
|
except Exception as error: |
|
print(error) |
|
|