gavinzli's picture
chore: Add script descriptions and improve code readability
39fe3d1
raw
history blame
3.73 kB
"""
This module contains code to scrape the People's Bank of China website and collect policy interpretation articles. It iterates through the pages of the website, extracts relevant information from each article, and stores the data in a database.
The main functionality of this module includes:
- Scraping the website for policy interpretation articles
- Parsing the HTML content of each article
- Extracting relevant information such as title, content, publish date, and URL
- Translating the content from Chinese to English
- Computing sentiment scores for the content
- Storing the collected data in a database
Note: This code assumes the existence of the following helper functions: encode, translate, datemodifier, sentiment_computation, and upsert_content.
"""
import time
import uuid
from datetime import datetime, timedelta
import requests
from lxml import etree
from utils import encode, translate, datemodifier, sentiment_computation, upsert_content
i = 0
while i > -1:
if i == 0:
CATEGORY_URL = "http://www.pbc.gov.cn/rmyh/3963412/3963426/index.html"
else:
j = i + 1
CATEGORY_URL = f"http://www.pbc.gov.cn/rmyh/3963412/3963426/index_{j}.html"
i = i + 1
response = requests.get(CATEGORY_URL, timeout=30)
page = etree.HTML(response.text)
articlelist = page.xpath("//td[contains(@height, '22')]")
for article in articlelist:
if isinstance(article, etree._Element):
subelement = etree.tostring(article).decode()
subpage = etree.HTML(subelement)
date = subpage.xpath("//span/text()")
try:
parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
except:
continue
if parsed_datetime < (datetime.today() - timedelta(days=183)):
i = -1
else:
urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
for url in urls:
try:
article = {}
url = "http://www.pbc.gov.cn" + url
response = requests.get(url, timeout=20)
response.encoding = 'utf-8'
page = etree.HTML(response.text)
article['originalContent'] = encode(page.xpath("//div[@class='mainw950']//td[@class='content']/font[@class='zoom1']//p"))
if len(article['originalContent']) < 10:
continue
CONTENT_ENG = ''
for element in article['originalContent'].split("。"):
CONTENT_ENG += translate(element) + ' '
article['content'] = CONTENT_ENG
article['site'] = "The People's Bank of China"
article['originalSite'] = "中国人民银行"
article['originalTitle'] = page.xpath("//title/text()")[0]
article['title'] = translate(article['originalTitle'])
article['url'] = url
article['category']= "Policy Interpretation"
article['publishDate'] = datemodifier(page.xpath("//meta[@name = '页面生成时间']/@content")[0], "%Y-%m-%d %H:%M:%S")
article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
upsert_content(article)
except Exception as error:
print(error)