Spaces:

Oxbridge-Economics
/

Data-Collection-China

Running

App Files Files Community

Data-Collection-China / pbc.py

gavinzli

chore: Add script descriptions and improve code readability

39fe3d1 11 months ago

raw

history blame

3.73 kB

	"""
	This module contains code to scrape the People's Bank of China website and collect policy interpretation articles. It iterates through the pages of the website, extracts relevant information from each article, and stores the data in a database.

	The main functionality of this module includes:
	- Scraping the website for policy interpretation articles
	- Parsing the HTML content of each article
	- Extracting relevant information such as title, content, publish date, and URL
	- Translating the content from Chinese to English
	- Computing sentiment scores for the content
	- Storing the collected data in a database

	Note: This code assumes the existence of the following helper functions: encode, translate, datemodifier, sentiment_computation, and upsert_content.

	"""

	import time
	import uuid
	from datetime import datetime, timedelta
	import requests
	from lxml import etree
	from utils import encode, translate, datemodifier, sentiment_computation, upsert_content

	i = 0
	while i > -1:
	if i == 0:
	CATEGORY_URL = "http://www.pbc.gov.cn/rmyh/3963412/3963426/index.html"
	else:
	j = i + 1
	CATEGORY_URL = f"http://www.pbc.gov.cn/rmyh/3963412/3963426/index_{j}.html"
	i = i + 1
	response = requests.get(CATEGORY_URL, timeout=30)
	page = etree.HTML(response.text)
	articlelist = page.xpath("//td[contains(@height, '22')]")
	for article in articlelist:
	if isinstance(article, etree._Element):
	subelement = etree.tostring(article).decode()
	subpage = etree.HTML(subelement)
	date = subpage.xpath("//span/text()")
	try:
	parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
	except:
	continue
	if parsed_datetime < (datetime.today() - timedelta(days=183)):
	i = -1
	else:
	urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
	for url in urls:
	try:
	article = {}
	url = "http://www.pbc.gov.cn" + url
	response = requests.get(url, timeout=20)
	response.encoding = 'utf-8'
	page = etree.HTML(response.text)
	article['originalContent'] = encode(page.xpath("//div[@class='mainw950']//td[@class='content']/font[@class='zoom1']//p"))
	if len(article['originalContent']) < 10:
	continue
	CONTENT_ENG = ''
	for element in article['originalContent'].split("。"):
	CONTENT_ENG += translate(element) + ' '
	article['content'] = CONTENT_ENG
	article['site'] = "The People's Bank of China"
	article['originalSite'] = "中国人民银行"
	article['originalTitle'] = page.xpath("//title/text()")[0]
	article['title'] = translate(article['originalTitle'])
	article['url'] = url
	article['category']= "Policy Interpretation"
	article['publishDate'] = datemodifier(page.xpath("//meta[@name = '页面生成时间']/@content")[0], "%Y-%m-%d %H:%M:%S")
	article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
	article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
	upsert_content(article)
	except Exception as error:
	print(error)