Spaces:

Oxbridge-Economics
/

Data-Collection-China

Sleeping

App Files Files Community

Data-Collection-China / source /cbirc.py

gavinzli

Remove openai_vectorize calls from crawl function and retain vectorize for article processing

60a0016 4 months ago

raw

history blame contribute delete

3.69 kB

	"""Module to crawl the website 'https://www.cbirc.gov.cn' to fetch and process articles."""
	import json
	import time
	import uuid
	from datetime import datetime, timedelta
	from prefect import task, get_run_logger

	from controllers.summarizer import summarize
	from controllers.utils import (
	extract_from_pdf,
	fetch_url,
	sentiment_computation,
	translate,
	update_content,
	)
	from controllers.vectorizer import vectorize

	@task(name = "Data Collection - cbirc", log_prints = True)
	def crawl(delta):
	"""
	Crawls the website 'https://www.cbirc.gov.cn' to fetch and process articles.

	Args:
	delta (int): The number of days to consider for article publication date.

	Returns:
	None

	Raises:
	None
	"""
	logger = get_run_logger()
	logger.info("cbirc.gov.cn")
	i = 1
	while i > -1:
	category_url= f"https://www.cbirc.gov.cn/cn/static/data/DocInfo/SelectDocByItemIdAndChild/data_itemId=917,pageIndex={i},pageSize=18.json"
	i = i + 1
	content = fetch_url(category_url)
	if content is not None:
	reportinfo = json.loads(content)
	for article in reportinfo['data']['rows']:
	try:
	article['publishDate'] = time.strftime(
	"%Y-%m-%d",
	time.strptime(article['publishDate'], "%Y-%m-%d %H:%M:%S"))
	parsed_datetime = datetime.strptime(article['publishDate'], "%Y-%m-%d")
	if parsed_datetime < (datetime.today() - timedelta(days=delta)):
	i = -1
	else:
	contentcn, summary = extract_from_pdf(
	"https://www.cbirc.gov.cn" + article['pdfFileUrl'])
	article['contentCN'] = repr(contentcn)[1:-1].strip()
	if len(contentcn) < 10:
	continue
	contenteng = ''
	for element in article['contentCN'].split("\n"):
	contenteng += translate(element) + '\n'
	article['content'] = repr(contenteng)[1:-1].strip()
	article[
	'site'] = "National Financial Regulatory Administration of China"
	article['originSite'] = "国家金融监督管理总局"
	article['titleCN'] = article['docSubtitle']
	article['title'] = translate(article['docSubtitle'])
	article['link'] = "https://www.cbirc.gov.cn" + str(
	article['pdfFileUrl'])
	article['category'] = "Policy Interpretation"
	article['id'] = uuid.uuid5(
	uuid.NAMESPACE_OID,
	article['titleCN'] + article['publishDate'])
	article['sentimentScore'], article[
	'sentimentLabel'] = sentiment_computation(article['content'])
	article['attachment'] = ''
	article['author'] = ''
	try:
	article['subtitle'] = summarize(article['content'])
	except (RuntimeError, ValueError):
	article['subtitle'] = translate(summary)
	article['referenceid'] = None
	update_content(article)
	vectorize(article)
	except (ValueError, KeyError, TypeError) as error:
	logger.error(error)