Spaces:

Oxbridge-Economics
/

Data-Collection-China

Running

OxbridgeEconomics

commit

b6dcee5 about 1 year ago

3.08 kB

	import uuid
	import json
	import urllib.request
	from urllib.parse import urlparse
	from datetime import datetime, timedelta
	from lxml import etree
	from utils import encode, translate, datemodifier, sentiment_computation, upsert_content, fetch_url, encode_content

	with open('xpath.json', 'r', encoding='UTF-8') as f:
	xpath_dict = json.load(f)

	def crawl(url, article):
	domain = urlparse(url).netloc
	req = urllib.request.urlopen(url)
	text = req.read()
	html_text = text.decode("utf-8")
	page = etree.HTML(html_text)
	originContent, summary = encode_content(page.xpath(xpath_dict[domain]['content']))
	article['attachment'] = encode(page.xpath(xpath_dict[domain]['attachment']))
	article['link'] = url
	if article['orgSName'] == "''":
	article['site'] = translate(article['orgSName'])
	else:
	article['site'] = translate(article['orgName'])
	article['originTitle'] = article['title']
	article['title'] = translate(article['title'])
	article['author'] = translate(article['researcher'])
	article['originAuthor'] = article['researcher']
	article['originContent'] = repr(originContent)
	article['subtitle'] = translate(summary)
	article['category'] = "Macroeconomic Research"
	if len(article['originContent']) < 10:
	return None
	CONTENT_ENG = ''
	for element in originContent.split("\n"):
	CONTENT_ENG += translate(element) + '\n'
	article['content'] = repr(CONTENT_ENG)
	article['authorid'] = uuid.uuid5(uuid.NAMESPACE_OID, article['author'])
	article['publishDate'] = datemodifier(article['publishDate'], xpath_dict[domain]['datetime']['format_string'])
	article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
	article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(originContent.replace("\n",""))
	upsert_content(article)

	today = datetime.today().strftime('%Y-%m-%d')
	beginDate = (datetime.today() - timedelta(days=183)).strftime('%Y-%m-%d')
	i = 0
	while i > -1:
	URL = "https://reportapi.eastmoney.com/report/jg"
	params = {
	"cb": "datatable8544623",
	"pageSize": "100",
	"beginTime": beginDate,
	"endTime": today,
	"pageNo": i,
	"qType": "3",
	}
	URL = URL + "?" + "&".join(f"{key}={value}" for key, value in params.items())
	print(URL)
	content = fetch_url(URL)
	if content:
	start_index = content.find("(")
	if start_index != -1:
	result = content[start_index + 1: -1]
	else:
	result = content
	reportinfo = json.loads(result)
	if reportinfo["size"] > 0:
	i = i + 1
	for article in reportinfo['data']:
	try:
	url = f"https://data.eastmoney.com/report/zw_macresearch.jshtml?encodeUrl={article['encodeUrl']}"
	crawl(url,article)
	except Exception as error:
	print(error)
	else:
	print(reportinfo)
	i = -1
	else:
	print("Failed to fetch URL:", url)