Spaces:

Oxbridge-Economics
/

Data-Collection-China

Running

App Files Files Community

Data-Collection-China / eastmoney.py

gavinzli

chore: Add script descriptions and improve code readability

39fe3d1 11 months ago

raw

history blame

3.98 kB

	"""
	This script is used to crawl a webpage and extract relevant information from it. It defines a function `crawl` that takes a URL and a dictionary to store the extracted information. The function crawls the webpage, extracts the content, translates it to English, and stores it in the dictionary.

	The script also includes a main loop that fetches data from a specific URL and calls the `crawl` function for each article in the fetched data.
	"""

	import uuid
	import json
	import urllib.request
	from urllib.parse import urlparse
	from datetime import datetime, timedelta
	from lxml import etree
	from utils import encode, translate, datemodifier, sentiment_computation, upsert_content, fetch_url, encode_content

	# Load XPath dictionary from a JSON file
	with open('xpath.json', 'r', encoding='UTF-8') as f:
	xpath_dict = json.load(f)

	def crawl(url, article):
	"""
	Crawls the given URL and extracts relevant information from the webpage.

	Args:
	url (str): The URL of the webpage to crawl.
	article (dict): A dictionary to store the extracted information.

	Returns:
	None: If the length of the extracted content is less than 10 characters.
	str: The extracted content in English if successful.

	Raises:
	None

	"""
	domain = urlparse(url).netloc
	req = urllib.request.urlopen(url)
	text = req.read()
	html_text = text.decode("utf-8")
	page = etree.HTML(html_text)
	contentCN, summary = encode_content(page.xpath(xpath_dict[domain]['content']))
	article['attachment'] = encode(page.xpath(xpath_dict[domain]['attachment']))
	article['link'] = url
	if article['orgSName'] == "''":
	article['site'] = translate(article['orgSName'])
	else:
	article['site'] = translate(article['orgName'])
	article['titleCN'] = article['title']
	article['title'] = translate(article['title'])
	article['author'] = translate(article['researcher'])
	article['originAuthor'] = article['researcher']
	article['contentCN'] = repr(contentCN)[1:-1].strip()
	article['subtitle'] = translate(summary)
	article['category'] = "Macroeconomic Research"
	if len(article['contentCN']) < 10:
	return None
	CONTENT_ENG = ''
	for element in contentCN.split("\n"):
	CONTENT_ENG += translate(element) + '\n'
	article['content'] = repr(CONTENT_ENG)[1:-1].strip()
	article['authorid'] = uuid.uuid5(uuid.NAMESPACE_OID, article['author'])
	article['publishDate'] = datemodifier(article['publishDate'], xpath_dict[domain]['datetime_format'])
	article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['titleCN']+article['publishDate'])
	article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(contentCN.replace("\n",""))
	upsert_content(article)

	today = datetime.today().strftime('%Y-%m-%d')
	beginDate = (datetime.today() - timedelta(days=183)).strftime('%Y-%m-%d')
	i = 0
	while i > -1:
	URL = "https://reportapi.eastmoney.com/report/jg"
	params = {
	"cb": "datatable8544623",
	"pageSize": "100",
	"beginTime": beginDate,
	"endTime": today,
	"pageNo": i,
	"qType": "3",
	}
	URL = URL + "?" + "&".join(f"{key}={value}" for key, value in params.items())
	print(URL)
	content = fetch_url(URL)
	if content:
	start_index = content.find("(")
	if start_index != -1:
	result = content[start_index + 1: -1]
	else:
	result = content
	reportinfo = json.loads(result)
	if reportinfo["size"] > 0:
	i = i + 1
	for article in reportinfo['data']:
	try:
	url = f"https://data.eastmoney.com/report/zw_macresearch.jshtml?encodeUrl={article['encodeUrl']}"
	crawl(url,article)
	except Exception as error:
	print(error)
	else:
	print(reportinfo)
	i = -1
	else:
	print("Failed to fetch URL:", url)