Data-Collection-China / eastmoney.py
OxbridgeEconomics
Update eastmoney.py
d116b64 unverified
raw
history blame
3.4 kB
import uuid
import json
import urllib.request
from datetime import datetime, timedelta
from lxml import etree
from utils import encode, translate, datemodifier, sentiment_computation, upsert_content, fetch_url
today = datetime.today().strftime('%Y-%m-%d')
beginDate = (datetime.today() - timedelta(days=183)).strftime('%Y-%m-%d')
i = 0
while i > -1:
URL = "https://reportapi.eastmoney.com/report/jg"
params = {
"cb": "datatable8544623",
"pageSize": "100",
"beginTime": beginDate,
"endTime": today,
"pageNo": i,
"qType": "3",
}
URL = URL + "?" + "&".join(f"{key}={value}" for key, value in params.items())
print(URL)
content = fetch_url(URL)
if content:
start_index = content.find("(")
if start_index != -1:
result = content[start_index + 1: -1]
else:
result = content
reportinfo = json.loads(result)
if reportinfo["size"] > 0:
i = i + 1
for article in reportinfo['data']:
try:
url = f"https://data.eastmoney.com/report/zw_macresearch.jshtml?encodeUrl={article['encodeUrl']}"
req = urllib.request.urlopen(url)
text = req.read()
html_text = text.decode("utf-8")
page = etree.HTML(html_text)
content = encode(page.xpath("//div[contains(@class, 'ctx-content')]//p"))
reporturl = encode(page.xpath("//a[contains(@class, 'pdf-link')]/@href"))
article['url'] = url
if article['orgSName'] == "''":
article['site'] = translate(article['orgSName'])
article['originalSite'] = article['orgSName']
else:
article['site'] = translate(article['orgName'])
article['originalSite'] = article['orgSName']
article['reporturl'] = reporturl
article['originalTitle'] = article['title']
article['title'] = translate(article['title'])
article['author'] = translate(article['researcher'])
article['originalAuthor'] = article['researcher']
article['originalContent'] = content
article['category'] = "Macroeconomic Research"
if len(article['originalContent']) < 10:
continue
CONTENT_ENG = ''
for element in article['originalContent'].split("。"):
CONTENT_ENG += translate(element) + ' '
article['content'] = CONTENT_ENG
article['authorid'] = uuid.uuid5(uuid.NAMESPACE_OID, article['author'])
article['publishDate'] = datemodifier(article['publishDate'], "%Y-%m-%d %H:%M:%S.%f")
article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
upsert_content(article)
except Exception as error:
print(error)
else:
print(reportinfo)
i = -1
else:
print("Failed to fetch URL:", url)