Data-Collection-China / eastmoney.py
OxbridgeEconomics
commit
57c4050
raw
history blame
3.33 kB
import uuid
import json
import urllib.request
from datetime import datetime, timedelta
from lxml import etree
from utils import encode, translate, datemodifier, sentiment_computation, upsert_content, fetch_url
today = datetime.today().strftime('%Y-%m-%d')
beginDate = (datetime.today() - timedelta(days=183)).strftime('%Y-%m-%d')
i = 0
while i > -1:
URL = "https://reportapi.eastmoney.com/report/jg"
params = {
"cb": "datatable8544623",
"pageSize": "100",
"beginTime": beginDate,
"endTime": today,
"pageNo": i,
"qType": "3",
}
URL = URL + "?" + "&".join(f"{key}={value}" for key, value in params.items())
print(URL)
content = fetch_url(URL)
if content:
start_index = content.find("(")
if start_index != -1:
result = content[start_index + 1: -1]
else:
result = content
reportinfo = json.loads(result)
if reportinfo["size"] > 0:
i = i + 1
for article in reportinfo['data']:
try:
url = f"https://data.eastmoney.com/report/zw_macresearch.jshtml?encodeUrl={article['encodeUrl']}"
req = urllib.request.urlopen(url)
text = req.read()
html_text = text.decode("utf-8")
page = etree.HTML(html_text)
content = encode(page.xpath("//div[contains(@class, 'ctx-content')]//p"))
reporturl = encode(page.xpath("//a[contains(@class, 'pdf-link')]/@href"))
article['url'] = url
if article['orgSName'] == "''":
article['site'] = translate(article['orgSName'])
article['originalSite'] = article['orgSName']
else:
article['site'] = translate(article['orgName'])
article['originalSite'] = article['orgSName']
article['reporturl'] = reporturl
article['originalTitle'] = article['title']
article['title'] = translate(article['title'])
article['author'] = translate(article['researcher'])
article['originalAuthor'] = article['researcher']
article['originalContent'] = content
if len(article['originalContent']) < 10:
continue
CONTENT_ENG = ''
for element in article['originalContent'].split("。"):
CONTENT_ENG += translate(element) + ' '
article['content'] = CONTENT_ENG
article['authorid'] = uuid.uuid5(uuid.NAMESPACE_OID, article['author'])
article['publishDate'] = datemodifier(article['publishDate'], "%Y-%m-%d %H:%M:%S.%f")
article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
upsert_content(article)
except Exception as error:
print(error)
else:
print(reportinfo)
i = -1
else:
print("Failed to fetch URL:", url)