Data-Collection-China / eastmoney.py
OxbridgeEconomics
commit
b6dcee5
raw
history blame
3.08 kB
import uuid
import json
import urllib.request
from urllib.parse import urlparse
from datetime import datetime, timedelta
from lxml import etree
from utils import encode, translate, datemodifier, sentiment_computation, upsert_content, fetch_url, encode_content
with open('xpath.json', 'r', encoding='UTF-8') as f:
xpath_dict = json.load(f)
def crawl(url, article):
domain = urlparse(url).netloc
req = urllib.request.urlopen(url)
text = req.read()
html_text = text.decode("utf-8")
page = etree.HTML(html_text)
originContent, summary = encode_content(page.xpath(xpath_dict[domain]['content']))
article['attachment'] = encode(page.xpath(xpath_dict[domain]['attachment']))
article['link'] = url
if article['orgSName'] == "''":
article['site'] = translate(article['orgSName'])
else:
article['site'] = translate(article['orgName'])
article['originTitle'] = article['title']
article['title'] = translate(article['title'])
article['author'] = translate(article['researcher'])
article['originAuthor'] = article['researcher']
article['originContent'] = repr(originContent)
article['subtitle'] = translate(summary)
article['category'] = "Macroeconomic Research"
if len(article['originContent']) < 10:
return None
CONTENT_ENG = ''
for element in originContent.split("\n"):
CONTENT_ENG += translate(element) + '\n'
article['content'] = repr(CONTENT_ENG)
article['authorid'] = uuid.uuid5(uuid.NAMESPACE_OID, article['author'])
article['publishDate'] = datemodifier(article['publishDate'], xpath_dict[domain]['datetime']['format_string'])
article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(originContent.replace("\n",""))
upsert_content(article)
today = datetime.today().strftime('%Y-%m-%d')
beginDate = (datetime.today() - timedelta(days=183)).strftime('%Y-%m-%d')
i = 0
while i > -1:
URL = "https://reportapi.eastmoney.com/report/jg"
params = {
"cb": "datatable8544623",
"pageSize": "100",
"beginTime": beginDate,
"endTime": today,
"pageNo": i,
"qType": "3",
}
URL = URL + "?" + "&".join(f"{key}={value}" for key, value in params.items())
print(URL)
content = fetch_url(URL)
if content:
start_index = content.find("(")
if start_index != -1:
result = content[start_index + 1: -1]
else:
result = content
reportinfo = json.loads(result)
if reportinfo["size"] > 0:
i = i + 1
for article in reportinfo['data']:
try:
url = f"https://data.eastmoney.com/report/zw_macresearch.jshtml?encodeUrl={article['encodeUrl']}"
crawl(url,article)
except Exception as error:
print(error)
else:
print(reportinfo)
i = -1
else:
print("Failed to fetch URL:", url)