File size: 3,075 Bytes
c48c6cf b6dcee5 86c11ee 57c4050 b6dcee5 d83e215 b6dcee5 d83e215 b6dcee5 d83e215 b6dcee5 d83e215 b6dcee5 d83e215 b6dcee5 d83e215 b6dcee5 d83e215 b6dcee5 c48c6cf 86c11ee 54c7a12 c48c6cf 57c4050 c48c6cf 86c11ee c48c6cf 57c4050 c48c6cf 57c4050 c48c6cf 57c4050 b6dcee5 c48c6cf |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 |
import uuid
import json
import urllib.request
from urllib.parse import urlparse
from datetime import datetime, timedelta
from lxml import etree
from utils import encode, translate, datemodifier, sentiment_computation, upsert_content, fetch_url, encode_content
with open('xpath.json', 'r', encoding='UTF-8') as f:
xpath_dict = json.load(f)
def crawl(url, article):
domain = urlparse(url).netloc
req = urllib.request.urlopen(url)
text = req.read()
html_text = text.decode("utf-8")
page = etree.HTML(html_text)
contentCN, summary = encode_content(page.xpath(xpath_dict[domain]['content']))
article['attachment'] = encode(page.xpath(xpath_dict[domain]['attachment']))
article['link'] = url
if article['orgSName'] == "''":
article['site'] = translate(article['orgSName'])
else:
article['site'] = translate(article['orgName'])
article['titleCN'] = article['title']
article['title'] = translate(article['title'])
article['author'] = translate(article['researcher'])
article['originAuthor'] = article['researcher']
article['contentCN'] = repr(contentCN)[1:-1].strip()
article['subtitle'] = translate(summary)
article['category'] = "Macroeconomic Research"
if len(article['contentCN']) < 10:
return None
CONTENT_ENG = ''
for element in contentCN.split("\n"):
CONTENT_ENG += translate(element) + '\n'
article['content'] = repr(CONTENT_ENG)[1:-1].strip()
article['authorid'] = uuid.uuid5(uuid.NAMESPACE_OID, article['author'])
article['publishDate'] = datemodifier(article['publishDate'], xpath_dict[domain]['datetime_format'])
article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['titleCN']+article['publishDate'])
article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(contentCN.replace("\n",""))
upsert_content(article)
today = datetime.today().strftime('%Y-%m-%d')
beginDate = (datetime.today() - timedelta(days=183)).strftime('%Y-%m-%d')
i = 0
while i > -1:
URL = "https://reportapi.eastmoney.com/report/jg"
params = {
"cb": "datatable8544623",
"pageSize": "100",
"beginTime": beginDate,
"endTime": today,
"pageNo": i,
"qType": "3",
}
URL = URL + "?" + "&".join(f"{key}={value}" for key, value in params.items())
print(URL)
content = fetch_url(URL)
if content:
start_index = content.find("(")
if start_index != -1:
result = content[start_index + 1: -1]
else:
result = content
reportinfo = json.loads(result)
if reportinfo["size"] > 0:
i = i + 1
for article in reportinfo['data']:
try:
url = f"https://data.eastmoney.com/report/zw_macresearch.jshtml?encodeUrl={article['encodeUrl']}"
crawl(url,article)
except Exception as error:
print(error)
else:
print(reportinfo)
i = -1
else:
print("Failed to fetch URL:", url)
|