File size: 3,083 Bytes
c48c6cf b6dcee5 86c11ee 57c4050 b6dcee5 c48c6cf 86c11ee 54c7a12 c48c6cf 57c4050 c48c6cf 86c11ee c48c6cf 57c4050 c48c6cf 57c4050 c48c6cf 57c4050 b6dcee5 c48c6cf |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 |
import uuid
import json
import urllib.request
from urllib.parse import urlparse
from datetime import datetime, timedelta
from lxml import etree
from utils import encode, translate, datemodifier, sentiment_computation, upsert_content, fetch_url, encode_content
with open('xpath.json', 'r', encoding='UTF-8') as f:
xpath_dict = json.load(f)
def crawl(url, article):
domain = urlparse(url).netloc
req = urllib.request.urlopen(url)
text = req.read()
html_text = text.decode("utf-8")
page = etree.HTML(html_text)
originContent, summary = encode_content(page.xpath(xpath_dict[domain]['content']))
article['attachment'] = encode(page.xpath(xpath_dict[domain]['attachment']))
article['link'] = url
if article['orgSName'] == "''":
article['site'] = translate(article['orgSName'])
else:
article['site'] = translate(article['orgName'])
article['originTitle'] = article['title']
article['title'] = translate(article['title'])
article['author'] = translate(article['researcher'])
article['originAuthor'] = article['researcher']
article['originContent'] = repr(originContent)
article['subtitle'] = translate(summary)
article['category'] = "Macroeconomic Research"
if len(article['originContent']) < 10:
return None
CONTENT_ENG = ''
for element in originContent.split("\n"):
CONTENT_ENG += translate(element) + '\n'
article['content'] = repr(CONTENT_ENG)
article['authorid'] = uuid.uuid5(uuid.NAMESPACE_OID, article['author'])
article['publishDate'] = datemodifier(article['publishDate'], xpath_dict[domain]['datetime']['format_string'])
article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(originContent.replace("\n",""))
upsert_content(article)
today = datetime.today().strftime('%Y-%m-%d')
beginDate = (datetime.today() - timedelta(days=183)).strftime('%Y-%m-%d')
i = 0
while i > -1:
URL = "https://reportapi.eastmoney.com/report/jg"
params = {
"cb": "datatable8544623",
"pageSize": "100",
"beginTime": beginDate,
"endTime": today,
"pageNo": i,
"qType": "3",
}
URL = URL + "?" + "&".join(f"{key}={value}" for key, value in params.items())
print(URL)
content = fetch_url(URL)
if content:
start_index = content.find("(")
if start_index != -1:
result = content[start_index + 1: -1]
else:
result = content
reportinfo = json.loads(result)
if reportinfo["size"] > 0:
i = i + 1
for article in reportinfo['data']:
try:
url = f"https://data.eastmoney.com/report/zw_macresearch.jshtml?encodeUrl={article['encodeUrl']}"
crawl(url,article)
except Exception as error:
print(error)
else:
print(reportinfo)
i = -1
else:
print("Failed to fetch URL:", url)
|