OxbridgeEconomics
commit
42ba1cc
raw
history blame
3.49 kB
import uuid
import json
import time
import urllib.request
from datetime import datetime, timedelta
from lxml import etree
from utils import encode, translate, sentiment_computation, upsert_content, fetch_url, crawl
i = 1
while i > -1:
if i == 1:
CATEGORY_URL = "http://www.csrc.gov.cn/csrc/c100039/common_list.shtml"
else:
CATEGORY_URL = f"http://www.csrc.gov.cn/csrc/c100039/common_list_{i}.shtml"
i = i + 1
req = urllib.request.urlopen(CATEGORY_URL)
text = req.read()
html_text = text.decode("utf-8")
page = etree.HTML(html_text)
articlelist = page.xpath("//div[contains(@class, 'main-right fr common-list')]/ul/li")
for article in articlelist:
if isinstance(article, etree._Element):
subelement = etree.tostring(article).decode()
subpage = etree.HTML(subelement)
date = encode(subpage.xpath("//span[@class='date']"))
parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
if parsed_datetime < (datetime.today() - timedelta(days=183)):
i = -1
else:
urls = subpage.xpath("//a/@href")
for url in urls:
try:
article = {}
url = "http://www.csrc.gov.cn" + url
article['category']= "Policy Interpretation"
crawl(url, article)
except Exception as error:
print(error)
i = 1
while i > -1:
CATEGORY_URL = f"http://www.csrc.gov.cn/searchList/a1a078ee0bc54721ab6b148884c784a8?_isAgg=true&_isJson=true&_pageSize=18&_template=index&_rangeTimeGte=&_channelName=&page={i}"
i = i + 1
content = fetch_url(CATEGORY_URL)
reportinfo = json.loads(content)
for article in reportinfo['data']['results']:
try:
parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(article['publishedTimeStr'],"%Y-%m-%d %H:%M:%S")), "%Y-%m-%d")
if parsed_datetime < (datetime.today() - timedelta(days=183)):
i = -1
else:
article['category']= "Financial News"
article['site'] = "Securities Regulatory Commission of China"
article['originSite'] = "证监会"
article['originTitle'] = article['title']
article['title'] = translate(article['originTitle'])
article['author'] = ''
article['originContent'] = repr(article['content'])
if len(article['originContent']) < 10:
continue
CONTENT_ENG = ''
for element in article['originContent'].split("。"):
CONTENT_ENG += translate(element) + ' '
article['content'] = repr(CONTENT_ENG)
article['subtitle'] = article['memo']
article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(article['publishedTimeStr'],"%Y-%m-%d %H:%M:%S"))
article['link'] = article['url']
article['attachment'] = ""
article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
upsert_content(article)
except Exception as error:
print(error)