OxbridgeEconomics
commit
57c4050
raw
history blame
4.66 kB
import uuid
import json
import time
import urllib.request
from datetime import datetime, timedelta
from lxml import etree
from utils import encode, translate, sentiment_computation, upsert_content, fetch_url
i = 1
while i > -1:
if i == 1:
CATEGORY_URL = "http://www.csrc.gov.cn/csrc/c100039/common_list.shtml"
else:
CATEGORY_URL = f"http://www.csrc.gov.cn/csrc/c100039/common_list_{i}.shtml"
i = i + 1
req = urllib.request.urlopen(CATEGORY_URL)
text = req.read()
html_text = text.decode("utf-8")
page = etree.HTML(html_text)
articlelist = page.xpath("//div[contains(@class, 'main-right fr common-list')]/ul/li")
for article in articlelist:
if isinstance(article, etree._Element):
subelement = etree.tostring(article).decode()
subpage = etree.HTML(subelement)
date = encode(subpage.xpath("//span[@class='date']"))
parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
if parsed_datetime < (datetime.today() - timedelta(days=183)):
i = -1
else:
urls = subpage.xpath("//a/@href")
for url in urls:
try:
article = {}
url = "http://www.csrc.gov.cn" + url
req = urllib.request.urlopen(url)
text = req.read()
html_text = text.decode("utf-8")
page = etree.HTML(html_text)
article['originalContent'] = encode(page.xpath("//div[contains(@class, 'detail-news')]//p"))
if len(article['originalContent']) < 10:
continue
CONTENT_ENG = ''
for element in article['originalContent'].split("。"):
CONTENT_ENG += translate(element) + ' '
article['content'] = CONTENT_ENG
article['site'] = "Securities Regulatory Commission"
article['originalSite'] = "θ―η›‘δΌš"
article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
article['title'] = translate(article['originalTitle'])
article['url'] = url
article['category']= "Policy Interpretation"
article['publishDate'] = date
article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
upsert_content(article)
except Exception as error:
print(error)
i = 1
while i > -1:
CATEGORY_URL = f"http://www.csrc.gov.cn/searchList/a1a078ee0bc54721ab6b148884c784a8?_isAgg=true&_isJson=true&_pageSize=18&_template=index&_rangeTimeGte=&_channelName=&page={i}"
i = i + 1
content = fetch_url(CATEGORY_URL)
reportinfo = json.loads(content)
for article in reportinfo['data']['results']:
try:
parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(article['publishedTimeStr'],"%Y-%m-%d %H:%M:%S")), "%Y-%m-%d")
if parsed_datetime < (datetime.today() - timedelta(days=183)):
i = -1
else:
article['originalContent'] = article['content']
if len(article['originalContent']) < 10:
continue
CONTENT_ENG = ''
for element in article['originalContent'].split("。"):
CONTENT_ENG += translate(element) + ' '
article['content'] = CONTENT_ENG
article['site'] = "Securities Regulatory Commission"
article['originalSite'] = "θ―η›‘δΌš"
article['originalTitle'] = article['title']
article['title'] = translate(article['originalTitle'])
article['url'] = url
article['category']= "Financial News"
article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(article['publishedTimeStr'],"%Y-%m-%d %H:%M:%S"))
article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
upsert_content(article)
except Exception as error:
print(error)