OxbridgeEconomics
commit
01677a0
raw
history blame
5.59 kB
"""Module to crawl the website 'https://www.csrc.gov.cn' to fetch and process articles."""
import json
import logging
import time
import urllib.request
import uuid
from datetime import datetime, timedelta
from prefect import flow
from lxml import etree
from controllers.summarizer import summarize
from controllers.utils import (
crawl_by_url,
encode,
fetch_url,
sentiment_computation,
translate,
update_content,
)
@flow(name = "Data Collection - csrc")
def crawl(delta):
"""
Crawls the website http://www.csrc.gov.cn to retrieve articles based on the specified delta.
Args:
delta (int): The number of days in the past from today to retrieve articles.
Returns:
None
Raises:
None
"""
logging.info("csrc.gov.cn")
i = 1
while i > -1:
try:
if i == 1:
category_url = "http://www.csrc.gov.cn/csrc/c100039/common_list.shtml"
else:
category_url = f"http://www.csrc.gov.cn/csrc/c100039/common_list_{i}.shtml"
i = i + 1
logging.info(category_url)
req = urllib.request.urlopen(category_url)
text = req.read()
html_text = text.decode("utf-8")
page = etree.HTML(html_text)
articlelist = page.xpath(
"//div[contains(@class, 'main-right fr common-list')]/ul/li")
for article in articlelist:
if isinstance(article, etree._Element):
subelement = etree.tostring(article).decode()
subpage = etree.HTML(subelement)
date = encode(subpage.xpath("//span[@class='date']"))
parsed_datetime = datetime.strptime(
time.strftime("%Y-%m-%d", time.strptime(date, "%Y-%m-%d")),
"%Y-%m-%d")
if parsed_datetime < (datetime.today() - timedelta(days=delta)):
i = -1
else:
urls = subpage.xpath("//a/@href")
for url in urls:
try:
article = {}
url = "http://www.csrc.gov.cn" + url
article['category'] = "Policy Interpretation"
logging.info(url)
crawl_by_url(url, article)
except Exception as error:
logging.error(error)
except Exception as error:
i = -1
logging.error(error)
i = 1
while i > -1:
category_url = f"http://www.csrc.gov.cn/searchList/a1a078ee0bc54721ab6b148884c784a8?_isAgg=true&_isJson=true&_pageSize=18&_template=index&_rangeTimeGte=&_channelName=&page={i}"
i = i + 1
try:
logging.info(category_url)
content = fetch_url(category_url)
if content is None:
i = -1
else:
reportinfo = json.loads(content)
if len(reportinfo['data']['results']) == 0:
i = -1
logging.info(len(reportinfo['data']['results']))
for article in reportinfo['data']['results']:
parsed_datetime = datetime.strptime(
time.strftime(
"%Y-%m-%d",
time.strptime(article['publishedTimeStr'],
"%Y-%m-%d %H:%M:%S")), "%Y-%m-%d")
if parsed_datetime < (datetime.today() - timedelta(days=delta)):
i = -1
else:
article['category'] = "Financial News"
article['site'] = "Securities Regulatory Commission of China"
article['originSite'] = "证监会"
article['titleCN'] = article['title']
article['title'] = translate(article['titleCN'])
article['author'] = ''
article['contentCN'] = repr(article['content'])[1:-1].strip()
if len(article['contentCN']) < 10:
continue
contenteng = ''
for element in article['contentCN'].split("。"):
contenteng += translate(element) + ' '
article['content'] = repr(contenteng)[1:-1].strip()
article['subtitle'] = summarize(article['content'])
article['publishDate'] = time.strftime(
"%Y-%m-%d",
time.strptime(article['publishedTimeStr'],
"%Y-%m-%d %H:%M:%S"))
article['link'] = article['url']
article['attachment'] = ""
article['sentimentScore'], article[
'sentimentLabel'] = sentiment_computation(article['content'])
article['id'] = uuid.uuid5(
uuid.NAMESPACE_OID,
article['titleCN'] + article['publishDate'])
logging.info(article)
# update_content(article)
except Exception as error:
i = -1
logging.error(error)