File size: 6,221 Bytes
74475ac 4259f95 74475ac 964df08 74475ac 91fadcf 74475ac 4259f95 74475ac 82a33ed 4259f95 122b0c4 cc76656 122b0c4 74475ac cc76656 74475ac d705151 4259f95 cc76656 74475ac 4259f95 74475ac 4259f95 74475ac 4259f95 74475ac 5737030 d705151 74475ac 4259f95 74475ac d705151 74475ac 4259f95 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 |
"""Module to crawl the website 'https://www.csrc.gov.cn' to fetch and process articles."""
import json
import time
import urllib.request
import uuid
from datetime import datetime, timedelta
from prefect import task, get_run_logger
from lxml import etree
from controllers.summarizer import summarize
from controllers.utils import (
crawl_by_url,
encode,
fetch_url,
sentiment_computation,
translate,
)
@task(name = "Data Collection - csrc", log_prints = True)
def crawl(delta):
"""
Crawls the website http://www.csrc.gov.cn to retrieve articles based on the specified delta.
Args:
delta (int): The number of days in the past from today to retrieve articles.
Returns:
None
Raises:
None
"""
logger = get_run_logger()
logger.info("csrc.gov.cn")
i = 1
while i > -1:
try:
if i == 1:
category_url = "http://www.csrc.gov.cn/csrc/c100039/common_list.shtml"
else:
category_url = f"http://www.csrc.gov.cn/csrc/c100039/common_list_{i}.shtml"
i = i + 1
logger.info(f"Fetching from URL: {category_url}")
logger.info(category_url)
req = urllib.request.Request(
category_url,
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
)
response = urllib.request.urlopen(req, timeout=60)
text = response.read()
html_text = text.decode("utf-8")
page = etree.HTML(html_text)
articlelist = page.xpath(
"//div[contains(@class, 'main-right fr common-list')]/ul/li")
for article in articlelist:
if isinstance(article, etree._Element):
subelement = etree.tostring(article).decode()
subpage = etree.HTML(subelement)
date = encode(subpage.xpath("//span[@class='date']"))
parsed_datetime = datetime.strptime(
time.strftime("%Y-%m-%d", time.strptime(date, "%Y-%m-%d")),
"%Y-%m-%d")
if parsed_datetime < (datetime.today() - timedelta(days=delta)):
i = -1
else:
urls = subpage.xpath("//a/@href")
for url in urls:
try:
article = {}
url = "http://www.csrc.gov.cn" + url
article['category'] = "Policy Interpretation"
logger.info("Processing article URL: %s", url)
crawl_by_url(url, article)
except (urllib.error.URLError, etree.XMLSyntaxError) as error:
logger.error(error)
except (urllib.error.URLError, etree.XMLSyntaxError, ValueError, TimeoutError) as error:
i = -1
logger.error(error)
i = 1
while i > -1:
category_url = f"http://www.csrc.gov.cn/searchList/a1a078ee0bc54721ab6b148884c784a8?_isAgg=true&_isJson=true&_pageSize=18&_template=index&_rangeTimeGte=&_channelName=&page={i}"
i = i + 1
try:
logger.info(category_url)
content = fetch_url(category_url)
if content is None:
i = -1
else:
reportinfo = json.loads(content)
if len(reportinfo['data']['results']) == 0:
i = -1
logger.info(len(reportinfo['data']['results']))
for article in reportinfo['data']['results']:
parsed_datetime = datetime.strptime(
time.strftime(
"%Y-%m-%d",
time.strptime(article['publishedTimeStr'],
"%Y-%m-%d %H:%M:%S")), "%Y-%m-%d")
if parsed_datetime < (datetime.today() - timedelta(days=delta)):
i = -1
else:
article['category'] = "Financial News"
article['site'] = "Securities Regulatory Commission of China"
article['originSite'] = "证监会"
article['titleCN'] = article['title']
article['title'] = translate(article['titleCN'])
article['author'] = ''
article['contentCN'] = repr(article['content'])[1:-1].strip()
if len(article['contentCN']) < 10:
continue
contenteng = ''
for element in article['contentCN'].split("。"):
contenteng += translate(element) + ' '
article['content'] = repr(contenteng)[1:-1].strip()
try:
article['subtitle'] = summarize(article['content'])
except (RuntimeError, ValueError):
article['subtitle'] = ""
article['publishDate'] = time.strftime(
"%Y-%m-%d",
time.strptime(article['publishedTimeStr'],
"%Y-%m-%d %H:%M:%S"))
article['link'] = article['url']
article['attachment'] = ""
article['sentimentScore'], article[
'sentimentLabel'] = sentiment_computation(article['content'])
article['id'] = uuid.uuid5(
uuid.NAMESPACE_OID,
article['titleCN'] + article['publishDate'])
logger.info(article)
# update_content(article)
except (ValueError, KeyError, TypeError) as error:
i = -1
logger.error(error)
|