File size: 3,688 Bytes
74475ac 4259f95 74475ac 964df08 74475ac 60a0016 74475ac 5d719e2 74475ac 4259f95 74475ac 5737030 d705151 5737030 b68d569 74475ac b68d569 d705151 4259f95 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 |
"""Module to crawl the website 'https://www.cbirc.gov.cn' to fetch and process articles."""
import json
import time
import uuid
from datetime import datetime, timedelta
from prefect import task, get_run_logger
from controllers.summarizer import summarize
from controllers.utils import (
extract_from_pdf,
fetch_url,
sentiment_computation,
translate,
update_content,
)
from controllers.vectorizer import vectorize
@task(name = "Data Collection - cbirc", log_prints = True)
def crawl(delta):
"""
Crawls the website 'https://www.cbirc.gov.cn' to fetch and process articles.
Args:
delta (int): The number of days to consider for article publication date.
Returns:
None
Raises:
None
"""
logger = get_run_logger()
logger.info("cbirc.gov.cn")
i = 1
while i > -1:
category_url= f"https://www.cbirc.gov.cn/cn/static/data/DocInfo/SelectDocByItemIdAndChild/data_itemId=917,pageIndex={i},pageSize=18.json"
i = i + 1
content = fetch_url(category_url)
if content is not None:
reportinfo = json.loads(content)
for article in reportinfo['data']['rows']:
try:
article['publishDate'] = time.strftime(
"%Y-%m-%d",
time.strptime(article['publishDate'], "%Y-%m-%d %H:%M:%S"))
parsed_datetime = datetime.strptime(article['publishDate'], "%Y-%m-%d")
if parsed_datetime < (datetime.today() - timedelta(days=delta)):
i = -1
else:
contentcn, summary = extract_from_pdf(
"https://www.cbirc.gov.cn" + article['pdfFileUrl'])
article['contentCN'] = repr(contentcn)[1:-1].strip()
if len(contentcn) < 10:
continue
contenteng = ''
for element in article['contentCN'].split("\n"):
contenteng += translate(element) + '\n'
article['content'] = repr(contenteng)[1:-1].strip()
article[
'site'] = "National Financial Regulatory Administration of China"
article['originSite'] = "国家金融监督管理总局"
article['titleCN'] = article['docSubtitle']
article['title'] = translate(article['docSubtitle'])
article['link'] = "https://www.cbirc.gov.cn" + str(
article['pdfFileUrl'])
article['category'] = "Policy Interpretation"
article['id'] = uuid.uuid5(
uuid.NAMESPACE_OID,
article['titleCN'] + article['publishDate'])
article['sentimentScore'], article[
'sentimentLabel'] = sentiment_computation(article['content'])
article['attachment'] = ''
article['author'] = ''
try:
article['subtitle'] = summarize(article['content'])
except (RuntimeError, ValueError):
article['subtitle'] = translate(summary)
article['referenceid'] = None
update_content(article)
vectorize(article)
except (ValueError, KeyError, TypeError) as error:
logger.error(error)
|