"""Module to crawl the website 'https://www.csrc.gov.cn' to fetch and process articles."""
import json
import logging
import time
import urllib.request
import uuid
from datetime import datetime, timedelta

from prefect import flow
from lxml import etree

from controllers.summarizer import summarize
from controllers.utils import (
    crawl_by_url,
    encode,
    fetch_url,
    sentiment_computation,
    translate,
    update_content,
)

@flow(name = "Data Collection - csrc")
def crawl(delta):
    """
    Crawls the website http://www.csrc.gov.cn to retrieve articles based on the specified delta.

    Args:
        delta (int): The number of days in the past from today to retrieve articles.

    Returns:
        None

    Raises:
        None
    """
    logging.info("csrc.gov.cn")
    i = 1
    while i > -1:
        try:
            if i == 1:
                category_url = "http://www.csrc.gov.cn/csrc/c100039/common_list.shtml"
            else:
                category_url = f"http://www.csrc.gov.cn/csrc/c100039/common_list_{i}.shtml"
            i = i + 1
            logging.info(category_url)
            req = urllib.request.urlopen(category_url)
            text = req.read()
            html_text = text.decode("utf-8")
            page = etree.HTML(html_text)
            articlelist = page.xpath(
                    "//div[contains(@class, 'main-right fr common-list')]/ul/li")
            for article in articlelist:
                if isinstance(article, etree._Element):
                    subelement = etree.tostring(article).decode()
                    subpage = etree.HTML(subelement)
                    date = encode(subpage.xpath("//span[@class='date']"))
                    parsed_datetime = datetime.strptime(
                            time.strftime("%Y-%m-%d", time.strptime(date, "%Y-%m-%d")),
                            "%Y-%m-%d")
                    if parsed_datetime < (datetime.today() - timedelta(days=delta)):
                        i = -1
                    else:
                        urls = subpage.xpath("//a/@href")
                        for url in urls:
                            try:
                                article = {}
                                url = "http://www.csrc.gov.cn" + url
                                article['category'] = "Policy Interpretation"
                                logging.info(url)
                                crawl_by_url(url, article)
                            except Exception as error:
                                logging.error(error)
        except Exception as error:
            i = -1
            logging.error(error)

    i = 1
    while i > -1:
        category_url = f"http://www.csrc.gov.cn/searchList/a1a078ee0bc54721ab6b148884c784a8?_isAgg=true&_isJson=true&_pageSize=18&_template=index&_rangeTimeGte=&_channelName=&page={i}"
        i = i + 1
        try:
            logging.info(category_url)
            content = fetch_url(category_url)
            if content is None:
                i = -1
            else:
                reportinfo = json.loads(content)
                if len(reportinfo['data']['results']) == 0:
                    i = -1
                    logging.info(len(reportinfo['data']['results']))
                for article in reportinfo['data']['results']:
                    parsed_datetime = datetime.strptime(
                            time.strftime(
                                    "%Y-%m-%d",
                                    time.strptime(article['publishedTimeStr'],
                                                                "%Y-%m-%d %H:%M:%S")), "%Y-%m-%d")
                    if parsed_datetime < (datetime.today() - timedelta(days=delta)):
                        i = -1
                    else:
                        article['category'] = "Financial News"
                        article['site'] = "Securities Regulatory Commission of China"
                        article['originSite'] = "证监会"
                        article['titleCN'] = article['title']
                        article['title'] = translate(article['titleCN'])
                        article['author'] = ''
                        article['contentCN'] = repr(article['content'])[1:-1].strip()
                        if len(article['contentCN']) < 10:
                            continue
                        contenteng = ''
                        for element in article['contentCN'].split("。"):
                            contenteng += translate(element) + ' '
                        article['content'] = repr(contenteng)[1:-1].strip()
                        article['subtitle'] = summarize(article['content'])
                        article['publishDate'] = time.strftime(
                                "%Y-%m-%d",
                                time.strptime(article['publishedTimeStr'],
                                                            "%Y-%m-%d %H:%M:%S"))
                        article['link'] = article['url']
                        article['attachment'] = ""
                        article['sentimentScore'], article[
                                'sentimentLabel'] = sentiment_computation(article['content'])
                        article['id'] = uuid.uuid5(
                                uuid.NAMESPACE_OID,
                                article['titleCN'] + article['publishDate'])
                        logging.info(article)
                        # update_content(article)
        except Exception as error:
            i = -1
            logging.error(error)