File size: 4,044 Bytes
74475ac
 
 
 
 
 
4259f95
74475ac
 
 
5d719e2
74475ac
 
 
 
 
 
 
 
 
 
4259f95
 
74475ac
 
 
 
 
 
 
cc76656
74475ac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d705151
4259f95
74475ac
 
 
 
 
 
 
 
cc76656
74475ac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d705151
4259f95
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
"""Module to crawl the website 'https://www.mof.gov.cn' to fetch and process articles."""
import time
import urllib.request
from datetime import datetime, timedelta

from lxml import etree
from prefect import task, get_run_logger

from controllers.utils import crawl_by_url

@task(name = "Data Collection - mof", log_prints = True)
def crawl(delta):
    """
    Crawls the website to retrieve articles based on the specified delta.

    Args:
        delta (int): The number of days in the past to retrieve articles from.

    Returns:
        None
    """
    logger = get_run_logger()
    logger.info("mof.gov.cn")
    i = 0
    while i > -1:
        if i == 0:
            category_url = "https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/"
        else:
            category_url = f"https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/index_{i}.htm"
        i = i + 1
        req = urllib.request.urlopen(category_url, timeout=60)
        text = req.read()
        html_text = text.decode("utf-8")
        page = etree.HTML(html_text)
        articlelist = page.xpath(
                "//div[contains(@class, 'xwfb_listerji')]/ul/li[not(@class = 'clear')]"
        )
        for article in articlelist:
            if isinstance(article, etree._Element):
                subelement = etree.tostring(article).decode()
                subpage = etree.HTML(subelement)
                date = subpage.xpath("//span/text()")[0]
                parsed_datetime = datetime.strptime(
                        time.strftime("%Y-%m-%d", time.strptime(date, "%Y-%m-%d")),
                        "%Y-%m-%d")
                if parsed_datetime < (datetime.today() - timedelta(days=delta)):
                    i = -1
                else:
                    urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
                    for url in urls:
                        try:
                            article = {}
                            url = url.replace("../", "https://www.mof.gov.cn/zhengwuxinxi/")
                            url = url.replace(
                                    "./", "https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/")
                            article['category'] = "Financial News"
                            crawl_by_url(url, article)
                        except (urllib.error.URLError, etree.XMLSyntaxError) as error:
                            logger.error(error)

    i = 0
    while i > -1:
        if i == 0:
            category_url = "https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/"
        else:
            category_url = f"https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/index_{i}.htm"
        i = i + 1
        req = urllib.request.urlopen(category_url, timeout=60)
        text = req.read()
        html_text = text.decode("utf-8")
        page = etree.HTML(html_text)
        articlelist = page.xpath(
                "//div[contains(@class, 'xwfb_listerji')]/ul/li[not(@class = 'clear')]"
        )
        for article in articlelist:
            if isinstance(article, etree._Element):
                subelement = etree.tostring(article).decode()
                subpage = etree.HTML(subelement)
                date = subpage.xpath("//span/text()")[0]
                parsed_datetime = datetime.strptime(
                        time.strftime("%Y-%m-%d", time.strptime(date, "%Y-%m-%d")),
                        "%Y-%m-%d")
                if parsed_datetime < (datetime.today() - timedelta(days=delta)):
                    i = -1
                else:
                    urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
                    for url in urls:
                        try:
                            article = {}
                            url = url.replace("./", category_url)
                            article['category'] = "Policy Interpretation"
                            crawl_by_url(url, article)
                        except (urllib.error.URLError, etree.XMLSyntaxError) as error:
                            logger.error(error)