Muhammad Abdur Rahman Saad
commited on
Commit
·
4259f95
1
Parent(s):
91fadcf
fix logging issue
Browse files- source/cbirc.py +4 -4
- source/csrc.py +11 -11
- source/eastmoney.py +8 -8
- source/gov.py +5 -5
- source/mof.py +5 -5
- source/mofcom.py +5 -5
- source/ndrc.py +4 -4
- source/safe.py +5 -5
- source/stats.py +4 -4
source/cbirc.py
CHANGED
@@ -1,10 +1,9 @@
|
|
1 |
"""Module to crawl the website 'https://www.cbirc.gov.cn' to fetch and process articles."""
|
2 |
import json
|
3 |
-
import logging
|
4 |
import time
|
5 |
import uuid
|
6 |
from datetime import datetime, timedelta
|
7 |
-
from prefect import task
|
8 |
|
9 |
from controllers.summarizer import summarize
|
10 |
from controllers.utils import (
|
@@ -29,7 +28,8 @@ def crawl(delta):
|
|
29 |
Raises:
|
30 |
None
|
31 |
"""
|
32 |
-
|
|
|
33 |
i = 1
|
34 |
while i > -1:
|
35 |
category_url= f"https://www.cbirc.gov.cn/cn/static/data/DocInfo/SelectDocByItemIdAndChild/data_itemId=917,pageIndex={i},pageSize=18.json"
|
@@ -73,4 +73,4 @@ def crawl(delta):
|
|
73 |
article['subtitle'] = summarize(article['content'])
|
74 |
update_content(article)
|
75 |
except Exception as error:
|
76 |
-
|
|
|
1 |
"""Module to crawl the website 'https://www.cbirc.gov.cn' to fetch and process articles."""
|
2 |
import json
|
|
|
3 |
import time
|
4 |
import uuid
|
5 |
from datetime import datetime, timedelta
|
6 |
+
from prefect import task, get_run_logger
|
7 |
|
8 |
from controllers.summarizer import summarize
|
9 |
from controllers.utils import (
|
|
|
28 |
Raises:
|
29 |
None
|
30 |
"""
|
31 |
+
logger = get_run_logger()
|
32 |
+
logger.info("cbirc.gov.cn")
|
33 |
i = 1
|
34 |
while i > -1:
|
35 |
category_url= f"https://www.cbirc.gov.cn/cn/static/data/DocInfo/SelectDocByItemIdAndChild/data_itemId=917,pageIndex={i},pageSize=18.json"
|
|
|
73 |
article['subtitle'] = summarize(article['content'])
|
74 |
update_content(article)
|
75 |
except Exception as error:
|
76 |
+
logger.error(error)
|
source/csrc.py
CHANGED
@@ -1,12 +1,11 @@
|
|
1 |
"""Module to crawl the website 'https://www.csrc.gov.cn' to fetch and process articles."""
|
2 |
import json
|
3 |
-
import logging
|
4 |
import time
|
5 |
import urllib.request
|
6 |
import uuid
|
7 |
from datetime import datetime, timedelta
|
8 |
|
9 |
-
from prefect import task
|
10 |
from lxml import etree
|
11 |
|
12 |
from controllers.summarizer import summarize
|
@@ -33,7 +32,8 @@ def crawl(delta):
|
|
33 |
Raises:
|
34 |
None
|
35 |
"""
|
36 |
-
|
|
|
37 |
i = 1
|
38 |
while i > -1:
|
39 |
try:
|
@@ -42,7 +42,7 @@ def crawl(delta):
|
|
42 |
else:
|
43 |
category_url = f"http://www.csrc.gov.cn/csrc/c100039/common_list_{i}.shtml"
|
44 |
i = i + 1
|
45 |
-
|
46 |
req = urllib.request.urlopen(category_url)
|
47 |
text = req.read()
|
48 |
html_text = text.decode("utf-8")
|
@@ -66,20 +66,20 @@ def crawl(delta):
|
|
66 |
article = {}
|
67 |
url = "http://www.csrc.gov.cn" + url
|
68 |
article['category'] = "Policy Interpretation"
|
69 |
-
|
70 |
crawl_by_url(url, article)
|
71 |
except Exception as error:
|
72 |
-
|
73 |
except Exception as error:
|
74 |
i = -1
|
75 |
-
|
76 |
|
77 |
i = 1
|
78 |
while i > -1:
|
79 |
category_url = f"http://www.csrc.gov.cn/searchList/a1a078ee0bc54721ab6b148884c784a8?_isAgg=true&_isJson=true&_pageSize=18&_template=index&_rangeTimeGte=&_channelName=&page={i}"
|
80 |
i = i + 1
|
81 |
try:
|
82 |
-
|
83 |
content = fetch_url(category_url)
|
84 |
if content is None:
|
85 |
i = -1
|
@@ -87,7 +87,7 @@ def crawl(delta):
|
|
87 |
reportinfo = json.loads(content)
|
88 |
if len(reportinfo['data']['results']) == 0:
|
89 |
i = -1
|
90 |
-
|
91 |
for article in reportinfo['data']['results']:
|
92 |
parsed_datetime = datetime.strptime(
|
93 |
time.strftime(
|
@@ -122,8 +122,8 @@ def crawl(delta):
|
|
122 |
article['id'] = uuid.uuid5(
|
123 |
uuid.NAMESPACE_OID,
|
124 |
article['titleCN'] + article['publishDate'])
|
125 |
-
|
126 |
# update_content(article)
|
127 |
except Exception as error:
|
128 |
i = -1
|
129 |
-
|
|
|
1 |
"""Module to crawl the website 'https://www.csrc.gov.cn' to fetch and process articles."""
|
2 |
import json
|
|
|
3 |
import time
|
4 |
import urllib.request
|
5 |
import uuid
|
6 |
from datetime import datetime, timedelta
|
7 |
|
8 |
+
from prefect import task, get_run_logger
|
9 |
from lxml import etree
|
10 |
|
11 |
from controllers.summarizer import summarize
|
|
|
32 |
Raises:
|
33 |
None
|
34 |
"""
|
35 |
+
logger = get_run_logger()
|
36 |
+
logger.info("csrc.gov.cn")
|
37 |
i = 1
|
38 |
while i > -1:
|
39 |
try:
|
|
|
42 |
else:
|
43 |
category_url = f"http://www.csrc.gov.cn/csrc/c100039/common_list_{i}.shtml"
|
44 |
i = i + 1
|
45 |
+
logger.info(category_url)
|
46 |
req = urllib.request.urlopen(category_url)
|
47 |
text = req.read()
|
48 |
html_text = text.decode("utf-8")
|
|
|
66 |
article = {}
|
67 |
url = "http://www.csrc.gov.cn" + url
|
68 |
article['category'] = "Policy Interpretation"
|
69 |
+
logger.info(url)
|
70 |
crawl_by_url(url, article)
|
71 |
except Exception as error:
|
72 |
+
logger.error(error)
|
73 |
except Exception as error:
|
74 |
i = -1
|
75 |
+
logger.error(error)
|
76 |
|
77 |
i = 1
|
78 |
while i > -1:
|
79 |
category_url = f"http://www.csrc.gov.cn/searchList/a1a078ee0bc54721ab6b148884c784a8?_isAgg=true&_isJson=true&_pageSize=18&_template=index&_rangeTimeGte=&_channelName=&page={i}"
|
80 |
i = i + 1
|
81 |
try:
|
82 |
+
logger.info(category_url)
|
83 |
content = fetch_url(category_url)
|
84 |
if content is None:
|
85 |
i = -1
|
|
|
87 |
reportinfo = json.loads(content)
|
88 |
if len(reportinfo['data']['results']) == 0:
|
89 |
i = -1
|
90 |
+
logger.info(len(reportinfo['data']['results']))
|
91 |
for article in reportinfo['data']['results']:
|
92 |
parsed_datetime = datetime.strptime(
|
93 |
time.strftime(
|
|
|
122 |
article['id'] = uuid.uuid5(
|
123 |
uuid.NAMESPACE_OID,
|
124 |
article['titleCN'] + article['publishDate'])
|
125 |
+
logger.info(article)
|
126 |
# update_content(article)
|
127 |
except Exception as error:
|
128 |
i = -1
|
129 |
+
logger.error(error)
|
source/eastmoney.py
CHANGED
@@ -1,12 +1,11 @@
|
|
1 |
"""Module to crawl the website 'eastmoney.com' to fetch and process articles."""
|
2 |
import json
|
3 |
-
import logging
|
4 |
import urllib.request
|
5 |
import uuid
|
6 |
from datetime import datetime, timedelta
|
7 |
from urllib.parse import urlparse
|
8 |
|
9 |
-
from prefect import task
|
10 |
from lxml import etree
|
11 |
|
12 |
from controllers.summarizer import summarize
|
@@ -55,7 +54,7 @@ def _crawl(url, article):
|
|
55 |
article['site'] = translate(article['orgSName'])
|
56 |
else:
|
57 |
article['site'] = translate(article['orgName'])
|
58 |
-
|
59 |
article['titleCN'] = article['title']
|
60 |
article['title'] = translate(article['title'])
|
61 |
article['author'] = translate(article['researcher'])
|
@@ -68,7 +67,7 @@ def _crawl(url, article):
|
|
68 |
for element in contentcn.split("\n"):
|
69 |
contenteng += translate(element) + '\n'
|
70 |
article['content'] = repr(contenteng)[1:-1].strip()
|
71 |
-
|
72 |
article['subtitle'] = summarize(article['content'])
|
73 |
article['authorid'] = uuid.uuid5(uuid.NAMESPACE_OID, article['author'])
|
74 |
article['publishDate'] = datemodifier(
|
@@ -77,7 +76,7 @@ def _crawl(url, article):
|
|
77 |
article['titleCN'] + article['publishDate'])
|
78 |
article['sentimentScore'], article[
|
79 |
'sentimentLabel'] = sentiment_computation(contentcn.replace("\n", ""))
|
80 |
-
|
81 |
extract_reference(article)
|
82 |
update_content(article)
|
83 |
|
@@ -95,7 +94,8 @@ def crawl(delta):
|
|
95 |
Raises:
|
96 |
None
|
97 |
"""
|
98 |
-
|
|
|
99 |
today = datetime.today().strftime('%Y-%m-%d')
|
100 |
i = 0
|
101 |
while i > -1:
|
@@ -123,8 +123,8 @@ def crawl(delta):
|
|
123 |
url = f"https://data.eastmoney.com/report/zw_macresearch.jshtml?encodeUrl={article['encodeUrl']}"
|
124 |
_crawl(url, article)
|
125 |
except Exception as error:
|
126 |
-
|
127 |
else:
|
128 |
i = -1
|
129 |
else:
|
130 |
-
|
|
|
1 |
"""Module to crawl the website 'eastmoney.com' to fetch and process articles."""
|
2 |
import json
|
|
|
3 |
import urllib.request
|
4 |
import uuid
|
5 |
from datetime import datetime, timedelta
|
6 |
from urllib.parse import urlparse
|
7 |
|
8 |
+
from prefect import task, get_run_logger
|
9 |
from lxml import etree
|
10 |
|
11 |
from controllers.summarizer import summarize
|
|
|
54 |
article['site'] = translate(article['orgSName'])
|
55 |
else:
|
56 |
article['site'] = translate(article['orgName'])
|
57 |
+
print(f'INFO - {article}')
|
58 |
article['titleCN'] = article['title']
|
59 |
article['title'] = translate(article['title'])
|
60 |
article['author'] = translate(article['researcher'])
|
|
|
67 |
for element in contentcn.split("\n"):
|
68 |
contenteng += translate(element) + '\n'
|
69 |
article['content'] = repr(contenteng)[1:-1].strip()
|
70 |
+
print(f'INFO - {article}')
|
71 |
article['subtitle'] = summarize(article['content'])
|
72 |
article['authorid'] = uuid.uuid5(uuid.NAMESPACE_OID, article['author'])
|
73 |
article['publishDate'] = datemodifier(
|
|
|
76 |
article['titleCN'] + article['publishDate'])
|
77 |
article['sentimentScore'], article[
|
78 |
'sentimentLabel'] = sentiment_computation(contentcn.replace("\n", ""))
|
79 |
+
print(f'INFO - {article}')
|
80 |
extract_reference(article)
|
81 |
update_content(article)
|
82 |
|
|
|
94 |
Raises:
|
95 |
None
|
96 |
"""
|
97 |
+
logger = get_run_logger()
|
98 |
+
logger.info("data.eastmoney.com")
|
99 |
today = datetime.today().strftime('%Y-%m-%d')
|
100 |
i = 0
|
101 |
while i > -1:
|
|
|
123 |
url = f"https://data.eastmoney.com/report/zw_macresearch.jshtml?encodeUrl={article['encodeUrl']}"
|
124 |
_crawl(url, article)
|
125 |
except Exception as error:
|
126 |
+
logger.error(error)
|
127 |
else:
|
128 |
i = -1
|
129 |
else:
|
130 |
+
logger.error("Failed to fetch URL: %s", category_url)
|
source/gov.py
CHANGED
@@ -1,11 +1,10 @@
|
|
1 |
"""Module to crawl the website 'https://www.gov.cn' to fetch and process articles."""
|
2 |
-
import logging
|
3 |
import time
|
4 |
import urllib.request
|
5 |
from datetime import datetime, timedelta
|
6 |
|
7 |
from lxml import etree
|
8 |
-
from prefect import task
|
9 |
|
10 |
from controllers.utils import crawl_by_url
|
11 |
|
@@ -20,7 +19,8 @@ def crawl(delta):
|
|
20 |
Returns:
|
21 |
None
|
22 |
"""
|
23 |
-
|
|
|
24 |
i = 0
|
25 |
while i > -1:
|
26 |
if i == 0:
|
@@ -53,7 +53,7 @@ def crawl(delta):
|
|
53 |
article['category'] = "Policy Interpretation"
|
54 |
crawl_by_url(url, article)
|
55 |
except Exception as error:
|
56 |
-
|
57 |
i = 0
|
58 |
while i > -1:
|
59 |
if i == 0:
|
@@ -86,4 +86,4 @@ def crawl(delta):
|
|
86 |
article['site'] = "State Council of China"
|
87 |
crawl_by_url(url, article)
|
88 |
except Exception as error:
|
89 |
-
|
|
|
1 |
"""Module to crawl the website 'https://www.gov.cn' to fetch and process articles."""
|
|
|
2 |
import time
|
3 |
import urllib.request
|
4 |
from datetime import datetime, timedelta
|
5 |
|
6 |
from lxml import etree
|
7 |
+
from prefect import task, get_run_logger
|
8 |
|
9 |
from controllers.utils import crawl_by_url
|
10 |
|
|
|
19 |
Returns:
|
20 |
None
|
21 |
"""
|
22 |
+
logger = get_run_logger()
|
23 |
+
logger.info("gov.cn")
|
24 |
i = 0
|
25 |
while i > -1:
|
26 |
if i == 0:
|
|
|
53 |
article['category'] = "Policy Interpretation"
|
54 |
crawl_by_url(url, article)
|
55 |
except Exception as error:
|
56 |
+
logger.error(error)
|
57 |
i = 0
|
58 |
while i > -1:
|
59 |
if i == 0:
|
|
|
86 |
article['site'] = "State Council of China"
|
87 |
crawl_by_url(url, article)
|
88 |
except Exception as error:
|
89 |
+
logger.error(error)
|
source/mof.py
CHANGED
@@ -1,11 +1,10 @@
|
|
1 |
"""Module to crawl the website 'https://www.mof.gov.cn' to fetch and process articles."""
|
2 |
-
import logging
|
3 |
import time
|
4 |
import urllib.request
|
5 |
from datetime import datetime, timedelta
|
6 |
|
7 |
from lxml import etree
|
8 |
-
from prefect import task
|
9 |
|
10 |
from controllers.utils import crawl_by_url
|
11 |
|
@@ -20,7 +19,8 @@ def crawl(delta):
|
|
20 |
Returns:
|
21 |
None
|
22 |
"""
|
23 |
-
|
|
|
24 |
i = 0
|
25 |
while i > -1:
|
26 |
if i == 0:
|
@@ -56,7 +56,7 @@ def crawl(delta):
|
|
56 |
article['category'] = "Financial News"
|
57 |
crawl_by_url(url, article)
|
58 |
except Exception as error:
|
59 |
-
|
60 |
|
61 |
i = 0
|
62 |
while i > -1:
|
@@ -91,4 +91,4 @@ def crawl(delta):
|
|
91 |
article['category'] = "Policy Interpretation"
|
92 |
crawl_by_url(url, article)
|
93 |
except Exception as error:
|
94 |
-
|
|
|
1 |
"""Module to crawl the website 'https://www.mof.gov.cn' to fetch and process articles."""
|
|
|
2 |
import time
|
3 |
import urllib.request
|
4 |
from datetime import datetime, timedelta
|
5 |
|
6 |
from lxml import etree
|
7 |
+
from prefect import task, get_run_logger
|
8 |
|
9 |
from controllers.utils import crawl_by_url
|
10 |
|
|
|
19 |
Returns:
|
20 |
None
|
21 |
"""
|
22 |
+
logger = get_run_logger()
|
23 |
+
logger.info("mof.gov.cn")
|
24 |
i = 0
|
25 |
while i > -1:
|
26 |
if i == 0:
|
|
|
56 |
article['category'] = "Financial News"
|
57 |
crawl_by_url(url, article)
|
58 |
except Exception as error:
|
59 |
+
logger.error(error)
|
60 |
|
61 |
i = 0
|
62 |
while i > -1:
|
|
|
91 |
article['category'] = "Policy Interpretation"
|
92 |
crawl_by_url(url, article)
|
93 |
except Exception as error:
|
94 |
+
logger.error(error)
|
source/mofcom.py
CHANGED
@@ -1,11 +1,10 @@
|
|
1 |
"""Module to crawl the website 'https://www.mofcom.gov.cn' to fetch and process articles."""
|
2 |
-
import logging
|
3 |
import time
|
4 |
import urllib.request
|
5 |
from datetime import datetime, timedelta
|
6 |
|
7 |
from lxml import etree
|
8 |
-
from prefect import task
|
9 |
|
10 |
from controllers.utils import crawl_by_url
|
11 |
|
@@ -20,7 +19,8 @@ def crawl(delta):
|
|
20 |
Returns:
|
21 |
None
|
22 |
"""
|
23 |
-
|
|
|
24 |
categories = ['jdzhsw', 'jdgnmy', 'jddwmy', 'jdtzhz']
|
25 |
for category in categories:
|
26 |
i = 1
|
@@ -60,7 +60,7 @@ def crawl(delta):
|
|
60 |
article['category'] = "Policy Release"
|
61 |
crawl_by_url(url, article)
|
62 |
except Exception as error:
|
63 |
-
|
64 |
except Exception as error:
|
65 |
i = -1
|
66 |
-
|
|
|
1 |
"""Module to crawl the website 'https://www.mofcom.gov.cn' to fetch and process articles."""
|
|
|
2 |
import time
|
3 |
import urllib.request
|
4 |
from datetime import datetime, timedelta
|
5 |
|
6 |
from lxml import etree
|
7 |
+
from prefect import task, get_run_logger
|
8 |
|
9 |
from controllers.utils import crawl_by_url
|
10 |
|
|
|
19 |
Returns:
|
20 |
None
|
21 |
"""
|
22 |
+
logger = get_run_logger()
|
23 |
+
logger.info("mofcom.gov.cn")
|
24 |
categories = ['jdzhsw', 'jdgnmy', 'jddwmy', 'jdtzhz']
|
25 |
for category in categories:
|
26 |
i = 1
|
|
|
60 |
article['category'] = "Policy Release"
|
61 |
crawl_by_url(url, article)
|
62 |
except Exception as error:
|
63 |
+
logger.error(error)
|
64 |
except Exception as error:
|
65 |
i = -1
|
66 |
+
logger.error(error)
|
source/ndrc.py
CHANGED
@@ -1,11 +1,10 @@
|
|
1 |
"""Module to crawl the website 'https://www.ndrc.gov.cn' to fetch and process articles."""
|
2 |
-
import logging
|
3 |
import time
|
4 |
import urllib.request
|
5 |
from datetime import datetime, timedelta
|
6 |
|
7 |
from lxml import etree
|
8 |
-
from prefect import task
|
9 |
|
10 |
from controllers.utils import crawl_by_url
|
11 |
|
@@ -23,7 +22,8 @@ def crawl(delta):
|
|
23 |
Raises:
|
24 |
None
|
25 |
"""
|
26 |
-
|
|
|
27 |
i = 0
|
28 |
while i > -1:
|
29 |
if i == 0:
|
@@ -65,4 +65,4 @@ def crawl(delta):
|
|
65 |
article['category'] = "Policy Interpretation"
|
66 |
crawl_by_url(url, article)
|
67 |
except Exception as error:
|
68 |
-
|
|
|
1 |
"""Module to crawl the website 'https://www.ndrc.gov.cn' to fetch and process articles."""
|
|
|
2 |
import time
|
3 |
import urllib.request
|
4 |
from datetime import datetime, timedelta
|
5 |
|
6 |
from lxml import etree
|
7 |
+
from prefect import task, get_run_logger
|
8 |
|
9 |
from controllers.utils import crawl_by_url
|
10 |
|
|
|
22 |
Raises:
|
23 |
None
|
24 |
"""
|
25 |
+
logger = get_run_logger()
|
26 |
+
logger.info("ndrc.gov.cn")
|
27 |
i = 0
|
28 |
while i > -1:
|
29 |
if i == 0:
|
|
|
65 |
article['category'] = "Policy Interpretation"
|
66 |
crawl_by_url(url, article)
|
67 |
except Exception as error:
|
68 |
+
logger.error(error)
|
source/safe.py
CHANGED
@@ -1,11 +1,10 @@
|
|
1 |
"""Module to crawl the website 'https://www.safe.gov.cn' to fetch and process articles."""
|
2 |
-
import logging
|
3 |
import time
|
4 |
import urllib.request
|
5 |
from datetime import datetime, timedelta
|
6 |
|
7 |
from lxml import etree
|
8 |
-
from prefect import task
|
9 |
|
10 |
from controllers.utils import crawl_by_url
|
11 |
|
@@ -20,7 +19,8 @@ def crawl(delta):
|
|
20 |
Returns:
|
21 |
None
|
22 |
"""
|
23 |
-
|
|
|
24 |
i = 1
|
25 |
while i > -1:
|
26 |
if i == 1:
|
@@ -52,7 +52,7 @@ def crawl(delta):
|
|
52 |
article['category'] = "Policy Interpretation"
|
53 |
crawl_by_url(url, article)
|
54 |
except Exception as error:
|
55 |
-
|
56 |
|
57 |
i = 1
|
58 |
while i > -1:
|
@@ -85,4 +85,4 @@ def crawl(delta):
|
|
85 |
article['category'] = "Data Interpretation"
|
86 |
crawl_by_url(url, article)
|
87 |
except Exception as error:
|
88 |
-
|
|
|
1 |
"""Module to crawl the website 'https://www.safe.gov.cn' to fetch and process articles."""
|
|
|
2 |
import time
|
3 |
import urllib.request
|
4 |
from datetime import datetime, timedelta
|
5 |
|
6 |
from lxml import etree
|
7 |
+
from prefect import task, get_run_logger
|
8 |
|
9 |
from controllers.utils import crawl_by_url
|
10 |
|
|
|
19 |
Returns:
|
20 |
None
|
21 |
"""
|
22 |
+
logger = get_run_logger()
|
23 |
+
logger.info("safe.gov.cn")
|
24 |
i = 1
|
25 |
while i > -1:
|
26 |
if i == 1:
|
|
|
52 |
article['category'] = "Policy Interpretation"
|
53 |
crawl_by_url(url, article)
|
54 |
except Exception as error:
|
55 |
+
logger.error(error)
|
56 |
|
57 |
i = 1
|
58 |
while i > -1:
|
|
|
85 |
article['category'] = "Data Interpretation"
|
86 |
crawl_by_url(url, article)
|
87 |
except Exception as error:
|
88 |
+
logger.error(error)
|
source/stats.py
CHANGED
@@ -1,11 +1,10 @@
|
|
1 |
"""Module to crawl the website 'https://www.stats.gov.cn' to fetch and process articles."""
|
2 |
-
import logging
|
3 |
import time
|
4 |
import urllib.request
|
5 |
from datetime import datetime, timedelta
|
6 |
|
7 |
from lxml import etree
|
8 |
-
from prefect import task
|
9 |
|
10 |
from controllers.utils import crawl_by_url, encode
|
11 |
|
@@ -23,7 +22,8 @@ def crawl(delta):
|
|
23 |
Raises:
|
24 |
None
|
25 |
"""
|
26 |
-
|
|
|
27 |
i = 0
|
28 |
while i > -1:
|
29 |
if i == 0:
|
@@ -55,4 +55,4 @@ def crawl(delta):
|
|
55 |
article['category'] = "Data Interpretation"
|
56 |
crawl_by_url(url, article)
|
57 |
except Exception as error:
|
58 |
-
|
|
|
1 |
"""Module to crawl the website 'https://www.stats.gov.cn' to fetch and process articles."""
|
|
|
2 |
import time
|
3 |
import urllib.request
|
4 |
from datetime import datetime, timedelta
|
5 |
|
6 |
from lxml import etree
|
7 |
+
from prefect import task, get_run_logger
|
8 |
|
9 |
from controllers.utils import crawl_by_url, encode
|
10 |
|
|
|
22 |
Raises:
|
23 |
None
|
24 |
"""
|
25 |
+
logger = get_run_logger()
|
26 |
+
logger.info("stats.gov.hk")
|
27 |
i = 0
|
28 |
while i > -1:
|
29 |
if i == 0:
|
|
|
55 |
article['category'] = "Data Interpretation"
|
56 |
crawl_by_url(url, article)
|
57 |
except Exception as error:
|
58 |
+
logger.info(error)
|