Refactor exception handling in multiple files to specify exception types and improve logging
Browse files- .gitignore +1 -1
- controllers/utils.py +8 -6
- source/cbirc.py +2 -2
- source/csrc.py +5 -7
- source/eastmoney.py +2 -2
- source/gov.py +2 -2
- source/mof.py +2 -2
- source/mofcom.py +2 -2
- source/ndrc.py +1 -1
- source/safe.py +2 -2
- source/stats.py +1 -1
.gitignore
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
.
|
2 |
data
|
3 |
venv
|
4 |
__pycache__
|
|
|
1 |
+
.venv
|
2 |
data
|
3 |
venv
|
4 |
__pycache__
|
controllers/utils.py
CHANGED
@@ -115,7 +115,7 @@ def encode_content(content):
|
|
115 |
text = text[:index]
|
116 |
try:
|
117 |
summary = '\n'.join(text.split('\n')[:2])
|
118 |
-
except
|
119 |
logging.error(e)
|
120 |
summary = text
|
121 |
return text, summary
|
@@ -382,7 +382,8 @@ def extract_from_pdf_by_pattern(url, pattern):
|
|
382 |
else:
|
383 |
text = text.strip()
|
384 |
extracted_text += text
|
385 |
-
except:
|
|
|
386 |
extracted_text = ''
|
387 |
return extracted_text.replace('?\n', '?-\n').replace(
|
388 |
'!\n', '!-\n').replace('。\n', '。-\n').replace('\n', ' ').replace(
|
@@ -446,7 +447,7 @@ def extract_reference(row):
|
|
446 |
for title, date in zip(reference_titles, reference_dates):
|
447 |
try:
|
448 |
date = datetime.strptime(date, pattern['date_format'])
|
449 |
-
except:
|
450 |
date = datetime(2006, 1, 1)
|
451 |
dates = []
|
452 |
if 'date_range' in pattern:
|
@@ -590,8 +591,9 @@ def extract_from_pdf(url):
|
|
590 |
extracted_text += text
|
591 |
try:
|
592 |
summary = '\n'.join(extracted_text.split('\n')[:2])
|
593 |
-
except:
|
594 |
-
|
|
|
595 |
return extracted_text, summary
|
596 |
|
597 |
|
@@ -651,7 +653,7 @@ def crawl_by_url(url, article):
|
|
651 |
article['content'] = repr(contenteng)[1:-1].strip()
|
652 |
try:
|
653 |
article['subtitle'] = summarize(article['content'])
|
654 |
-
except:
|
655 |
article['subtitle'] = translate(summary)
|
656 |
article['publishDate'] = datemodifier(
|
657 |
encode(page.xpath(xpath_dict[domain]['publishdate'])),
|
|
|
115 |
text = text[:index]
|
116 |
try:
|
117 |
summary = '\n'.join(text.split('\n')[:2])
|
118 |
+
except (IndexError, AttributeError) as e:
|
119 |
logging.error(e)
|
120 |
summary = text
|
121 |
return text, summary
|
|
|
382 |
else:
|
383 |
text = text.strip()
|
384 |
extracted_text += text
|
385 |
+
except (requests.exceptions.RequestException, requests.exceptions.ReadTimeout) as e:
|
386 |
+
logging.error(e)
|
387 |
extracted_text = ''
|
388 |
return extracted_text.replace('?\n', '?-\n').replace(
|
389 |
'!\n', '!-\n').replace('。\n', '。-\n').replace('\n', ' ').replace(
|
|
|
447 |
for title, date in zip(reference_titles, reference_dates):
|
448 |
try:
|
449 |
date = datetime.strptime(date, pattern['date_format'])
|
450 |
+
except ValueError:
|
451 |
date = datetime(2006, 1, 1)
|
452 |
dates = []
|
453 |
if 'date_range' in pattern:
|
|
|
591 |
extracted_text += text
|
592 |
try:
|
593 |
summary = '\n'.join(extracted_text.split('\n')[:2])
|
594 |
+
except (ValueError, KeyError, TypeError) as e:
|
595 |
+
logging.error(e)
|
596 |
+
summary = extracted_text
|
597 |
return extracted_text, summary
|
598 |
|
599 |
|
|
|
653 |
article['content'] = repr(contenteng)[1:-1].strip()
|
654 |
try:
|
655 |
article['subtitle'] = summarize(article['content'])
|
656 |
+
except (ValueError, KeyError, TypeError):
|
657 |
article['subtitle'] = translate(summary)
|
658 |
article['publishDate'] = datemodifier(
|
659 |
encode(page.xpath(xpath_dict[domain]['publishdate'])),
|
source/cbirc.py
CHANGED
@@ -72,8 +72,8 @@ def crawl(delta):
|
|
72 |
article['author'] = ''
|
73 |
try:
|
74 |
article['subtitle'] = summarize(article['content'])
|
75 |
-
except:
|
76 |
article['subtitle'] = translate(summary)
|
77 |
update_content(article)
|
78 |
-
except
|
79 |
logger.error(error)
|
|
|
72 |
article['author'] = ''
|
73 |
try:
|
74 |
article['subtitle'] = summarize(article['content'])
|
75 |
+
except (RuntimeError, ValueError):
|
76 |
article['subtitle'] = translate(summary)
|
77 |
update_content(article)
|
78 |
+
except (ValueError, KeyError, TypeError) as error:
|
79 |
logger.error(error)
|
source/csrc.py
CHANGED
@@ -15,7 +15,6 @@ from controllers.utils import (
|
|
15 |
fetch_url,
|
16 |
sentiment_computation,
|
17 |
translate,
|
18 |
-
update_content,
|
19 |
)
|
20 |
|
21 |
@task(name = "Data Collection - csrc", log_prints = True)
|
@@ -73,9 +72,9 @@ def crawl(delta):
|
|
73 |
article['category'] = "Policy Interpretation"
|
74 |
logger.info(f"Processing article URL: {url}")
|
75 |
crawl_by_url(url, article)
|
76 |
-
except
|
77 |
logger.error(error)
|
78 |
-
except
|
79 |
i = -1
|
80 |
logger.error(error)
|
81 |
|
@@ -117,8 +116,8 @@ def crawl(delta):
|
|
117 |
article['content'] = repr(contenteng)[1:-1].strip()
|
118 |
try:
|
119 |
article['subtitle'] = summarize(article['content'])
|
120 |
-
except:
|
121 |
-
article['subtitle'] =
|
122 |
article['publishDate'] = time.strftime(
|
123 |
"%Y-%m-%d",
|
124 |
time.strptime(article['publishedTimeStr'],
|
@@ -132,7 +131,6 @@ def crawl(delta):
|
|
132 |
article['titleCN'] + article['publishDate'])
|
133 |
logger.info(article)
|
134 |
# update_content(article)
|
135 |
-
except
|
136 |
i = -1
|
137 |
logger.error(error)
|
138 |
-
|
|
|
15 |
fetch_url,
|
16 |
sentiment_computation,
|
17 |
translate,
|
|
|
18 |
)
|
19 |
|
20 |
@task(name = "Data Collection - csrc", log_prints = True)
|
|
|
72 |
article['category'] = "Policy Interpretation"
|
73 |
logger.info(f"Processing article URL: {url}")
|
74 |
crawl_by_url(url, article)
|
75 |
+
except (urllib.error.URLError, etree.XMLSyntaxError) as error:
|
76 |
logger.error(error)
|
77 |
+
except (urllib.error.URLError, etree.XMLSyntaxError, ValueError) as error:
|
78 |
i = -1
|
79 |
logger.error(error)
|
80 |
|
|
|
116 |
article['content'] = repr(contenteng)[1:-1].strip()
|
117 |
try:
|
118 |
article['subtitle'] = summarize(article['content'])
|
119 |
+
except (RuntimeError, ValueError):
|
120 |
+
article['subtitle'] = ""
|
121 |
article['publishDate'] = time.strftime(
|
122 |
"%Y-%m-%d",
|
123 |
time.strptime(article['publishedTimeStr'],
|
|
|
131 |
article['titleCN'] + article['publishDate'])
|
132 |
logger.info(article)
|
133 |
# update_content(article)
|
134 |
+
except (ValueError, KeyError, TypeError) as error:
|
135 |
i = -1
|
136 |
logger.error(error)
|
|
source/eastmoney.py
CHANGED
@@ -70,7 +70,7 @@ def _crawl(url, article):
|
|
70 |
print(f'INFO - {article}')
|
71 |
try:
|
72 |
article['subtitle'] = summarize(article['content'])
|
73 |
-
except:
|
74 |
article['subtitle'] = translate(summary)
|
75 |
article['authorid'] = uuid.uuid5(uuid.NAMESPACE_OID, article['author'])
|
76 |
article['publishDate'] = datemodifier(
|
@@ -127,7 +127,7 @@ def crawl(delta):
|
|
127 |
try:
|
128 |
url = f"https://data.eastmoney.com/report/zw_macresearch.jshtml?encodeUrl={article['encodeUrl']}"
|
129 |
_crawl(url, article)
|
130 |
-
except
|
131 |
logger.error(error)
|
132 |
else:
|
133 |
i = -1
|
|
|
70 |
print(f'INFO - {article}')
|
71 |
try:
|
72 |
article['subtitle'] = summarize(article['content'])
|
73 |
+
except (RuntimeError, ValueError):
|
74 |
article['subtitle'] = translate(summary)
|
75 |
article['authorid'] = uuid.uuid5(uuid.NAMESPACE_OID, article['author'])
|
76 |
article['publishDate'] = datemodifier(
|
|
|
127 |
try:
|
128 |
url = f"https://data.eastmoney.com/report/zw_macresearch.jshtml?encodeUrl={article['encodeUrl']}"
|
129 |
_crawl(url, article)
|
130 |
+
except (urllib.error.URLError, json.JSONDecodeError, KeyError) as error:
|
131 |
logger.error(error)
|
132 |
else:
|
133 |
i = -1
|
source/gov.py
CHANGED
@@ -52,7 +52,7 @@ def crawl(delta):
|
|
52 |
if "https://www.gov.cn" in url:
|
53 |
article['category'] = "Policy Interpretation"
|
54 |
crawl_by_url(url, article)
|
55 |
-
except
|
56 |
logger.error(error)
|
57 |
i = 0
|
58 |
while i > -1:
|
@@ -85,5 +85,5 @@ def crawl(delta):
|
|
85 |
if "https://www.gov.cn" in url:
|
86 |
article['site'] = "State Council of China"
|
87 |
crawl_by_url(url, article)
|
88 |
-
except
|
89 |
logger.error(error)
|
|
|
52 |
if "https://www.gov.cn" in url:
|
53 |
article['category'] = "Policy Interpretation"
|
54 |
crawl_by_url(url, article)
|
55 |
+
except (urllib.error.URLError, etree.XMLSyntaxError) as error:
|
56 |
logger.error(error)
|
57 |
i = 0
|
58 |
while i > -1:
|
|
|
85 |
if "https://www.gov.cn" in url:
|
86 |
article['site'] = "State Council of China"
|
87 |
crawl_by_url(url, article)
|
88 |
+
except (urllib.error.URLError, etree.XMLSyntaxError) as error:
|
89 |
logger.error(error)
|
source/mof.py
CHANGED
@@ -55,7 +55,7 @@ def crawl(delta):
|
|
55 |
"./", "https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/")
|
56 |
article['category'] = "Financial News"
|
57 |
crawl_by_url(url, article)
|
58 |
-
except
|
59 |
logger.error(error)
|
60 |
|
61 |
i = 0
|
@@ -90,5 +90,5 @@ def crawl(delta):
|
|
90 |
url = url.replace("./", category_url)
|
91 |
article['category'] = "Policy Interpretation"
|
92 |
crawl_by_url(url, article)
|
93 |
-
except
|
94 |
logger.error(error)
|
|
|
55 |
"./", "https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/")
|
56 |
article['category'] = "Financial News"
|
57 |
crawl_by_url(url, article)
|
58 |
+
except (urllib.error.URLError, etree.XMLSyntaxError) as error:
|
59 |
logger.error(error)
|
60 |
|
61 |
i = 0
|
|
|
90 |
url = url.replace("./", category_url)
|
91 |
article['category'] = "Policy Interpretation"
|
92 |
crawl_by_url(url, article)
|
93 |
+
except (urllib.error.URLError, etree.XMLSyntaxError) as error:
|
94 |
logger.error(error)
|
source/mofcom.py
CHANGED
@@ -59,8 +59,8 @@ def crawl(delta):
|
|
59 |
else:
|
60 |
article['category'] = "Policy Release"
|
61 |
crawl_by_url(url, article)
|
62 |
-
except
|
63 |
logger.error(error)
|
64 |
-
except
|
65 |
i = -1
|
66 |
logger.error(error)
|
|
|
59 |
else:
|
60 |
article['category'] = "Policy Release"
|
61 |
crawl_by_url(url, article)
|
62 |
+
except (urllib.error.URLError, etree.XMLSyntaxError) as error:
|
63 |
logger.error(error)
|
64 |
+
except (urllib.error.URLError, etree.XMLSyntaxError) as error:
|
65 |
i = -1
|
66 |
logger.error(error)
|
source/ndrc.py
CHANGED
@@ -64,5 +64,5 @@ def crawl(delta):
|
|
64 |
url = url.replace("./", "https://www.ndrc.gov.cn/xxgk/jd/jd/")
|
65 |
article['category'] = "Policy Interpretation"
|
66 |
crawl_by_url(url, article)
|
67 |
-
except
|
68 |
logger.error(error)
|
|
|
64 |
url = url.replace("./", "https://www.ndrc.gov.cn/xxgk/jd/jd/")
|
65 |
article['category'] = "Policy Interpretation"
|
66 |
crawl_by_url(url, article)
|
67 |
+
except (urllib.error.URLError, etree.XMLSyntaxError) as error:
|
68 |
logger.error(error)
|
source/safe.py
CHANGED
@@ -51,7 +51,7 @@ def crawl(delta):
|
|
51 |
url = "https://www.safe.gov.cn" + url
|
52 |
article['category'] = "Policy Interpretation"
|
53 |
crawl_by_url(url, article)
|
54 |
-
except
|
55 |
logger.error(error)
|
56 |
|
57 |
i = 1
|
@@ -84,5 +84,5 @@ def crawl(delta):
|
|
84 |
url = "https://www.safe.gov.cn" + url
|
85 |
article['category'] = "Data Interpretation"
|
86 |
crawl_by_url(url, article)
|
87 |
-
except
|
88 |
logger.error(error)
|
|
|
51 |
url = "https://www.safe.gov.cn" + url
|
52 |
article['category'] = "Policy Interpretation"
|
53 |
crawl_by_url(url, article)
|
54 |
+
except (urllib.error.URLError, etree.XMLSyntaxError) as error:
|
55 |
logger.error(error)
|
56 |
|
57 |
i = 1
|
|
|
84 |
url = "https://www.safe.gov.cn" + url
|
85 |
article['category'] = "Data Interpretation"
|
86 |
crawl_by_url(url, article)
|
87 |
+
except (urllib.error.URLError, etree.XMLSyntaxError) as error:
|
88 |
logger.error(error)
|
source/stats.py
CHANGED
@@ -54,5 +54,5 @@ def crawl(delta):
|
|
54 |
url = url.replace('./', "https://www.stats.gov.cn/sj/sjjd/")
|
55 |
article['category'] = "Data Interpretation"
|
56 |
crawl_by_url(url, article)
|
57 |
-
except
|
58 |
logger.info(error)
|
|
|
54 |
url = url.replace('./', "https://www.stats.gov.cn/sj/sjjd/")
|
55 |
article['category'] = "Data Interpretation"
|
56 |
crawl_by_url(url, article)
|
57 |
+
except (urllib.error.URLError, etree.XMLSyntaxError) as error:
|
58 |
logger.info(error)
|