gavinzli commited on
Commit
d705151
·
1 Parent(s): c39d841

Refactor exception handling in multiple files to specify exception types and improve logging

Browse files
.gitignore CHANGED
@@ -1,4 +1,4 @@
1
- .env
2
  data
3
  venv
4
  __pycache__
 
1
+ .venv
2
  data
3
  venv
4
  __pycache__
controllers/utils.py CHANGED
@@ -115,7 +115,7 @@ def encode_content(content):
115
  text = text[:index]
116
  try:
117
  summary = '\n'.join(text.split('\n')[:2])
118
- except Exception as e:
119
  logging.error(e)
120
  summary = text
121
  return text, summary
@@ -382,7 +382,8 @@ def extract_from_pdf_by_pattern(url, pattern):
382
  else:
383
  text = text.strip()
384
  extracted_text += text
385
- except:
 
386
  extracted_text = ''
387
  return extracted_text.replace('?\n', '?-\n').replace(
388
  '!\n', '!-\n').replace('。\n', '。-\n').replace('\n', ' ').replace(
@@ -446,7 +447,7 @@ def extract_reference(row):
446
  for title, date in zip(reference_titles, reference_dates):
447
  try:
448
  date = datetime.strptime(date, pattern['date_format'])
449
- except:
450
  date = datetime(2006, 1, 1)
451
  dates = []
452
  if 'date_range' in pattern:
@@ -590,8 +591,9 @@ def extract_from_pdf(url):
590
  extracted_text += text
591
  try:
592
  summary = '\n'.join(extracted_text.split('\n')[:2])
593
- except:
594
- summary = text
 
595
  return extracted_text, summary
596
 
597
 
@@ -651,7 +653,7 @@ def crawl_by_url(url, article):
651
  article['content'] = repr(contenteng)[1:-1].strip()
652
  try:
653
  article['subtitle'] = summarize(article['content'])
654
- except:
655
  article['subtitle'] = translate(summary)
656
  article['publishDate'] = datemodifier(
657
  encode(page.xpath(xpath_dict[domain]['publishdate'])),
 
115
  text = text[:index]
116
  try:
117
  summary = '\n'.join(text.split('\n')[:2])
118
+ except (IndexError, AttributeError) as e:
119
  logging.error(e)
120
  summary = text
121
  return text, summary
 
382
  else:
383
  text = text.strip()
384
  extracted_text += text
385
+ except (requests.exceptions.RequestException, requests.exceptions.ReadTimeout) as e:
386
+ logging.error(e)
387
  extracted_text = ''
388
  return extracted_text.replace('?\n', '?-\n').replace(
389
  '!\n', '!-\n').replace('。\n', '。-\n').replace('\n', ' ').replace(
 
447
  for title, date in zip(reference_titles, reference_dates):
448
  try:
449
  date = datetime.strptime(date, pattern['date_format'])
450
+ except ValueError:
451
  date = datetime(2006, 1, 1)
452
  dates = []
453
  if 'date_range' in pattern:
 
591
  extracted_text += text
592
  try:
593
  summary = '\n'.join(extracted_text.split('\n')[:2])
594
+ except (ValueError, KeyError, TypeError) as e:
595
+ logging.error(e)
596
+ summary = extracted_text
597
  return extracted_text, summary
598
 
599
 
 
653
  article['content'] = repr(contenteng)[1:-1].strip()
654
  try:
655
  article['subtitle'] = summarize(article['content'])
656
+ except (ValueError, KeyError, TypeError):
657
  article['subtitle'] = translate(summary)
658
  article['publishDate'] = datemodifier(
659
  encode(page.xpath(xpath_dict[domain]['publishdate'])),
source/cbirc.py CHANGED
@@ -72,8 +72,8 @@ def crawl(delta):
72
  article['author'] = ''
73
  try:
74
  article['subtitle'] = summarize(article['content'])
75
- except:
76
  article['subtitle'] = translate(summary)
77
  update_content(article)
78
- except Exception as error:
79
  logger.error(error)
 
72
  article['author'] = ''
73
  try:
74
  article['subtitle'] = summarize(article['content'])
75
+ except (RuntimeError, ValueError):
76
  article['subtitle'] = translate(summary)
77
  update_content(article)
78
+ except (ValueError, KeyError, TypeError) as error:
79
  logger.error(error)
source/csrc.py CHANGED
@@ -15,7 +15,6 @@ from controllers.utils import (
15
  fetch_url,
16
  sentiment_computation,
17
  translate,
18
- update_content,
19
  )
20
 
21
  @task(name = "Data Collection - csrc", log_prints = True)
@@ -73,9 +72,9 @@ def crawl(delta):
73
  article['category'] = "Policy Interpretation"
74
  logger.info(f"Processing article URL: {url}")
75
  crawl_by_url(url, article)
76
- except Exception as error:
77
  logger.error(error)
78
- except Exception as error:
79
  i = -1
80
  logger.error(error)
81
 
@@ -117,8 +116,8 @@ def crawl(delta):
117
  article['content'] = repr(contenteng)[1:-1].strip()
118
  try:
119
  article['subtitle'] = summarize(article['content'])
120
- except:
121
- article['subtitle'] = translate(summary)
122
  article['publishDate'] = time.strftime(
123
  "%Y-%m-%d",
124
  time.strptime(article['publishedTimeStr'],
@@ -132,7 +131,6 @@ def crawl(delta):
132
  article['titleCN'] + article['publishDate'])
133
  logger.info(article)
134
  # update_content(article)
135
- except Exception as error:
136
  i = -1
137
  logger.error(error)
138
-
 
15
  fetch_url,
16
  sentiment_computation,
17
  translate,
 
18
  )
19
 
20
  @task(name = "Data Collection - csrc", log_prints = True)
 
72
  article['category'] = "Policy Interpretation"
73
  logger.info(f"Processing article URL: {url}")
74
  crawl_by_url(url, article)
75
+ except (urllib.error.URLError, etree.XMLSyntaxError) as error:
76
  logger.error(error)
77
+ except (urllib.error.URLError, etree.XMLSyntaxError, ValueError) as error:
78
  i = -1
79
  logger.error(error)
80
 
 
116
  article['content'] = repr(contenteng)[1:-1].strip()
117
  try:
118
  article['subtitle'] = summarize(article['content'])
119
+ except (RuntimeError, ValueError):
120
+ article['subtitle'] = ""
121
  article['publishDate'] = time.strftime(
122
  "%Y-%m-%d",
123
  time.strptime(article['publishedTimeStr'],
 
131
  article['titleCN'] + article['publishDate'])
132
  logger.info(article)
133
  # update_content(article)
134
+ except (ValueError, KeyError, TypeError) as error:
135
  i = -1
136
  logger.error(error)
 
source/eastmoney.py CHANGED
@@ -70,7 +70,7 @@ def _crawl(url, article):
70
  print(f'INFO - {article}')
71
  try:
72
  article['subtitle'] = summarize(article['content'])
73
- except:
74
  article['subtitle'] = translate(summary)
75
  article['authorid'] = uuid.uuid5(uuid.NAMESPACE_OID, article['author'])
76
  article['publishDate'] = datemodifier(
@@ -127,7 +127,7 @@ def crawl(delta):
127
  try:
128
  url = f"https://data.eastmoney.com/report/zw_macresearch.jshtml?encodeUrl={article['encodeUrl']}"
129
  _crawl(url, article)
130
- except Exception as error:
131
  logger.error(error)
132
  else:
133
  i = -1
 
70
  print(f'INFO - {article}')
71
  try:
72
  article['subtitle'] = summarize(article['content'])
73
+ except (RuntimeError, ValueError):
74
  article['subtitle'] = translate(summary)
75
  article['authorid'] = uuid.uuid5(uuid.NAMESPACE_OID, article['author'])
76
  article['publishDate'] = datemodifier(
 
127
  try:
128
  url = f"https://data.eastmoney.com/report/zw_macresearch.jshtml?encodeUrl={article['encodeUrl']}"
129
  _crawl(url, article)
130
+ except (urllib.error.URLError, json.JSONDecodeError, KeyError) as error:
131
  logger.error(error)
132
  else:
133
  i = -1
source/gov.py CHANGED
@@ -52,7 +52,7 @@ def crawl(delta):
52
  if "https://www.gov.cn" in url:
53
  article['category'] = "Policy Interpretation"
54
  crawl_by_url(url, article)
55
- except Exception as error:
56
  logger.error(error)
57
  i = 0
58
  while i > -1:
@@ -85,5 +85,5 @@ def crawl(delta):
85
  if "https://www.gov.cn" in url:
86
  article['site'] = "State Council of China"
87
  crawl_by_url(url, article)
88
- except Exception as error:
89
  logger.error(error)
 
52
  if "https://www.gov.cn" in url:
53
  article['category'] = "Policy Interpretation"
54
  crawl_by_url(url, article)
55
+ except (urllib.error.URLError, etree.XMLSyntaxError) as error:
56
  logger.error(error)
57
  i = 0
58
  while i > -1:
 
85
  if "https://www.gov.cn" in url:
86
  article['site'] = "State Council of China"
87
  crawl_by_url(url, article)
88
+ except (urllib.error.URLError, etree.XMLSyntaxError) as error:
89
  logger.error(error)
source/mof.py CHANGED
@@ -55,7 +55,7 @@ def crawl(delta):
55
  "./", "https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/")
56
  article['category'] = "Financial News"
57
  crawl_by_url(url, article)
58
- except Exception as error:
59
  logger.error(error)
60
 
61
  i = 0
@@ -90,5 +90,5 @@ def crawl(delta):
90
  url = url.replace("./", category_url)
91
  article['category'] = "Policy Interpretation"
92
  crawl_by_url(url, article)
93
- except Exception as error:
94
  logger.error(error)
 
55
  "./", "https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/")
56
  article['category'] = "Financial News"
57
  crawl_by_url(url, article)
58
+ except (urllib.error.URLError, etree.XMLSyntaxError) as error:
59
  logger.error(error)
60
 
61
  i = 0
 
90
  url = url.replace("./", category_url)
91
  article['category'] = "Policy Interpretation"
92
  crawl_by_url(url, article)
93
+ except (urllib.error.URLError, etree.XMLSyntaxError) as error:
94
  logger.error(error)
source/mofcom.py CHANGED
@@ -59,8 +59,8 @@ def crawl(delta):
59
  else:
60
  article['category'] = "Policy Release"
61
  crawl_by_url(url, article)
62
- except Exception as error:
63
  logger.error(error)
64
- except Exception as error:
65
  i = -1
66
  logger.error(error)
 
59
  else:
60
  article['category'] = "Policy Release"
61
  crawl_by_url(url, article)
62
+ except (urllib.error.URLError, etree.XMLSyntaxError) as error:
63
  logger.error(error)
64
+ except (urllib.error.URLError, etree.XMLSyntaxError) as error:
65
  i = -1
66
  logger.error(error)
source/ndrc.py CHANGED
@@ -64,5 +64,5 @@ def crawl(delta):
64
  url = url.replace("./", "https://www.ndrc.gov.cn/xxgk/jd/jd/")
65
  article['category'] = "Policy Interpretation"
66
  crawl_by_url(url, article)
67
- except Exception as error:
68
  logger.error(error)
 
64
  url = url.replace("./", "https://www.ndrc.gov.cn/xxgk/jd/jd/")
65
  article['category'] = "Policy Interpretation"
66
  crawl_by_url(url, article)
67
+ except (urllib.error.URLError, etree.XMLSyntaxError) as error:
68
  logger.error(error)
source/safe.py CHANGED
@@ -51,7 +51,7 @@ def crawl(delta):
51
  url = "https://www.safe.gov.cn" + url
52
  article['category'] = "Policy Interpretation"
53
  crawl_by_url(url, article)
54
- except Exception as error:
55
  logger.error(error)
56
 
57
  i = 1
@@ -84,5 +84,5 @@ def crawl(delta):
84
  url = "https://www.safe.gov.cn" + url
85
  article['category'] = "Data Interpretation"
86
  crawl_by_url(url, article)
87
- except Exception as error:
88
  logger.error(error)
 
51
  url = "https://www.safe.gov.cn" + url
52
  article['category'] = "Policy Interpretation"
53
  crawl_by_url(url, article)
54
+ except (urllib.error.URLError, etree.XMLSyntaxError) as error:
55
  logger.error(error)
56
 
57
  i = 1
 
84
  url = "https://www.safe.gov.cn" + url
85
  article['category'] = "Data Interpretation"
86
  crawl_by_url(url, article)
87
+ except (urllib.error.URLError, etree.XMLSyntaxError) as error:
88
  logger.error(error)
source/stats.py CHANGED
@@ -54,5 +54,5 @@ def crawl(delta):
54
  url = url.replace('./', "https://www.stats.gov.cn/sj/sjjd/")
55
  article['category'] = "Data Interpretation"
56
  crawl_by_url(url, article)
57
- except Exception as error:
58
  logger.info(error)
 
54
  url = url.replace('./', "https://www.stats.gov.cn/sj/sjjd/")
55
  article['category'] = "Data Interpretation"
56
  crawl_by_url(url, article)
57
+ except (urllib.error.URLError, etree.XMLSyntaxError) as error:
58
  logger.info(error)