gavinzli commited on
Commit
d710384
·
1 Parent(s): dcdb6e8

Handle IncompleteRead exception in _crawl function to prevent crashes during URL fetching

Browse files
Files changed (1) hide show
  1. source/eastmoney.py +6 -1
source/eastmoney.py CHANGED
@@ -4,6 +4,7 @@ import urllib.request
4
  import uuid
5
  from datetime import datetime, timedelta
6
  from urllib.parse import urlparse
 
7
 
8
  from prefect import task, get_run_logger
9
  from lxml import etree
@@ -42,7 +43,11 @@ def _crawl(url, article):
42
  """
43
  domain = urlparse(url).netloc
44
  req = urllib.request.urlopen(url)
45
- text = req.read()
 
 
 
 
46
  html_text = text.decode("utf-8")
47
  page = etree.HTML(html_text)
48
  contentcn, summary = encode_content(
 
4
  import uuid
5
  from datetime import datetime, timedelta
6
  from urllib.parse import urlparse
7
+ from http.client import IncompleteRead
8
 
9
  from prefect import task, get_run_logger
10
  from lxml import etree
 
43
  """
44
  domain = urlparse(url).netloc
45
  req = urllib.request.urlopen(url)
46
+ try:
47
+ text = req.read()
48
+ except IncompleteRead as e:
49
+ print(e)
50
+ return
51
  html_text = text.decode("utf-8")
52
  page = etree.HTML(html_text)
53
  contentcn, summary = encode_content(