Handle IncompleteRead exception in _crawl function to prevent crashes during URL fetching
Browse files- source/eastmoney.py +6 -1
source/eastmoney.py
CHANGED
@@ -4,6 +4,7 @@ import urllib.request
|
|
4 |
import uuid
|
5 |
from datetime import datetime, timedelta
|
6 |
from urllib.parse import urlparse
|
|
|
7 |
|
8 |
from prefect import task, get_run_logger
|
9 |
from lxml import etree
|
@@ -42,7 +43,11 @@ def _crawl(url, article):
|
|
42 |
"""
|
43 |
domain = urlparse(url).netloc
|
44 |
req = urllib.request.urlopen(url)
|
45 |
-
|
|
|
|
|
|
|
|
|
46 |
html_text = text.decode("utf-8")
|
47 |
page = etree.HTML(html_text)
|
48 |
contentcn, summary = encode_content(
|
|
|
4 |
import uuid
|
5 |
from datetime import datetime, timedelta
|
6 |
from urllib.parse import urlparse
|
7 |
+
from http.client import IncompleteRead
|
8 |
|
9 |
from prefect import task, get_run_logger
|
10 |
from lxml import etree
|
|
|
43 |
"""
|
44 |
domain = urlparse(url).netloc
|
45 |
req = urllib.request.urlopen(url)
|
46 |
+
try:
|
47 |
+
text = req.read()
|
48 |
+
except IncompleteRead as e:
|
49 |
+
print(e)
|
50 |
+
return
|
51 |
html_text = text.decode("utf-8")
|
52 |
page = etree.HTML(html_text)
|
53 |
contentcn, summary = encode_content(
|