Implement retry mechanism in _crawl function to handle IncompleteRead exceptions and improve URL fetching reliability
Browse files- source/eastmoney.py +13 -7
source/eastmoney.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
"""Module to crawl the website 'eastmoney.com' to fetch and process articles."""
|
|
|
2 |
import json
|
3 |
import urllib.request
|
4 |
import uuid
|
@@ -25,7 +26,7 @@ with open('xpath.json', 'r', encoding='UTF-8') as f:
|
|
25 |
xpath_dict = json.load(f)
|
26 |
|
27 |
|
28 |
-
def _crawl(url, article):
|
29 |
"""
|
30 |
Crawls the given URL and extracts information from the webpage.
|
31 |
|
@@ -42,12 +43,17 @@ def _crawl(url, article):
|
|
42 |
|
43 |
"""
|
44 |
domain = urlparse(url).netloc
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
|
|
|
|
|
|
|
|
|
|
51 |
html_text = text.decode("utf-8")
|
52 |
page = etree.HTML(html_text)
|
53 |
contentcn, summary = encode_content(
|
|
|
1 |
"""Module to crawl the website 'eastmoney.com' to fetch and process articles."""
|
2 |
+
import time
|
3 |
import json
|
4 |
import urllib.request
|
5 |
import uuid
|
|
|
26 |
xpath_dict = json.load(f)
|
27 |
|
28 |
|
29 |
+
def _crawl(url, article, retries=3):
|
30 |
"""
|
31 |
Crawls the given URL and extracts information from the webpage.
|
32 |
|
|
|
43 |
|
44 |
"""
|
45 |
domain = urlparse(url).netloc
|
46 |
+
for attempt in range(retries):
|
47 |
+
try:
|
48 |
+
req = urllib.request.urlopen(url)
|
49 |
+
text = req.read()
|
50 |
+
break
|
51 |
+
except IncompleteRead as e:
|
52 |
+
if attempt == retries - 1:
|
53 |
+
time.sleep(1) # Wait before retrying
|
54 |
+
continue
|
55 |
+
else:
|
56 |
+
raise e
|
57 |
html_text = text.decode("utf-8")
|
58 |
page = etree.HTML(html_text)
|
59 |
contentcn, summary = encode_content(
|