gavinzli commited on
Commit
0383f38
·
1 Parent(s): d710384

Implement retry mechanism in _crawl function to handle IncompleteRead exceptions and improve URL fetching reliability

Browse files
Files changed (1) hide show
  1. source/eastmoney.py +13 -7
source/eastmoney.py CHANGED
@@ -1,4 +1,5 @@
1
  """Module to crawl the website 'eastmoney.com' to fetch and process articles."""
 
2
  import json
3
  import urllib.request
4
  import uuid
@@ -25,7 +26,7 @@ with open('xpath.json', 'r', encoding='UTF-8') as f:
25
  xpath_dict = json.load(f)
26
 
27
 
28
- def _crawl(url, article):
29
  """
30
  Crawls the given URL and extracts information from the webpage.
31
 
@@ -42,12 +43,17 @@ def _crawl(url, article):
42
 
43
  """
44
  domain = urlparse(url).netloc
45
- req = urllib.request.urlopen(url)
46
- try:
47
- text = req.read()
48
- except IncompleteRead as e:
49
- print(e)
50
- return
 
 
 
 
 
51
  html_text = text.decode("utf-8")
52
  page = etree.HTML(html_text)
53
  contentcn, summary = encode_content(
 
1
  """Module to crawl the website 'eastmoney.com' to fetch and process articles."""
2
+ import time
3
  import json
4
  import urllib.request
5
  import uuid
 
26
  xpath_dict = json.load(f)
27
 
28
 
29
+ def _crawl(url, article, retries=3):
30
  """
31
  Crawls the given URL and extracts information from the webpage.
32
 
 
43
 
44
  """
45
  domain = urlparse(url).netloc
46
+ for attempt in range(retries):
47
+ try:
48
+ req = urllib.request.urlopen(url)
49
+ text = req.read()
50
+ break
51
+ except IncompleteRead as e:
52
+ if attempt == retries - 1:
53
+ time.sleep(1) # Wait before retrying
54
+ continue
55
+ else:
56
+ raise e
57
  html_text = text.decode("utf-8")
58
  page = etree.HTML(html_text)
59
  contentcn, summary = encode_content(