Spaces:

Oxbridge-Economics
/

Data-Collection-China

Running

gavinzli commited on Dec 6, 2024

Commit

0383f38

1 Parent(s): d710384

Implement retry mechanism in _crawl function to handle IncompleteRead exceptions and improve URL fetching reliability

Files changed (1) hide show

source/eastmoney.py CHANGED Viewed

@@ -1,4 +1,5 @@
 """Module to crawl the website 'eastmoney.com' to fetch and process articles."""
 import json
 import urllib.request
 import uuid
@@ -25,7 +26,7 @@ with open('xpath.json', 'r', encoding='UTF-8') as f:
     xpath_dict = json.load(f)
-def _crawl(url, article):
     """
     Crawls the given URL and extracts information from the webpage.
@@ -42,12 +43,17 @@ def _crawl(url, article):
     """
     domain = urlparse(url).netloc
-    req = urllib.request.urlopen(url)
-    try:
-        text = req.read()
-    except IncompleteRead as e:
-        print(e)
-        return
     html_text = text.decode("utf-8")
     page = etree.HTML(html_text)
     contentcn, summary = encode_content(

 """Module to crawl the website 'eastmoney.com' to fetch and process articles."""
+import time
 import json
 import urllib.request
 import uuid
     xpath_dict = json.load(f)
+def _crawl(url, article, retries=3):
     """
     Crawls the given URL and extracts information from the webpage.
     """
     domain = urlparse(url).netloc
+    for attempt in range(retries):
+        try:
+            req = urllib.request.urlopen(url)
+            text = req.read()
+            break
+        except IncompleteRead as e:
+            if attempt == retries - 1:
+                time.sleep(1)  # Wait before retrying
+                continue
+            else:
+                raise e
     html_text = text.decode("utf-8")
     page = etree.HTML(html_text)
     contentcn, summary = encode_content(