Spaces:

Oxbridge-Economics
/

Data-Collection-China

Running

gavinzli commited on Feb 5

Commit

9939b16

1 Parent(s): 94ba329

Implement retry mechanism for URL requests in crawl function to enhance reliability

Files changed (1) hide show

source/stats.py CHANGED Viewed

@@ -33,15 +33,21 @@ def crawl(delta):
         else:
             category_url = f"https://www.stats.gov.cn/sj/sjjd/index_{i}.html"
         i = i + 1
-        try:
-            req = urllib.request.urlopen(category_url)
-            text = req.read()
-            html_text = text.decode("utf-8")
-            page = etree.HTML(html_text)
-            articlelist = page.xpath("//div[contains(@class, 'list-content')]/ul/li")
-        except (urllib.error.URLError, http.client.IncompleteRead) as error:
-            logger.info(error)
-            continue
         for article in articlelist:
             if isinstance(article, etree._Element):
                 subelement = etree.tostring(article).decode()

         else:
             category_url = f"https://www.stats.gov.cn/sj/sjjd/index_{i}.html"
         i = i + 1
+        retries = 3
+        while retries > 0:
+            try:
+                req = urllib.request.urlopen(category_url)
+                text = req.read()
+                html_text = text.decode("utf-8")
+                page = etree.HTML(html_text)
+                articlelist = page.xpath("//div[contains(@class, 'list-content')]/ul/li")
+            except (urllib.error.URLError, http.client.IncompleteRead) as error:
+                logger.info(error)
+                retries -= 1
+                if retries > 0:
+                    time.sleep(5)  # Wait for 5 seconds before retrying
+                else:
+                    continue  # Skip to the next URL after retries are exhausted
         for article in articlelist:
             if isinstance(article, etree._Element):
                 subelement = etree.tostring(article).decode()