Spaces:

Oxbridge-Economics
/

Data-Collection-China

Build error

App Files Files Community

gavinzli commited on Feb 5

Commit

cc76656

1 Parent(s): fed78ac

Increase timeout for URL requests in crawl functions to enhance reliability

Browse files

Files changed (9) hide show

controllers/utils.py +1 -1
source/csrc.py +3 -3
source/eastmoney.py +2 -2
source/gov.py +2 -2
source/mof.py +2 -2
source/mofcom.py +1 -1
source/ndrc.py +1 -1
source/safe.py +2 -2
source/stats.py +1 -1

controllers/utils.py CHANGED Viewed

@@ -654,7 +654,7 @@ def crawl_by_url(url, article):
     """
     domain = '.'.join(urlparse(url).netloc.split('.')[1:])
-    req = urllib.request.urlopen(url, timeout=10)
     text = req.read()
     html_text = text.decode("utf-8")
     page = etree.HTML(html_text)

     """
     domain = '.'.join(urlparse(url).netloc.split('.')[1:])
+    req = urllib.request.urlopen(url, timeout=60)
     text = req.read()
     html_text = text.decode("utf-8")
     page = etree.HTML(html_text)

source/csrc.py CHANGED Viewed

@@ -47,7 +47,7 @@ def crawl(delta):
                 category_url,
                 headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
             )
-            response = urllib.request.urlopen(req, timeout=10)
             text = response.read()
             html_text = text.decode("utf-8")
             page = etree.HTML(html_text)
@@ -70,11 +70,11 @@ def crawl(delta):
                                 article = {}
                                 url = "http://www.csrc.gov.cn" + url
                                 article['category'] = "Policy Interpretation"
-                                logger.info(f"Processing article URL: {url}")
                                 crawl_by_url(url, article)
                             except (urllib.error.URLError, etree.XMLSyntaxError) as error:
                                 logger.error(error)
-        except (urllib.error.URLError, etree.XMLSyntaxError, ValueError) as error:
             i = -1
             logger.error(error)

                 category_url,
                 headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
             )
+            response = urllib.request.urlopen(req, timeout=60)
             text = response.read()
             html_text = text.decode("utf-8")
             page = etree.HTML(html_text)
                                 article = {}
                                 url = "http://www.csrc.gov.cn" + url
                                 article['category'] = "Policy Interpretation"
+                                logger.info("Processing article URL: %s", url)
                                 crawl_by_url(url, article)
                             except (urllib.error.URLError, etree.XMLSyntaxError) as error:
                                 logger.error(error)
+        except (urllib.error.URLError, etree.XMLSyntaxError, ValueError, TimeoutError) as error:
             i = -1
             logger.error(error)

source/eastmoney.py CHANGED Viewed

@@ -47,10 +47,10 @@ def _crawl(url, article, retries=3):
     domain = urlparse(url).netloc
     for attempt in range(retries):
         try:
-            req = urllib.request.urlopen(url, timeout=10)
             text = req.read()
             break
-        except IncompleteRead as e:
             if attempt == retries - 1:
                 time.sleep(1)  # Wait before retrying
                 continue

     domain = urlparse(url).netloc
     for attempt in range(retries):
         try:
+            req = urllib.request.urlopen(url, timeout=60)
             text = req.read()
             break
+        except (IncompleteRead, TimeoutError) as e:
             if attempt == retries - 1:
                 time.sleep(1)  # Wait before retrying
                 continue

source/gov.py CHANGED Viewed

@@ -28,7 +28,7 @@ def crawl(delta):
         else:
             category_url = f"https://www.gov.cn/zhengce/jiedu/home_{i}.htm"
         i = i + 1
-        req = urllib.request.urlopen(category_url, timeout=10)
         text = req.read()
         html_text = text.decode("utf-8")
         page = etree.HTML(html_text)
@@ -61,7 +61,7 @@ def crawl(delta):
         else:
             category_url = f"https://www.gov.cn/zhengce/zuixin/home_{i}.htm"
         i = i + 1
-        req = urllib.request.urlopen(category_url, timeout=10)
         text = req.read()
         html_text = text.decode("utf-8")
         page = etree.HTML(html_text)

         else:
             category_url = f"https://www.gov.cn/zhengce/jiedu/home_{i}.htm"
         i = i + 1
+        req = urllib.request.urlopen(category_url, timeout=60)
         text = req.read()
         html_text = text.decode("utf-8")
         page = etree.HTML(html_text)
         else:
             category_url = f"https://www.gov.cn/zhengce/zuixin/home_{i}.htm"
         i = i + 1
+        req = urllib.request.urlopen(category_url, timeout=60)
         text = req.read()
         html_text = text.decode("utf-8")
         page = etree.HTML(html_text)

source/mof.py CHANGED Viewed

@@ -28,7 +28,7 @@ def crawl(delta):
         else:
             category_url = f"https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/index_{i}.htm"
         i = i + 1
-        req = urllib.request.urlopen(category_url, timeout=10)
         text = req.read()
         html_text = text.decode("utf-8")
         page = etree.HTML(html_text)
@@ -65,7 +65,7 @@ def crawl(delta):
         else:
             category_url = f"https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/index_{i}.htm"
         i = i + 1
-        req = urllib.request.urlopen(category_url, timeout=10)
         text = req.read()
         html_text = text.decode("utf-8")
         page = etree.HTML(html_text)

         else:
             category_url = f"https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/index_{i}.htm"
         i = i + 1
+        req = urllib.request.urlopen(category_url, timeout=60)
         text = req.read()
         html_text = text.decode("utf-8")
         page = etree.HTML(html_text)
         else:
             category_url = f"https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/index_{i}.htm"
         i = i + 1
+        req = urllib.request.urlopen(category_url, timeout=60)
         text = req.read()
         html_text = text.decode("utf-8")
         page = etree.HTML(html_text)

source/mofcom.py CHANGED Viewed

@@ -31,7 +31,7 @@ def crawl(delta):
                 url = f"http://www.mofcom.gov.cn/article/zcjd/{category}/?{i}"
             i = i + 1
             try:
-                req = urllib.request.urlopen(url, timeout=10)
                 text = req.read()
                 html_text = text.decode("utf-8")
                 page = etree.HTML(html_text)

                 url = f"http://www.mofcom.gov.cn/article/zcjd/{category}/?{i}"
             i = i + 1
             try:
+                req = urllib.request.urlopen(url, timeout=60)
                 text = req.read()
                 html_text = text.decode("utf-8")
                 page = etree.HTML(html_text)

source/ndrc.py CHANGED Viewed

@@ -31,7 +31,7 @@ def crawl(delta):
         else:
             category_url = f"https://www.ndrc.gov.cn/xxgk/jd/jd/index_{i}.html"
         i = i + 1
-        req = urllib.request.urlopen(category_url, timeout=10)
         text = req.read()
         html_text = text.decode("utf-8")
         page = etree.HTML(html_text)

         else:
             category_url = f"https://www.ndrc.gov.cn/xxgk/jd/jd/index_{i}.html"
         i = i + 1
+        req = urllib.request.urlopen(category_url, timeout=60)
         text = req.read()
         html_text = text.decode("utf-8")
         page = etree.HTML(html_text)

source/safe.py CHANGED Viewed

@@ -29,7 +29,7 @@ def crawl(delta):
             category_url = f"https://www.safe.gov.cn/safe/zcfgjd/index_{i}.html"
         i = i + 1
         try:
-            req = urllib.request.urlopen(category_url, timeout=10)
             text = req.read()
             html_text = text.decode("utf-8")
             page = etree.HTML(html_text)
@@ -65,7 +65,7 @@ def crawl(delta):
         else:
             category_url = f"https://www.safe.gov.cn/safe/sjjd/index_{i}.html"
         i = i + 1
-        req = urllib.request.urlopen(category_url, timeout=10)
         text = req.read()
         html_text = text.decode("utf-8")
         page = etree.HTML(html_text)

             category_url = f"https://www.safe.gov.cn/safe/zcfgjd/index_{i}.html"
         i = i + 1
         try:
+            req = urllib.request.urlopen(category_url, timeout=60)
             text = req.read()
             html_text = text.decode("utf-8")
             page = etree.HTML(html_text)
         else:
             category_url = f"https://www.safe.gov.cn/safe/sjjd/index_{i}.html"
         i = i + 1
+        req = urllib.request.urlopen(category_url, timeout=60)
         text = req.read()
         html_text = text.decode("utf-8")
         page = etree.HTML(html_text)

source/stats.py CHANGED Viewed

@@ -36,7 +36,7 @@ def crawl(delta):
         retries = 3
         while retries > 0:
             try:
-                req = urllib.request.urlopen(category_url, timeout=10)
                 text = req.read()
                 html_text = text.decode("utf-8")
                 page = etree.HTML(html_text)

         retries = 3
         while retries > 0:
             try:
+                req = urllib.request.urlopen(category_url, timeout=60)
                 text = req.read()
                 html_text = text.decode("utf-8")
                 page = etree.HTML(html_text)