Spaces:

Oxbridge-Economics
/

Data-Collection-China

Running

App Files Files Community

OxbridgeEconomics commited on Jun 13, 2024

Commit

efcd6b8

1 Parent(s): 214d268

commit

Browse files

Files changed (1) hide show

daily.py +32 -28

daily.py CHANGED Viewed

@@ -62,34 +62,38 @@ while i > -1:
 print("csrc.gov.cn")
 i = 1
 while i > -1:
-    if i == 1:
-        CATEGORY_URL = "http://www.csrc.gov.cn/csrc/c100039/common_list.shtml"
-    else:
-        CATEGORY_URL = f"http://www.csrc.gov.cn/csrc/c100039/common_list_{i}.shtml"
-    i = i + 1
-    req = urllib.request.urlopen(CATEGORY_URL)
-    text = req.read()
-    html_text = text.decode("utf-8")
-    page = etree.HTML(html_text)
-    articlelist = page.xpath("//div[contains(@class, 'main-right fr common-list')]/ul/li")
-    for article in articlelist:
-        if isinstance(article, etree._Element):
-            subelement = etree.tostring(article).decode()
-            subpage = etree.HTML(subelement)
-            date = encode(subpage.xpath("//span[@class='date']"))
-            parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
-            if  parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
-                i = -1
-            else:
-                urls = subpage.xpath("//a/@href")
-                for url in urls:
-                    try:
-                        article = {}
-                        url = "http://www.csrc.gov.cn" + url
-                        article['category']= "Policy Interpretation"
-                        crawl(url, article)
-                    except Exception as error:
-                       print(error)
 i = 1
 while i > -1:

 print("csrc.gov.cn")
 i = 1
 while i > -1:
+    try:
+        if i == 1:
+            CATEGORY_URL = "http://www.csrc.gov.cn/csrc/c100039/common_list.shtml"
+        else:
+            CATEGORY_URL = f"http://www.csrc.gov.cn/csrc/c100039/common_list_{i}.shtml"
+        i = i + 1
+        req = urllib.request.urlopen(CATEGORY_URL)
+        text = req.read()
+        html_text = text.decode("utf-8")
+        page = etree.HTML(html_text)
+        articlelist = page.xpath("//div[contains(@class, 'main-right fr common-list')]/ul/li")
+        for article in articlelist:
+            if isinstance(article, etree._Element):
+                subelement = etree.tostring(article).decode()
+                subpage = etree.HTML(subelement)
+                date = encode(subpage.xpath("//span[@class='date']"))
+                parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
+                if  parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
+                    i = -1
+                else:
+                    urls = subpage.xpath("//a/@href")
+                    for url in urls:
+                        try:
+                            article = {}
+                            url = "http://www.csrc.gov.cn" + url
+                            article['category']= "Policy Interpretation"
+                            crawl(url, article)
+                        except Exception as error:
+                            print(error)
+    except Exception as error:
+        i = -1
+        print(error)
 i = 1
 while i > -1: