Spaces:

Oxbridge-Economics
/

Data-Collection-China

Running

App Files Files Community

OxbridgeEconomics commited on Apr 21, 2024

Commit

d05c91c

1 Parent(s): 061b1de

commit

Browse files

Files changed (1) hide show

daily.py +9 -14

daily.py CHANGED Viewed

@@ -22,7 +22,7 @@ with open('xpath.json', 'r', encoding='UTF-8') as f:
 DELTA = int(os.environ.get('DELTA', '1'))
 print(f"DELTA = {DELTA}")
-# cbirc.gov.cn
 i = 1
 while i > -1:
     CATEGORY_URL = f"https://www.cbirc.gov.cn/cn/static/data/DocInfo/SelectDocByItemIdAndChild/data_itemId=917,pageIndex={i},pageSize=18.json"
@@ -59,7 +59,7 @@ while i > -1:
         except Exception as error:
             print(error)
-# csrc.gov.cn
 i = 1
 while i > -1:
     if i == 1:
@@ -126,7 +126,7 @@ while i > -1:
         except Exception as error:
             print(error)
-# data.eastmoney.com
 def crawl_eastmoney(url, article):
     domain = urlparse(url).netloc
     req = urllib.request.urlopen(url)
@@ -173,7 +173,6 @@ while i > -1:
         "qType": "3",
     }
     URL = URL + "?" + "&".join(f"{key}={value}" for key, value in params.items())
-    print(URL)
     content = fetch_url(URL)
     if content:
         start_index = content.find("(")
@@ -191,12 +190,11 @@ while i > -1:
                 except Exception as error:
                     print(error)
         else:
-            print(reportinfo)
             i = -1
     else:
         print("Failed to fetch URL:", url)
-# gov.cn
 i = 0
 while i > -1:
     if i == 0:
@@ -261,7 +259,7 @@ while i > -1:
                     except Exception as error:
                         print(error)
-# mof.gov.cn
 i = 0
 while i > -1:
     if i == 0:
@@ -269,7 +267,6 @@ while i > -1:
     else:
         CATEGORY_URL = f"https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/index_{i}.htm"
     i = i + 1
-    print(CATEGORY_URL)
     req = urllib.request.urlopen(CATEGORY_URL)
     text = req.read()
     html_text = text.decode("utf-8")
@@ -290,7 +287,6 @@ while i > -1:
                         article = {}
                         url = url.replace("../", "https://www.mof.gov.cn/zhengwuxinxi/")
                         url = url.replace("./", "https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/")
-                        print(url)
                         article['category']= "Financial News"
                         crawl(url, article)
                     except Exception as error:
@@ -303,7 +299,6 @@ while i > -1:
     else:
         CATEGORY_URL = f"https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/index_{i}.htm"
     i = i + 1
-    print(CATEGORY_URL)
     req = urllib.request.urlopen(CATEGORY_URL)
     text = req.read()
     html_text = text.decode("utf-8")
@@ -329,7 +324,7 @@ while i > -1:
                     except Exception as error:
                         print(error)
-# mofcom.gov.cn
 categories = ['jdzhsw','jdgnmy','jddwmy','jdtzhz']
 for category in categories:
     i = 1
@@ -366,7 +361,7 @@ for category in categories:
                         except Exception as error:
                             print(error)
-# ndrc.gov.cn
 i = 0
 while i > -1:
     if i == 0:
@@ -405,7 +400,7 @@ while i > -1:
                     except Exception as error:
                         print(error)
-# safe.gov.cn
 i = 1
 while i > -1:
     if i == 1:
@@ -468,7 +463,7 @@ while i > -1:
                     except Exception as error:
                         print(error)
-# stats.gov.hk
 i = 0
 while i > -1:
     if i == 0:

 DELTA = int(os.environ.get('DELTA', '1'))
 print(f"DELTA = {DELTA}")
+print("cbirc.gov.cn")
 i = 1
 while i > -1:
     CATEGORY_URL = f"https://www.cbirc.gov.cn/cn/static/data/DocInfo/SelectDocByItemIdAndChild/data_itemId=917,pageIndex={i},pageSize=18.json"
         except Exception as error:
             print(error)
+print("csrc.gov.cn")
 i = 1
 while i > -1:
     if i == 1:
         except Exception as error:
             print(error)
+print("data.eastmoney.com")
 def crawl_eastmoney(url, article):
     domain = urlparse(url).netloc
     req = urllib.request.urlopen(url)
         "qType": "3",
     }
     URL = URL + "?" + "&".join(f"{key}={value}" for key, value in params.items())
     content = fetch_url(URL)
     if content:
         start_index = content.find("(")
                 except Exception as error:
                     print(error)
         else:
             i = -1
     else:
         print("Failed to fetch URL:", url)
+print("gov.cn")
 i = 0
 while i > -1:
     if i == 0:
                     except Exception as error:
                         print(error)
+print("mof.gov.cn")
 i = 0
 while i > -1:
     if i == 0:
     else:
         CATEGORY_URL = f"https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/index_{i}.htm"
     i = i + 1
     req = urllib.request.urlopen(CATEGORY_URL)
     text = req.read()
     html_text = text.decode("utf-8")
                         article = {}
                         url = url.replace("../", "https://www.mof.gov.cn/zhengwuxinxi/")
                         url = url.replace("./", "https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/")
                         article['category']= "Financial News"
                         crawl(url, article)
                     except Exception as error:
     else:
         CATEGORY_URL = f"https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/index_{i}.htm"
     i = i + 1
     req = urllib.request.urlopen(CATEGORY_URL)
     text = req.read()
     html_text = text.decode("utf-8")
                     except Exception as error:
                         print(error)
+print("mofcom.gov.cn")
 categories = ['jdzhsw','jdgnmy','jddwmy','jdtzhz']
 for category in categories:
     i = 1
                         except Exception as error:
                             print(error)
+print("ndrc.gov.cn")
 i = 0
 while i > -1:
     if i == 0:
                     except Exception as error:
                         print(error)
+print("safe.gov.cn")
 i = 1
 while i > -1:
     if i == 1:
                     except Exception as error:
                         print(error)
+print("stats.gov.hk")
 i = 0
 while i > -1:
     if i == 0: