OxbridgeEconomics
commited on
Commit
·
d05c91c
1
Parent(s):
061b1de
commit
Browse files
daily.py
CHANGED
@@ -22,7 +22,7 @@ with open('xpath.json', 'r', encoding='UTF-8') as f:
|
|
22 |
DELTA = int(os.environ.get('DELTA', '1'))
|
23 |
print(f"DELTA = {DELTA}")
|
24 |
|
25 |
-
|
26 |
i = 1
|
27 |
while i > -1:
|
28 |
CATEGORY_URL = f"https://www.cbirc.gov.cn/cn/static/data/DocInfo/SelectDocByItemIdAndChild/data_itemId=917,pageIndex={i},pageSize=18.json"
|
@@ -59,7 +59,7 @@ while i > -1:
|
|
59 |
except Exception as error:
|
60 |
print(error)
|
61 |
|
62 |
-
|
63 |
i = 1
|
64 |
while i > -1:
|
65 |
if i == 1:
|
@@ -126,7 +126,7 @@ while i > -1:
|
|
126 |
except Exception as error:
|
127 |
print(error)
|
128 |
|
129 |
-
|
130 |
def crawl_eastmoney(url, article):
|
131 |
domain = urlparse(url).netloc
|
132 |
req = urllib.request.urlopen(url)
|
@@ -173,7 +173,6 @@ while i > -1:
|
|
173 |
"qType": "3",
|
174 |
}
|
175 |
URL = URL + "?" + "&".join(f"{key}={value}" for key, value in params.items())
|
176 |
-
print(URL)
|
177 |
content = fetch_url(URL)
|
178 |
if content:
|
179 |
start_index = content.find("(")
|
@@ -191,12 +190,11 @@ while i > -1:
|
|
191 |
except Exception as error:
|
192 |
print(error)
|
193 |
else:
|
194 |
-
print(reportinfo)
|
195 |
i = -1
|
196 |
else:
|
197 |
print("Failed to fetch URL:", url)
|
198 |
|
199 |
-
|
200 |
i = 0
|
201 |
while i > -1:
|
202 |
if i == 0:
|
@@ -261,7 +259,7 @@ while i > -1:
|
|
261 |
except Exception as error:
|
262 |
print(error)
|
263 |
|
264 |
-
|
265 |
i = 0
|
266 |
while i > -1:
|
267 |
if i == 0:
|
@@ -269,7 +267,6 @@ while i > -1:
|
|
269 |
else:
|
270 |
CATEGORY_URL = f"https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/index_{i}.htm"
|
271 |
i = i + 1
|
272 |
-
print(CATEGORY_URL)
|
273 |
req = urllib.request.urlopen(CATEGORY_URL)
|
274 |
text = req.read()
|
275 |
html_text = text.decode("utf-8")
|
@@ -290,7 +287,6 @@ while i > -1:
|
|
290 |
article = {}
|
291 |
url = url.replace("../", "https://www.mof.gov.cn/zhengwuxinxi/")
|
292 |
url = url.replace("./", "https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/")
|
293 |
-
print(url)
|
294 |
article['category']= "Financial News"
|
295 |
crawl(url, article)
|
296 |
except Exception as error:
|
@@ -303,7 +299,6 @@ while i > -1:
|
|
303 |
else:
|
304 |
CATEGORY_URL = f"https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/index_{i}.htm"
|
305 |
i = i + 1
|
306 |
-
print(CATEGORY_URL)
|
307 |
req = urllib.request.urlopen(CATEGORY_URL)
|
308 |
text = req.read()
|
309 |
html_text = text.decode("utf-8")
|
@@ -329,7 +324,7 @@ while i > -1:
|
|
329 |
except Exception as error:
|
330 |
print(error)
|
331 |
|
332 |
-
|
333 |
categories = ['jdzhsw','jdgnmy','jddwmy','jdtzhz']
|
334 |
for category in categories:
|
335 |
i = 1
|
@@ -366,7 +361,7 @@ for category in categories:
|
|
366 |
except Exception as error:
|
367 |
print(error)
|
368 |
|
369 |
-
|
370 |
i = 0
|
371 |
while i > -1:
|
372 |
if i == 0:
|
@@ -405,7 +400,7 @@ while i > -1:
|
|
405 |
except Exception as error:
|
406 |
print(error)
|
407 |
|
408 |
-
|
409 |
i = 1
|
410 |
while i > -1:
|
411 |
if i == 1:
|
@@ -468,7 +463,7 @@ while i > -1:
|
|
468 |
except Exception as error:
|
469 |
print(error)
|
470 |
|
471 |
-
|
472 |
i = 0
|
473 |
while i > -1:
|
474 |
if i == 0:
|
|
|
22 |
DELTA = int(os.environ.get('DELTA', '1'))
|
23 |
print(f"DELTA = {DELTA}")
|
24 |
|
25 |
+
print("cbirc.gov.cn")
|
26 |
i = 1
|
27 |
while i > -1:
|
28 |
CATEGORY_URL = f"https://www.cbirc.gov.cn/cn/static/data/DocInfo/SelectDocByItemIdAndChild/data_itemId=917,pageIndex={i},pageSize=18.json"
|
|
|
59 |
except Exception as error:
|
60 |
print(error)
|
61 |
|
62 |
+
print("csrc.gov.cn")
|
63 |
i = 1
|
64 |
while i > -1:
|
65 |
if i == 1:
|
|
|
126 |
except Exception as error:
|
127 |
print(error)
|
128 |
|
129 |
+
print("data.eastmoney.com")
|
130 |
def crawl_eastmoney(url, article):
|
131 |
domain = urlparse(url).netloc
|
132 |
req = urllib.request.urlopen(url)
|
|
|
173 |
"qType": "3",
|
174 |
}
|
175 |
URL = URL + "?" + "&".join(f"{key}={value}" for key, value in params.items())
|
|
|
176 |
content = fetch_url(URL)
|
177 |
if content:
|
178 |
start_index = content.find("(")
|
|
|
190 |
except Exception as error:
|
191 |
print(error)
|
192 |
else:
|
|
|
193 |
i = -1
|
194 |
else:
|
195 |
print("Failed to fetch URL:", url)
|
196 |
|
197 |
+
print("gov.cn")
|
198 |
i = 0
|
199 |
while i > -1:
|
200 |
if i == 0:
|
|
|
259 |
except Exception as error:
|
260 |
print(error)
|
261 |
|
262 |
+
print("mof.gov.cn")
|
263 |
i = 0
|
264 |
while i > -1:
|
265 |
if i == 0:
|
|
|
267 |
else:
|
268 |
CATEGORY_URL = f"https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/index_{i}.htm"
|
269 |
i = i + 1
|
|
|
270 |
req = urllib.request.urlopen(CATEGORY_URL)
|
271 |
text = req.read()
|
272 |
html_text = text.decode("utf-8")
|
|
|
287 |
article = {}
|
288 |
url = url.replace("../", "https://www.mof.gov.cn/zhengwuxinxi/")
|
289 |
url = url.replace("./", "https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/")
|
|
|
290 |
article['category']= "Financial News"
|
291 |
crawl(url, article)
|
292 |
except Exception as error:
|
|
|
299 |
else:
|
300 |
CATEGORY_URL = f"https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/index_{i}.htm"
|
301 |
i = i + 1
|
|
|
302 |
req = urllib.request.urlopen(CATEGORY_URL)
|
303 |
text = req.read()
|
304 |
html_text = text.decode("utf-8")
|
|
|
324 |
except Exception as error:
|
325 |
print(error)
|
326 |
|
327 |
+
print("mofcom.gov.cn")
|
328 |
categories = ['jdzhsw','jdgnmy','jddwmy','jdtzhz']
|
329 |
for category in categories:
|
330 |
i = 1
|
|
|
361 |
except Exception as error:
|
362 |
print(error)
|
363 |
|
364 |
+
print("ndrc.gov.cn")
|
365 |
i = 0
|
366 |
while i > -1:
|
367 |
if i == 0:
|
|
|
400 |
except Exception as error:
|
401 |
print(error)
|
402 |
|
403 |
+
print("safe.gov.cn")
|
404 |
i = 1
|
405 |
while i > -1:
|
406 |
if i == 1:
|
|
|
463 |
except Exception as error:
|
464 |
print(error)
|
465 |
|
466 |
+
print("stats.gov.hk")
|
467 |
i = 0
|
468 |
while i > -1:
|
469 |
if i == 0:
|