Muhammad Abdur Rahman Saad commited on
Commit
5068e1f
·
1 Parent(s): 9a22ff3

Update daily.py

Browse files
Files changed (1) hide show
  1. daily.py +411 -418
daily.py CHANGED
@@ -64,468 +64,461 @@ def crawl_eastmoney(url, article):
64
  extract_reference(article)
65
  update_content(article)
66
 
67
- def daily():
68
- with open('xpath.json', 'r', encoding='UTF-8') as f:
69
- xpath_dict = json.load(f)
70
 
71
- DELTA = int(os.environ.get('DELTA') or '1')
72
- print(f"DELTA = {DELTA}")
73
 
74
- print("cbirc.gov.cn")
75
- i = 1
76
- while i > -1:
77
- CATEGORY_URL = f"https://www.cbirc.gov.cn/cn/static/data/DocInfo/SelectDocByItemIdAndChild/data_itemId=917,pageIndex={i},pageSize=18.json"
78
- i = i + 1
79
- content = fetch_url(CATEGORY_URL)
80
- reportinfo = json.loads(content)
81
- for article in reportinfo['data']['rows']:
82
- try:
83
- article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(article['publishDate'],"%Y-%m-%d %H:%M:%S"))
84
- parsed_datetime = datetime.strptime(article['publishDate'], "%Y-%m-%d")
85
- if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
86
- i = -1
87
- else:
88
- contentCN, summary = extract_from_pdf("https://www.cbirc.gov.cn" + article['pdfFileUrl'])
89
- article['contentCN'] = repr(contentCN)[1:-1].strip()
90
- if len(contentCN) < 10:
91
- continue
92
- CONTENT_ENG = ''
93
- for element in article['contentCN'].split("\n"):
94
- CONTENT_ENG += translate(element) + '\n'
95
- article['content'] = repr(CONTENT_ENG)[1:-1].strip()
96
- article['site'] = "National Financial Regulatory Administration of China"
97
- article['originSite'] = "国家金融监督管理总局"
98
- article['titleCN'] = article['docSubtitle']
99
- article['title'] = translate(article['docSubtitle'])
100
- article['link'] = "https://www.cbirc.gov.cn" + str(article['pdfFileUrl'])
101
- article['category']= "Policy Interpretation"
102
- article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['titleCN']+article['publishDate'])
103
- article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
104
- article['attachment'] = ''
105
- article['author'] = ''
106
- article['subtitle'] = translate(summary)
107
- update_content(article)
108
- except Exception as error:
109
- print(error)
110
-
111
- print("csrc.gov.cn")
112
- i = 1
113
- while i > -1:
114
  try:
115
- if i == 1:
116
- CATEGORY_URL = "http://www.csrc.gov.cn/csrc/c100039/common_list.shtml"
 
 
117
  else:
118
- CATEGORY_URL = f"http://www.csrc.gov.cn/csrc/c100039/common_list_{i}.shtml"
119
- i = i + 1
120
- req = urllib.request.urlopen(CATEGORY_URL)
121
- text = req.read()
122
- html_text = text.decode("utf-8")
123
- page = etree.HTML(html_text)
124
- articlelist = page.xpath("//div[contains(@class, 'main-right fr common-list')]/ul/li")
125
- for article in articlelist:
126
- if isinstance(article, etree._Element):
127
- subelement = etree.tostring(article).decode()
128
- subpage = etree.HTML(subelement)
129
- date = encode(subpage.xpath("//span[@class='date']"))
130
- parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
131
- if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
132
- i = -1
133
- else:
134
- urls = subpage.xpath("//a/@href")
135
- for url in urls:
136
- try:
137
- article = {}
138
- url = "http://www.csrc.gov.cn" + url
139
- article['category']= "Policy Interpretation"
140
- crawl(url, article)
141
- except Exception as error:
142
- print(error)
143
- except Exception as error:
144
- i = -1
145
- print(error)
146
-
147
- i = 1
148
- while i > -1:
149
- CATEGORY_URL = f"http://www.csrc.gov.cn/searchList/a1a078ee0bc54721ab6b148884c784a8?_isAgg=true&_isJson=true&_pageSize=18&_template=index&_rangeTimeGte=&_channelName=&page={i}"
150
- i = i + 1
151
- try:
152
- content = fetch_url(CATEGORY_URL)
153
- reportinfo = json.loads(content)
154
- for article in reportinfo['data']['results']:
155
- parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(article['publishedTimeStr'],"%Y-%m-%d %H:%M:%S")), "%Y-%m-%d")
156
- if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
157
- i = -1
158
- else:
159
- article['category']= "Financial News"
160
- article['site'] = "Securities Regulatory Commission of China"
161
- article['originSite'] = "证监会"
162
- article['titleCN'] = article['title']
163
- article['title'] = translate(article['titleCN'])
164
- article['author'] = ''
165
- article['contentCN'] = repr(article['content'])[1:-1].strip()
166
- if len(article['contentCN']) < 10:
167
- continue
168
- CONTENT_ENG = ''
169
- for element in article['contentCN'].split("。"):
170
- CONTENT_ENG += translate(element) + ' '
171
- article['content'] = repr(CONTENT_ENG)[1:-1].strip()
172
- article['subtitle'] = article['memo']
173
- article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(article['publishedTimeStr'],"%Y-%m-%d %H:%M:%S"))
174
- article['link'] = article['url']
175
- article['attachment'] = ""
176
- article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
177
- article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['titleCN']+article['publishDate'])
178
- update_content(article)
179
  except Exception as error:
180
  print(error)
181
 
182
- print("data.eastmoney.com")
183
-
184
- today = datetime.today().strftime('%Y-%m-%d')
185
- beginDate = (datetime.today() - timedelta(days=DELTA)).strftime('%Y-%m-%d')
186
- i = 0
187
- while i > -1:
188
- URL = "https://reportapi.eastmoney.com/report/jg"
189
- params = {
190
- "cb": "datatable8544623",
191
- "pageSize": "100",
192
- "beginTime": beginDate,
193
- "endTime": today,
194
- "pageNo": i,
195
- "qType": "3",
196
- }
197
- URL = URL + "?" + "&".join(f"{key}={value}" for key, value in params.items())
198
- content = fetch_url(URL)
199
- if content:
200
- start_index = content.find("(")
201
- if start_index != -1:
202
- result = content[start_index + 1: -1]
203
- else:
204
- result = content
205
- reportinfo = json.loads(result)
206
- if reportinfo["size"] > 0:
207
- i = i + 1
208
- for article in reportinfo['data']:
209
- try:
210
- url = f"https://data.eastmoney.com/report/zw_macresearch.jshtml?encodeUrl={article['encodeUrl']}"
211
- crawl_eastmoney(url,article)
212
- except Exception as error:
213
- print(error)
214
- else:
215
- i = -1
216
- else:
217
- print("Failed to fetch URL:", url)
218
-
219
- print("gov.cn")
220
- i = 0
221
- while i > -1:
222
- if i == 0:
223
- CATEGORY_URL = "https://www.gov.cn/zhengce/jiedu/home.htm"
224
  else:
225
- CATEGORY_URL = f"https://www.gov.cn/zhengce/jiedu/home_{i}.htm"
226
  i = i + 1
227
  req = urllib.request.urlopen(CATEGORY_URL)
228
  text = req.read()
229
  html_text = text.decode("utf-8")
230
  page = etree.HTML(html_text)
231
- articlelist = page.xpath("//div[contains(@class, 'news_box')]//h4")
232
  for article in articlelist:
233
  if isinstance(article, etree._Element):
234
  subelement = etree.tostring(article).decode()
235
  subpage = etree.HTML(subelement)
236
- date = subpage.xpath("//span/text()")[0]
237
  parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
238
  if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
239
  i = -1
240
  else:
241
- urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
242
  for url in urls:
243
  try:
244
  article = {}
245
- url = url.replace('../', 'https://www.gov.cn/zhengce/')
246
- if "https://www.gov.cn" in url:
247
- article['category']= "Policy Interpretation"
248
- crawl(url, article)
249
  except Exception as error:
250
  print(error)
 
 
 
251
 
252
- i = 0
253
- while i > -1:
254
- if i == 0:
255
- CATEGORY_URL = "https://www.gov.cn/zhengce/zuixin/home.htm"
256
- else:
257
- CATEGORY_URL = f"https://www.gov.cn/zhengce/zuixin/home_{i}.htm"
258
- i = i + 1
259
- req = urllib.request.urlopen(CATEGORY_URL)
260
- text = req.read()
261
- html_text = text.decode("utf-8")
262
- page = etree.HTML(html_text)
263
- articlelist = page.xpath("//div[contains(@class, 'news_box')]//h4")
264
- for article in articlelist:
265
- if isinstance(article, etree._Element):
266
- subelement = etree.tostring(article).decode()
267
- subpage = etree.HTML(subelement)
268
- date = subpage.xpath("//span/text()")[0]
269
- parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
270
- if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
271
- i = -1
272
- else:
273
- urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
274
- for url in urls:
275
- try:
276
- article = {}
277
- url = url.replace('../', 'https://www.gov.cn/zhengce/')
278
- if "https://www.gov.cn" in url:
279
- article['site'] = "State Council of China"
280
- crawl(url, article)
281
- except Exception as error:
282
- print(error)
 
 
 
283
 
284
- print("mof.gov.cn")
285
- i = 0
286
- while i > -1:
287
- if i == 0:
288
- CATEGORY_URL = "https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/"
289
- else:
290
- CATEGORY_URL = f"https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/index_{i}.htm"
291
- i = i + 1
292
- req = urllib.request.urlopen(CATEGORY_URL)
293
- text = req.read()
294
- html_text = text.decode("utf-8")
295
- page = etree.HTML(html_text)
296
- articlelist = page.xpath("//div[contains(@class, 'xwfb_listerji')]/ul/li[not(@class = 'clear')]")
297
- for article in articlelist:
298
- if isinstance(article, etree._Element):
299
- subelement = etree.tostring(article).decode()
300
- subpage = etree.HTML(subelement)
301
- date = subpage.xpath("//span/text()")[0]
302
- parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
303
- if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
304
- i = -1
305
- else:
306
- urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
307
- for url in urls:
308
- try:
309
- article = {}
310
- url = url.replace("../", "https://www.mof.gov.cn/zhengwuxinxi/")
311
- url = url.replace("./", "https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/")
312
- article['category']= "Financial News"
313
- crawl(url, article)
314
- except Exception as error:
315
- print(error)
316
 
317
- i = 0
318
- while i > -1:
319
- if i == 0:
320
- CATEGORY_URL = "https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
321
  else:
322
- CATEGORY_URL = f"https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/index_{i}.htm"
323
- i = i + 1
324
- req = urllib.request.urlopen(CATEGORY_URL)
325
- text = req.read()
326
- html_text = text.decode("utf-8")
327
- page = etree.HTML(html_text)
328
- articlelist = page.xpath("//div[contains(@class, 'xwfb_listerji')]/ul/li[not(@class = 'clear')]")
329
- for article in articlelist:
330
- if isinstance(article, etree._Element):
331
- subelement = etree.tostring(article).decode()
332
- subpage = etree.HTML(subelement)
333
- date = subpage.xpath("//span/text()")[0]
334
- parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
335
- if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
336
- i = -1
337
- else:
338
- urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
339
- for url in urls:
340
- try:
341
- article = {}
342
- url = url.replace("./", CATEGORY_URL)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
343
  article['category']= "Policy Interpretation"
344
- print(url)
345
  crawl(url, article)
346
- except Exception as error:
347
- print(error)
348
 
349
- print("mofcom.gov.cn")
350
- categories = ['jdzhsw','jdgnmy','jddwmy','jdtzhz']
351
- for category in categories:
352
- i = 1
353
- while i > -1:
354
- if i == 1:
355
- URL = f"http://www.mofcom.gov.cn/article/zcjd/{category}/"
356
- else:
357
- URL = f"http://www.mofcom.gov.cn/article/zcjd/{category}/?{i}"
358
- i = i + 1
359
- try:
360
- req = urllib.request.urlopen(URL)
361
- text = req.read()
362
- html_text = text.decode("utf-8")
363
- page = etree.HTML(html_text)
364
- articlelist = page.xpath("//section[contains(@class, 'listCon iListCon f-mt30')]/ul/li")
365
- for article in articlelist:
366
- if isinstance(article, etree._Element):
367
- subelement = etree.tostring(article).decode()
368
- subpage = etree.HTML(subelement)
369
- date = subpage.xpath("//span/text()")[0]
370
- parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d %H:%M:%S")), "%Y-%m-%d")
371
- if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
372
- i = -1
373
- else:
374
- urls = subpage.xpath("//a/@href")
375
- for url in urls:
376
- try:
377
- article = {}
378
- if '/article/zcjd' in url:
379
- url = "http://www.mofcom.gov.cn" + url
380
- article['category']= "Policy Interpretation"
381
- else:
382
- article['category']= "Policy Release"
383
- crawl(url, article)
384
- except Exception as error:
385
- print(error)
386
- except Exception as error:
387
  i = -1
388
- print(error)
 
 
 
 
 
 
 
 
 
 
389
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
390
 
391
- print("ndrc.gov.cn")
392
- i = 0
393
- while i > -1:
394
- if i == 0:
395
- CATEGORY_URL = "https://www.ndrc.gov.cn/xxgk/jd/jd/index.html"
396
- else:
397
- CATEGORY_URL = f"https://www.ndrc.gov.cn/xxgk/jd/jd/index_{i}.html"
398
- i = i + 1
399
- req = urllib.request.urlopen(CATEGORY_URL)
400
- text = req.read()
401
- html_text = text.decode("utf-8")
402
- page = etree.HTML(html_text)
403
- articlelist = page.xpath("//div[contains(@class, 'list')]/ul/li[not(@class = 'empty')]")
404
- for article in articlelist:
405
- if isinstance(article, etree._Element):
406
- subelement = etree.tostring(article).decode()
407
- subpage = etree.HTML(subelement)
408
- date = subpage.xpath("//span/text()")[0]
409
- parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y/%m/%d")), "%Y-%m-%d")
410
- if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
411
- i = -1
412
- else:
413
- urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
414
- for url in urls:
415
- try:
416
- article = {}
417
- if "www.gov.cn" in url:
418
- article['category']= "Policy Release"
419
- elif "../../zcfb/" in url:
420
- url = url.replace("../../zcfb/", "https://www.ndrc.gov.cn/xxgk/zcfb/")
421
- article['category']= "Policy Release"
422
- else:
423
- url = url.replace("../../", "https://www.ndrc.gov.cn/xxgk/jd/jd/")
424
- url = url.replace("./", "https://www.ndrc.gov.cn/xxgk/jd/jd/")
425
- article['category']= "Policy Interpretation"
426
- crawl(url, article)
427
- except Exception as error:
428
- print(error)
429
 
430
- print("safe.gov.cn")
 
 
431
  i = 1
432
  while i > -1:
433
  if i == 1:
434
- CATEGORY_URL = "https://www.safe.gov.cn/safe/zcfgjd/index.html"
435
  else:
436
- CATEGORY_URL = f"https://www.safe.gov.cn/safe/zcfgjd/index_{i}.html"
437
  i = i + 1
438
- req = urllib.request.urlopen(CATEGORY_URL)
439
- text = req.read()
440
- html_text = text.decode("utf-8")
441
- page = etree.HTML(html_text)
442
- articlelist = page.xpath("//div[contains(@class, 'list_conr')]/ul/li")
443
- for article in articlelist:
444
- if isinstance(article, etree._Element):
445
- subelement = etree.tostring(article).decode()
446
- subpage = etree.HTML(subelement)
447
- date = subpage.xpath("//dd/text()")[0]
448
- parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
449
- if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
450
- i = -1
451
- else:
452
- urls = subpage.xpath("//a/@href")
453
- for url in urls:
454
- try:
455
- article = {}
456
- url = "https://www.safe.gov.cn" + url
457
- article['category']= "Policy Interpretation"
458
- crawl(url, article)
459
- except Exception as error:
460
- print(error)
 
 
 
 
 
 
 
461
 
462
- i = 1
463
- while i > -1:
464
- if i == 1:
465
- CATEGORY_URL = "https://www.safe.gov.cn/safe/sjjd/index.html"
466
- else:
467
- CATEGORY_URL = f"https://www.safe.gov.cn/safe/sjjd/index_{i}.html"
468
- i = i + 1
469
- req = urllib.request.urlopen(CATEGORY_URL)
470
- text = req.read()
471
- html_text = text.decode("utf-8")
472
- page = etree.HTML(html_text)
473
- articlelist = page.xpath("//div[contains(@class, 'list_conr')]/ul/li")
474
- for article in articlelist:
475
- if isinstance(article, etree._Element):
476
- subelement = etree.tostring(article).decode()
477
- subpage = etree.HTML(subelement)
478
- date = subpage.xpath("//dd/text()")[0]
479
- parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
480
- if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
481
- i = -1
482
- else:
483
- urls = subpage.xpath("//a/@href")
484
- for url in urls:
485
- try:
486
- article = {}
487
- url = "https://www.safe.gov.cn" + url
488
- article['category']= "Data Interpretation"
489
- crawl(url, article)
490
- except Exception as error:
491
- print(error)
492
 
493
- print("stats.gov.hk")
494
- i = 0
495
- while i > -1:
496
- if i == 0:
497
- CATEGORY_URL = "https://www.stats.gov.cn/sj/sjjd/"
498
- else:
499
- CATEGORY_URL = f"https://www.stats.gov.cn/sj/sjjd/index_{i}.html"
500
- i = i + 1
501
- req = urllib.request.urlopen(CATEGORY_URL)
502
- text = req.read()
503
- html_text = text.decode("utf-8")
504
- page = etree.HTML(html_text)
505
- articlelist = page.xpath("//div[contains(@class, 'list-content')]/ul/li")
506
- for article in articlelist:
507
- if isinstance(article, etree._Element):
508
- subelement = etree.tostring(article).decode()
509
- subpage = etree.HTML(subelement)
510
- date = encode(subpage.xpath("//span"))
511
- parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
512
- if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
513
- i = -1
514
- else:
515
- urls = subpage.xpath("//a[@class='fl pc_1600']/@href")
516
- for url in urls:
517
- try:
518
- article = {}
519
- url = url.replace('./', "https://www.stats.gov.cn/sj/sjjd/")
520
- article['category']= "Data Interpretation"
521
- crawl(url, article)
522
- except Exception as error:
523
- print(error)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
524
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
525
 
526
- def data_collection():
527
- daily()
528
- glue_job_run()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
529
 
530
- if __name__ == '__main__':
531
- data_collection()
 
64
  extract_reference(article)
65
  update_content(article)
66
 
67
+ with open('xpath.json', 'r', encoding='UTF-8') as f:
68
+ xpath_dict = json.load(f)
 
69
 
70
+ DELTA = int(os.environ.get('DELTA') or '1')
71
+ print(f"DELTA = {DELTA}")
72
 
73
+ print("cbirc.gov.cn")
74
+ i = 1
75
+ while i > -1:
76
+ CATEGORY_URL = f"https://www.cbirc.gov.cn/cn/static/data/DocInfo/SelectDocByItemIdAndChild/data_itemId=917,pageIndex={i},pageSize=18.json"
77
+ i = i + 1
78
+ content = fetch_url(CATEGORY_URL)
79
+ reportinfo = json.loads(content)
80
+ for article in reportinfo['data']['rows']:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  try:
82
+ article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(article['publishDate'],"%Y-%m-%d %H:%M:%S"))
83
+ parsed_datetime = datetime.strptime(article['publishDate'], "%Y-%m-%d")
84
+ if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
85
+ i = -1
86
  else:
87
+ contentCN, summary = extract_from_pdf("https://www.cbirc.gov.cn" + article['pdfFileUrl'])
88
+ article['contentCN'] = repr(contentCN)[1:-1].strip()
89
+ if len(contentCN) < 10:
90
+ continue
91
+ CONTENT_ENG = ''
92
+ for element in article['contentCN'].split("\n"):
93
+ CONTENT_ENG += translate(element) + '\n'
94
+ article['content'] = repr(CONTENT_ENG)[1:-1].strip()
95
+ article['site'] = "National Financial Regulatory Administration of China"
96
+ article['originSite'] = "国家金融监督管理总局"
97
+ article['titleCN'] = article['docSubtitle']
98
+ article['title'] = translate(article['docSubtitle'])
99
+ article['link'] = "https://www.cbirc.gov.cn" + str(article['pdfFileUrl'])
100
+ article['category']= "Policy Interpretation"
101
+ article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['titleCN']+article['publishDate'])
102
+ article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
103
+ article['attachment'] = ''
104
+ article['author'] = ''
105
+ article['subtitle'] = translate(summary)
106
+ update_content(article)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
  except Exception as error:
108
  print(error)
109
 
110
+ print("csrc.gov.cn")
111
+ i = 1
112
+ while i > -1:
113
+ try:
114
+ if i == 1:
115
+ CATEGORY_URL = "http://www.csrc.gov.cn/csrc/c100039/common_list.shtml"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
  else:
117
+ CATEGORY_URL = f"http://www.csrc.gov.cn/csrc/c100039/common_list_{i}.shtml"
118
  i = i + 1
119
  req = urllib.request.urlopen(CATEGORY_URL)
120
  text = req.read()
121
  html_text = text.decode("utf-8")
122
  page = etree.HTML(html_text)
123
+ articlelist = page.xpath("//div[contains(@class, 'main-right fr common-list')]/ul/li")
124
  for article in articlelist:
125
  if isinstance(article, etree._Element):
126
  subelement = etree.tostring(article).decode()
127
  subpage = etree.HTML(subelement)
128
+ date = encode(subpage.xpath("//span[@class='date']"))
129
  parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
130
  if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
131
  i = -1
132
  else:
133
+ urls = subpage.xpath("//a/@href")
134
  for url in urls:
135
  try:
136
  article = {}
137
+ url = "http://www.csrc.gov.cn" + url
138
+ article['category']= "Policy Interpretation"
139
+ crawl(url, article)
 
140
  except Exception as error:
141
  print(error)
142
+ except Exception as error:
143
+ i = -1
144
+ print(error)
145
 
146
+ i = 1
147
+ while i > -1:
148
+ CATEGORY_URL = f"http://www.csrc.gov.cn/searchList/a1a078ee0bc54721ab6b148884c784a8?_isAgg=true&_isJson=true&_pageSize=18&_template=index&_rangeTimeGte=&_channelName=&page={i}"
149
+ i = i + 1
150
+ try:
151
+ content = fetch_url(CATEGORY_URL)
152
+ reportinfo = json.loads(content)
153
+ for article in reportinfo['data']['results']:
154
+ parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(article['publishedTimeStr'],"%Y-%m-%d %H:%M:%S")), "%Y-%m-%d")
155
+ if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
156
+ i = -1
157
+ else:
158
+ article['category']= "Financial News"
159
+ article['site'] = "Securities Regulatory Commission of China"
160
+ article['originSite'] = "证监会"
161
+ article['titleCN'] = article['title']
162
+ article['title'] = translate(article['titleCN'])
163
+ article['author'] = ''
164
+ article['contentCN'] = repr(article['content'])[1:-1].strip()
165
+ if len(article['contentCN']) < 10:
166
+ continue
167
+ CONTENT_ENG = ''
168
+ for element in article['contentCN'].split("。"):
169
+ CONTENT_ENG += translate(element) + ' '
170
+ article['content'] = repr(CONTENT_ENG)[1:-1].strip()
171
+ article['subtitle'] = article['memo']
172
+ article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(article['publishedTimeStr'],"%Y-%m-%d %H:%M:%S"))
173
+ article['link'] = article['url']
174
+ article['attachment'] = ""
175
+ article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
176
+ article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['titleCN']+article['publishDate'])
177
+ update_content(article)
178
+ except Exception as error:
179
+ print(error)
180
 
181
+ print("data.eastmoney.com")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
182
 
183
+ today = datetime.today().strftime('%Y-%m-%d')
184
+ beginDate = (datetime.today() - timedelta(days=DELTA)).strftime('%Y-%m-%d')
185
+ i = 0
186
+ while i > -1:
187
+ URL = "https://reportapi.eastmoney.com/report/jg"
188
+ params = {
189
+ "cb": "datatable8544623",
190
+ "pageSize": "100",
191
+ "beginTime": beginDate,
192
+ "endTime": today,
193
+ "pageNo": i,
194
+ "qType": "3",
195
+ }
196
+ URL = URL + "?" + "&".join(f"{key}={value}" for key, value in params.items())
197
+ content = fetch_url(URL)
198
+ if content:
199
+ start_index = content.find("(")
200
+ if start_index != -1:
201
+ result = content[start_index + 1: -1]
202
  else:
203
+ result = content
204
+ reportinfo = json.loads(result)
205
+ if reportinfo["size"] > 0:
206
+ i = i + 1
207
+ for article in reportinfo['data']:
208
+ try:
209
+ url = f"https://data.eastmoney.com/report/zw_macresearch.jshtml?encodeUrl={article['encodeUrl']}"
210
+ crawl_eastmoney(url,article)
211
+ except Exception as error:
212
+ print(error)
213
+ else:
214
+ i = -1
215
+ else:
216
+ print("Failed to fetch URL:", url)
217
+
218
+ print("gov.cn")
219
+ i = 0
220
+ while i > -1:
221
+ if i == 0:
222
+ CATEGORY_URL = "https://www.gov.cn/zhengce/jiedu/home.htm"
223
+ else:
224
+ CATEGORY_URL = f"https://www.gov.cn/zhengce/jiedu/home_{i}.htm"
225
+ i = i + 1
226
+ req = urllib.request.urlopen(CATEGORY_URL)
227
+ text = req.read()
228
+ html_text = text.decode("utf-8")
229
+ page = etree.HTML(html_text)
230
+ articlelist = page.xpath("//div[contains(@class, 'news_box')]//h4")
231
+ for article in articlelist:
232
+ if isinstance(article, etree._Element):
233
+ subelement = etree.tostring(article).decode()
234
+ subpage = etree.HTML(subelement)
235
+ date = subpage.xpath("//span/text()")[0]
236
+ parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
237
+ if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
238
+ i = -1
239
+ else:
240
+ urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
241
+ for url in urls:
242
+ try:
243
+ article = {}
244
+ url = url.replace('../', 'https://www.gov.cn/zhengce/')
245
+ if "https://www.gov.cn" in url:
246
  article['category']= "Policy Interpretation"
 
247
  crawl(url, article)
248
+ except Exception as error:
249
+ print(error)
250
 
251
+ i = 0
252
+ while i > -1:
253
+ if i == 0:
254
+ CATEGORY_URL = "https://www.gov.cn/zhengce/zuixin/home.htm"
255
+ else:
256
+ CATEGORY_URL = f"https://www.gov.cn/zhengce/zuixin/home_{i}.htm"
257
+ i = i + 1
258
+ req = urllib.request.urlopen(CATEGORY_URL)
259
+ text = req.read()
260
+ html_text = text.decode("utf-8")
261
+ page = etree.HTML(html_text)
262
+ articlelist = page.xpath("//div[contains(@class, 'news_box')]//h4")
263
+ for article in articlelist:
264
+ if isinstance(article, etree._Element):
265
+ subelement = etree.tostring(article).decode()
266
+ subpage = etree.HTML(subelement)
267
+ date = subpage.xpath("//span/text()")[0]
268
+ parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
269
+ if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
270
  i = -1
271
+ else:
272
+ urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
273
+ for url in urls:
274
+ try:
275
+ article = {}
276
+ url = url.replace('../', 'https://www.gov.cn/zhengce/')
277
+ if "https://www.gov.cn" in url:
278
+ article['site'] = "State Council of China"
279
+ crawl(url, article)
280
+ except Exception as error:
281
+ print(error)
282
 
283
+ print("mof.gov.cn")
284
+ i = 0
285
+ while i > -1:
286
+ if i == 0:
287
+ CATEGORY_URL = "https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/"
288
+ else:
289
+ CATEGORY_URL = f"https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/index_{i}.htm"
290
+ i = i + 1
291
+ req = urllib.request.urlopen(CATEGORY_URL)
292
+ text = req.read()
293
+ html_text = text.decode("utf-8")
294
+ page = etree.HTML(html_text)
295
+ articlelist = page.xpath("//div[contains(@class, 'xwfb_listerji')]/ul/li[not(@class = 'clear')]")
296
+ for article in articlelist:
297
+ if isinstance(article, etree._Element):
298
+ subelement = etree.tostring(article).decode()
299
+ subpage = etree.HTML(subelement)
300
+ date = subpage.xpath("//span/text()")[0]
301
+ parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
302
+ if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
303
+ i = -1
304
+ else:
305
+ urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
306
+ for url in urls:
307
+ try:
308
+ article = {}
309
+ url = url.replace("../", "https://www.mof.gov.cn/zhengwuxinxi/")
310
+ url = url.replace("./", "https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/")
311
+ article['category']= "Financial News"
312
+ crawl(url, article)
313
+ except Exception as error:
314
+ print(error)
315
 
316
+ i = 0
317
+ while i > -1:
318
+ if i == 0:
319
+ CATEGORY_URL = "https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/"
320
+ else:
321
+ CATEGORY_URL = f"https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/index_{i}.htm"
322
+ i = i + 1
323
+ req = urllib.request.urlopen(CATEGORY_URL)
324
+ text = req.read()
325
+ html_text = text.decode("utf-8")
326
+ page = etree.HTML(html_text)
327
+ articlelist = page.xpath("//div[contains(@class, 'xwfb_listerji')]/ul/li[not(@class = 'clear')]")
328
+ for article in articlelist:
329
+ if isinstance(article, etree._Element):
330
+ subelement = etree.tostring(article).decode()
331
+ subpage = etree.HTML(subelement)
332
+ date = subpage.xpath("//span/text()")[0]
333
+ parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
334
+ if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
335
+ i = -1
336
+ else:
337
+ urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
338
+ for url in urls:
339
+ try:
340
+ article = {}
341
+ url = url.replace("./", CATEGORY_URL)
342
+ article['category']= "Policy Interpretation"
343
+ print(url)
344
+ crawl(url, article)
345
+ except Exception as error:
346
+ print(error)
 
 
 
 
 
 
 
347
 
348
+ print("mofcom.gov.cn")
349
+ categories = ['jdzhsw','jdgnmy','jddwmy','jdtzhz']
350
+ for category in categories:
351
  i = 1
352
  while i > -1:
353
  if i == 1:
354
+ URL = f"http://www.mofcom.gov.cn/article/zcjd/{category}/"
355
  else:
356
+ URL = f"http://www.mofcom.gov.cn/article/zcjd/{category}/?{i}"
357
  i = i + 1
358
+ try:
359
+ req = urllib.request.urlopen(URL)
360
+ text = req.read()
361
+ html_text = text.decode("utf-8")
362
+ page = etree.HTML(html_text)
363
+ articlelist = page.xpath("//section[contains(@class, 'listCon iListCon f-mt30')]/ul/li")
364
+ for article in articlelist:
365
+ if isinstance(article, etree._Element):
366
+ subelement = etree.tostring(article).decode()
367
+ subpage = etree.HTML(subelement)
368
+ date = subpage.xpath("//span/text()")[0]
369
+ parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d %H:%M:%S")), "%Y-%m-%d")
370
+ if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
371
+ i = -1
372
+ else:
373
+ urls = subpage.xpath("//a/@href")
374
+ for url in urls:
375
+ try:
376
+ article = {}
377
+ if '/article/zcjd' in url:
378
+ url = "http://www.mofcom.gov.cn" + url
379
+ article['category']= "Policy Interpretation"
380
+ else:
381
+ article['category']= "Policy Release"
382
+ crawl(url, article)
383
+ except Exception as error:
384
+ print(error)
385
+ except Exception as error:
386
+ i = -1
387
+ print(error)
388
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
389
 
390
+ print("ndrc.gov.cn")
391
+ i = 0
392
+ while i > -1:
393
+ if i == 0:
394
+ CATEGORY_URL = "https://www.ndrc.gov.cn/xxgk/jd/jd/index.html"
395
+ else:
396
+ CATEGORY_URL = f"https://www.ndrc.gov.cn/xxgk/jd/jd/index_{i}.html"
397
+ i = i + 1
398
+ req = urllib.request.urlopen(CATEGORY_URL)
399
+ text = req.read()
400
+ html_text = text.decode("utf-8")
401
+ page = etree.HTML(html_text)
402
+ articlelist = page.xpath("//div[contains(@class, 'list')]/ul/li[not(@class = 'empty')]")
403
+ for article in articlelist:
404
+ if isinstance(article, etree._Element):
405
+ subelement = etree.tostring(article).decode()
406
+ subpage = etree.HTML(subelement)
407
+ date = subpage.xpath("//span/text()")[0]
408
+ parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y/%m/%d")), "%Y-%m-%d")
409
+ if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
410
+ i = -1
411
+ else:
412
+ urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
413
+ for url in urls:
414
+ try:
415
+ article = {}
416
+ if "www.gov.cn" in url:
417
+ article['category']= "Policy Release"
418
+ elif "../../zcfb/" in url:
419
+ url = url.replace("../../zcfb/", "https://www.ndrc.gov.cn/xxgk/zcfb/")
420
+ article['category']= "Policy Release"
421
+ else:
422
+ url = url.replace("../../", "https://www.ndrc.gov.cn/xxgk/jd/jd/")
423
+ url = url.replace("./", "https://www.ndrc.gov.cn/xxgk/jd/jd/")
424
+ article['category']= "Policy Interpretation"
425
+ crawl(url, article)
426
+ except Exception as error:
427
+ print(error)
428
+
429
+ print("safe.gov.cn")
430
+ i = 1
431
+ while i > -1:
432
+ if i == 1:
433
+ CATEGORY_URL = "https://www.safe.gov.cn/safe/zcfgjd/index.html"
434
+ else:
435
+ CATEGORY_URL = f"https://www.safe.gov.cn/safe/zcfgjd/index_{i}.html"
436
+ i = i + 1
437
+ req = urllib.request.urlopen(CATEGORY_URL)
438
+ text = req.read()
439
+ html_text = text.decode("utf-8")
440
+ page = etree.HTML(html_text)
441
+ articlelist = page.xpath("//div[contains(@class, 'list_conr')]/ul/li")
442
+ for article in articlelist:
443
+ if isinstance(article, etree._Element):
444
+ subelement = etree.tostring(article).decode()
445
+ subpage = etree.HTML(subelement)
446
+ date = subpage.xpath("//dd/text()")[0]
447
+ parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
448
+ if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
449
+ i = -1
450
+ else:
451
+ urls = subpage.xpath("//a/@href")
452
+ for url in urls:
453
+ try:
454
+ article = {}
455
+ url = "https://www.safe.gov.cn" + url
456
+ article['category']= "Policy Interpretation"
457
+ crawl(url, article)
458
+ except Exception as error:
459
+ print(error)
460
 
461
+ i = 1
462
+ while i > -1:
463
+ if i == 1:
464
+ CATEGORY_URL = "https://www.safe.gov.cn/safe/sjjd/index.html"
465
+ else:
466
+ CATEGORY_URL = f"https://www.safe.gov.cn/safe/sjjd/index_{i}.html"
467
+ i = i + 1
468
+ req = urllib.request.urlopen(CATEGORY_URL)
469
+ text = req.read()
470
+ html_text = text.decode("utf-8")
471
+ page = etree.HTML(html_text)
472
+ articlelist = page.xpath("//div[contains(@class, 'list_conr')]/ul/li")
473
+ for article in articlelist:
474
+ if isinstance(article, etree._Element):
475
+ subelement = etree.tostring(article).decode()
476
+ subpage = etree.HTML(subelement)
477
+ date = subpage.xpath("//dd/text()")[0]
478
+ parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
479
+ if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
480
+ i = -1
481
+ else:
482
+ urls = subpage.xpath("//a/@href")
483
+ for url in urls:
484
+ try:
485
+ article = {}
486
+ url = "https://www.safe.gov.cn" + url
487
+ article['category']= "Data Interpretation"
488
+ crawl(url, article)
489
+ except Exception as error:
490
+ print(error)
491
 
492
+ print("stats.gov.hk")
493
+ i = 0
494
+ while i > -1:
495
+ if i == 0:
496
+ CATEGORY_URL = "https://www.stats.gov.cn/sj/sjjd/"
497
+ else:
498
+ CATEGORY_URL = f"https://www.stats.gov.cn/sj/sjjd/index_{i}.html"
499
+ i = i + 1
500
+ req = urllib.request.urlopen(CATEGORY_URL)
501
+ text = req.read()
502
+ html_text = text.decode("utf-8")
503
+ page = etree.HTML(html_text)
504
+ articlelist = page.xpath("//div[contains(@class, 'list-content')]/ul/li")
505
+ for article in articlelist:
506
+ if isinstance(article, etree._Element):
507
+ subelement = etree.tostring(article).decode()
508
+ subpage = etree.HTML(subelement)
509
+ date = encode(subpage.xpath("//span"))
510
+ parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
511
+ if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
512
+ i = -1
513
+ else:
514
+ urls = subpage.xpath("//a[@class='fl pc_1600']/@href")
515
+ for url in urls:
516
+ try:
517
+ article = {}
518
+ url = url.replace('./', "https://www.stats.gov.cn/sj/sjjd/")
519
+ article['category']= "Data Interpretation"
520
+ crawl(url, article)
521
+ except Exception as error:
522
+ print(error)
523
 
524
+ glue_job_run()