Muhammad Abdur Rahman Saad commited on
Commit
cc37e8c
·
1 Parent(s): b69e69a

add prefect flow

Browse files
Files changed (2) hide show
  1. daily.py +426 -418
  2. requirements.txt +0 -0
daily.py CHANGED
@@ -10,6 +10,7 @@ import urllib.request
10
  import uuid
11
  from datetime import datetime, timedelta
12
  from urllib.parse import urlparse
 
13
 
14
  from lxml import etree
15
 
@@ -17,121 +18,7 @@ from utils import (crawl, datemodifier, encode, encode_content,
17
  extract_from_pdf, extract_reference, fetch_url,
18
  sentiment_computation, translate, update_content)
19
 
20
- with open('xpath.json', 'r', encoding='UTF-8') as f:
21
- xpath_dict = json.load(f)
22
 
23
- DELTA = int(os.environ.get('DELTA') or '1')
24
- print(f"DELTA = {DELTA}")
25
-
26
- print("cbirc.gov.cn")
27
- i = 1
28
- while i > -1:
29
- CATEGORY_URL = f"https://www.cbirc.gov.cn/cn/static/data/DocInfo/SelectDocByItemIdAndChild/data_itemId=917,pageIndex={i},pageSize=18.json"
30
- i = i + 1
31
- content = fetch_url(CATEGORY_URL)
32
- reportinfo = json.loads(content)
33
- for article in reportinfo['data']['rows']:
34
- try:
35
- article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(article['publishDate'],"%Y-%m-%d %H:%M:%S"))
36
- parsed_datetime = datetime.strptime(article['publishDate'], "%Y-%m-%d")
37
- if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
38
- i = -1
39
- else:
40
- contentCN, summary = extract_from_pdf("https://www.cbirc.gov.cn" + article['pdfFileUrl'])
41
- article['contentCN'] = repr(contentCN)[1:-1].strip()
42
- if len(contentCN) < 10:
43
- continue
44
- CONTENT_ENG = ''
45
- for element in article['contentCN'].split("\n"):
46
- CONTENT_ENG += translate(element) + '\n'
47
- article['content'] = repr(CONTENT_ENG)[1:-1].strip()
48
- article['site'] = "National Financial Regulatory Administration of China"
49
- article['originSite'] = "国家金融监督管理总局"
50
- article['titleCN'] = article['docSubtitle']
51
- article['title'] = translate(article['docSubtitle'])
52
- article['link'] = "https://www.cbirc.gov.cn" + str(article['pdfFileUrl'])
53
- article['category']= "Policy Interpretation"
54
- article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['titleCN']+article['publishDate'])
55
- article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
56
- article['attachment'] = ''
57
- article['author'] = ''
58
- article['subtitle'] = translate(summary)
59
- update_content(article)
60
- except Exception as error:
61
- print(error)
62
-
63
- print("csrc.gov.cn")
64
- i = 1
65
- while i > -1:
66
- try:
67
- if i == 1:
68
- CATEGORY_URL = "http://www.csrc.gov.cn/csrc/c100039/common_list.shtml"
69
- else:
70
- CATEGORY_URL = f"http://www.csrc.gov.cn/csrc/c100039/common_list_{i}.shtml"
71
- i = i + 1
72
- req = urllib.request.urlopen(CATEGORY_URL)
73
- text = req.read()
74
- html_text = text.decode("utf-8")
75
- page = etree.HTML(html_text)
76
- articlelist = page.xpath("//div[contains(@class, 'main-right fr common-list')]/ul/li")
77
- for article in articlelist:
78
- if isinstance(article, etree._Element):
79
- subelement = etree.tostring(article).decode()
80
- subpage = etree.HTML(subelement)
81
- date = encode(subpage.xpath("//span[@class='date']"))
82
- parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
83
- if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
84
- i = -1
85
- else:
86
- urls = subpage.xpath("//a/@href")
87
- for url in urls:
88
- try:
89
- article = {}
90
- url = "http://www.csrc.gov.cn" + url
91
- article['category']= "Policy Interpretation"
92
- crawl(url, article)
93
- except Exception as error:
94
- print(error)
95
- except Exception as error:
96
- i = -1
97
- print(error)
98
-
99
- i = 1
100
- while i > -1:
101
- CATEGORY_URL = f"http://www.csrc.gov.cn/searchList/a1a078ee0bc54721ab6b148884c784a8?_isAgg=true&_isJson=true&_pageSize=18&_template=index&_rangeTimeGte=&_channelName=&page={i}"
102
- i = i + 1
103
- try:
104
- content = fetch_url(CATEGORY_URL)
105
- reportinfo = json.loads(content)
106
- for article in reportinfo['data']['results']:
107
- parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(article['publishedTimeStr'],"%Y-%m-%d %H:%M:%S")), "%Y-%m-%d")
108
- if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
109
- i = -1
110
- else:
111
- article['category']= "Financial News"
112
- article['site'] = "Securities Regulatory Commission of China"
113
- article['originSite'] = "证监会"
114
- article['titleCN'] = article['title']
115
- article['title'] = translate(article['titleCN'])
116
- article['author'] = ''
117
- article['contentCN'] = repr(article['content'])[1:-1].strip()
118
- if len(article['contentCN']) < 10:
119
- continue
120
- CONTENT_ENG = ''
121
- for element in article['contentCN'].split("。"):
122
- CONTENT_ENG += translate(element) + ' '
123
- article['content'] = repr(CONTENT_ENG)[1:-1].strip()
124
- article['subtitle'] = article['memo']
125
- article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(article['publishedTimeStr'],"%Y-%m-%d %H:%M:%S"))
126
- article['link'] = article['url']
127
- article['attachment'] = ""
128
- article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
129
- article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['titleCN']+article['publishDate'])
130
- update_content(article)
131
- except Exception as error:
132
- print(error)
133
-
134
- print("data.eastmoney.com")
135
  def crawl_eastmoney(url, article):
136
  """
137
  Crawls the given URL and extracts information from the webpage.
@@ -179,193 +66,71 @@ def crawl_eastmoney(url, article):
179
  extract_reference(article)
180
  update_content(article)
181
 
182
- today = datetime.today().strftime('%Y-%m-%d')
183
- beginDate = (datetime.today() - timedelta(days=DELTA)).strftime('%Y-%m-%d')
184
- i = 0
185
- while i > -1:
186
- URL = "https://reportapi.eastmoney.com/report/jg"
187
- params = {
188
- "cb": "datatable8544623",
189
- "pageSize": "100",
190
- "beginTime": beginDate,
191
- "endTime": today,
192
- "pageNo": i,
193
- "qType": "3",
194
- }
195
- URL = URL + "?" + "&".join(f"{key}={value}" for key, value in params.items())
196
- content = fetch_url(URL)
197
- if content:
198
- start_index = content.find("(")
199
- if start_index != -1:
200
- result = content[start_index + 1: -1]
201
- else:
202
- result = content
203
- reportinfo = json.loads(result)
204
- if reportinfo["size"] > 0:
205
- i = i + 1
206
- for article in reportinfo['data']:
207
- try:
208
- url = f"https://data.eastmoney.com/report/zw_macresearch.jshtml?encodeUrl={article['encodeUrl']}"
209
- crawl_eastmoney(url,article)
210
- except Exception as error:
211
- print(error)
212
- else:
213
- i = -1
214
- else:
215
- print("Failed to fetch URL:", url)
216
-
217
- print("gov.cn")
218
- i = 0
219
- while i > -1:
220
- if i == 0:
221
- CATEGORY_URL = "https://www.gov.cn/zhengce/jiedu/home.htm"
222
- else:
223
- CATEGORY_URL = f"https://www.gov.cn/zhengce/jiedu/home_{i}.htm"
224
- i = i + 1
225
- req = urllib.request.urlopen(CATEGORY_URL)
226
- text = req.read()
227
- html_text = text.decode("utf-8")
228
- page = etree.HTML(html_text)
229
- articlelist = page.xpath("//div[contains(@class, 'news_box')]//h4")
230
- for article in articlelist:
231
- if isinstance(article, etree._Element):
232
- subelement = etree.tostring(article).decode()
233
- subpage = etree.HTML(subelement)
234
- date = subpage.xpath("//span/text()")[0]
235
- parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
236
- if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
237
- i = -1
238
- else:
239
- urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
240
- for url in urls:
241
- try:
242
- article = {}
243
- url = url.replace('../', 'https://www.gov.cn/zhengce/')
244
- if "https://www.gov.cn" in url:
245
- article['category']= "Policy Interpretation"
246
- crawl(url, article)
247
- except Exception as error:
248
- print(error)
249
 
250
- i = 0
251
- while i > -1:
252
- if i == 0:
253
- CATEGORY_URL = "https://www.gov.cn/zhengce/zuixin/home.htm"
254
- else:
255
- CATEGORY_URL = f"https://www.gov.cn/zhengce/zuixin/home_{i}.htm"
256
- i = i + 1
257
- req = urllib.request.urlopen(CATEGORY_URL)
258
- text = req.read()
259
- html_text = text.decode("utf-8")
260
- page = etree.HTML(html_text)
261
- articlelist = page.xpath("//div[contains(@class, 'news_box')]//h4")
262
- for article in articlelist:
263
- if isinstance(article, etree._Element):
264
- subelement = etree.tostring(article).decode()
265
- subpage = etree.HTML(subelement)
266
- date = subpage.xpath("//span/text()")[0]
267
- parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
268
- if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
269
- i = -1
270
- else:
271
- urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
272
- for url in urls:
273
- try:
274
- article = {}
275
- url = url.replace('../', 'https://www.gov.cn/zhengce/')
276
- if "https://www.gov.cn" in url:
277
- article['site'] = "State Council of China"
278
- crawl(url, article)
279
- except Exception as error:
280
- print(error)
281
-
282
- print("mof.gov.cn")
283
- i = 0
284
- while i > -1:
285
- if i == 0:
286
- CATEGORY_URL = "https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/"
287
- else:
288
- CATEGORY_URL = f"https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/index_{i}.htm"
289
- i = i + 1
290
- req = urllib.request.urlopen(CATEGORY_URL)
291
- text = req.read()
292
- html_text = text.decode("utf-8")
293
- page = etree.HTML(html_text)
294
- articlelist = page.xpath("//div[contains(@class, 'xwfb_listerji')]/ul/li[not(@class = 'clear')]")
295
- for article in articlelist:
296
- if isinstance(article, etree._Element):
297
- subelement = etree.tostring(article).decode()
298
- subpage = etree.HTML(subelement)
299
- date = subpage.xpath("//span/text()")[0]
300
- parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
301
- if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
302
- i = -1
303
- else:
304
- urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
305
- for url in urls:
306
- try:
307
- article = {}
308
- url = url.replace("../", "https://www.mof.gov.cn/zhengwuxinxi/")
309
- url = url.replace("./", "https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/")
310
- article['category']= "Financial News"
311
- crawl(url, article)
312
- except Exception as error:
313
- print(error)
314
-
315
- i = 0
316
- while i > -1:
317
- if i == 0:
318
- CATEGORY_URL = "https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/"
319
- else:
320
- CATEGORY_URL = f"https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/index_{i}.htm"
321
- i = i + 1
322
- req = urllib.request.urlopen(CATEGORY_URL)
323
- text = req.read()
324
- html_text = text.decode("utf-8")
325
- page = etree.HTML(html_text)
326
- articlelist = page.xpath("//div[contains(@class, 'xwfb_listerji')]/ul/li[not(@class = 'clear')]")
327
- for article in articlelist:
328
- if isinstance(article, etree._Element):
329
- subelement = etree.tostring(article).decode()
330
- subpage = etree.HTML(subelement)
331
- date = subpage.xpath("//span/text()")[0]
332
- parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
333
- if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
334
- i = -1
335
- else:
336
- urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
337
- for url in urls:
338
- try:
339
- article = {}
340
- url = url.replace("./", CATEGORY_URL)
341
- article['category']= "Policy Interpretation"
342
- print(url)
343
- crawl(url, article)
344
- except Exception as error:
345
- print(error)
346
 
347
- print("mofcom.gov.cn")
348
- categories = ['jdzhsw','jdgnmy','jddwmy','jdtzhz']
349
- for category in categories:
350
  i = 1
351
  while i > -1:
352
- if i == 1:
353
- URL = f"http://www.mofcom.gov.cn/article/zcjd/{category}/"
354
- else:
355
- URL = f"http://www.mofcom.gov.cn/article/zcjd/{category}/?{i}"
356
  i = i + 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
357
  try:
358
- req = urllib.request.urlopen(URL)
 
 
 
 
 
359
  text = req.read()
360
  html_text = text.decode("utf-8")
361
  page = etree.HTML(html_text)
362
- articlelist = page.xpath("//section[contains(@class, 'listCon iListCon f-mt30')]/ul/li")
363
  for article in articlelist:
364
  if isinstance(article, etree._Element):
365
  subelement = etree.tostring(article).decode()
366
  subpage = etree.HTML(subelement)
367
- date = subpage.xpath("//span/text()")[0]
368
- parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d %H:%M:%S")), "%Y-%m-%d")
369
  if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
370
  i = -1
371
  else:
@@ -373,11 +138,8 @@ for category in categories:
373
  for url in urls:
374
  try:
375
  article = {}
376
- if '/article/zcjd' in url:
377
- url = "http://www.mofcom.gov.cn" + url
378
- article['category']= "Policy Interpretation"
379
- else:
380
- article['category']= "Policy Release"
381
  crawl(url, article)
382
  except Exception as error:
383
  print(error)
@@ -385,137 +147,383 @@ for category in categories:
385
  i = -1
386
  print(error)
387
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
388
 
389
- print("ndrc.gov.cn")
390
- i = 0
391
- while i > -1:
392
- if i == 0:
393
- CATEGORY_URL = "https://www.ndrc.gov.cn/xxgk/jd/jd/index.html"
394
- else:
395
- CATEGORY_URL = f"https://www.ndrc.gov.cn/xxgk/jd/jd/index_{i}.html"
396
- i = i + 1
397
- req = urllib.request.urlopen(CATEGORY_URL)
398
- text = req.read()
399
- html_text = text.decode("utf-8")
400
- page = etree.HTML(html_text)
401
- articlelist = page.xpath("//div[contains(@class, 'list')]/ul/li[not(@class = 'empty')]")
402
- for article in articlelist:
403
- if isinstance(article, etree._Element):
404
- subelement = etree.tostring(article).decode()
405
- subpage = etree.HTML(subelement)
406
- date = subpage.xpath("//span/text()")[0]
407
- parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y/%m/%d")), "%Y-%m-%d")
408
- if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
409
- i = -1
410
- else:
411
- urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
412
- for url in urls:
413
- try:
414
- article = {}
415
- if "www.gov.cn" in url:
416
- article['category']= "Policy Release"
417
- elif "../../zcfb/" in url:
418
- url = url.replace("../../zcfb/", "https://www.ndrc.gov.cn/xxgk/zcfb/")
419
- article['category']= "Policy Release"
420
- else:
421
- url = url.replace("../../", "https://www.ndrc.gov.cn/xxgk/jd/jd/")
422
- url = url.replace("./", "https://www.ndrc.gov.cn/xxgk/jd/jd/")
423
- article['category']= "Policy Interpretation"
424
- crawl(url, article)
425
- except Exception as error:
426
- print(error)
427
 
428
- print("safe.gov.cn")
429
- i = 1
430
- while i > -1:
431
- if i == 1:
432
- CATEGORY_URL = "https://www.safe.gov.cn/safe/zcfgjd/index.html"
433
- else:
434
- CATEGORY_URL = f"https://www.safe.gov.cn/safe/zcfgjd/index_{i}.html"
435
- i = i + 1
436
- req = urllib.request.urlopen(CATEGORY_URL)
437
- text = req.read()
438
- html_text = text.decode("utf-8")
439
- page = etree.HTML(html_text)
440
- articlelist = page.xpath("//div[contains(@class, 'list_conr')]/ul/li")
441
- for article in articlelist:
442
- if isinstance(article, etree._Element):
443
- subelement = etree.tostring(article).decode()
444
- subpage = etree.HTML(subelement)
445
- date = subpage.xpath("//dd/text()")[0]
446
- parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
447
- if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
448
- i = -1
449
  else:
450
- urls = subpage.xpath("//a/@href")
451
- for url in urls:
 
 
 
452
  try:
453
- article = {}
454
- url = "https://www.safe.gov.cn" + url
455
- article['category']= "Policy Interpretation"
456
- crawl(url, article)
457
  except Exception as error:
458
  print(error)
459
-
460
- i = 1
461
- while i > -1:
462
- if i == 1:
463
- CATEGORY_URL = "https://www.safe.gov.cn/safe/sjjd/index.html"
464
- else:
465
- CATEGORY_URL = f"https://www.safe.gov.cn/safe/sjjd/index_{i}.html"
466
- i = i + 1
467
- req = urllib.request.urlopen(CATEGORY_URL)
468
- text = req.read()
469
- html_text = text.decode("utf-8")
470
- page = etree.HTML(html_text)
471
- articlelist = page.xpath("//div[contains(@class, 'list_conr')]/ul/li")
472
- for article in articlelist:
473
- if isinstance(article, etree._Element):
474
- subelement = etree.tostring(article).decode()
475
- subpage = etree.HTML(subelement)
476
- date = subpage.xpath("//dd/text()")[0]
477
- parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
478
- if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
479
- i = -1
480
  else:
481
- urls = subpage.xpath("//a/@href")
482
- for url in urls:
483
- try:
484
- article = {}
485
- url = "https://www.safe.gov.cn" + url
486
- article['category']= "Data Interpretation"
487
- crawl(url, article)
488
- except Exception as error:
489
- print(error)
490
-
491
- print("stats.gov.hk")
492
- i = 0
493
- while i > -1:
494
- if i == 0:
495
- CATEGORY_URL = "https://www.stats.gov.cn/sj/sjjd/"
496
- else:
497
- CATEGORY_URL = f"https://www.stats.gov.cn/sj/sjjd/index_{i}.html"
498
- i = i + 1
499
- req = urllib.request.urlopen(CATEGORY_URL)
500
- text = req.read()
501
- html_text = text.decode("utf-8")
502
- page = etree.HTML(html_text)
503
- articlelist = page.xpath("//div[contains(@class, 'list-content')]/ul/li")
504
- for article in articlelist:
505
- if isinstance(article, etree._Element):
506
- subelement = etree.tostring(article).decode()
507
- subpage = etree.HTML(subelement)
508
- date = encode(subpage.xpath("//span"))
509
- parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
510
- if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
511
  i = -1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
512
  else:
513
- urls = subpage.xpath("//a[@class='fl pc_1600']/@href")
514
- for url in urls:
515
- try:
516
- article = {}
517
- url = url.replace('./', "https://www.stats.gov.cn/sj/sjjd/")
518
- article['category']= "Data Interpretation"
519
- crawl(url, article)
520
- except Exception as error:
521
- print(error)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  import uuid
11
  from datetime import datetime, timedelta
12
  from urllib.parse import urlparse
13
+ from prefect import flow, task
14
 
15
  from lxml import etree
16
 
 
18
  extract_from_pdf, extract_reference, fetch_url,
19
  sentiment_computation, translate, update_content)
20
 
 
 
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  def crawl_eastmoney(url, article):
23
  """
24
  Crawls the given URL and extracts information from the webpage.
 
66
  extract_reference(article)
67
  update_content(article)
68
 
69
+ @flow(name = "Data Collection China - Daily", log_prints = True)
70
+ def main():
71
+ with open('xpath.json', 'r', encoding='UTF-8') as f:
72
+ xpath_dict = json.load(f)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
 
74
+ DELTA = int(os.environ.get('DELTA') or '1')
75
+ print(f"DELTA = {DELTA}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
 
77
+ print("cbirc.gov.cn")
 
 
78
  i = 1
79
  while i > -1:
80
+ CATEGORY_URL = f"https://www.cbirc.gov.cn/cn/static/data/DocInfo/SelectDocByItemIdAndChild/data_itemId=917,pageIndex={i},pageSize=18.json"
 
 
 
81
  i = i + 1
82
+ content = fetch_url(CATEGORY_URL)
83
+ reportinfo = json.loads(content)
84
+ for article in reportinfo['data']['rows']:
85
+ try:
86
+ article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(article['publishDate'],"%Y-%m-%d %H:%M:%S"))
87
+ parsed_datetime = datetime.strptime(article['publishDate'], "%Y-%m-%d")
88
+ if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
89
+ i = -1
90
+ else:
91
+ contentCN, summary = extract_from_pdf("https://www.cbirc.gov.cn" + article['pdfFileUrl'])
92
+ article['contentCN'] = repr(contentCN)[1:-1].strip()
93
+ if len(contentCN) < 10:
94
+ continue
95
+ CONTENT_ENG = ''
96
+ for element in article['contentCN'].split("\n"):
97
+ CONTENT_ENG += translate(element) + '\n'
98
+ article['content'] = repr(CONTENT_ENG)[1:-1].strip()
99
+ article['site'] = "National Financial Regulatory Administration of China"
100
+ article['originSite'] = "国家金融监督管理总局"
101
+ article['titleCN'] = article['docSubtitle']
102
+ article['title'] = translate(article['docSubtitle'])
103
+ article['link'] = "https://www.cbirc.gov.cn" + str(article['pdfFileUrl'])
104
+ article['category']= "Policy Interpretation"
105
+ article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['titleCN']+article['publishDate'])
106
+ article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
107
+ article['attachment'] = ''
108
+ article['author'] = ''
109
+ article['subtitle'] = translate(summary)
110
+ update_content(article)
111
+ except Exception as error:
112
+ print(error)
113
+
114
+ print("csrc.gov.cn")
115
+ i = 1
116
+ while i > -1:
117
  try:
118
+ if i == 1:
119
+ CATEGORY_URL = "http://www.csrc.gov.cn/csrc/c100039/common_list.shtml"
120
+ else:
121
+ CATEGORY_URL = f"http://www.csrc.gov.cn/csrc/c100039/common_list_{i}.shtml"
122
+ i = i + 1
123
+ req = urllib.request.urlopen(CATEGORY_URL)
124
  text = req.read()
125
  html_text = text.decode("utf-8")
126
  page = etree.HTML(html_text)
127
+ articlelist = page.xpath("//div[contains(@class, 'main-right fr common-list')]/ul/li")
128
  for article in articlelist:
129
  if isinstance(article, etree._Element):
130
  subelement = etree.tostring(article).decode()
131
  subpage = etree.HTML(subelement)
132
+ date = encode(subpage.xpath("//span[@class='date']"))
133
+ parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
134
  if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
135
  i = -1
136
  else:
 
138
  for url in urls:
139
  try:
140
  article = {}
141
+ url = "http://www.csrc.gov.cn" + url
142
+ article['category']= "Policy Interpretation"
 
 
 
143
  crawl(url, article)
144
  except Exception as error:
145
  print(error)
 
147
  i = -1
148
  print(error)
149
 
150
+ i = 1
151
+ while i > -1:
152
+ CATEGORY_URL = f"http://www.csrc.gov.cn/searchList/a1a078ee0bc54721ab6b148884c784a8?_isAgg=true&_isJson=true&_pageSize=18&_template=index&_rangeTimeGte=&_channelName=&page={i}"
153
+ i = i + 1
154
+ try:
155
+ content = fetch_url(CATEGORY_URL)
156
+ reportinfo = json.loads(content)
157
+ for article in reportinfo['data']['results']:
158
+ parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(article['publishedTimeStr'],"%Y-%m-%d %H:%M:%S")), "%Y-%m-%d")
159
+ if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
160
+ i = -1
161
+ else:
162
+ article['category']= "Financial News"
163
+ article['site'] = "Securities Regulatory Commission of China"
164
+ article['originSite'] = "证监会"
165
+ article['titleCN'] = article['title']
166
+ article['title'] = translate(article['titleCN'])
167
+ article['author'] = ''
168
+ article['contentCN'] = repr(article['content'])[1:-1].strip()
169
+ if len(article['contentCN']) < 10:
170
+ continue
171
+ CONTENT_ENG = ''
172
+ for element in article['contentCN'].split("。"):
173
+ CONTENT_ENG += translate(element) + ' '
174
+ article['content'] = repr(CONTENT_ENG)[1:-1].strip()
175
+ article['subtitle'] = article['memo']
176
+ article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(article['publishedTimeStr'],"%Y-%m-%d %H:%M:%S"))
177
+ article['link'] = article['url']
178
+ article['attachment'] = ""
179
+ article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
180
+ article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['titleCN']+article['publishDate'])
181
+ update_content(article)
182
+ except Exception as error:
183
+ print(error)
184
 
185
+ print("data.eastmoney.com")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186
 
187
+ today = datetime.today().strftime('%Y-%m-%d')
188
+ beginDate = (datetime.today() - timedelta(days=DELTA)).strftime('%Y-%m-%d')
189
+ i = 0
190
+ while i > -1:
191
+ URL = "https://reportapi.eastmoney.com/report/jg"
192
+ params = {
193
+ "cb": "datatable8544623",
194
+ "pageSize": "100",
195
+ "beginTime": beginDate,
196
+ "endTime": today,
197
+ "pageNo": i,
198
+ "qType": "3",
199
+ }
200
+ URL = URL + "?" + "&".join(f"{key}={value}" for key, value in params.items())
201
+ content = fetch_url(URL)
202
+ if content:
203
+ start_index = content.find("(")
204
+ if start_index != -1:
205
+ result = content[start_index + 1: -1]
 
 
206
  else:
207
+ result = content
208
+ reportinfo = json.loads(result)
209
+ if reportinfo["size"] > 0:
210
+ i = i + 1
211
+ for article in reportinfo['data']:
212
  try:
213
+ url = f"https://data.eastmoney.com/report/zw_macresearch.jshtml?encodeUrl={article['encodeUrl']}"
214
+ crawl_eastmoney(url,article)
 
 
215
  except Exception as error:
216
  print(error)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
217
  else:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
218
  i = -1
219
+ else:
220
+ print("Failed to fetch URL:", url)
221
+
222
+ print("gov.cn")
223
+ i = 0
224
+ while i > -1:
225
+ if i == 0:
226
+ CATEGORY_URL = "https://www.gov.cn/zhengce/jiedu/home.htm"
227
+ else:
228
+ CATEGORY_URL = f"https://www.gov.cn/zhengce/jiedu/home_{i}.htm"
229
+ i = i + 1
230
+ req = urllib.request.urlopen(CATEGORY_URL)
231
+ text = req.read()
232
+ html_text = text.decode("utf-8")
233
+ page = etree.HTML(html_text)
234
+ articlelist = page.xpath("//div[contains(@class, 'news_box')]//h4")
235
+ for article in articlelist:
236
+ if isinstance(article, etree._Element):
237
+ subelement = etree.tostring(article).decode()
238
+ subpage = etree.HTML(subelement)
239
+ date = subpage.xpath("//span/text()")[0]
240
+ parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
241
+ if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
242
+ i = -1
243
+ else:
244
+ urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
245
+ for url in urls:
246
+ try:
247
+ article = {}
248
+ url = url.replace('../', 'https://www.gov.cn/zhengce/')
249
+ if "https://www.gov.cn" in url:
250
+ article['category']= "Policy Interpretation"
251
+ crawl(url, article)
252
+ except Exception as error:
253
+ print(error)
254
+
255
+ i = 0
256
+ while i > -1:
257
+ if i == 0:
258
+ CATEGORY_URL = "https://www.gov.cn/zhengce/zuixin/home.htm"
259
+ else:
260
+ CATEGORY_URL = f"https://www.gov.cn/zhengce/zuixin/home_{i}.htm"
261
+ i = i + 1
262
+ req = urllib.request.urlopen(CATEGORY_URL)
263
+ text = req.read()
264
+ html_text = text.decode("utf-8")
265
+ page = etree.HTML(html_text)
266
+ articlelist = page.xpath("//div[contains(@class, 'news_box')]//h4")
267
+ for article in articlelist:
268
+ if isinstance(article, etree._Element):
269
+ subelement = etree.tostring(article).decode()
270
+ subpage = etree.HTML(subelement)
271
+ date = subpage.xpath("//span/text()")[0]
272
+ parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
273
+ if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
274
+ i = -1
275
+ else:
276
+ urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
277
+ for url in urls:
278
+ try:
279
+ article = {}
280
+ url = url.replace('../', 'https://www.gov.cn/zhengce/')
281
+ if "https://www.gov.cn" in url:
282
+ article['site'] = "State Council of China"
283
+ crawl(url, article)
284
+ except Exception as error:
285
+ print(error)
286
+
287
+ print("mof.gov.cn")
288
+ i = 0
289
+ while i > -1:
290
+ if i == 0:
291
+ CATEGORY_URL = "https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/"
292
+ else:
293
+ CATEGORY_URL = f"https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/index_{i}.htm"
294
+ i = i + 1
295
+ req = urllib.request.urlopen(CATEGORY_URL)
296
+ text = req.read()
297
+ html_text = text.decode("utf-8")
298
+ page = etree.HTML(html_text)
299
+ articlelist = page.xpath("//div[contains(@class, 'xwfb_listerji')]/ul/li[not(@class = 'clear')]")
300
+ for article in articlelist:
301
+ if isinstance(article, etree._Element):
302
+ subelement = etree.tostring(article).decode()
303
+ subpage = etree.HTML(subelement)
304
+ date = subpage.xpath("//span/text()")[0]
305
+ parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
306
+ if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
307
+ i = -1
308
+ else:
309
+ urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
310
+ for url in urls:
311
+ try:
312
+ article = {}
313
+ url = url.replace("../", "https://www.mof.gov.cn/zhengwuxinxi/")
314
+ url = url.replace("./", "https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/")
315
+ article['category']= "Financial News"
316
+ crawl(url, article)
317
+ except Exception as error:
318
+ print(error)
319
+
320
+ i = 0
321
+ while i > -1:
322
+ if i == 0:
323
+ CATEGORY_URL = "https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/"
324
+ else:
325
+ CATEGORY_URL = f"https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/index_{i}.htm"
326
+ i = i + 1
327
+ req = urllib.request.urlopen(CATEGORY_URL)
328
+ text = req.read()
329
+ html_text = text.decode("utf-8")
330
+ page = etree.HTML(html_text)
331
+ articlelist = page.xpath("//div[contains(@class, 'xwfb_listerji')]/ul/li[not(@class = 'clear')]")
332
+ for article in articlelist:
333
+ if isinstance(article, etree._Element):
334
+ subelement = etree.tostring(article).decode()
335
+ subpage = etree.HTML(subelement)
336
+ date = subpage.xpath("//span/text()")[0]
337
+ parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
338
+ if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
339
+ i = -1
340
+ else:
341
+ urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
342
+ for url in urls:
343
+ try:
344
+ article = {}
345
+ url = url.replace("./", CATEGORY_URL)
346
+ article['category']= "Policy Interpretation"
347
+ print(url)
348
+ crawl(url, article)
349
+ except Exception as error:
350
+ print(error)
351
+
352
+ print("mofcom.gov.cn")
353
+ categories = ['jdzhsw','jdgnmy','jddwmy','jdtzhz']
354
+ for category in categories:
355
+ i = 1
356
+ while i > -1:
357
+ if i == 1:
358
+ URL = f"http://www.mofcom.gov.cn/article/zcjd/{category}/"
359
  else:
360
+ URL = f"http://www.mofcom.gov.cn/article/zcjd/{category}/?{i}"
361
+ i = i + 1
362
+ try:
363
+ req = urllib.request.urlopen(URL)
364
+ text = req.read()
365
+ html_text = text.decode("utf-8")
366
+ page = etree.HTML(html_text)
367
+ articlelist = page.xpath("//section[contains(@class, 'listCon iListCon f-mt30')]/ul/li")
368
+ for article in articlelist:
369
+ if isinstance(article, etree._Element):
370
+ subelement = etree.tostring(article).decode()
371
+ subpage = etree.HTML(subelement)
372
+ date = subpage.xpath("//span/text()")[0]
373
+ parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d %H:%M:%S")), "%Y-%m-%d")
374
+ if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
375
+ i = -1
376
+ else:
377
+ urls = subpage.xpath("//a/@href")
378
+ for url in urls:
379
+ try:
380
+ article = {}
381
+ if '/article/zcjd' in url:
382
+ url = "http://www.mofcom.gov.cn" + url
383
+ article['category']= "Policy Interpretation"
384
+ else:
385
+ article['category']= "Policy Release"
386
+ crawl(url, article)
387
+ except Exception as error:
388
+ print(error)
389
+ except Exception as error:
390
+ i = -1
391
+ print(error)
392
+
393
+
394
+ print("ndrc.gov.cn")
395
+ i = 0
396
+ while i > -1:
397
+ if i == 0:
398
+ CATEGORY_URL = "https://www.ndrc.gov.cn/xxgk/jd/jd/index.html"
399
+ else:
400
+ CATEGORY_URL = f"https://www.ndrc.gov.cn/xxgk/jd/jd/index_{i}.html"
401
+ i = i + 1
402
+ req = urllib.request.urlopen(CATEGORY_URL)
403
+ text = req.read()
404
+ html_text = text.decode("utf-8")
405
+ page = etree.HTML(html_text)
406
+ articlelist = page.xpath("//div[contains(@class, 'list')]/ul/li[not(@class = 'empty')]")
407
+ for article in articlelist:
408
+ if isinstance(article, etree._Element):
409
+ subelement = etree.tostring(article).decode()
410
+ subpage = etree.HTML(subelement)
411
+ date = subpage.xpath("//span/text()")[0]
412
+ parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y/%m/%d")), "%Y-%m-%d")
413
+ if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
414
+ i = -1
415
+ else:
416
+ urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
417
+ for url in urls:
418
+ try:
419
+ article = {}
420
+ if "www.gov.cn" in url:
421
+ article['category']= "Policy Release"
422
+ elif "../../zcfb/" in url:
423
+ url = url.replace("../../zcfb/", "https://www.ndrc.gov.cn/xxgk/zcfb/")
424
+ article['category']= "Policy Release"
425
+ else:
426
+ url = url.replace("../../", "https://www.ndrc.gov.cn/xxgk/jd/jd/")
427
+ url = url.replace("./", "https://www.ndrc.gov.cn/xxgk/jd/jd/")
428
+ article['category']= "Policy Interpretation"
429
+ crawl(url, article)
430
+ except Exception as error:
431
+ print(error)
432
+
433
+ print("safe.gov.cn")
434
+ i = 1
435
+ while i > -1:
436
+ if i == 1:
437
+ CATEGORY_URL = "https://www.safe.gov.cn/safe/zcfgjd/index.html"
438
+ else:
439
+ CATEGORY_URL = f"https://www.safe.gov.cn/safe/zcfgjd/index_{i}.html"
440
+ i = i + 1
441
+ req = urllib.request.urlopen(CATEGORY_URL)
442
+ text = req.read()
443
+ html_text = text.decode("utf-8")
444
+ page = etree.HTML(html_text)
445
+ articlelist = page.xpath("//div[contains(@class, 'list_conr')]/ul/li")
446
+ for article in articlelist:
447
+ if isinstance(article, etree._Element):
448
+ subelement = etree.tostring(article).decode()
449
+ subpage = etree.HTML(subelement)
450
+ date = subpage.xpath("//dd/text()")[0]
451
+ parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
452
+ if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
453
+ i = -1
454
+ else:
455
+ urls = subpage.xpath("//a/@href")
456
+ for url in urls:
457
+ try:
458
+ article = {}
459
+ url = "https://www.safe.gov.cn" + url
460
+ article['category']= "Policy Interpretation"
461
+ crawl(url, article)
462
+ except Exception as error:
463
+ print(error)
464
+
465
+ i = 1
466
+ while i > -1:
467
+ if i == 1:
468
+ CATEGORY_URL = "https://www.safe.gov.cn/safe/sjjd/index.html"
469
+ else:
470
+ CATEGORY_URL = f"https://www.safe.gov.cn/safe/sjjd/index_{i}.html"
471
+ i = i + 1
472
+ req = urllib.request.urlopen(CATEGORY_URL)
473
+ text = req.read()
474
+ html_text = text.decode("utf-8")
475
+ page = etree.HTML(html_text)
476
+ articlelist = page.xpath("//div[contains(@class, 'list_conr')]/ul/li")
477
+ for article in articlelist:
478
+ if isinstance(article, etree._Element):
479
+ subelement = etree.tostring(article).decode()
480
+ subpage = etree.HTML(subelement)
481
+ date = subpage.xpath("//dd/text()")[0]
482
+ parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
483
+ if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
484
+ i = -1
485
+ else:
486
+ urls = subpage.xpath("//a/@href")
487
+ for url in urls:
488
+ try:
489
+ article = {}
490
+ url = "https://www.safe.gov.cn" + url
491
+ article['category']= "Data Interpretation"
492
+ crawl(url, article)
493
+ except Exception as error:
494
+ print(error)
495
+
496
+ print("stats.gov.hk")
497
+ i = 0
498
+ while i > -1:
499
+ if i == 0:
500
+ CATEGORY_URL = "https://www.stats.gov.cn/sj/sjjd/"
501
+ else:
502
+ CATEGORY_URL = f"https://www.stats.gov.cn/sj/sjjd/index_{i}.html"
503
+ i = i + 1
504
+ req = urllib.request.urlopen(CATEGORY_URL)
505
+ text = req.read()
506
+ html_text = text.decode("utf-8")
507
+ page = etree.HTML(html_text)
508
+ articlelist = page.xpath("//div[contains(@class, 'list-content')]/ul/li")
509
+ for article in articlelist:
510
+ if isinstance(article, etree._Element):
511
+ subelement = etree.tostring(article).decode()
512
+ subpage = etree.HTML(subelement)
513
+ date = encode(subpage.xpath("//span"))
514
+ parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
515
+ if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
516
+ i = -1
517
+ else:
518
+ urls = subpage.xpath("//a[@class='fl pc_1600']/@href")
519
+ for url in urls:
520
+ try:
521
+ article = {}
522
+ url = url.replace('./', "https://www.stats.gov.cn/sj/sjjd/")
523
+ article['category']= "Data Interpretation"
524
+ crawl(url, article)
525
+ except Exception as error:
526
+ print(error)
527
+
528
+ if __name__ == '__main__':
529
+ main()
requirements.txt CHANGED
Binary files a/requirements.txt and b/requirements.txt differ