OxbridgeEconomics commited on
Commit
eaaafcb
·
1 Parent(s): 043eca4
Files changed (2) hide show
  1. .github/workflows/daily.yml +42 -0
  2. daily.py +501 -0
.github/workflows/daily.yml ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This workflow will install Python dependencies, run tests and lint with a single version of Python
2
+ # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
3
+
4
+ name: Data Collection - Daily
5
+
6
+ on:
7
+ schedule:
8
+ - cron: '0 16 * * *'
9
+ workflow_dispatch:
10
+ inputs:
11
+ delta:
12
+ description: 'delta'
13
+ required: true
14
+ default: '1'
15
+
16
+ permissions:
17
+ contents: read
18
+
19
+ jobs:
20
+ build:
21
+ runs-on: ubuntu-latest
22
+ timeout-minutes: 14400
23
+ steps:
24
+ - uses: actions/checkout@v3
25
+ - name: Set up Python 3.10
26
+ uses: actions/setup-python@v3
27
+ with:
28
+ python-version: "3.10"
29
+ - name: Install dependencies
30
+ run: |
31
+ python -m pip install --upgrade pip
32
+ if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
33
+ pip install transformers
34
+ pip install tensorflow
35
+ pip install tf-keras
36
+ - name: Data Collection
37
+ env:
38
+ AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
39
+ AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
40
+ DELTA: ${{ github.event.inputs.delta}}
41
+ run: |
42
+ python cbirc.py
daily.py ADDED
@@ -0,0 +1,501 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import uuid
4
+ import time
5
+ import urllib.request
6
+ from lxml import etree
7
+ from datetime import datetime, timedelta
8
+ from urllib.parse import urlparse
9
+ from utils import (encode,
10
+ translate,
11
+ sentiment_computation,
12
+ upsert_content,
13
+ fetch_url,
14
+ extract_from_pdf,
15
+ crawl,
16
+ datemodifier,
17
+ encode_content)
18
+
19
+ with open('xpath.json', 'r', encoding='UTF-8') as f:
20
+ xpath_dict = json.load(f)
21
+
22
+ DELTA = int(os.environ.get('DELTA', '1'))
23
+ print(f"DELTA = {DELTA}")
24
+
25
+ # cbirc.gov.cn
26
+ i = 1
27
+ while i > -1:
28
+ CATEGORY_URL = f"https://www.cbirc.gov.cn/cn/static/data/DocInfo/SelectDocByItemIdAndChild/data_itemId=917,pageIndex={i},pageSize=18.json"
29
+ i = i + 1
30
+ content = fetch_url(CATEGORY_URL)
31
+ reportinfo = json.loads(content)
32
+ for article in reportinfo['data']['rows']:
33
+ try:
34
+ article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(article['publishDate'],"%Y-%m-%d %H:%M:%S"))
35
+ parsed_datetime = datetime.strptime(article['publishDate'], "%Y-%m-%d")
36
+ if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
37
+ i = -1
38
+ else:
39
+ contentCN, summary = extract_from_pdf("https://www.cbirc.gov.cn" + article['pdfFileUrl'])
40
+ article['contentCN'] = repr(contentCN)[1:-1].strip()
41
+ if len(contentCN) < 10:
42
+ continue
43
+ CONTENT_ENG = ''
44
+ for element in article['contentCN'].split("\n"):
45
+ CONTENT_ENG += translate(element) + '\n'
46
+ article['content'] = repr(CONTENT_ENG)[1:-1].strip()
47
+ article['site'] = "National Financial Regulatory Administration of China"
48
+ article['originSite'] = "国家金融监督管理总局"
49
+ article['titleCN'] = article['docSubtitle']
50
+ article['title'] = translate(article['docSubtitle'])
51
+ article['link'] = "https://www.cbirc.gov.cn" + str(article['pdfFileUrl'])
52
+ article['category']= "Policy Interpretation"
53
+ article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['titleCN']+article['publishDate'])
54
+ article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
55
+ article['attachment'] = ''
56
+ article['author'] = ''
57
+ article['subtitle'] = translate(summary)
58
+ upsert_content(article)
59
+ except Exception as error:
60
+ print(error)
61
+
62
+ # csrc.gov.cn
63
+ i = 1
64
+ while i > -1:
65
+ if i == 1:
66
+ CATEGORY_URL = "http://www.csrc.gov.cn/csrc/c100039/common_list.shtml"
67
+ else:
68
+ CATEGORY_URL = f"http://www.csrc.gov.cn/csrc/c100039/common_list_{i}.shtml"
69
+ i = i + 1
70
+ req = urllib.request.urlopen(CATEGORY_URL)
71
+ text = req.read()
72
+ html_text = text.decode("utf-8")
73
+ page = etree.HTML(html_text)
74
+ articlelist = page.xpath("//div[contains(@class, 'main-right fr common-list')]/ul/li")
75
+ for article in articlelist:
76
+ if isinstance(article, etree._Element):
77
+ subelement = etree.tostring(article).decode()
78
+ subpage = etree.HTML(subelement)
79
+ date = encode(subpage.xpath("//span[@class='date']"))
80
+ parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
81
+ if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
82
+ i = -1
83
+ else:
84
+ urls = subpage.xpath("//a/@href")
85
+ for url in urls:
86
+ try:
87
+ article = {}
88
+ url = "http://www.csrc.gov.cn" + url
89
+ article['category']= "Policy Interpretation"
90
+ crawl(url, article)
91
+ except Exception as error:
92
+ print(error)
93
+
94
+ i = 1
95
+ while i > -1:
96
+ CATEGORY_URL = f"http://www.csrc.gov.cn/searchList/a1a078ee0bc54721ab6b148884c784a8?_isAgg=true&_isJson=true&_pageSize=18&_template=index&_rangeTimeGte=&_channelName=&page={i}"
97
+ i = i + 1
98
+ content = fetch_url(CATEGORY_URL)
99
+ reportinfo = json.loads(content)
100
+ for article in reportinfo['data']['results']:
101
+ try:
102
+ parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(article['publishedTimeStr'],"%Y-%m-%d %H:%M:%S")), "%Y-%m-%d")
103
+ if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
104
+ i = -1
105
+ else:
106
+ article['category']= "Financial News"
107
+ article['site'] = "Securities Regulatory Commission of China"
108
+ article['originSite'] = "证监会"
109
+ article['titleCN'] = article['title']
110
+ article['title'] = translate(article['titleCN'])
111
+ article['author'] = ''
112
+ article['contentCN'] = repr(article['content'])[1:-1].strip()
113
+ if len(article['contentCN']) < 10:
114
+ continue
115
+ CONTENT_ENG = ''
116
+ for element in article['contentCN'].split("。"):
117
+ CONTENT_ENG += translate(element) + ' '
118
+ article['content'] = repr(CONTENT_ENG)[1:-1].strip()
119
+ article['subtitle'] = article['memo']
120
+ article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(article['publishedTimeStr'],"%Y-%m-%d %H:%M:%S"))
121
+ article['link'] = article['url']
122
+ article['attachment'] = ""
123
+ article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
124
+ article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['titleCN']+article['publishDate'])
125
+ upsert_content(article)
126
+ except Exception as error:
127
+ print(error)
128
+
129
+ # data.eastmoney.com
130
+ def crawl_eastmoney(url, article):
131
+ domain = urlparse(url).netloc
132
+ req = urllib.request.urlopen(url)
133
+ text = req.read()
134
+ html_text = text.decode("utf-8")
135
+ page = etree.HTML(html_text)
136
+ contentCN, summary = encode_content(page.xpath(xpath_dict[domain]['content']))
137
+ article['attachment'] = encode(page.xpath(xpath_dict[domain]['attachment']))
138
+ article['link'] = url
139
+ if article['orgSName'] == "''":
140
+ article['site'] = translate(article['orgSName'])
141
+ else:
142
+ article['site'] = translate(article['orgName'])
143
+ article['titleCN'] = article['title']
144
+ article['title'] = translate(article['title'])
145
+ article['author'] = translate(article['researcher'])
146
+ article['originAuthor'] = article['researcher']
147
+ article['contentCN'] = repr(contentCN)[1:-1].strip()
148
+ article['subtitle'] = translate(summary)
149
+ article['category'] = "Macroeconomic Research"
150
+ if len(article['contentCN']) < 10:
151
+ return None
152
+ CONTENT_ENG = ''
153
+ for element in contentCN.split("\n"):
154
+ CONTENT_ENG += translate(element) + '\n'
155
+ article['content'] = repr(CONTENT_ENG)[1:-1].strip()
156
+ article['authorid'] = uuid.uuid5(uuid.NAMESPACE_OID, article['author'])
157
+ article['publishDate'] = datemodifier(article['publishDate'], xpath_dict[domain]['datetime_format'])
158
+ article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['titleCN']+article['publishDate'])
159
+ article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(contentCN.replace("\n",""))
160
+ upsert_content(article)
161
+
162
+ today = datetime.today().strftime('%Y-%m-%d')
163
+ beginDate = (datetime.today() - timedelta(days=DELTA)).strftime('%Y-%m-%d')
164
+ i = 0
165
+ while i > -1:
166
+ URL = "https://reportapi.eastmoney.com/report/jg"
167
+ params = {
168
+ "cb": "datatable8544623",
169
+ "pageSize": "100",
170
+ "beginTime": beginDate,
171
+ "endTime": today,
172
+ "pageNo": i,
173
+ "qType": "3",
174
+ }
175
+ URL = URL + "?" + "&".join(f"{key}={value}" for key, value in params.items())
176
+ print(URL)
177
+ content = fetch_url(URL)
178
+ if content:
179
+ start_index = content.find("(")
180
+ if start_index != -1:
181
+ result = content[start_index + 1: -1]
182
+ else:
183
+ result = content
184
+ reportinfo = json.loads(result)
185
+ if reportinfo["size"] > 0:
186
+ i = i + 1
187
+ for article in reportinfo['data']:
188
+ try:
189
+ url = f"https://data.eastmoney.com/report/zw_macresearch.jshtml?encodeUrl={article['encodeUrl']}"
190
+ crawl_eastmoney(url,article)
191
+ except Exception as error:
192
+ print(error)
193
+ else:
194
+ print(reportinfo)
195
+ i = -1
196
+ else:
197
+ print("Failed to fetch URL:", url)
198
+
199
+ # gov.cn
200
+ i = 0
201
+ while i > -1:
202
+ if i == 0:
203
+ CATEGORY_URL = "https://www.gov.cn/zhengce/jiedu/home.htm"
204
+ else:
205
+ CATEGORY_URL = f"https://www.gov.cn/zhengce/jiedu/home_{i}.htm"
206
+ i = i + 1
207
+ req = urllib.request.urlopen(CATEGORY_URL)
208
+ text = req.read()
209
+ html_text = text.decode("utf-8")
210
+ page = etree.HTML(html_text)
211
+ articlelist = page.xpath("//div[contains(@class, 'news_box')]//h4")
212
+ for article in articlelist:
213
+ if isinstance(article, etree._Element):
214
+ subelement = etree.tostring(article).decode()
215
+ subpage = etree.HTML(subelement)
216
+ date = subpage.xpath("//span/text()")[0]
217
+ parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
218
+ if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
219
+ i = -1
220
+ else:
221
+ urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
222
+ for url in urls:
223
+ try:
224
+ article = {}
225
+ url = url.replace('../', 'https://www.gov.cn/zhengce/')
226
+ if "https://www.gov.cn" in url:
227
+ article['category']= "Policy Interpretation"
228
+ crawl(url, article)
229
+ except Exception as error:
230
+ print(error)
231
+
232
+ i = 0
233
+ while i > -1:
234
+ if i == 0:
235
+ CATEGORY_URL = "https://www.gov.cn/zhengce/zuixin/home.htm"
236
+ else:
237
+ CATEGORY_URL = f"https://www.gov.cn/zhengce/zuixin/home_{i}.htm"
238
+ i = i + 1
239
+ req = urllib.request.urlopen(CATEGORY_URL)
240
+ text = req.read()
241
+ html_text = text.decode("utf-8")
242
+ page = etree.HTML(html_text)
243
+ articlelist = page.xpath("//div[contains(@class, 'news_box')]//h4")
244
+ for article in articlelist:
245
+ if isinstance(article, etree._Element):
246
+ subelement = etree.tostring(article).decode()
247
+ subpage = etree.HTML(subelement)
248
+ date = subpage.xpath("//span/text()")[0]
249
+ parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
250
+ if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
251
+ i = -1
252
+ else:
253
+ urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
254
+ for url in urls:
255
+ try:
256
+ article = {}
257
+ url = url.replace('../', 'https://www.gov.cn/zhengce/')
258
+ if "https://www.gov.cn" in url:
259
+ article['site'] = "State Council of China"
260
+ crawl(url, article)
261
+ except Exception as error:
262
+ print(error)
263
+
264
+ # mof.gov.cn
265
+ i = 0
266
+ while i > -1:
267
+ if i == 0:
268
+ CATEGORY_URL = "https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/"
269
+ else:
270
+ CATEGORY_URL = f"https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/index_{i}.htm"
271
+ i = i + 1
272
+ print(CATEGORY_URL)
273
+ req = urllib.request.urlopen(CATEGORY_URL)
274
+ text = req.read()
275
+ html_text = text.decode("utf-8")
276
+ page = etree.HTML(html_text)
277
+ articlelist = page.xpath("//div[contains(@class, 'xwfb_listerji')]/ul/li[not(@class = 'clear')]")
278
+ for article in articlelist:
279
+ if isinstance(article, etree._Element):
280
+ subelement = etree.tostring(article).decode()
281
+ subpage = etree.HTML(subelement)
282
+ date = subpage.xpath("//span/text()")[0]
283
+ parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
284
+ if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
285
+ i = -1
286
+ else:
287
+ urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
288
+ for url in urls:
289
+ try:
290
+ article = {}
291
+ url = url.replace("../", "https://www.mof.gov.cn/zhengwuxinxi/")
292
+ url = url.replace("./", "https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/")
293
+ print(url)
294
+ article['category']= "Financial News"
295
+ crawl(url, article)
296
+ except Exception as error:
297
+ print(error)
298
+
299
+ i = 0
300
+ while i > -1:
301
+ if i == 0:
302
+ CATEGORY_URL = "https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/"
303
+ else:
304
+ CATEGORY_URL = f"https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/index_{i}.htm"
305
+ i = i + 1
306
+ print(CATEGORY_URL)
307
+ req = urllib.request.urlopen(CATEGORY_URL)
308
+ text = req.read()
309
+ html_text = text.decode("utf-8")
310
+ page = etree.HTML(html_text)
311
+ articlelist = page.xpath("//div[contains(@class, 'xwfb_listerji')]/ul/li[not(@class = 'clear')]")
312
+ for article in articlelist:
313
+ if isinstance(article, etree._Element):
314
+ subelement = etree.tostring(article).decode()
315
+ subpage = etree.HTML(subelement)
316
+ date = subpage.xpath("//span/text()")[0]
317
+ parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
318
+ if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
319
+ i = -1
320
+ else:
321
+ urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
322
+ for url in urls:
323
+ try:
324
+ article = {}
325
+ url = url.replace("./", CATEGORY_URL)
326
+ article['category']= "Policy Interpretation"
327
+ print(url)
328
+ crawl(url, article)
329
+ except Exception as error:
330
+ print(error)
331
+
332
+ # mofcom.gov.cn
333
+ categories = ['jdzhsw','jdgnmy','jddwmy','jdtzhz']
334
+ for category in categories:
335
+ i = 1
336
+ while i > -1:
337
+ if i == 1:
338
+ URL = f"http://www.mofcom.gov.cn/article/zcjd/{category}/"
339
+ else:
340
+ URL = f"http://www.mofcom.gov.cn/article/zcjd/{category}/?{i}"
341
+ i = i + 1
342
+ req = urllib.request.urlopen(URL)
343
+ text = req.read()
344
+ html_text = text.decode("utf-8")
345
+ page = etree.HTML(html_text)
346
+ articlelist = page.xpath("//section[contains(@class, 'listCon iListCon f-mt30')]/ul/li")
347
+ for article in articlelist:
348
+ if isinstance(article, etree._Element):
349
+ subelement = etree.tostring(article).decode()
350
+ subpage = etree.HTML(subelement)
351
+ date = subpage.xpath("//span/text()")[0]
352
+ parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d %H:%M:%S")), "%Y-%m-%d")
353
+ if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
354
+ i = -1
355
+ else:
356
+ urls = subpage.xpath("//a/@href")
357
+ for url in urls:
358
+ try:
359
+ article = {}
360
+ if '/article/zcjd' in url:
361
+ url = "http://www.mofcom.gov.cn" + url
362
+ article['category']= "Policy Interpretation"
363
+ else:
364
+ article['category']= "Policy Release"
365
+ crawl(url, article)
366
+ except Exception as error:
367
+ print(error)
368
+
369
+ # ndrc.gov.cn
370
+ i = 0
371
+ while i > -1:
372
+ if i == 0:
373
+ CATEGORY_URL = "https://www.ndrc.gov.cn/xxgk/jd/jd/index.html"
374
+ else:
375
+ CATEGORY_URL = f"https://www.ndrc.gov.cn/xxgk/jd/jd/index_{i}.html"
376
+ i = i + 1
377
+ req = urllib.request.urlopen(CATEGORY_URL)
378
+ text = req.read()
379
+ html_text = text.decode("utf-8")
380
+ page = etree.HTML(html_text)
381
+ articlelist = page.xpath("//div[contains(@class, 'list')]/ul/li[not(@class = 'empty')]")
382
+ for article in articlelist:
383
+ if isinstance(article, etree._Element):
384
+ subelement = etree.tostring(article).decode()
385
+ subpage = etree.HTML(subelement)
386
+ date = subpage.xpath("//span/text()")[0]
387
+ parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y/%m/%d")), "%Y-%m-%d")
388
+ if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
389
+ i = -1
390
+ else:
391
+ urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
392
+ for url in urls:
393
+ try:
394
+ article = {}
395
+ if "www.gov.cn" in url:
396
+ article['category']= "Policy Release"
397
+ elif "../../zcfb/" in url:
398
+ url = url.replace("../../zcfb/", "https://www.ndrc.gov.cn/xxgk/zcfb/")
399
+ article['category']= "Policy Release"
400
+ else:
401
+ url = url.replace("../../", "https://www.ndrc.gov.cn/xxgk/jd/jd/")
402
+ url = url.replace("./", "https://www.ndrc.gov.cn/xxgk/jd/jd/")
403
+ article['category']= "Policy Interpretation"
404
+ crawl(url, article)
405
+ except Exception as error:
406
+ print(error)
407
+
408
+ # safe.gov.cn
409
+ i = 1
410
+ while i > -1:
411
+ if i == 1:
412
+ CATEGORY_URL = "https://www.safe.gov.cn/safe/zcfgjd/index.html"
413
+ else:
414
+ CATEGORY_URL = f"https://www.safe.gov.cn/safe/zcfgjd/index_{i}.html"
415
+ i = i + 1
416
+ req = urllib.request.urlopen(CATEGORY_URL)
417
+ text = req.read()
418
+ html_text = text.decode("utf-8")
419
+ page = etree.HTML(html_text)
420
+ articlelist = page.xpath("//div[contains(@class, 'list_conr')]/ul/li")
421
+ for article in articlelist:
422
+ if isinstance(article, etree._Element):
423
+ subelement = etree.tostring(article).decode()
424
+ subpage = etree.HTML(subelement)
425
+ date = subpage.xpath("//dd/text()")[0]
426
+ parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
427
+ if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
428
+ i = -1
429
+ else:
430
+ urls = subpage.xpath("//a/@href")
431
+ for url in urls:
432
+ try:
433
+ article = {}
434
+ url = "https://www.safe.gov.cn" + url
435
+ article['category']= "Policy Interpretation"
436
+ crawl(url, article)
437
+ except Exception as error:
438
+ print(error)
439
+
440
+ i = 1
441
+ while i > -1:
442
+ if i == 1:
443
+ CATEGORY_URL = "https://www.safe.gov.cn/safe/sjjd/index.html"
444
+ else:
445
+ CATEGORY_URL = f"https://www.safe.gov.cn/safe/sjjd/index_{i}.html"
446
+ i = i + 1
447
+ req = urllib.request.urlopen(CATEGORY_URL)
448
+ text = req.read()
449
+ html_text = text.decode("utf-8")
450
+ page = etree.HTML(html_text)
451
+ articlelist = page.xpath("//div[contains(@class, 'list_conr')]/ul/li")
452
+ for article in articlelist:
453
+ if isinstance(article, etree._Element):
454
+ subelement = etree.tostring(article).decode()
455
+ subpage = etree.HTML(subelement)
456
+ date = subpage.xpath("//dd/text()")[0]
457
+ parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
458
+ if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
459
+ i = -1
460
+ else:
461
+ urls = subpage.xpath("//a/@href")
462
+ for url in urls:
463
+ try:
464
+ article = {}
465
+ url = "https://www.safe.gov.cn" + url
466
+ article['category']= "Data Interpretation"
467
+ crawl(url, article)
468
+ except Exception as error:
469
+ print(error)
470
+
471
+ # stats.gov.hk
472
+ i = 0
473
+ while i > -1:
474
+ if i == 0:
475
+ CATEGORY_URL = "https://www.stats.gov.cn/sj/sjjd/"
476
+ else:
477
+ CATEGORY_URL = f"https://www.stats.gov.cn/sj/sjjd/index_{i}.html"
478
+ i = i + 1
479
+ req = urllib.request.urlopen(CATEGORY_URL)
480
+ text = req.read()
481
+ html_text = text.decode("utf-8")
482
+ page = etree.HTML(html_text)
483
+ articlelist = page.xpath("//div[contains(@class, 'list-content')]/ul/li")
484
+ for article in articlelist:
485
+ if isinstance(article, etree._Element):
486
+ subelement = etree.tostring(article).decode()
487
+ subpage = etree.HTML(subelement)
488
+ date = encode(subpage.xpath("//span"))
489
+ parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
490
+ if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
491
+ i = -1
492
+ else:
493
+ urls = subpage.xpath("//a[@class='fl pc_1600']/@href")
494
+ for url in urls:
495
+ try:
496
+ article = {}
497
+ url = url.replace('./', "https://www.stats.gov.cn/sj/sjjd/")
498
+ article['category']= "Data Interpretation"
499
+ crawl(url, article)
500
+ except Exception as error:
501
+ print(error)