OxbridgeEconomics commited on
Commit
42ba1cc
·
1 Parent(s): 046bb22
Files changed (11) hide show
  1. cbirc.py +19 -11
  2. csrc.py +30 -32
  3. gov.py +0 -2
  4. mof.py +3 -106
  5. mofcom.py +2 -22
  6. ndrc.py +3 -59
  7. pbc.py +38 -30
  8. safe.py +3 -42
  9. stats.py +2 -21
  10. utils.py +28 -18
  11. xpath.json +68 -11
cbirc.py CHANGED
@@ -11,30 +11,35 @@ i = 1
11
  while i > -1:
12
  CATEGORY_URL = f"https://www.cbirc.gov.cn/cn/static/data/DocInfo/SelectDocByItemIdAndChild/data_itemId=917,pageIndex={i},pageSize=18.json"
13
  i = i + 1
 
14
  content = fetch_url(CATEGORY_URL)
15
  reportinfo = json.loads(content)
16
  for article in reportinfo['data']['rows']:
17
  try:
18
- parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(article['publishDate'],"%Y-%m-%d %H:%M:%S")), "%Y-%m-%d")
 
19
  if parsed_datetime < (datetime.today() - timedelta(days=183)):
20
  i = -1
21
  else:
22
- article['originContent'] = extract_from_pdf("https://www.cbirc.gov.cn" + article['pdfFileUrl'])
23
- if len(article['originContent']) < 10:
 
24
  continue
25
  CONTENT_ENG = ''
26
- for element in article['originContent'].split("\n"):
27
  CONTENT_ENG += translate(element) + '\n'
28
- article['content'] = CONTENT_ENG
29
  article['site'] = "National Financial Regulatory Administration of China"
30
  article['originSite'] = "国家金融监督管理总局"
31
- article['originTitle'] = article['docSubtitle']
32
- article['title'] = translate(article['originTitle'])
33
- article['url'] = "https://www.cbirc.gov.cn" + article['pdfFileUrl']
34
- article['category']= "Policy Interpretation"
35
- article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(article['publishDate'],"%Y-%m-%d %H:%M:%S"))
36
  article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
37
  article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
 
 
 
38
  upsert_content(article)
39
  except Exception as error:
40
  print(error)
@@ -46,7 +51,10 @@ while i > -1:
46
  CATEGORY_URL = f"https://www.chinatax.gov.cn/search5/search/s?siteCode=bm29000002&searchWord=&type=1&xxgkResolveType=%E6%96%87%E5%AD%97&pageNum={i}&pageSize=10&cwrqStart=&cwrqEnd=&column=%E6%94%BF%E7%AD%96%E8%A7%A3%E8%AF%BB&likeDoc=0&wordPlace=0&videoreSolveType="
47
  i = i + 1
48
  urllib3.disable_warnings()
49
- req = urllib.request.urlopen(CATEGORY_URL)
 
 
 
50
  content = req.read().decode("utf-8")
51
  reportinfo = json.loads(content)
52
  for article in reportinfo['searchResultAll']['searchTotal']:
 
11
  while i > -1:
12
  CATEGORY_URL = f"https://www.cbirc.gov.cn/cn/static/data/DocInfo/SelectDocByItemIdAndChild/data_itemId=917,pageIndex={i},pageSize=18.json"
13
  i = i + 1
14
+ print(CATEGORY_URL)
15
  content = fetch_url(CATEGORY_URL)
16
  reportinfo = json.loads(content)
17
  for article in reportinfo['data']['rows']:
18
  try:
19
+ article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(article['publishDate'],"%Y-%m-%d %H:%M:%S"))
20
+ parsed_datetime = datetime.strptime(article['publishDate'], "%Y-%m-%d")
21
  if parsed_datetime < (datetime.today() - timedelta(days=183)):
22
  i = -1
23
  else:
24
+ contentCN, summary = extract_from_pdf("https://www.cbirc.gov.cn" + article['pdfFileUrl'])
25
+ article['contentCN'] = repr(contentCN)
26
+ if len(contentCN) < 10:
27
  continue
28
  CONTENT_ENG = ''
29
+ for element in article['contentCN'].split("\n"):
30
  CONTENT_ENG += translate(element) + '\n'
31
+ article['content'] = repr(CONTENT_ENG)
32
  article['site'] = "National Financial Regulatory Administration of China"
33
  article['originSite'] = "国家金融监督管理总局"
34
+ article['titleCN'] = article['docSubtitle']
35
+ article['title'] = translate(article['docSubtitle'])
36
+ article['link'] = "https://www.cbirc.gov.cn" + str(article['pdfFileUrl'])
37
+ article['category']= "Policy Interpretation"
 
38
  article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
39
  article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
40
+ article['attachment'] = ''
41
+ article['author'] = ''
42
+ article['subtitle'] = translate(summary)
43
  upsert_content(article)
44
  except Exception as error:
45
  print(error)
 
51
  CATEGORY_URL = f"https://www.chinatax.gov.cn/search5/search/s?siteCode=bm29000002&searchWord=&type=1&xxgkResolveType=%E6%96%87%E5%AD%97&pageNum={i}&pageSize=10&cwrqStart=&cwrqEnd=&column=%E6%94%BF%E7%AD%96%E8%A7%A3%E8%AF%BB&likeDoc=0&wordPlace=0&videoreSolveType="
52
  i = i + 1
53
  urllib3.disable_warnings()
54
+ try:
55
+ req = urllib.request.urlopen(CATEGORY_URL, timeout=30)
56
+ except:
57
+ break
58
  content = req.read().decode("utf-8")
59
  reportinfo = json.loads(content)
60
  for article in reportinfo['searchResultAll']['searchTotal']:
csrc.py CHANGED
@@ -6,38 +6,36 @@ from datetime import datetime, timedelta
6
  from lxml import etree
7
  from utils import encode, translate, sentiment_computation, upsert_content, fetch_url, crawl
8
 
9
- # i = 1
10
- # while i > -1:
11
- # if i == 1:
12
- # CATEGORY_URL = "http://www.csrc.gov.cn/csrc/c100039/common_list.shtml"
13
- # else:
14
- # CATEGORY_URL = f"http://www.csrc.gov.cn/csrc/c100039/common_list_{i}.shtml"
15
- # i = i + 1
16
- # req = urllib.request.urlopen(CATEGORY_URL)
17
- # text = req.read()
18
- # html_text = text.decode("utf-8")
19
- # page = etree.HTML(html_text)
20
- # articlelist = page.xpath("//div[contains(@class, 'main-right fr common-list')]/ul/li")
21
- # for article in articlelist:
22
- # if isinstance(article, etree._Element):
23
- # subelement = etree.tostring(article).decode()
24
- # subpage = etree.HTML(subelement)
25
- # date = encode(subpage.xpath("//span[@class='date']"))
26
- # parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
27
- # if parsed_datetime < (datetime.today() - timedelta(days=183)):
28
- # i = -1
29
- # else:
30
- # urls = subpage.xpath("//a/@href")
31
- # for url in urls:
32
- # try:
33
- # article = {}
34
- # url = "http://www.csrc.gov.cn" + url
35
- # article['site'] = "Securities Regulatory Commission of China"
36
- # article['originSite'] = "证监会"
37
- # article['category']= "Policy Interpretation"
38
- # crawl(url, article)
39
- # except Exception as error:
40
- # print(error)
41
 
42
  i = 1
43
  while i > -1:
 
6
  from lxml import etree
7
  from utils import encode, translate, sentiment_computation, upsert_content, fetch_url, crawl
8
 
9
+ i = 1
10
+ while i > -1:
11
+ if i == 1:
12
+ CATEGORY_URL = "http://www.csrc.gov.cn/csrc/c100039/common_list.shtml"
13
+ else:
14
+ CATEGORY_URL = f"http://www.csrc.gov.cn/csrc/c100039/common_list_{i}.shtml"
15
+ i = i + 1
16
+ req = urllib.request.urlopen(CATEGORY_URL)
17
+ text = req.read()
18
+ html_text = text.decode("utf-8")
19
+ page = etree.HTML(html_text)
20
+ articlelist = page.xpath("//div[contains(@class, 'main-right fr common-list')]/ul/li")
21
+ for article in articlelist:
22
+ if isinstance(article, etree._Element):
23
+ subelement = etree.tostring(article).decode()
24
+ subpage = etree.HTML(subelement)
25
+ date = encode(subpage.xpath("//span[@class='date']"))
26
+ parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
27
+ if parsed_datetime < (datetime.today() - timedelta(days=183)):
28
+ i = -1
29
+ else:
30
+ urls = subpage.xpath("//a/@href")
31
+ for url in urls:
32
+ try:
33
+ article = {}
34
+ url = "http://www.csrc.gov.cn" + url
35
+ article['category']= "Policy Interpretation"
36
+ crawl(url, article)
37
+ except Exception as error:
38
+ print(error)
 
 
39
 
40
  i = 1
41
  while i > -1:
gov.py CHANGED
@@ -60,8 +60,6 @@ while i > -1:
60
  url = url.replace('../', 'https://www.gov.cn/zhengce/')
61
  if "https://www.gov.cn" in url:
62
  article['category']= "Policy Interpretation"
63
- article['originSite'] = "国务院"
64
- article['site'] = "State Council of China"
65
  crawl(url, article)
66
  except Exception as error:
67
  print(error)
 
60
  url = url.replace('../', 'https://www.gov.cn/zhengce/')
61
  if "https://www.gov.cn" in url:
62
  article['category']= "Policy Interpretation"
 
 
63
  crawl(url, article)
64
  except Exception as error:
65
  print(error)
mof.py CHANGED
@@ -1,9 +1,8 @@
1
- import uuid
2
  import time
3
  import urllib.request
4
  from lxml import etree
5
  from datetime import datetime, timedelta
6
- from utils import encode, translate, datemodifier, sentiment_computation, upsert_content, extract_from_pdf
7
 
8
  i = 0
9
  while i > -1:
@@ -32,27 +31,8 @@ while i > -1:
32
  article = {}
33
  url = url.replace("../", "https://www.mof.gov.cn/zhengwuxinxi/")
34
  url = url.replace("./", "https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/")
35
- req = urllib.request.urlopen(url)
36
- text = req.read()
37
- html_text = text.decode("utf-8")
38
- page = etree.HTML(html_text)
39
- article['originalContent'] = encode(page.xpath("//div[contains(@class, 'TRS_Editor')]//p"))
40
- if len(article['originalContent']) < 10:
41
- continue
42
- CONTENT_ENG = ''
43
- for element in article['originalContent'].split("。"):
44
- CONTENT_ENG += translate(element) + ' '
45
- article['content'] = CONTENT_ENG
46
- article['site'] = "Ministry of Finance of China"
47
- article['originalSite'] = "财政部"
48
- article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
49
- article['title'] = translate(article['originalTitle'])
50
- article['url'] = url
51
  article['category']= "Financial News"
52
- article['publishDate'] = datemodifier(page.xpath("//meta[@name = 'PubDate']/@content")[0],"%Y-%m-%d %H:%M:%S")
53
- article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
54
- article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
55
- upsert_content(article)
56
  except Exception as error:
57
  print(error)
58
 
@@ -82,90 +62,7 @@ while i > -1:
82
  try:
83
  article = {}
84
  url = url.replace("./", CATEGORY_URL)
85
- req = urllib.request.urlopen(url)
86
- text = req.read()
87
- html_text = text.decode("utf-8")
88
- page = etree.HTML(html_text)
89
- article['originalContent'] = encode(page.xpath("//div[contains(@class, 'TRS_Editor')]//p"))
90
- if len(article['originalContent']) < 10:
91
- continue
92
- CONTENT_ENG = ''
93
- for element in article['originalContent'].split("。"):
94
- CONTENT_ENG += translate(element) + ' '
95
- article['content'] = CONTENT_ENG
96
- article['site'] = "Ministry of Finance of China"
97
- article['originalSite'] = "财政部"
98
- article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
99
- article['title'] = translate(article['originalTitle'])
100
- article['url'] = url
101
  article['category']= "Policy Interpretation"
102
- article['publishDate'] = datemodifier(page.xpath("//meta[@name = 'PubDate']/@content")[0], "%Y-%m-%d %H:%M:%S")
103
- article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
104
- article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
105
- upsert_content(article)
106
  except Exception as error:
107
  print(error)
108
-
109
- # i = 0
110
- # while i > -1:
111
- # if i == 0:
112
- # CATEGORY_URL = "https://www.mof.gov.cn/zhengwuxinxi/zhengcefabu/"
113
- # else:
114
- # CATEGORY_URL = f"https://www.mof.gov.cn/zhengwuxinxi/zhengcefabu/index_{i}.htm"
115
- # i = i + 1
116
- # req = urllib.request.urlopen(CATEGORY_URL)
117
- # text = req.read()
118
- # html_text = text.decode("utf-8")
119
- # page = etree.HTML(html_text)
120
- # articlelist = page.xpath("//div[contains(@class, 'xwfb_listerji')]/ul/li[not(@class = 'clear')]")
121
- # for article in articlelist:
122
- # if isinstance(article, etree._Element):
123
- # subelement = etree.tostring(article).decode()
124
- # subpage = etree.HTML(subelement)
125
- # date = subpage.xpath("//span/text()")[0]
126
- # parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
127
- # if parsed_datetime < (datetime.today() - timedelta(days=183)):
128
- # i = -1
129
- # else:
130
- # urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
131
- # for url in urls:
132
- # try:
133
- # article = {}
134
- # url = url.replace("./", CATEGORY_URL)
135
- # print(url)
136
- # req = urllib.request.urlopen(url)
137
- # text = req.read()
138
- # html_text = text.decode("utf-8")
139
- # page = etree.HTML(html_text)
140
- # attachments = page.xpath("//span[contains(@id, 'appendix1')]/a/@href")
141
- # print(attachments)
142
- # if len(attachments) > 0:
143
- # for attachment_url in attachments:
144
- # if '.pdf' in attachment_url:
145
- # attachment_url = attachment_url.replace("./", "https://zyhj.mof.gov.cn/zcfb/202403/")
146
- # article['originalContent'] = extract_from_pdf(attachment_url)
147
- # if '.doc' in attachment_url:
148
- # continue
149
- # if '.docx' in attachment_url:
150
- # continue
151
- # else:
152
- # article['originalContent'] = encode(page.xpath("//div[contains(@class, 'TRS_Editor')]/p[@align='justify']"))
153
- # print(article['originalContent'])
154
- # if len(article['originalContent']) < 10:
155
- # continue
156
- # CONTENT_ENG = ''
157
- # for element in article['originalContent'].split("。"):
158
- # CONTENT_ENG += translate(element) + ' '
159
- # article['content'] = CONTENT_ENG
160
- # article['site'] = "Ministry of Finance"
161
- # article['originalSite'] = "财政部"
162
- # article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
163
- # article['title'] = translate(article['originalTitle'])
164
- # article['url'] = url
165
- # article['category']= "Policy Release"
166
- # article['publishDate'] = datemodifier(page.xpath("//meta[@name = 'PubDate']/@content")[0], "%Y-%m-%d %H:%M:%S")
167
- # article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
168
- # article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
169
- # # upsert_content(article)
170
- # except Exception as error:
171
- # print(error)
 
 
1
  import time
2
  import urllib.request
3
  from lxml import etree
4
  from datetime import datetime, timedelta
5
+ from utils import crawl
6
 
7
  i = 0
8
  while i > -1:
 
31
  article = {}
32
  url = url.replace("../", "https://www.mof.gov.cn/zhengwuxinxi/")
33
  url = url.replace("./", "https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  article['category']= "Financial News"
35
+ crawl(url, article)
 
 
 
36
  except Exception as error:
37
  print(error)
38
 
 
62
  try:
63
  article = {}
64
  url = url.replace("./", CATEGORY_URL)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  article['category']= "Policy Interpretation"
66
+ crawl(url, article)
 
 
 
67
  except Exception as error:
68
  print(error)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
mofcom.py CHANGED
@@ -1,9 +1,8 @@
1
- import uuid
2
  import time
3
  import urllib.request
4
  from datetime import datetime, timedelta
5
  from lxml import etree
6
- from utils import encode, translate, sentiment_computation, upsert_content
7
 
8
  categories = ['jdzhsw','jdgnmy','jddwmy','jdtzhz']
9
  for category in categories:
@@ -37,25 +36,6 @@ for category in categories:
37
  article['category']= "Policy Interpretation"
38
  else:
39
  article['category']= "Policy Release"
40
- req = urllib.request.urlopen(url)
41
- text = req.read()
42
- html_text = text.decode("utf-8")
43
- page = etree.HTML(html_text)
44
- article['originalContent'] = encode(page.xpath("//div[contains(@class, 'art-con art-con-bottonmLine')]//p"))
45
- if len(article['originalContent']) < 10:
46
- continue
47
- CONTENT_ENG = ''
48
- for element in article['originalContent'].split("。"):
49
- CONTENT_ENG += translate(element) + ' '
50
- article['content'] = CONTENT_ENG
51
- article['site'] = "Ministry of Commerce of China"
52
- article['originalSite'] = "商务部"
53
- article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
54
- article['title'] = translate(article['originalTitle'])
55
- article['url'] = url
56
- article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(page.xpath("//meta[@name = 'PubDate']/@content")[0],"%Y-%m-%d %H:%M:%S"))
57
- article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
58
- article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
59
- upsert_content(article)
60
  except Exception as error:
61
  print(error)
 
 
1
  import time
2
  import urllib.request
3
  from datetime import datetime, timedelta
4
  from lxml import etree
5
+ from utils import crawl
6
 
7
  categories = ['jdzhsw','jdgnmy','jddwmy','jdtzhz']
8
  for category in categories:
 
36
  article['category']= "Policy Interpretation"
37
  else:
38
  article['category']= "Policy Release"
39
+ crawl(url, article)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  except Exception as error:
41
  print(error)
ndrc.py CHANGED
@@ -3,7 +3,7 @@ import uuid
3
  import time
4
  import urllib.request
5
  from lxml import etree
6
- from utils import encode, translate, datemodifier, sentiment_computation, upsert_content
7
 
8
  i = 0
9
  while i > -1:
@@ -30,71 +30,15 @@ while i > -1:
30
  for url in urls:
31
  try:
32
  article = {}
33
- print(url)
34
- if "https://www.gov.cn" in url:
35
- req = urllib.request.urlopen(url)
36
- text = req.read()
37
- html_text = text.decode("utf-8")
38
- page = etree.HTML(html_text)
39
- article['originalContent'] = encode(page.xpath("//div[contains(@id, 'UCAP-CONTENT')]//p"))
40
- if len(article['originalContent']) < 10:
41
- continue
42
- CONTENT_ENG = ''
43
- for element in article['originalContent'].split("。"):
44
- CONTENT_ENG += translate(element) + ' '
45
- article['content'] = CONTENT_ENG
46
- article['site'] = "State Council of China"
47
- article['originalSite'] = "国务院"
48
- article['originalTitle'] = page.xpath("//title/text()")[0]
49
- article['title'] = translate(article['originalTitle'])
50
- article['url'] = url
51
  article['category']= "Policy Release"
52
- article['publishDate'] = datemodifier(page.xpath("//meta[@name = 'firstpublishedtime']/@content")[0], "%Y-%m-%d-%H:%M:%S")
53
  elif "../../zcfb/" in url:
54
  url = url.replace("../../zcfb/", "https://www.ndrc.gov.cn/xxgk/zcfb/")
55
- print(url)
56
- req = urllib.request.urlopen(url)
57
- text = req.read()
58
- html_text = text.decode("utf-8")
59
- page = etree.HTML(html_text)
60
- article['originalContent'] = encode(page.xpath("//div[contains(@class, 'TRS_Editor') or contains(@class, 'article_l')]"))
61
- if len(article['originalContent']) < 10:
62
- continue
63
- CONTENT_ENG = ''
64
- for element in article['originalContent'].split("。"):
65
- CONTENT_ENG += translate(element) + ' '
66
- article['content'] = CONTENT_ENG
67
- article['site'] = "National Development and Reform Commission of China"
68
- article['originalSite'] = "国家发展和改革委员会"
69
- article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
70
- article['title'] = translate(article['originalTitle'])
71
- article['url'] = url
72
  article['category']= "Policy Release"
73
- article['publishDate'] = datemodifier(page.xpath("//meta[@name = 'PubDate']/@content")[0], "%Y-%m-%d %H:%M:%S")
74
  else:
75
  url = url.replace("../../", "https://www.ndrc.gov.cn/xxgk/jd/jd/")
76
  url = url.replace("./", "https://www.ndrc.gov.cn/xxgk/jd/jd/")
77
- print(url)
78
- req = urllib.request.urlopen(url)
79
- text = req.read()
80
- html_text = text.decode("utf-8")
81
- page = etree.HTML(html_text)
82
- article['originalContent'] = encode(page.xpath("//div[contains(@class, 'TRS_Editor')]//p"))
83
- if len(article['originalContent']) < 10:
84
- continue
85
- CONTENT_ENG = ''
86
- for element in article['originalContent'].split("。"):
87
- CONTENT_ENG += translate(element) + ' '
88
- article['content'] = CONTENT_ENG
89
- article['site'] = "National Development and Reform Commission of China"
90
- article['originalSite'] = "国家发展和改革委员会"
91
- article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
92
- article['title'] = translate(article['originalTitle'])
93
- article['url'] = url
94
  article['category']= "Policy Interpretation"
95
- article['publishDate'] = datemodifier(page.xpath("//meta[@name = 'PubDate']/@content")[0], "%Y-%m-%d %H:%M:%S")
96
- article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
97
- article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
98
- upsert_content(article)
99
  except Exception as error:
100
  print(error)
 
3
  import time
4
  import urllib.request
5
  from lxml import etree
6
+ from utils import crawl
7
 
8
  i = 0
9
  while i > -1:
 
30
  for url in urls:
31
  try:
32
  article = {}
33
+ if "www.gov.cn" in url:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  article['category']= "Policy Release"
 
35
  elif "../../zcfb/" in url:
36
  url = url.replace("../../zcfb/", "https://www.ndrc.gov.cn/xxgk/zcfb/")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  article['category']= "Policy Release"
 
38
  else:
39
  url = url.replace("../../", "https://www.ndrc.gov.cn/xxgk/jd/jd/")
40
  url = url.replace("./", "https://www.ndrc.gov.cn/xxgk/jd/jd/")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  article['category']= "Policy Interpretation"
42
+ crawl(url, article)
 
 
 
43
  except Exception as error:
44
  print(error)
pbc.py CHANGED
@@ -13,37 +13,45 @@ while i > -1:
13
  j = i + 1
14
  CATEGORY_URL = f"http://www.pbc.gov.cn/rmyh/3963412/3963426/index_{j}.html"
15
  i = i + 1
16
- response = requests.get(CATEGORY_URL, timeout=20)
17
  page = etree.HTML(response.text)
18
- urls = page.xpath("//td[contains(@height,'22')]//a[contains(@target, '_blank')]/@href")
19
- urls = [item for item in urls if item.startswith("/rmyh/")]
20
- for url in urls:
21
- try:
22
- url = "http://www.pbc.gov.cn" + url
23
- article = {}
24
- response = requests.get(url, timeout=20)
25
- response.encoding = 'utf-8'
26
- page = etree.HTML(response.text)
27
- article['originalContent'] = encode(page.xpath("//div[@class='mainw950']//td[@class='content']/font[@class='zoom1']//p"))
28
- if len(article['originalContent']) < 10:
29
  continue
30
- CONTENT_ENG = ''
31
- for element in article['originalContent'].split("。"):
32
- CONTENT_ENG += translate(element) + ' '
33
- article['content'] = CONTENT_ENG
34
- article['site'] = "The People's Bank of China"
35
- article['originalSite'] = "中国人民银行"
36
- article['originalTitle'] = page.xpath("//title/text()")[0]
37
- article['title'] = translate(article['originalTitle'])
38
- article['url'] = url
39
- article['category']= "Policy Interpretation"
40
- article['publishDate'] = datemodifier(page.xpath("//meta[@name = '页面生成时间']/@content")[0], "%Y-%m-%d %H:%M:%S")
41
- parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(article['publishDate'],"%Y-%m-%d")), "%Y-%m-%d")
42
- if parsed_datetime < (datetime.today() - timedelta(days=183)):
43
  i = -1
44
  else:
45
- article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
46
- article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
47
- upsert_content(article)
48
- except Exception as error:
49
- print(error)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  j = i + 1
14
  CATEGORY_URL = f"http://www.pbc.gov.cn/rmyh/3963412/3963426/index_{j}.html"
15
  i = i + 1
16
+ response = requests.get(CATEGORY_URL, timeout=30)
17
  page = etree.HTML(response.text)
18
+ articlelist = page.xpath("//td[contains(@height, '22')]")
19
+ for article in articlelist:
20
+ if isinstance(article, etree._Element):
21
+ subelement = etree.tostring(article).decode()
22
+ subpage = etree.HTML(subelement)
23
+ date = subpage.xpath("//span/text()")
24
+ try:
25
+ parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
26
+ except:
 
 
27
  continue
28
+ if parsed_datetime < (datetime.today() - timedelta(days=183)):
 
 
 
 
 
 
 
 
 
 
 
 
29
  i = -1
30
  else:
31
+ urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
32
+ for url in urls:
33
+ try:
34
+ article = {}
35
+ url = "http://www.pbc.gov.cn" + url
36
+ response = requests.get(url, timeout=20)
37
+ response.encoding = 'utf-8'
38
+ page = etree.HTML(response.text)
39
+ article['originalContent'] = encode(page.xpath("//div[@class='mainw950']//td[@class='content']/font[@class='zoom1']//p"))
40
+ if len(article['originalContent']) < 10:
41
+ continue
42
+ CONTENT_ENG = ''
43
+ for element in article['originalContent'].split("。"):
44
+ CONTENT_ENG += translate(element) + ' '
45
+ article['content'] = CONTENT_ENG
46
+ article['site'] = "The People's Bank of China"
47
+ article['originalSite'] = "中国人民银行"
48
+ article['originalTitle'] = page.xpath("//title/text()")[0]
49
+ article['title'] = translate(article['originalTitle'])
50
+ article['url'] = url
51
+ article['category']= "Policy Interpretation"
52
+ article['publishDate'] = datemodifier(page.xpath("//meta[@name = '页面生成时间']/@content")[0], "%Y-%m-%d %H:%M:%S")
53
+ article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
54
+ article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
55
+ upsert_content(article)
56
+ except Exception as error:
57
+ print(error)
safe.py CHANGED
@@ -1,9 +1,8 @@
1
- import uuid
2
  import time
3
  import urllib.request
4
  from datetime import datetime, timedelta
5
  from lxml import etree
6
- from utils import encode, translate, datemodifier, sentiment_computation, upsert_content
7
 
8
  i = 1
9
  while i > -1:
@@ -31,27 +30,8 @@ while i > -1:
31
  try:
32
  article = {}
33
  url = "https://www.safe.gov.cn" + url
34
- req = urllib.request.urlopen(url)
35
- text = req.read()
36
- html_text = text.decode("utf-8")
37
- page = etree.HTML(html_text)
38
- article['originalContent'] = encode(page.xpath("//div[contains(@class, 'detail_content')]//p"))
39
- if len(article['originalContent']) < 10:
40
- continue
41
- CONTENT_ENG = ''
42
- for element in article['originalContent'].split("。"):
43
- CONTENT_ENG += translate(element) + ' '
44
- article['content'] = CONTENT_ENG
45
- article['site'] = "State Administration of Foregin Exchange of China"
46
- article['originalSite'] = "外汇管理局"
47
- article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
48
- article['title'] = translate(article['originalTitle'])
49
- article['url'] = url
50
  article['category']= "Policy Interpretation"
51
- article['publishDate'] = datemodifier(page.xpath("//meta[@name = 'PubDate']/@content")[0], "%Y-%m-%d")
52
- article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
53
- article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
54
- upsert_content(article)
55
  except Exception as error:
56
  print(error)
57
 
@@ -81,26 +61,7 @@ while i > -1:
81
  try:
82
  article = {}
83
  url = "https://www.safe.gov.cn" + url
84
- req = urllib.request.urlopen(url)
85
- text = req.read()
86
- html_text = text.decode("utf-8")
87
- page = etree.HTML(html_text)
88
- article['originalContent'] = encode(page.xpath("//div[contains(@class, 'detail_content')]//p"))
89
- if len(article['originalContent']) < 10:
90
- continue
91
- CONTENT_ENG = ''
92
- for element in article['originalContent'].split("。"):
93
- CONTENT_ENG += translate(element) + ' '
94
- article['content'] = CONTENT_ENG
95
- article['site'] = "State Administration of Foregin Exchange of China"
96
- article['originalSite'] = "外汇管理局"
97
- article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
98
- article['title'] = translate(article['originalTitle'])
99
- article['url'] = url
100
  article['category']= "Data Interpretation"
101
- article['publishDate'] = datemodifier(page.xpath("//meta[@name = 'PubDate']/@content")[0], "%Y-%m-%d")
102
- article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
103
- article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
104
- upsert_content(article)
105
  except Exception as error:
106
  print(error)
 
 
1
  import time
2
  import urllib.request
3
  from datetime import datetime, timedelta
4
  from lxml import etree
5
+ from utils import crawl
6
 
7
  i = 1
8
  while i > -1:
 
30
  try:
31
  article = {}
32
  url = "https://www.safe.gov.cn" + url
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  article['category']= "Policy Interpretation"
34
+ crawl(url, article)
 
 
 
35
  except Exception as error:
36
  print(error)
37
 
 
61
  try:
62
  article = {}
63
  url = "https://www.safe.gov.cn" + url
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  article['category']= "Data Interpretation"
65
+ crawl(url, article)
 
 
 
66
  except Exception as error:
67
  print(error)
stats.py CHANGED
@@ -3,7 +3,7 @@ import time
3
  import urllib.request
4
  from datetime import datetime, timedelta
5
  from lxml import etree
6
- from utils import encode, translate, sentiment_computation, upsert_content
7
 
8
  i = 0
9
  while i > -1:
@@ -31,26 +31,7 @@ while i > -1:
31
  try:
32
  article = {}
33
  url = url.replace('./', "https://www.stats.gov.cn/sj/sjjd/")
34
- req = urllib.request.urlopen(url)
35
- text = req.read()
36
- html_text = text.decode("utf-8")
37
- page = etree.HTML(html_text)
38
- article['originalContent'] = encode(page.xpath("//div[contains(@class, 'TRS_Editor')]//p"))
39
- if len(article['originalContent']) < 10:
40
- continue
41
- CONTENT_ENG = ''
42
- for element in article['originalContent'].split("。"):
43
- CONTENT_ENG += translate(element) + ' '
44
- article['content'] = CONTENT_ENG
45
- article['site'] = "National Bureau of Statistics of China"
46
- article['originalSite'] = "国家统计局"
47
- article['originalTitle'] = page.xpath("//title/text()")[0]
48
- article['title'] = translate(article['originalTitle'])
49
- article['url'] = url
50
  article['category']= "Data Interpretation"
51
- article['publishDate'] = date
52
- article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
53
- article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
54
- upsert_content(article)
55
  except Exception as error:
56
  print(error)
 
3
  import urllib.request
4
  from datetime import datetime, timedelta
5
  from lxml import etree
6
+ from utils import encode, crawl
7
 
8
  i = 0
9
  while i > -1:
 
31
  try:
32
  article = {}
33
  url = url.replace('./', "https://www.stats.gov.cn/sj/sjjd/")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  article['category']= "Data Interpretation"
35
+ crawl(url, article)
 
 
 
36
  except Exception as error:
37
  print(error)
utils.py CHANGED
@@ -119,16 +119,18 @@ def extract_from_pdf(url):
119
  pdf_reader = PdfReader(f)
120
  num_pages = len(pdf_reader.pages)
121
  extracted_text = ""
122
- extracted_text_eng = ""
123
  for page in range(num_pages):
124
  text = pdf_reader.pages[page].extract_text()
125
  if text and text[0].isdigit():
126
  text = text[1:]
127
  first_newline_index = text.find('\n')
128
- text = text[:first_newline_index+1].replace('\n', ' ') + text[first_newline_index+1:].replace('\n', '')
129
- extracted_text_eng += translator.translate(text, dest='en').text
130
  extracted_text += text
131
- return extracted_text, extracted_text_eng
 
 
 
 
132
 
133
  def get_db_connection():
134
  """Get dynamoDB connection"""
@@ -164,27 +166,35 @@ def sentiment_computation(content):
164
  return sentiment_score, label_dict[sentiment_label]
165
 
166
  def crawl(url, article):
167
- domain = urlparse(url).netloc
168
  req = urllib.request.urlopen(url)
169
  text = req.read()
170
  html_text = text.decode("utf-8")
171
  page = etree.HTML(html_text)
172
- originContent, summary = encode_content(page.xpath(xpath_dict[domain]['content']))
173
- article['originTitle'] = encode(page.xpath(xpath_dict[domain]['title']))
174
- article['title'] = translate(article['originTitle'])
175
- article['author'] = translate(encode(page.xpath(xpath_dict[domain]['author'])))
176
- article['originContent'] = repr(originContent)
177
- if len(article['originContent']) < 10:
 
 
 
 
 
178
  return None
179
  CONTENT_ENG = ''
180
- for element in originContent.split("\n"):
181
  CONTENT_ENG += translate(element) + '\n'
182
  article['content'] = repr(CONTENT_ENG)
183
- article['subtitle'] = translate(summary)
184
- article['publishDate'] = datemodifier(encode(page.xpath(xpath_dict[domain]['publishdate'])), xpath_dict[domain]['datetime']['format_string'])
 
 
 
185
  article['link'] = url
186
  article['attachment'] = ""
187
- article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(originContent.replace("\n",""))
188
  article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
189
  upsert_content(article)
190
 
@@ -197,9 +207,9 @@ def upsert_content(report):
197
  'id': str(report['id']),
198
  'site': report['site'],
199
  'title': report['title'],
200
- # 'originSite': report['originSite'],
201
- 'originTitle': report['originTitle'],
202
- 'originContent': report['originContent'],
203
  'category': report['category'],
204
  'author': report['author'],
205
  'content': report['content'],
 
119
  pdf_reader = PdfReader(f)
120
  num_pages = len(pdf_reader.pages)
121
  extracted_text = ""
 
122
  for page in range(num_pages):
123
  text = pdf_reader.pages[page].extract_text()
124
  if text and text[0].isdigit():
125
  text = text[1:]
126
  first_newline_index = text.find('\n')
127
+ text = text[:first_newline_index+1].replace('\n', ' ') + text[first_newline_index+1:]
 
128
  extracted_text += text
129
+ try:
130
+ summary = '\n'.join(extracted_text.split('\n')[:2])
131
+ except:
132
+ summary = text
133
+ return extracted_text, summary
134
 
135
  def get_db_connection():
136
  """Get dynamoDB connection"""
 
166
  return sentiment_score, label_dict[sentiment_label]
167
 
168
  def crawl(url, article):
169
+ domain = '.'.join(urlparse(url).netloc.split('.')[1:])
170
  req = urllib.request.urlopen(url)
171
  text = req.read()
172
  html_text = text.decode("utf-8")
173
  page = etree.HTML(html_text)
174
+ contentCN, summary = encode_content(page.xpath(xpath_dict[domain]['content']))
175
+ article['originSite'] = xpath_dict[domain]['siteCN']
176
+ article['site'] = xpath_dict[domain]['site']
177
+ article['titleCN'] = encode(page.xpath(xpath_dict[domain]['title']))
178
+ article['title'] = translate(article['titleCN'])
179
+ if 'author' in xpath_dict[domain]:
180
+ article['author'] = translate(encode(page.xpath(xpath_dict[domain]['author'])))
181
+ else:
182
+ article['author'] = ""
183
+ article['contentCN'] = repr(contentCN)
184
+ if len(article['contentCN']) < 10:
185
  return None
186
  CONTENT_ENG = ''
187
+ for element in contentCN.split("\n"):
188
  CONTENT_ENG += translate(element) + '\n'
189
  article['content'] = repr(CONTENT_ENG)
190
+ if 'subtitle' in xpath_dict[domain]:
191
+ article['subtitle'] = translate(encode(page.xpath(xpath_dict[domain]['subtitle'])))
192
+ else:
193
+ article['subtitle'] = translate(summary)
194
+ article['publishDate'] = datemodifier(encode(page.xpath(xpath_dict[domain]['publishdate'])), xpath_dict[domain]['datetime_format'])
195
  article['link'] = url
196
  article['attachment'] = ""
197
+ article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(CONTENT_ENG.replace("\n",""))
198
  article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
199
  upsert_content(article)
200
 
 
207
  'id': str(report['id']),
208
  'site': report['site'],
209
  'title': report['title'],
210
+ 'titleCN': report['titleCN'],
211
+ 'site': report['site'],
212
+ 'contentCN': report['contentCN'],
213
  'category': report['category'],
214
  'author': report['author'],
215
  'content': report['content'],
xpath.json CHANGED
@@ -2,28 +2,85 @@
2
  "data.eastmoney.com": {
3
  "attachment": "//a[contains(@class, 'pdf-link')]/@href",
4
  "content": "//div[contains(@class, 'ctx-content')]//p",
5
- "datetime": {
6
- "format_string": "%Y-%m-%d %H:%M:%S.%f"
7
- }
8
  },
9
- "www.gov.cn": {
10
  "title": "//title/text()",
11
  "subtitle": "//meta[@name = 'description']/@content",
12
  "author": "//meta[@name = 'author']/@content",
13
  "publishdate": "//meta[@name = 'firstpublishedtime']/@content",
14
  "content": "//div[contains(@id, 'UCAP-CONTENT')]//p",
15
- "datetime": {
16
- "format_string": "%Y-%m-%d-%H:%M:%S"
17
- }
18
  },
19
- "www.csrc.gov.cn": {
20
  "title": "//meta[@name = 'ArticleTitle']/@content",
21
  "subtitle": "//meta[@name = 'description']/@content",
22
  "author": "//meta[@name = 'author']/@content",
23
  "publishdate": "//meta[@name = 'PubDate']/@content",
24
  "content": "//div[contains(@class, 'detail-news')]//p",
25
- "datetime": {
26
- "format_string": "%Y-%m-%d %H:%M:%S"
27
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  }
29
  }
 
2
  "data.eastmoney.com": {
3
  "attachment": "//a[contains(@class, 'pdf-link')]/@href",
4
  "content": "//div[contains(@class, 'ctx-content')]//p",
5
+ "datetime": "%Y-%m-%d %H:%M:%S.%f"
 
 
6
  },
7
+ "gov.cn": {
8
  "title": "//title/text()",
9
  "subtitle": "//meta[@name = 'description']/@content",
10
  "author": "//meta[@name = 'author']/@content",
11
  "publishdate": "//meta[@name = 'firstpublishedtime']/@content",
12
  "content": "//div[contains(@id, 'UCAP-CONTENT')]//p",
13
+ "datetime_format": "%Y-%m-%d-%H:%M:%S",
14
+ "siteCN": "中国国务院",
15
+ "site": "State Council of China"
16
  },
17
+ "csrc.gov.cn": {
18
  "title": "//meta[@name = 'ArticleTitle']/@content",
19
  "subtitle": "//meta[@name = 'description']/@content",
20
  "author": "//meta[@name = 'author']/@content",
21
  "publishdate": "//meta[@name = 'PubDate']/@content",
22
  "content": "//div[contains(@class, 'detail-news')]//p",
23
+ "datetime_format": "%Y-%m-%d %H:%M:%S",
24
+ "siteCN": "中国证监会",
25
+ "site": "Securities Regulatory Commission of China"
26
+ },
27
+ "mof.gov.cn": {
28
+ "title": "//meta[@name = 'ArticleTitle']/@content",
29
+ "publishdate": "//meta[@name = 'PubDate']/@content",
30
+ "content": "//div[contains(@class, 'TRS_Editor')]//p",
31
+ "datetime_format": "%Y-%m-%d %H:%M:%S",
32
+ "siteCN": "中国财政部",
33
+ "site": "Ministry of Finance of China"
34
+ },
35
+ "mofcom.gov.cn": {
36
+ "title": "//meta[@name = 'ArticleTitle']/@content",
37
+ "subtitle": "//meta[@name = 'Description']/@content",
38
+ "publishdate": "//meta[@name = 'PubDate']/@content",
39
+ "content": "//div[contains(@class, 'art-con art-con-bottonmLine')]//p",
40
+ "datetime_format": "%Y-%m-%d %H:%M:%S",
41
+ "siteCN": "中国商务部",
42
+ "site": "Ministry of Commerce of China"
43
+ },
44
+ "ndrc.gov.cn": {
45
+ "title": "//meta[@name = 'ArticleTitle']/@content",
46
+ "publishdate": "//meta[@name = 'PubDate']/@content",
47
+ "content": "//div[contains(@class, 'TRS_Editor') or contains(@class, 'article_l')]",
48
+ "datetime_format": "%Y-%m-%d %H:%M:%S",
49
+ "siteCN": "中国国家发展和改革委员会",
50
+ "site": "National Development and Reform Commission of China"
51
+ },
52
+ "pbc.gov.cn": {
53
+ "title": "//title/text()",
54
+ "subtitle": "//meta[@name = 'description']/@content",
55
+ "publishdate": "//meta[@name = '页面生成时间']/@content",
56
+ "content": "//div[@class='mainw950']//td[@class='content']/font[@class='zoom1']//p",
57
+ "datetime_format": "%Y-%m-%d %H:%M:%S",
58
+ "siteCN": "中国人民银行",
59
+ "site": "The People's Bank of China"
60
+ },
61
+ "safe.gov.cn": {
62
+ "title": "//meta[@name = 'ArticleTitle']/@content",
63
+ "subtitle": "//meta[@name = 'Description']/@content",
64
+ "publishdate": "//meta[@name = 'PubDate']/@content",
65
+ "content": "//div[contains(@class, 'detail_content')]//p",
66
+ "datetime_format": "%Y-%m-%d",
67
+ "siteCN": "中国外汇管理局",
68
+ "site": "State Administration of Foreign Exchange of China"
69
+ },
70
+ "stats.gov.cn": {
71
+ "title": "//title/text()",
72
+ "publishdate": "//div[contains(@class, 'detail-title-des')]//p[1]",
73
+ "content": "//div[contains(@class, 'TRS_Editor')]//p",
74
+ "datetime_format": "%Y/%m/%d %H:%M",
75
+ "siteCN": "中国国家统计局",
76
+ "site": "National Bureau of Statistics of China"
77
+ },
78
+ "chinatax.gov.cn": {
79
+ "title": "//title/text()",
80
+ "publishdate": "//div[contains(@class, 'detail-title-des')]//p[1]",
81
+ "content": "//div[contains(@class, 'article')]//p",
82
+ "datetime_format": "%Y-%m-%d %H:%M:%S",
83
+ "siteCN": "中国国家税务总局",
84
+ "site": "State Taxation Administration of China"
85
  }
86
  }