OxbridgeEconomics
commited on
Commit
·
42ba1cc
1
Parent(s):
046bb22
commit
Browse files
cbirc.py
CHANGED
@@ -11,30 +11,35 @@ i = 1
|
|
11 |
while i > -1:
|
12 |
CATEGORY_URL = f"https://www.cbirc.gov.cn/cn/static/data/DocInfo/SelectDocByItemIdAndChild/data_itemId=917,pageIndex={i},pageSize=18.json"
|
13 |
i = i + 1
|
|
|
14 |
content = fetch_url(CATEGORY_URL)
|
15 |
reportinfo = json.loads(content)
|
16 |
for article in reportinfo['data']['rows']:
|
17 |
try:
|
18 |
-
|
|
|
19 |
if parsed_datetime < (datetime.today() - timedelta(days=183)):
|
20 |
i = -1
|
21 |
else:
|
22 |
-
|
23 |
-
|
|
|
24 |
continue
|
25 |
CONTENT_ENG = ''
|
26 |
-
for element in article['
|
27 |
CONTENT_ENG += translate(element) + '\n'
|
28 |
-
article['content'] = CONTENT_ENG
|
29 |
article['site'] = "National Financial Regulatory Administration of China"
|
30 |
article['originSite'] = "国家金融监督管理总局"
|
31 |
-
article['
|
32 |
-
article['title'] = translate(article['
|
33 |
-
article['
|
34 |
-
article['category']= "Policy Interpretation"
|
35 |
-
article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(article['publishDate'],"%Y-%m-%d %H:%M:%S"))
|
36 |
article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
|
37 |
article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
|
|
|
|
|
|
|
38 |
upsert_content(article)
|
39 |
except Exception as error:
|
40 |
print(error)
|
@@ -46,7 +51,10 @@ while i > -1:
|
|
46 |
CATEGORY_URL = f"https://www.chinatax.gov.cn/search5/search/s?siteCode=bm29000002&searchWord=&type=1&xxgkResolveType=%E6%96%87%E5%AD%97&pageNum={i}&pageSize=10&cwrqStart=&cwrqEnd=&column=%E6%94%BF%E7%AD%96%E8%A7%A3%E8%AF%BB&likeDoc=0&wordPlace=0&videoreSolveType="
|
47 |
i = i + 1
|
48 |
urllib3.disable_warnings()
|
49 |
-
|
|
|
|
|
|
|
50 |
content = req.read().decode("utf-8")
|
51 |
reportinfo = json.loads(content)
|
52 |
for article in reportinfo['searchResultAll']['searchTotal']:
|
|
|
11 |
while i > -1:
|
12 |
CATEGORY_URL = f"https://www.cbirc.gov.cn/cn/static/data/DocInfo/SelectDocByItemIdAndChild/data_itemId=917,pageIndex={i},pageSize=18.json"
|
13 |
i = i + 1
|
14 |
+
print(CATEGORY_URL)
|
15 |
content = fetch_url(CATEGORY_URL)
|
16 |
reportinfo = json.loads(content)
|
17 |
for article in reportinfo['data']['rows']:
|
18 |
try:
|
19 |
+
article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(article['publishDate'],"%Y-%m-%d %H:%M:%S"))
|
20 |
+
parsed_datetime = datetime.strptime(article['publishDate'], "%Y-%m-%d")
|
21 |
if parsed_datetime < (datetime.today() - timedelta(days=183)):
|
22 |
i = -1
|
23 |
else:
|
24 |
+
contentCN, summary = extract_from_pdf("https://www.cbirc.gov.cn" + article['pdfFileUrl'])
|
25 |
+
article['contentCN'] = repr(contentCN)
|
26 |
+
if len(contentCN) < 10:
|
27 |
continue
|
28 |
CONTENT_ENG = ''
|
29 |
+
for element in article['contentCN'].split("\n"):
|
30 |
CONTENT_ENG += translate(element) + '\n'
|
31 |
+
article['content'] = repr(CONTENT_ENG)
|
32 |
article['site'] = "National Financial Regulatory Administration of China"
|
33 |
article['originSite'] = "国家金融监督管理总局"
|
34 |
+
article['titleCN'] = article['docSubtitle']
|
35 |
+
article['title'] = translate(article['docSubtitle'])
|
36 |
+
article['link'] = "https://www.cbirc.gov.cn" + str(article['pdfFileUrl'])
|
37 |
+
article['category']= "Policy Interpretation"
|
|
|
38 |
article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
|
39 |
article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
|
40 |
+
article['attachment'] = ''
|
41 |
+
article['author'] = ''
|
42 |
+
article['subtitle'] = translate(summary)
|
43 |
upsert_content(article)
|
44 |
except Exception as error:
|
45 |
print(error)
|
|
|
51 |
CATEGORY_URL = f"https://www.chinatax.gov.cn/search5/search/s?siteCode=bm29000002&searchWord=&type=1&xxgkResolveType=%E6%96%87%E5%AD%97&pageNum={i}&pageSize=10&cwrqStart=&cwrqEnd=&column=%E6%94%BF%E7%AD%96%E8%A7%A3%E8%AF%BB&likeDoc=0&wordPlace=0&videoreSolveType="
|
52 |
i = i + 1
|
53 |
urllib3.disable_warnings()
|
54 |
+
try:
|
55 |
+
req = urllib.request.urlopen(CATEGORY_URL, timeout=30)
|
56 |
+
except:
|
57 |
+
break
|
58 |
content = req.read().decode("utf-8")
|
59 |
reportinfo = json.loads(content)
|
60 |
for article in reportinfo['searchResultAll']['searchTotal']:
|
csrc.py
CHANGED
@@ -6,38 +6,36 @@ from datetime import datetime, timedelta
|
|
6 |
from lxml import etree
|
7 |
from utils import encode, translate, sentiment_computation, upsert_content, fetch_url, crawl
|
8 |
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
# except Exception as error:
|
40 |
-
# print(error)
|
41 |
|
42 |
i = 1
|
43 |
while i > -1:
|
|
|
6 |
from lxml import etree
|
7 |
from utils import encode, translate, sentiment_computation, upsert_content, fetch_url, crawl
|
8 |
|
9 |
+
i = 1
|
10 |
+
while i > -1:
|
11 |
+
if i == 1:
|
12 |
+
CATEGORY_URL = "http://www.csrc.gov.cn/csrc/c100039/common_list.shtml"
|
13 |
+
else:
|
14 |
+
CATEGORY_URL = f"http://www.csrc.gov.cn/csrc/c100039/common_list_{i}.shtml"
|
15 |
+
i = i + 1
|
16 |
+
req = urllib.request.urlopen(CATEGORY_URL)
|
17 |
+
text = req.read()
|
18 |
+
html_text = text.decode("utf-8")
|
19 |
+
page = etree.HTML(html_text)
|
20 |
+
articlelist = page.xpath("//div[contains(@class, 'main-right fr common-list')]/ul/li")
|
21 |
+
for article in articlelist:
|
22 |
+
if isinstance(article, etree._Element):
|
23 |
+
subelement = etree.tostring(article).decode()
|
24 |
+
subpage = etree.HTML(subelement)
|
25 |
+
date = encode(subpage.xpath("//span[@class='date']"))
|
26 |
+
parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
|
27 |
+
if parsed_datetime < (datetime.today() - timedelta(days=183)):
|
28 |
+
i = -1
|
29 |
+
else:
|
30 |
+
urls = subpage.xpath("//a/@href")
|
31 |
+
for url in urls:
|
32 |
+
try:
|
33 |
+
article = {}
|
34 |
+
url = "http://www.csrc.gov.cn" + url
|
35 |
+
article['category']= "Policy Interpretation"
|
36 |
+
crawl(url, article)
|
37 |
+
except Exception as error:
|
38 |
+
print(error)
|
|
|
|
|
39 |
|
40 |
i = 1
|
41 |
while i > -1:
|
gov.py
CHANGED
@@ -60,8 +60,6 @@ while i > -1:
|
|
60 |
url = url.replace('../', 'https://www.gov.cn/zhengce/')
|
61 |
if "https://www.gov.cn" in url:
|
62 |
article['category']= "Policy Interpretation"
|
63 |
-
article['originSite'] = "国务院"
|
64 |
-
article['site'] = "State Council of China"
|
65 |
crawl(url, article)
|
66 |
except Exception as error:
|
67 |
print(error)
|
|
|
60 |
url = url.replace('../', 'https://www.gov.cn/zhengce/')
|
61 |
if "https://www.gov.cn" in url:
|
62 |
article['category']= "Policy Interpretation"
|
|
|
|
|
63 |
crawl(url, article)
|
64 |
except Exception as error:
|
65 |
print(error)
|
mof.py
CHANGED
@@ -1,9 +1,8 @@
|
|
1 |
-
import uuid
|
2 |
import time
|
3 |
import urllib.request
|
4 |
from lxml import etree
|
5 |
from datetime import datetime, timedelta
|
6 |
-
from utils import
|
7 |
|
8 |
i = 0
|
9 |
while i > -1:
|
@@ -32,27 +31,8 @@ while i > -1:
|
|
32 |
article = {}
|
33 |
url = url.replace("../", "https://www.mof.gov.cn/zhengwuxinxi/")
|
34 |
url = url.replace("./", "https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/")
|
35 |
-
req = urllib.request.urlopen(url)
|
36 |
-
text = req.read()
|
37 |
-
html_text = text.decode("utf-8")
|
38 |
-
page = etree.HTML(html_text)
|
39 |
-
article['originalContent'] = encode(page.xpath("//div[contains(@class, 'TRS_Editor')]//p"))
|
40 |
-
if len(article['originalContent']) < 10:
|
41 |
-
continue
|
42 |
-
CONTENT_ENG = ''
|
43 |
-
for element in article['originalContent'].split("。"):
|
44 |
-
CONTENT_ENG += translate(element) + ' '
|
45 |
-
article['content'] = CONTENT_ENG
|
46 |
-
article['site'] = "Ministry of Finance of China"
|
47 |
-
article['originalSite'] = "财政部"
|
48 |
-
article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
|
49 |
-
article['title'] = translate(article['originalTitle'])
|
50 |
-
article['url'] = url
|
51 |
article['category']= "Financial News"
|
52 |
-
|
53 |
-
article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
|
54 |
-
article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
|
55 |
-
upsert_content(article)
|
56 |
except Exception as error:
|
57 |
print(error)
|
58 |
|
@@ -82,90 +62,7 @@ while i > -1:
|
|
82 |
try:
|
83 |
article = {}
|
84 |
url = url.replace("./", CATEGORY_URL)
|
85 |
-
req = urllib.request.urlopen(url)
|
86 |
-
text = req.read()
|
87 |
-
html_text = text.decode("utf-8")
|
88 |
-
page = etree.HTML(html_text)
|
89 |
-
article['originalContent'] = encode(page.xpath("//div[contains(@class, 'TRS_Editor')]//p"))
|
90 |
-
if len(article['originalContent']) < 10:
|
91 |
-
continue
|
92 |
-
CONTENT_ENG = ''
|
93 |
-
for element in article['originalContent'].split("。"):
|
94 |
-
CONTENT_ENG += translate(element) + ' '
|
95 |
-
article['content'] = CONTENT_ENG
|
96 |
-
article['site'] = "Ministry of Finance of China"
|
97 |
-
article['originalSite'] = "财政部"
|
98 |
-
article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
|
99 |
-
article['title'] = translate(article['originalTitle'])
|
100 |
-
article['url'] = url
|
101 |
article['category']= "Policy Interpretation"
|
102 |
-
|
103 |
-
article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
|
104 |
-
article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
|
105 |
-
upsert_content(article)
|
106 |
except Exception as error:
|
107 |
print(error)
|
108 |
-
|
109 |
-
# i = 0
|
110 |
-
# while i > -1:
|
111 |
-
# if i == 0:
|
112 |
-
# CATEGORY_URL = "https://www.mof.gov.cn/zhengwuxinxi/zhengcefabu/"
|
113 |
-
# else:
|
114 |
-
# CATEGORY_URL = f"https://www.mof.gov.cn/zhengwuxinxi/zhengcefabu/index_{i}.htm"
|
115 |
-
# i = i + 1
|
116 |
-
# req = urllib.request.urlopen(CATEGORY_URL)
|
117 |
-
# text = req.read()
|
118 |
-
# html_text = text.decode("utf-8")
|
119 |
-
# page = etree.HTML(html_text)
|
120 |
-
# articlelist = page.xpath("//div[contains(@class, 'xwfb_listerji')]/ul/li[not(@class = 'clear')]")
|
121 |
-
# for article in articlelist:
|
122 |
-
# if isinstance(article, etree._Element):
|
123 |
-
# subelement = etree.tostring(article).decode()
|
124 |
-
# subpage = etree.HTML(subelement)
|
125 |
-
# date = subpage.xpath("//span/text()")[0]
|
126 |
-
# parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
|
127 |
-
# if parsed_datetime < (datetime.today() - timedelta(days=183)):
|
128 |
-
# i = -1
|
129 |
-
# else:
|
130 |
-
# urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
|
131 |
-
# for url in urls:
|
132 |
-
# try:
|
133 |
-
# article = {}
|
134 |
-
# url = url.replace("./", CATEGORY_URL)
|
135 |
-
# print(url)
|
136 |
-
# req = urllib.request.urlopen(url)
|
137 |
-
# text = req.read()
|
138 |
-
# html_text = text.decode("utf-8")
|
139 |
-
# page = etree.HTML(html_text)
|
140 |
-
# attachments = page.xpath("//span[contains(@id, 'appendix1')]/a/@href")
|
141 |
-
# print(attachments)
|
142 |
-
# if len(attachments) > 0:
|
143 |
-
# for attachment_url in attachments:
|
144 |
-
# if '.pdf' in attachment_url:
|
145 |
-
# attachment_url = attachment_url.replace("./", "https://zyhj.mof.gov.cn/zcfb/202403/")
|
146 |
-
# article['originalContent'] = extract_from_pdf(attachment_url)
|
147 |
-
# if '.doc' in attachment_url:
|
148 |
-
# continue
|
149 |
-
# if '.docx' in attachment_url:
|
150 |
-
# continue
|
151 |
-
# else:
|
152 |
-
# article['originalContent'] = encode(page.xpath("//div[contains(@class, 'TRS_Editor')]/p[@align='justify']"))
|
153 |
-
# print(article['originalContent'])
|
154 |
-
# if len(article['originalContent']) < 10:
|
155 |
-
# continue
|
156 |
-
# CONTENT_ENG = ''
|
157 |
-
# for element in article['originalContent'].split("。"):
|
158 |
-
# CONTENT_ENG += translate(element) + ' '
|
159 |
-
# article['content'] = CONTENT_ENG
|
160 |
-
# article['site'] = "Ministry of Finance"
|
161 |
-
# article['originalSite'] = "财政部"
|
162 |
-
# article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
|
163 |
-
# article['title'] = translate(article['originalTitle'])
|
164 |
-
# article['url'] = url
|
165 |
-
# article['category']= "Policy Release"
|
166 |
-
# article['publishDate'] = datemodifier(page.xpath("//meta[@name = 'PubDate']/@content")[0], "%Y-%m-%d %H:%M:%S")
|
167 |
-
# article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
|
168 |
-
# article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
|
169 |
-
# # upsert_content(article)
|
170 |
-
# except Exception as error:
|
171 |
-
# print(error)
|
|
|
|
|
1 |
import time
|
2 |
import urllib.request
|
3 |
from lxml import etree
|
4 |
from datetime import datetime, timedelta
|
5 |
+
from utils import crawl
|
6 |
|
7 |
i = 0
|
8 |
while i > -1:
|
|
|
31 |
article = {}
|
32 |
url = url.replace("../", "https://www.mof.gov.cn/zhengwuxinxi/")
|
33 |
url = url.replace("./", "https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
article['category']= "Financial News"
|
35 |
+
crawl(url, article)
|
|
|
|
|
|
|
36 |
except Exception as error:
|
37 |
print(error)
|
38 |
|
|
|
62 |
try:
|
63 |
article = {}
|
64 |
url = url.replace("./", CATEGORY_URL)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
article['category']= "Policy Interpretation"
|
66 |
+
crawl(url, article)
|
|
|
|
|
|
|
67 |
except Exception as error:
|
68 |
print(error)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
mofcom.py
CHANGED
@@ -1,9 +1,8 @@
|
|
1 |
-
import uuid
|
2 |
import time
|
3 |
import urllib.request
|
4 |
from datetime import datetime, timedelta
|
5 |
from lxml import etree
|
6 |
-
from utils import
|
7 |
|
8 |
categories = ['jdzhsw','jdgnmy','jddwmy','jdtzhz']
|
9 |
for category in categories:
|
@@ -37,25 +36,6 @@ for category in categories:
|
|
37 |
article['category']= "Policy Interpretation"
|
38 |
else:
|
39 |
article['category']= "Policy Release"
|
40 |
-
|
41 |
-
text = req.read()
|
42 |
-
html_text = text.decode("utf-8")
|
43 |
-
page = etree.HTML(html_text)
|
44 |
-
article['originalContent'] = encode(page.xpath("//div[contains(@class, 'art-con art-con-bottonmLine')]//p"))
|
45 |
-
if len(article['originalContent']) < 10:
|
46 |
-
continue
|
47 |
-
CONTENT_ENG = ''
|
48 |
-
for element in article['originalContent'].split("。"):
|
49 |
-
CONTENT_ENG += translate(element) + ' '
|
50 |
-
article['content'] = CONTENT_ENG
|
51 |
-
article['site'] = "Ministry of Commerce of China"
|
52 |
-
article['originalSite'] = "商务部"
|
53 |
-
article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
|
54 |
-
article['title'] = translate(article['originalTitle'])
|
55 |
-
article['url'] = url
|
56 |
-
article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(page.xpath("//meta[@name = 'PubDate']/@content")[0],"%Y-%m-%d %H:%M:%S"))
|
57 |
-
article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
|
58 |
-
article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
|
59 |
-
upsert_content(article)
|
60 |
except Exception as error:
|
61 |
print(error)
|
|
|
|
|
1 |
import time
|
2 |
import urllib.request
|
3 |
from datetime import datetime, timedelta
|
4 |
from lxml import etree
|
5 |
+
from utils import crawl
|
6 |
|
7 |
categories = ['jdzhsw','jdgnmy','jddwmy','jdtzhz']
|
8 |
for category in categories:
|
|
|
36 |
article['category']= "Policy Interpretation"
|
37 |
else:
|
38 |
article['category']= "Policy Release"
|
39 |
+
crawl(url, article)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
except Exception as error:
|
41 |
print(error)
|
ndrc.py
CHANGED
@@ -3,7 +3,7 @@ import uuid
|
|
3 |
import time
|
4 |
import urllib.request
|
5 |
from lxml import etree
|
6 |
-
from utils import
|
7 |
|
8 |
i = 0
|
9 |
while i > -1:
|
@@ -30,71 +30,15 @@ while i > -1:
|
|
30 |
for url in urls:
|
31 |
try:
|
32 |
article = {}
|
33 |
-
|
34 |
-
if "https://www.gov.cn" in url:
|
35 |
-
req = urllib.request.urlopen(url)
|
36 |
-
text = req.read()
|
37 |
-
html_text = text.decode("utf-8")
|
38 |
-
page = etree.HTML(html_text)
|
39 |
-
article['originalContent'] = encode(page.xpath("//div[contains(@id, 'UCAP-CONTENT')]//p"))
|
40 |
-
if len(article['originalContent']) < 10:
|
41 |
-
continue
|
42 |
-
CONTENT_ENG = ''
|
43 |
-
for element in article['originalContent'].split("。"):
|
44 |
-
CONTENT_ENG += translate(element) + ' '
|
45 |
-
article['content'] = CONTENT_ENG
|
46 |
-
article['site'] = "State Council of China"
|
47 |
-
article['originalSite'] = "国务院"
|
48 |
-
article['originalTitle'] = page.xpath("//title/text()")[0]
|
49 |
-
article['title'] = translate(article['originalTitle'])
|
50 |
-
article['url'] = url
|
51 |
article['category']= "Policy Release"
|
52 |
-
article['publishDate'] = datemodifier(page.xpath("//meta[@name = 'firstpublishedtime']/@content")[0], "%Y-%m-%d-%H:%M:%S")
|
53 |
elif "../../zcfb/" in url:
|
54 |
url = url.replace("../../zcfb/", "https://www.ndrc.gov.cn/xxgk/zcfb/")
|
55 |
-
print(url)
|
56 |
-
req = urllib.request.urlopen(url)
|
57 |
-
text = req.read()
|
58 |
-
html_text = text.decode("utf-8")
|
59 |
-
page = etree.HTML(html_text)
|
60 |
-
article['originalContent'] = encode(page.xpath("//div[contains(@class, 'TRS_Editor') or contains(@class, 'article_l')]"))
|
61 |
-
if len(article['originalContent']) < 10:
|
62 |
-
continue
|
63 |
-
CONTENT_ENG = ''
|
64 |
-
for element in article['originalContent'].split("。"):
|
65 |
-
CONTENT_ENG += translate(element) + ' '
|
66 |
-
article['content'] = CONTENT_ENG
|
67 |
-
article['site'] = "National Development and Reform Commission of China"
|
68 |
-
article['originalSite'] = "国家发展和改革委员会"
|
69 |
-
article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
|
70 |
-
article['title'] = translate(article['originalTitle'])
|
71 |
-
article['url'] = url
|
72 |
article['category']= "Policy Release"
|
73 |
-
article['publishDate'] = datemodifier(page.xpath("//meta[@name = 'PubDate']/@content")[0], "%Y-%m-%d %H:%M:%S")
|
74 |
else:
|
75 |
url = url.replace("../../", "https://www.ndrc.gov.cn/xxgk/jd/jd/")
|
76 |
url = url.replace("./", "https://www.ndrc.gov.cn/xxgk/jd/jd/")
|
77 |
-
print(url)
|
78 |
-
req = urllib.request.urlopen(url)
|
79 |
-
text = req.read()
|
80 |
-
html_text = text.decode("utf-8")
|
81 |
-
page = etree.HTML(html_text)
|
82 |
-
article['originalContent'] = encode(page.xpath("//div[contains(@class, 'TRS_Editor')]//p"))
|
83 |
-
if len(article['originalContent']) < 10:
|
84 |
-
continue
|
85 |
-
CONTENT_ENG = ''
|
86 |
-
for element in article['originalContent'].split("。"):
|
87 |
-
CONTENT_ENG += translate(element) + ' '
|
88 |
-
article['content'] = CONTENT_ENG
|
89 |
-
article['site'] = "National Development and Reform Commission of China"
|
90 |
-
article['originalSite'] = "国家发展和改革委员会"
|
91 |
-
article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
|
92 |
-
article['title'] = translate(article['originalTitle'])
|
93 |
-
article['url'] = url
|
94 |
article['category']= "Policy Interpretation"
|
95 |
-
|
96 |
-
article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
|
97 |
-
article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
|
98 |
-
upsert_content(article)
|
99 |
except Exception as error:
|
100 |
print(error)
|
|
|
3 |
import time
|
4 |
import urllib.request
|
5 |
from lxml import etree
|
6 |
+
from utils import crawl
|
7 |
|
8 |
i = 0
|
9 |
while i > -1:
|
|
|
30 |
for url in urls:
|
31 |
try:
|
32 |
article = {}
|
33 |
+
if "www.gov.cn" in url:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
article['category']= "Policy Release"
|
|
|
35 |
elif "../../zcfb/" in url:
|
36 |
url = url.replace("../../zcfb/", "https://www.ndrc.gov.cn/xxgk/zcfb/")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
article['category']= "Policy Release"
|
|
|
38 |
else:
|
39 |
url = url.replace("../../", "https://www.ndrc.gov.cn/xxgk/jd/jd/")
|
40 |
url = url.replace("./", "https://www.ndrc.gov.cn/xxgk/jd/jd/")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
article['category']= "Policy Interpretation"
|
42 |
+
crawl(url, article)
|
|
|
|
|
|
|
43 |
except Exception as error:
|
44 |
print(error)
|
pbc.py
CHANGED
@@ -13,37 +13,45 @@ while i > -1:
|
|
13 |
j = i + 1
|
14 |
CATEGORY_URL = f"http://www.pbc.gov.cn/rmyh/3963412/3963426/index_{j}.html"
|
15 |
i = i + 1
|
16 |
-
response = requests.get(CATEGORY_URL, timeout=
|
17 |
page = etree.HTML(response.text)
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
article['originalContent'] = encode(page.xpath("//div[@class='mainw950']//td[@class='content']/font[@class='zoom1']//p"))
|
28 |
-
if len(article['originalContent']) < 10:
|
29 |
continue
|
30 |
-
|
31 |
-
for element in article['originalContent'].split("。"):
|
32 |
-
CONTENT_ENG += translate(element) + ' '
|
33 |
-
article['content'] = CONTENT_ENG
|
34 |
-
article['site'] = "The People's Bank of China"
|
35 |
-
article['originalSite'] = "中国人民银行"
|
36 |
-
article['originalTitle'] = page.xpath("//title/text()")[0]
|
37 |
-
article['title'] = translate(article['originalTitle'])
|
38 |
-
article['url'] = url
|
39 |
-
article['category']= "Policy Interpretation"
|
40 |
-
article['publishDate'] = datemodifier(page.xpath("//meta[@name = '页面生成时间']/@content")[0], "%Y-%m-%d %H:%M:%S")
|
41 |
-
parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(article['publishDate'],"%Y-%m-%d")), "%Y-%m-%d")
|
42 |
-
if parsed_datetime < (datetime.today() - timedelta(days=183)):
|
43 |
i = -1
|
44 |
else:
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
j = i + 1
|
14 |
CATEGORY_URL = f"http://www.pbc.gov.cn/rmyh/3963412/3963426/index_{j}.html"
|
15 |
i = i + 1
|
16 |
+
response = requests.get(CATEGORY_URL, timeout=30)
|
17 |
page = etree.HTML(response.text)
|
18 |
+
articlelist = page.xpath("//td[contains(@height, '22')]")
|
19 |
+
for article in articlelist:
|
20 |
+
if isinstance(article, etree._Element):
|
21 |
+
subelement = etree.tostring(article).decode()
|
22 |
+
subpage = etree.HTML(subelement)
|
23 |
+
date = subpage.xpath("//span/text()")
|
24 |
+
try:
|
25 |
+
parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
|
26 |
+
except:
|
|
|
|
|
27 |
continue
|
28 |
+
if parsed_datetime < (datetime.today() - timedelta(days=183)):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
i = -1
|
30 |
else:
|
31 |
+
urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
|
32 |
+
for url in urls:
|
33 |
+
try:
|
34 |
+
article = {}
|
35 |
+
url = "http://www.pbc.gov.cn" + url
|
36 |
+
response = requests.get(url, timeout=20)
|
37 |
+
response.encoding = 'utf-8'
|
38 |
+
page = etree.HTML(response.text)
|
39 |
+
article['originalContent'] = encode(page.xpath("//div[@class='mainw950']//td[@class='content']/font[@class='zoom1']//p"))
|
40 |
+
if len(article['originalContent']) < 10:
|
41 |
+
continue
|
42 |
+
CONTENT_ENG = ''
|
43 |
+
for element in article['originalContent'].split("。"):
|
44 |
+
CONTENT_ENG += translate(element) + ' '
|
45 |
+
article['content'] = CONTENT_ENG
|
46 |
+
article['site'] = "The People's Bank of China"
|
47 |
+
article['originalSite'] = "中国人民银行"
|
48 |
+
article['originalTitle'] = page.xpath("//title/text()")[0]
|
49 |
+
article['title'] = translate(article['originalTitle'])
|
50 |
+
article['url'] = url
|
51 |
+
article['category']= "Policy Interpretation"
|
52 |
+
article['publishDate'] = datemodifier(page.xpath("//meta[@name = '页面生成时间']/@content")[0], "%Y-%m-%d %H:%M:%S")
|
53 |
+
article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
|
54 |
+
article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
|
55 |
+
upsert_content(article)
|
56 |
+
except Exception as error:
|
57 |
+
print(error)
|
safe.py
CHANGED
@@ -1,9 +1,8 @@
|
|
1 |
-
import uuid
|
2 |
import time
|
3 |
import urllib.request
|
4 |
from datetime import datetime, timedelta
|
5 |
from lxml import etree
|
6 |
-
from utils import
|
7 |
|
8 |
i = 1
|
9 |
while i > -1:
|
@@ -31,27 +30,8 @@ while i > -1:
|
|
31 |
try:
|
32 |
article = {}
|
33 |
url = "https://www.safe.gov.cn" + url
|
34 |
-
req = urllib.request.urlopen(url)
|
35 |
-
text = req.read()
|
36 |
-
html_text = text.decode("utf-8")
|
37 |
-
page = etree.HTML(html_text)
|
38 |
-
article['originalContent'] = encode(page.xpath("//div[contains(@class, 'detail_content')]//p"))
|
39 |
-
if len(article['originalContent']) < 10:
|
40 |
-
continue
|
41 |
-
CONTENT_ENG = ''
|
42 |
-
for element in article['originalContent'].split("。"):
|
43 |
-
CONTENT_ENG += translate(element) + ' '
|
44 |
-
article['content'] = CONTENT_ENG
|
45 |
-
article['site'] = "State Administration of Foregin Exchange of China"
|
46 |
-
article['originalSite'] = "外汇管理局"
|
47 |
-
article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
|
48 |
-
article['title'] = translate(article['originalTitle'])
|
49 |
-
article['url'] = url
|
50 |
article['category']= "Policy Interpretation"
|
51 |
-
|
52 |
-
article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
|
53 |
-
article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
|
54 |
-
upsert_content(article)
|
55 |
except Exception as error:
|
56 |
print(error)
|
57 |
|
@@ -81,26 +61,7 @@ while i > -1:
|
|
81 |
try:
|
82 |
article = {}
|
83 |
url = "https://www.safe.gov.cn" + url
|
84 |
-
req = urllib.request.urlopen(url)
|
85 |
-
text = req.read()
|
86 |
-
html_text = text.decode("utf-8")
|
87 |
-
page = etree.HTML(html_text)
|
88 |
-
article['originalContent'] = encode(page.xpath("//div[contains(@class, 'detail_content')]//p"))
|
89 |
-
if len(article['originalContent']) < 10:
|
90 |
-
continue
|
91 |
-
CONTENT_ENG = ''
|
92 |
-
for element in article['originalContent'].split("。"):
|
93 |
-
CONTENT_ENG += translate(element) + ' '
|
94 |
-
article['content'] = CONTENT_ENG
|
95 |
-
article['site'] = "State Administration of Foregin Exchange of China"
|
96 |
-
article['originalSite'] = "外汇管理局"
|
97 |
-
article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
|
98 |
-
article['title'] = translate(article['originalTitle'])
|
99 |
-
article['url'] = url
|
100 |
article['category']= "Data Interpretation"
|
101 |
-
|
102 |
-
article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
|
103 |
-
article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
|
104 |
-
upsert_content(article)
|
105 |
except Exception as error:
|
106 |
print(error)
|
|
|
|
|
1 |
import time
|
2 |
import urllib.request
|
3 |
from datetime import datetime, timedelta
|
4 |
from lxml import etree
|
5 |
+
from utils import crawl
|
6 |
|
7 |
i = 1
|
8 |
while i > -1:
|
|
|
30 |
try:
|
31 |
article = {}
|
32 |
url = "https://www.safe.gov.cn" + url
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
article['category']= "Policy Interpretation"
|
34 |
+
crawl(url, article)
|
|
|
|
|
|
|
35 |
except Exception as error:
|
36 |
print(error)
|
37 |
|
|
|
61 |
try:
|
62 |
article = {}
|
63 |
url = "https://www.safe.gov.cn" + url
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
article['category']= "Data Interpretation"
|
65 |
+
crawl(url, article)
|
|
|
|
|
|
|
66 |
except Exception as error:
|
67 |
print(error)
|
stats.py
CHANGED
@@ -3,7 +3,7 @@ import time
|
|
3 |
import urllib.request
|
4 |
from datetime import datetime, timedelta
|
5 |
from lxml import etree
|
6 |
-
from utils import encode,
|
7 |
|
8 |
i = 0
|
9 |
while i > -1:
|
@@ -31,26 +31,7 @@ while i > -1:
|
|
31 |
try:
|
32 |
article = {}
|
33 |
url = url.replace('./', "https://www.stats.gov.cn/sj/sjjd/")
|
34 |
-
req = urllib.request.urlopen(url)
|
35 |
-
text = req.read()
|
36 |
-
html_text = text.decode("utf-8")
|
37 |
-
page = etree.HTML(html_text)
|
38 |
-
article['originalContent'] = encode(page.xpath("//div[contains(@class, 'TRS_Editor')]//p"))
|
39 |
-
if len(article['originalContent']) < 10:
|
40 |
-
continue
|
41 |
-
CONTENT_ENG = ''
|
42 |
-
for element in article['originalContent'].split("。"):
|
43 |
-
CONTENT_ENG += translate(element) + ' '
|
44 |
-
article['content'] = CONTENT_ENG
|
45 |
-
article['site'] = "National Bureau of Statistics of China"
|
46 |
-
article['originalSite'] = "国家统计局"
|
47 |
-
article['originalTitle'] = page.xpath("//title/text()")[0]
|
48 |
-
article['title'] = translate(article['originalTitle'])
|
49 |
-
article['url'] = url
|
50 |
article['category']= "Data Interpretation"
|
51 |
-
article
|
52 |
-
article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
|
53 |
-
article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
|
54 |
-
upsert_content(article)
|
55 |
except Exception as error:
|
56 |
print(error)
|
|
|
3 |
import urllib.request
|
4 |
from datetime import datetime, timedelta
|
5 |
from lxml import etree
|
6 |
+
from utils import encode, crawl
|
7 |
|
8 |
i = 0
|
9 |
while i > -1:
|
|
|
31 |
try:
|
32 |
article = {}
|
33 |
url = url.replace('./', "https://www.stats.gov.cn/sj/sjjd/")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
article['category']= "Data Interpretation"
|
35 |
+
crawl(url, article)
|
|
|
|
|
|
|
36 |
except Exception as error:
|
37 |
print(error)
|
utils.py
CHANGED
@@ -119,16 +119,18 @@ def extract_from_pdf(url):
|
|
119 |
pdf_reader = PdfReader(f)
|
120 |
num_pages = len(pdf_reader.pages)
|
121 |
extracted_text = ""
|
122 |
-
extracted_text_eng = ""
|
123 |
for page in range(num_pages):
|
124 |
text = pdf_reader.pages[page].extract_text()
|
125 |
if text and text[0].isdigit():
|
126 |
text = text[1:]
|
127 |
first_newline_index = text.find('\n')
|
128 |
-
text = text[:first_newline_index+1].replace('\n', ' ') + text[first_newline_index+1:]
|
129 |
-
extracted_text_eng += translator.translate(text, dest='en').text
|
130 |
extracted_text += text
|
131 |
-
|
|
|
|
|
|
|
|
|
132 |
|
133 |
def get_db_connection():
|
134 |
"""Get dynamoDB connection"""
|
@@ -164,27 +166,35 @@ def sentiment_computation(content):
|
|
164 |
return sentiment_score, label_dict[sentiment_label]
|
165 |
|
166 |
def crawl(url, article):
|
167 |
-
domain = urlparse(url).netloc
|
168 |
req = urllib.request.urlopen(url)
|
169 |
text = req.read()
|
170 |
html_text = text.decode("utf-8")
|
171 |
page = etree.HTML(html_text)
|
172 |
-
|
173 |
-
article['
|
174 |
-
article['
|
175 |
-
article['
|
176 |
-
article['
|
177 |
-
if
|
|
|
|
|
|
|
|
|
|
|
178 |
return None
|
179 |
CONTENT_ENG = ''
|
180 |
-
for element in
|
181 |
CONTENT_ENG += translate(element) + '\n'
|
182 |
article['content'] = repr(CONTENT_ENG)
|
183 |
-
|
184 |
-
|
|
|
|
|
|
|
185 |
article['link'] = url
|
186 |
article['attachment'] = ""
|
187 |
-
article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(
|
188 |
article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
|
189 |
upsert_content(article)
|
190 |
|
@@ -197,9 +207,9 @@ def upsert_content(report):
|
|
197 |
'id': str(report['id']),
|
198 |
'site': report['site'],
|
199 |
'title': report['title'],
|
200 |
-
|
201 |
-
'
|
202 |
-
'
|
203 |
'category': report['category'],
|
204 |
'author': report['author'],
|
205 |
'content': report['content'],
|
|
|
119 |
pdf_reader = PdfReader(f)
|
120 |
num_pages = len(pdf_reader.pages)
|
121 |
extracted_text = ""
|
|
|
122 |
for page in range(num_pages):
|
123 |
text = pdf_reader.pages[page].extract_text()
|
124 |
if text and text[0].isdigit():
|
125 |
text = text[1:]
|
126 |
first_newline_index = text.find('\n')
|
127 |
+
text = text[:first_newline_index+1].replace('\n', ' ') + text[first_newline_index+1:]
|
|
|
128 |
extracted_text += text
|
129 |
+
try:
|
130 |
+
summary = '\n'.join(extracted_text.split('\n')[:2])
|
131 |
+
except:
|
132 |
+
summary = text
|
133 |
+
return extracted_text, summary
|
134 |
|
135 |
def get_db_connection():
|
136 |
"""Get dynamoDB connection"""
|
|
|
166 |
return sentiment_score, label_dict[sentiment_label]
|
167 |
|
168 |
def crawl(url, article):
|
169 |
+
domain = '.'.join(urlparse(url).netloc.split('.')[1:])
|
170 |
req = urllib.request.urlopen(url)
|
171 |
text = req.read()
|
172 |
html_text = text.decode("utf-8")
|
173 |
page = etree.HTML(html_text)
|
174 |
+
contentCN, summary = encode_content(page.xpath(xpath_dict[domain]['content']))
|
175 |
+
article['originSite'] = xpath_dict[domain]['siteCN']
|
176 |
+
article['site'] = xpath_dict[domain]['site']
|
177 |
+
article['titleCN'] = encode(page.xpath(xpath_dict[domain]['title']))
|
178 |
+
article['title'] = translate(article['titleCN'])
|
179 |
+
if 'author' in xpath_dict[domain]:
|
180 |
+
article['author'] = translate(encode(page.xpath(xpath_dict[domain]['author'])))
|
181 |
+
else:
|
182 |
+
article['author'] = ""
|
183 |
+
article['contentCN'] = repr(contentCN)
|
184 |
+
if len(article['contentCN']) < 10:
|
185 |
return None
|
186 |
CONTENT_ENG = ''
|
187 |
+
for element in contentCN.split("\n"):
|
188 |
CONTENT_ENG += translate(element) + '\n'
|
189 |
article['content'] = repr(CONTENT_ENG)
|
190 |
+
if 'subtitle' in xpath_dict[domain]:
|
191 |
+
article['subtitle'] = translate(encode(page.xpath(xpath_dict[domain]['subtitle'])))
|
192 |
+
else:
|
193 |
+
article['subtitle'] = translate(summary)
|
194 |
+
article['publishDate'] = datemodifier(encode(page.xpath(xpath_dict[domain]['publishdate'])), xpath_dict[domain]['datetime_format'])
|
195 |
article['link'] = url
|
196 |
article['attachment'] = ""
|
197 |
+
article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(CONTENT_ENG.replace("\n",""))
|
198 |
article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
|
199 |
upsert_content(article)
|
200 |
|
|
|
207 |
'id': str(report['id']),
|
208 |
'site': report['site'],
|
209 |
'title': report['title'],
|
210 |
+
'titleCN': report['titleCN'],
|
211 |
+
'site': report['site'],
|
212 |
+
'contentCN': report['contentCN'],
|
213 |
'category': report['category'],
|
214 |
'author': report['author'],
|
215 |
'content': report['content'],
|
xpath.json
CHANGED
@@ -2,28 +2,85 @@
|
|
2 |
"data.eastmoney.com": {
|
3 |
"attachment": "//a[contains(@class, 'pdf-link')]/@href",
|
4 |
"content": "//div[contains(@class, 'ctx-content')]//p",
|
5 |
-
"datetime":
|
6 |
-
"format_string": "%Y-%m-%d %H:%M:%S.%f"
|
7 |
-
}
|
8 |
},
|
9 |
-
"
|
10 |
"title": "//title/text()",
|
11 |
"subtitle": "//meta[@name = 'description']/@content",
|
12 |
"author": "//meta[@name = 'author']/@content",
|
13 |
"publishdate": "//meta[@name = 'firstpublishedtime']/@content",
|
14 |
"content": "//div[contains(@id, 'UCAP-CONTENT')]//p",
|
15 |
-
"
|
16 |
-
|
17 |
-
|
18 |
},
|
19 |
-
"
|
20 |
"title": "//meta[@name = 'ArticleTitle']/@content",
|
21 |
"subtitle": "//meta[@name = 'description']/@content",
|
22 |
"author": "//meta[@name = 'author']/@content",
|
23 |
"publishdate": "//meta[@name = 'PubDate']/@content",
|
24 |
"content": "//div[contains(@class, 'detail-news')]//p",
|
25 |
-
"
|
26 |
-
|
27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
}
|
29 |
}
|
|
|
2 |
"data.eastmoney.com": {
|
3 |
"attachment": "//a[contains(@class, 'pdf-link')]/@href",
|
4 |
"content": "//div[contains(@class, 'ctx-content')]//p",
|
5 |
+
"datetime": "%Y-%m-%d %H:%M:%S.%f"
|
|
|
|
|
6 |
},
|
7 |
+
"gov.cn": {
|
8 |
"title": "//title/text()",
|
9 |
"subtitle": "//meta[@name = 'description']/@content",
|
10 |
"author": "//meta[@name = 'author']/@content",
|
11 |
"publishdate": "//meta[@name = 'firstpublishedtime']/@content",
|
12 |
"content": "//div[contains(@id, 'UCAP-CONTENT')]//p",
|
13 |
+
"datetime_format": "%Y-%m-%d-%H:%M:%S",
|
14 |
+
"siteCN": "中国国务院",
|
15 |
+
"site": "State Council of China"
|
16 |
},
|
17 |
+
"csrc.gov.cn": {
|
18 |
"title": "//meta[@name = 'ArticleTitle']/@content",
|
19 |
"subtitle": "//meta[@name = 'description']/@content",
|
20 |
"author": "//meta[@name = 'author']/@content",
|
21 |
"publishdate": "//meta[@name = 'PubDate']/@content",
|
22 |
"content": "//div[contains(@class, 'detail-news')]//p",
|
23 |
+
"datetime_format": "%Y-%m-%d %H:%M:%S",
|
24 |
+
"siteCN": "中国证监会",
|
25 |
+
"site": "Securities Regulatory Commission of China"
|
26 |
+
},
|
27 |
+
"mof.gov.cn": {
|
28 |
+
"title": "//meta[@name = 'ArticleTitle']/@content",
|
29 |
+
"publishdate": "//meta[@name = 'PubDate']/@content",
|
30 |
+
"content": "//div[contains(@class, 'TRS_Editor')]//p",
|
31 |
+
"datetime_format": "%Y-%m-%d %H:%M:%S",
|
32 |
+
"siteCN": "中国财政部",
|
33 |
+
"site": "Ministry of Finance of China"
|
34 |
+
},
|
35 |
+
"mofcom.gov.cn": {
|
36 |
+
"title": "//meta[@name = 'ArticleTitle']/@content",
|
37 |
+
"subtitle": "//meta[@name = 'Description']/@content",
|
38 |
+
"publishdate": "//meta[@name = 'PubDate']/@content",
|
39 |
+
"content": "//div[contains(@class, 'art-con art-con-bottonmLine')]//p",
|
40 |
+
"datetime_format": "%Y-%m-%d %H:%M:%S",
|
41 |
+
"siteCN": "中国商务部",
|
42 |
+
"site": "Ministry of Commerce of China"
|
43 |
+
},
|
44 |
+
"ndrc.gov.cn": {
|
45 |
+
"title": "//meta[@name = 'ArticleTitle']/@content",
|
46 |
+
"publishdate": "//meta[@name = 'PubDate']/@content",
|
47 |
+
"content": "//div[contains(@class, 'TRS_Editor') or contains(@class, 'article_l')]",
|
48 |
+
"datetime_format": "%Y-%m-%d %H:%M:%S",
|
49 |
+
"siteCN": "中国国家发展和改革委员会",
|
50 |
+
"site": "National Development and Reform Commission of China"
|
51 |
+
},
|
52 |
+
"pbc.gov.cn": {
|
53 |
+
"title": "//title/text()",
|
54 |
+
"subtitle": "//meta[@name = 'description']/@content",
|
55 |
+
"publishdate": "//meta[@name = '页面生成时间']/@content",
|
56 |
+
"content": "//div[@class='mainw950']//td[@class='content']/font[@class='zoom1']//p",
|
57 |
+
"datetime_format": "%Y-%m-%d %H:%M:%S",
|
58 |
+
"siteCN": "中国人民银行",
|
59 |
+
"site": "The People's Bank of China"
|
60 |
+
},
|
61 |
+
"safe.gov.cn": {
|
62 |
+
"title": "//meta[@name = 'ArticleTitle']/@content",
|
63 |
+
"subtitle": "//meta[@name = 'Description']/@content",
|
64 |
+
"publishdate": "//meta[@name = 'PubDate']/@content",
|
65 |
+
"content": "//div[contains(@class, 'detail_content')]//p",
|
66 |
+
"datetime_format": "%Y-%m-%d",
|
67 |
+
"siteCN": "中国外汇管理局",
|
68 |
+
"site": "State Administration of Foreign Exchange of China"
|
69 |
+
},
|
70 |
+
"stats.gov.cn": {
|
71 |
+
"title": "//title/text()",
|
72 |
+
"publishdate": "//div[contains(@class, 'detail-title-des')]//p[1]",
|
73 |
+
"content": "//div[contains(@class, 'TRS_Editor')]//p",
|
74 |
+
"datetime_format": "%Y/%m/%d %H:%M",
|
75 |
+
"siteCN": "中国国家统计局",
|
76 |
+
"site": "National Bureau of Statistics of China"
|
77 |
+
},
|
78 |
+
"chinatax.gov.cn": {
|
79 |
+
"title": "//title/text()",
|
80 |
+
"publishdate": "//div[contains(@class, 'detail-title-des')]//p[1]",
|
81 |
+
"content": "//div[contains(@class, 'article')]//p",
|
82 |
+
"datetime_format": "%Y-%m-%d %H:%M:%S",
|
83 |
+
"siteCN": "中国国家税务总局",
|
84 |
+
"site": "State Taxation Administration of China"
|
85 |
}
|
86 |
}
|