gavinzli
commited on
Commit
·
ec13f7a
1
Parent(s):
b348cfd
commit
Browse files- .github/workflows/eastmoney.yml +1 -1
- .github/workflows/gov.yml +1 -1
- .github/workflows/mof.yml +1 -1
- .github/workflows/ndrc.yml +1 -1
- .github/workflows/pbc.yml +1 -1
- gov.py +5 -2
- mof.py +8 -2
- ndrc.py +4 -1
- pbc.py +27 -26
.github/workflows/eastmoney.yml
CHANGED
@@ -14,7 +14,7 @@ permissions:
|
|
14 |
jobs:
|
15 |
build:
|
16 |
|
17 |
-
runs-on:
|
18 |
timeout-minutes: 7200
|
19 |
steps:
|
20 |
- uses: actions/checkout@v3
|
|
|
14 |
jobs:
|
15 |
build:
|
16 |
|
17 |
+
runs-on: self-hosted
|
18 |
timeout-minutes: 7200
|
19 |
steps:
|
20 |
- uses: actions/checkout@v3
|
.github/workflows/gov.yml
CHANGED
@@ -14,7 +14,7 @@ permissions:
|
|
14 |
jobs:
|
15 |
build:
|
16 |
|
17 |
-
runs-on:
|
18 |
timeout-minutes: 14400
|
19 |
steps:
|
20 |
- uses: actions/checkout@v3
|
|
|
14 |
jobs:
|
15 |
build:
|
16 |
|
17 |
+
runs-on: self-hosted
|
18 |
timeout-minutes: 14400
|
19 |
steps:
|
20 |
- uses: actions/checkout@v3
|
.github/workflows/mof.yml
CHANGED
@@ -14,7 +14,7 @@ permissions:
|
|
14 |
jobs:
|
15 |
build:
|
16 |
|
17 |
-
runs-on:
|
18 |
timeout-minutes: 14400
|
19 |
steps:
|
20 |
- uses: actions/checkout@v3
|
|
|
14 |
jobs:
|
15 |
build:
|
16 |
|
17 |
+
runs-on: self-hosted
|
18 |
timeout-minutes: 14400
|
19 |
steps:
|
20 |
- uses: actions/checkout@v3
|
.github/workflows/ndrc.yml
CHANGED
@@ -14,7 +14,7 @@ permissions:
|
|
14 |
jobs:
|
15 |
build:
|
16 |
|
17 |
-
runs-on:
|
18 |
timeout-minutes: 14400
|
19 |
steps:
|
20 |
- uses: actions/checkout@v3
|
|
|
14 |
jobs:
|
15 |
build:
|
16 |
|
17 |
+
runs-on: self-hosted
|
18 |
timeout-minutes: 14400
|
19 |
steps:
|
20 |
- uses: actions/checkout@v3
|
.github/workflows/pbc.yml
CHANGED
@@ -14,7 +14,7 @@ permissions:
|
|
14 |
jobs:
|
15 |
build:
|
16 |
|
17 |
-
runs-on:
|
18 |
timeout-minutes: 14400
|
19 |
steps:
|
20 |
- uses: actions/checkout@v3
|
|
|
14 |
jobs:
|
15 |
build:
|
16 |
|
17 |
+
runs-on: self-hosted
|
18 |
timeout-minutes: 14400
|
19 |
steps:
|
20 |
- uses: actions/checkout@v3
|
gov.py
CHANGED
@@ -125,6 +125,7 @@ while i > -1:
|
|
125 |
categoryu_url = "https://www.gov.cn/zhengce/jiedu/home.htm"
|
126 |
else:
|
127 |
categoryu_url = f"https://www.gov.cn/zhengce/jiedu/home_{i}.htm"
|
|
|
128 |
req = urllib.request.urlopen(categoryu_url)
|
129 |
text = req.read()
|
130 |
html_text = text.decode("utf-8")
|
@@ -137,7 +138,6 @@ while i > -1:
|
|
137 |
date = subpage.xpath("//span/text()")[0]
|
138 |
parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
|
139 |
if parsed_datetime < (datetime.today() - timedelta(days=183)):
|
140 |
-
print(categoryu_url)
|
141 |
i = -1
|
142 |
else:
|
143 |
urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
|
@@ -195,6 +195,7 @@ while i > -1:
|
|
195 |
categoryu_url = "https://www.gov.cn/zhengce/zuixin/home.htm"
|
196 |
else:
|
197 |
categoryu_url = f"https://www.gov.cn/zhengce/zuixin/home_{i}.htm"
|
|
|
198 |
req = urllib.request.urlopen(categoryu_url)
|
199 |
text = req.read()
|
200 |
html_text = text.decode("utf-8")
|
@@ -206,7 +207,9 @@ while i > -1:
|
|
206 |
subpage = etree.HTML(subelement)
|
207 |
date = subpage.xpath("//span/text()")[0]
|
208 |
parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
|
209 |
-
if parsed_datetime
|
|
|
|
|
210 |
urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
|
211 |
for url in urls:
|
212 |
try:
|
|
|
125 |
categoryu_url = "https://www.gov.cn/zhengce/jiedu/home.htm"
|
126 |
else:
|
127 |
categoryu_url = f"https://www.gov.cn/zhengce/jiedu/home_{i}.htm"
|
128 |
+
i = i + 1
|
129 |
req = urllib.request.urlopen(categoryu_url)
|
130 |
text = req.read()
|
131 |
html_text = text.decode("utf-8")
|
|
|
138 |
date = subpage.xpath("//span/text()")[0]
|
139 |
parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
|
140 |
if parsed_datetime < (datetime.today() - timedelta(days=183)):
|
|
|
141 |
i = -1
|
142 |
else:
|
143 |
urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
|
|
|
195 |
categoryu_url = "https://www.gov.cn/zhengce/zuixin/home.htm"
|
196 |
else:
|
197 |
categoryu_url = f"https://www.gov.cn/zhengce/zuixin/home_{i}.htm"
|
198 |
+
i = i + 1
|
199 |
req = urllib.request.urlopen(categoryu_url)
|
200 |
text = req.read()
|
201 |
html_text = text.decode("utf-8")
|
|
|
207 |
subpage = etree.HTML(subelement)
|
208 |
date = subpage.xpath("//span/text()")[0]
|
209 |
parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
|
210 |
+
if parsed_datetime < (datetime.today() - timedelta(days=183)):
|
211 |
+
i = -1
|
212 |
+
else:
|
213 |
urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
|
214 |
for url in urls:
|
215 |
try:
|
mof.py
CHANGED
@@ -121,6 +121,7 @@ while i > -1:
|
|
121 |
categoryu_url = "https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/"
|
122 |
else:
|
123 |
categoryu_url = f"https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/index_{i}.htm"
|
|
|
124 |
req = urllib.request.urlopen(categoryu_url)
|
125 |
text = req.read()
|
126 |
html_text = text.decode("utf-8")
|
@@ -132,7 +133,9 @@ while i > -1:
|
|
132 |
subpage = etree.HTML(subelement)
|
133 |
date = subpage.xpath("//span/text()")[0]
|
134 |
parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
|
135 |
-
if parsed_datetime
|
|
|
|
|
136 |
urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
|
137 |
for url in urls:
|
138 |
try:
|
@@ -250,6 +253,7 @@ while i > -1:
|
|
250 |
categoryu_url = "https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/"
|
251 |
else:
|
252 |
categoryu_url = f"https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/index_{i}.htm"
|
|
|
253 |
req = urllib.request.urlopen(categoryu_url)
|
254 |
text = req.read()
|
255 |
html_text = text.decode("utf-8")
|
@@ -261,7 +265,9 @@ while i > -1:
|
|
261 |
subpage = etree.HTML(subelement)
|
262 |
date = subpage.xpath("//span/text()")[0]
|
263 |
parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
|
264 |
-
if parsed_datetime
|
|
|
|
|
265 |
urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
|
266 |
for url in urls:
|
267 |
try:
|
|
|
121 |
categoryu_url = "https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/"
|
122 |
else:
|
123 |
categoryu_url = f"https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/index_{i}.htm"
|
124 |
+
i = i + 1
|
125 |
req = urllib.request.urlopen(categoryu_url)
|
126 |
text = req.read()
|
127 |
html_text = text.decode("utf-8")
|
|
|
133 |
subpage = etree.HTML(subelement)
|
134 |
date = subpage.xpath("//span/text()")[0]
|
135 |
parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
|
136 |
+
if parsed_datetime < (datetime.today() - timedelta(days=183)):
|
137 |
+
i = -1
|
138 |
+
else:
|
139 |
urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
|
140 |
for url in urls:
|
141 |
try:
|
|
|
253 |
categoryu_url = "https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/"
|
254 |
else:
|
255 |
categoryu_url = f"https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/index_{i}.htm"
|
256 |
+
i = i + 1
|
257 |
req = urllib.request.urlopen(categoryu_url)
|
258 |
text = req.read()
|
259 |
html_text = text.decode("utf-8")
|
|
|
265 |
subpage = etree.HTML(subelement)
|
266 |
date = subpage.xpath("//span/text()")[0]
|
267 |
parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
|
268 |
+
if parsed_datetime < (datetime.today() - timedelta(days=183)):
|
269 |
+
i = -1
|
270 |
+
else:
|
271 |
urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
|
272 |
for url in urls:
|
273 |
try:
|
ndrc.py
CHANGED
@@ -196,6 +196,7 @@ while i > -1:
|
|
196 |
categoryu_url = "https://www.ndrc.gov.cn/xxgk/jd/jd/index.html"
|
197 |
else:
|
198 |
categoryu_url = f"https://www.ndrc.gov.cn/xxgk/jd/jd/index_{i}.html"
|
|
|
199 |
req = urllib.request.urlopen(categoryu_url)
|
200 |
text = req.read()
|
201 |
html_text = text.decode("utf-8")
|
@@ -207,7 +208,9 @@ while i > -1:
|
|
207 |
subpage = etree.HTML(subelement)
|
208 |
date = subpage.xpath("//span/text()")[0]
|
209 |
parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y/%m/%d")), "%Y-%m-%d")
|
210 |
-
if parsed_datetime
|
|
|
|
|
211 |
urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
|
212 |
for url in urls:
|
213 |
try:
|
|
|
196 |
categoryu_url = "https://www.ndrc.gov.cn/xxgk/jd/jd/index.html"
|
197 |
else:
|
198 |
categoryu_url = f"https://www.ndrc.gov.cn/xxgk/jd/jd/index_{i}.html"
|
199 |
+
i = i + 1
|
200 |
req = urllib.request.urlopen(categoryu_url)
|
201 |
text = req.read()
|
202 |
html_text = text.decode("utf-8")
|
|
|
208 |
subpage = etree.HTML(subelement)
|
209 |
date = subpage.xpath("//span/text()")[0]
|
210 |
parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y/%m/%d")), "%Y-%m-%d")
|
211 |
+
if parsed_datetime < (datetime.today() - timedelta(days=183)):
|
212 |
+
i = -1
|
213 |
+
else:
|
214 |
urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
|
215 |
for url in urls:
|
216 |
try:
|
pbc.py
CHANGED
@@ -127,7 +127,7 @@ while i > -1:
|
|
127 |
else:
|
128 |
j = i + 1
|
129 |
categoryu_url = f"http://www.pbc.gov.cn/rmyh/3963412/3963426/index_{j}.html"
|
130 |
-
|
131 |
response = requests.get(categoryu_url)
|
132 |
page = etree.HTML(response.text)
|
133 |
urls = page.xpath("//td[contains(@height,'22')]//a[contains(@target, '_blank')]/@href")
|
@@ -153,30 +153,31 @@ while i > -1:
|
|
153 |
article['publishDate'] = datemodifier(page.xpath("//meta[@name = '页面生成时间']/@content")[0])
|
154 |
parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(article['publishDate'],"%Y-%m-%d")), "%Y-%m-%d")
|
155 |
if parsed_datetime < (datetime.today() - timedelta(days=183)):
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
|
|
181 |
except Exception as error:
|
182 |
print(error)
|
|
|
127 |
else:
|
128 |
j = i + 1
|
129 |
categoryu_url = f"http://www.pbc.gov.cn/rmyh/3963412/3963426/index_{j}.html"
|
130 |
+
i = i + 1
|
131 |
response = requests.get(categoryu_url)
|
132 |
page = etree.HTML(response.text)
|
133 |
urls = page.xpath("//td[contains(@height,'22')]//a[contains(@target, '_blank')]/@href")
|
|
|
153 |
article['publishDate'] = datemodifier(page.xpath("//meta[@name = '页面生成时间']/@content")[0])
|
154 |
parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(article['publishDate'],"%Y-%m-%d")), "%Y-%m-%d")
|
155 |
if parsed_datetime < (datetime.today() - timedelta(days=183)):
|
156 |
+
i = -1
|
157 |
+
else:
|
158 |
+
article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
|
159 |
+
label_dict = {
|
160 |
+
"positive": "+",
|
161 |
+
"negative": "-",
|
162 |
+
"neutral": "0",
|
163 |
+
}
|
164 |
+
sentiment_score = 0
|
165 |
+
maximum_value = 0
|
166 |
+
raw_sentiment = analyzer(article['content'][:512], return_all_scores=True)
|
167 |
+
sentiment_label = None
|
168 |
+
for sentiment_dict in raw_sentiment[0]:
|
169 |
+
value = sentiment_dict["score"]
|
170 |
+
if value > maximum_value:
|
171 |
+
sentiment_label = sentiment_dict["label"]
|
172 |
+
maximum_value = value
|
173 |
+
if sentiment_dict["label"] == "positive":
|
174 |
+
sentiment_score = sentiment_score + value
|
175 |
+
if sentiment_dict["label"] == "negative":
|
176 |
+
sentiment_score = sentiment_score - value
|
177 |
+
else:
|
178 |
+
sentiment_score = sentiment_score + 0
|
179 |
+
article['sentimentScore'] = sentiment_score
|
180 |
+
article['sentimentLabel'] = label_dict[sentiment_label]
|
181 |
+
upsert_content(article)
|
182 |
except Exception as error:
|
183 |
print(error)
|