gavinzli commited on
Commit
ec13f7a
·
1 Parent(s): b348cfd
.github/workflows/eastmoney.yml CHANGED
@@ -14,7 +14,7 @@ permissions:
14
  jobs:
15
  build:
16
 
17
- runs-on: data-collection
18
  timeout-minutes: 7200
19
  steps:
20
  - uses: actions/checkout@v3
 
14
  jobs:
15
  build:
16
 
17
+ runs-on: self-hosted
18
  timeout-minutes: 7200
19
  steps:
20
  - uses: actions/checkout@v3
.github/workflows/gov.yml CHANGED
@@ -14,7 +14,7 @@ permissions:
14
  jobs:
15
  build:
16
 
17
- runs-on: data-collection
18
  timeout-minutes: 14400
19
  steps:
20
  - uses: actions/checkout@v3
 
14
  jobs:
15
  build:
16
 
17
+ runs-on: self-hosted
18
  timeout-minutes: 14400
19
  steps:
20
  - uses: actions/checkout@v3
.github/workflows/mof.yml CHANGED
@@ -14,7 +14,7 @@ permissions:
14
  jobs:
15
  build:
16
 
17
- runs-on: data-collection
18
  timeout-minutes: 14400
19
  steps:
20
  - uses: actions/checkout@v3
 
14
  jobs:
15
  build:
16
 
17
+ runs-on: self-hosted
18
  timeout-minutes: 14400
19
  steps:
20
  - uses: actions/checkout@v3
.github/workflows/ndrc.yml CHANGED
@@ -14,7 +14,7 @@ permissions:
14
  jobs:
15
  build:
16
 
17
- runs-on: data-collection
18
  timeout-minutes: 14400
19
  steps:
20
  - uses: actions/checkout@v3
 
14
  jobs:
15
  build:
16
 
17
+ runs-on: self-hosted
18
  timeout-minutes: 14400
19
  steps:
20
  - uses: actions/checkout@v3
.github/workflows/pbc.yml CHANGED
@@ -14,7 +14,7 @@ permissions:
14
  jobs:
15
  build:
16
 
17
- runs-on: data-collection
18
  timeout-minutes: 14400
19
  steps:
20
  - uses: actions/checkout@v3
 
14
  jobs:
15
  build:
16
 
17
+ runs-on: self-hosted
18
  timeout-minutes: 14400
19
  steps:
20
  - uses: actions/checkout@v3
gov.py CHANGED
@@ -125,6 +125,7 @@ while i > -1:
125
  categoryu_url = "https://www.gov.cn/zhengce/jiedu/home.htm"
126
  else:
127
  categoryu_url = f"https://www.gov.cn/zhengce/jiedu/home_{i}.htm"
 
128
  req = urllib.request.urlopen(categoryu_url)
129
  text = req.read()
130
  html_text = text.decode("utf-8")
@@ -137,7 +138,6 @@ while i > -1:
137
  date = subpage.xpath("//span/text()")[0]
138
  parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
139
  if parsed_datetime < (datetime.today() - timedelta(days=183)):
140
- print(categoryu_url)
141
  i = -1
142
  else:
143
  urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
@@ -195,6 +195,7 @@ while i > -1:
195
  categoryu_url = "https://www.gov.cn/zhengce/zuixin/home.htm"
196
  else:
197
  categoryu_url = f"https://www.gov.cn/zhengce/zuixin/home_{i}.htm"
 
198
  req = urllib.request.urlopen(categoryu_url)
199
  text = req.read()
200
  html_text = text.decode("utf-8")
@@ -206,7 +207,9 @@ while i > -1:
206
  subpage = etree.HTML(subelement)
207
  date = subpage.xpath("//span/text()")[0]
208
  parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
209
- if parsed_datetime > (datetime.today() - timedelta(days=183)):
 
 
210
  urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
211
  for url in urls:
212
  try:
 
125
  categoryu_url = "https://www.gov.cn/zhengce/jiedu/home.htm"
126
  else:
127
  categoryu_url = f"https://www.gov.cn/zhengce/jiedu/home_{i}.htm"
128
+ i = i + 1
129
  req = urllib.request.urlopen(categoryu_url)
130
  text = req.read()
131
  html_text = text.decode("utf-8")
 
138
  date = subpage.xpath("//span/text()")[0]
139
  parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
140
  if parsed_datetime < (datetime.today() - timedelta(days=183)):
 
141
  i = -1
142
  else:
143
  urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
 
195
  categoryu_url = "https://www.gov.cn/zhengce/zuixin/home.htm"
196
  else:
197
  categoryu_url = f"https://www.gov.cn/zhengce/zuixin/home_{i}.htm"
198
+ i = i + 1
199
  req = urllib.request.urlopen(categoryu_url)
200
  text = req.read()
201
  html_text = text.decode("utf-8")
 
207
  subpage = etree.HTML(subelement)
208
  date = subpage.xpath("//span/text()")[0]
209
  parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
210
+ if parsed_datetime < (datetime.today() - timedelta(days=183)):
211
+ i = -1
212
+ else:
213
  urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
214
  for url in urls:
215
  try:
mof.py CHANGED
@@ -121,6 +121,7 @@ while i > -1:
121
  categoryu_url = "https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/"
122
  else:
123
  categoryu_url = f"https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/index_{i}.htm"
 
124
  req = urllib.request.urlopen(categoryu_url)
125
  text = req.read()
126
  html_text = text.decode("utf-8")
@@ -132,7 +133,9 @@ while i > -1:
132
  subpage = etree.HTML(subelement)
133
  date = subpage.xpath("//span/text()")[0]
134
  parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
135
- if parsed_datetime > (datetime.today() - timedelta(days=183)):
 
 
136
  urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
137
  for url in urls:
138
  try:
@@ -250,6 +253,7 @@ while i > -1:
250
  categoryu_url = "https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/"
251
  else:
252
  categoryu_url = f"https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/index_{i}.htm"
 
253
  req = urllib.request.urlopen(categoryu_url)
254
  text = req.read()
255
  html_text = text.decode("utf-8")
@@ -261,7 +265,9 @@ while i > -1:
261
  subpage = etree.HTML(subelement)
262
  date = subpage.xpath("//span/text()")[0]
263
  parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
264
- if parsed_datetime > (datetime.today() - timedelta(days=183)):
 
 
265
  urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
266
  for url in urls:
267
  try:
 
121
  categoryu_url = "https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/"
122
  else:
123
  categoryu_url = f"https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/index_{i}.htm"
124
+ i = i + 1
125
  req = urllib.request.urlopen(categoryu_url)
126
  text = req.read()
127
  html_text = text.decode("utf-8")
 
133
  subpage = etree.HTML(subelement)
134
  date = subpage.xpath("//span/text()")[0]
135
  parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
136
+ if parsed_datetime < (datetime.today() - timedelta(days=183)):
137
+ i = -1
138
+ else:
139
  urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
140
  for url in urls:
141
  try:
 
253
  categoryu_url = "https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/"
254
  else:
255
  categoryu_url = f"https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/index_{i}.htm"
256
+ i = i + 1
257
  req = urllib.request.urlopen(categoryu_url)
258
  text = req.read()
259
  html_text = text.decode("utf-8")
 
265
  subpage = etree.HTML(subelement)
266
  date = subpage.xpath("//span/text()")[0]
267
  parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
268
+ if parsed_datetime < (datetime.today() - timedelta(days=183)):
269
+ i = -1
270
+ else:
271
  urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
272
  for url in urls:
273
  try:
ndrc.py CHANGED
@@ -196,6 +196,7 @@ while i > -1:
196
  categoryu_url = "https://www.ndrc.gov.cn/xxgk/jd/jd/index.html"
197
  else:
198
  categoryu_url = f"https://www.ndrc.gov.cn/xxgk/jd/jd/index_{i}.html"
 
199
  req = urllib.request.urlopen(categoryu_url)
200
  text = req.read()
201
  html_text = text.decode("utf-8")
@@ -207,7 +208,9 @@ while i > -1:
207
  subpage = etree.HTML(subelement)
208
  date = subpage.xpath("//span/text()")[0]
209
  parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y/%m/%d")), "%Y-%m-%d")
210
- if parsed_datetime > (datetime.today() - timedelta(days=183)):
 
 
211
  urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
212
  for url in urls:
213
  try:
 
196
  categoryu_url = "https://www.ndrc.gov.cn/xxgk/jd/jd/index.html"
197
  else:
198
  categoryu_url = f"https://www.ndrc.gov.cn/xxgk/jd/jd/index_{i}.html"
199
+ i = i + 1
200
  req = urllib.request.urlopen(categoryu_url)
201
  text = req.read()
202
  html_text = text.decode("utf-8")
 
208
  subpage = etree.HTML(subelement)
209
  date = subpage.xpath("//span/text()")[0]
210
  parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y/%m/%d")), "%Y-%m-%d")
211
+ if parsed_datetime < (datetime.today() - timedelta(days=183)):
212
+ i = -1
213
+ else:
214
  urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
215
  for url in urls:
216
  try:
pbc.py CHANGED
@@ -127,7 +127,7 @@ while i > -1:
127
  else:
128
  j = i + 1
129
  categoryu_url = f"http://www.pbc.gov.cn/rmyh/3963412/3963426/index_{j}.html"
130
-
131
  response = requests.get(categoryu_url)
132
  page = etree.HTML(response.text)
133
  urls = page.xpath("//td[contains(@height,'22')]//a[contains(@target, '_blank')]/@href")
@@ -153,30 +153,31 @@ while i > -1:
153
  article['publishDate'] = datemodifier(page.xpath("//meta[@name = '页面生成时间']/@content")[0])
154
  parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(article['publishDate'],"%Y-%m-%d")), "%Y-%m-%d")
155
  if parsed_datetime < (datetime.today() - timedelta(days=183)):
156
- continue
157
- article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
158
- label_dict = {
159
- "positive": "+",
160
- "negative": "-",
161
- "neutral": "0",
162
- }
163
- sentiment_score = 0
164
- maximum_value = 0
165
- raw_sentiment = analyzer(article['content'][:512], return_all_scores=True)
166
- sentiment_label = None
167
- for sentiment_dict in raw_sentiment[0]:
168
- value = sentiment_dict["score"]
169
- if value > maximum_value:
170
- sentiment_label = sentiment_dict["label"]
171
- maximum_value = value
172
- if sentiment_dict["label"] == "positive":
173
- sentiment_score = sentiment_score + value
174
- if sentiment_dict["label"] == "negative":
175
- sentiment_score = sentiment_score - value
176
- else:
177
- sentiment_score = sentiment_score + 0
178
- article['sentimentScore'] = sentiment_score
179
- article['sentimentLabel'] = label_dict[sentiment_label]
180
- upsert_content(article)
 
181
  except Exception as error:
182
  print(error)
 
127
  else:
128
  j = i + 1
129
  categoryu_url = f"http://www.pbc.gov.cn/rmyh/3963412/3963426/index_{j}.html"
130
+ i = i + 1
131
  response = requests.get(categoryu_url)
132
  page = etree.HTML(response.text)
133
  urls = page.xpath("//td[contains(@height,'22')]//a[contains(@target, '_blank')]/@href")
 
153
  article['publishDate'] = datemodifier(page.xpath("//meta[@name = '页面生成时间']/@content")[0])
154
  parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(article['publishDate'],"%Y-%m-%d")), "%Y-%m-%d")
155
  if parsed_datetime < (datetime.today() - timedelta(days=183)):
156
+ i = -1
157
+ else:
158
+ article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
159
+ label_dict = {
160
+ "positive": "+",
161
+ "negative": "-",
162
+ "neutral": "0",
163
+ }
164
+ sentiment_score = 0
165
+ maximum_value = 0
166
+ raw_sentiment = analyzer(article['content'][:512], return_all_scores=True)
167
+ sentiment_label = None
168
+ for sentiment_dict in raw_sentiment[0]:
169
+ value = sentiment_dict["score"]
170
+ if value > maximum_value:
171
+ sentiment_label = sentiment_dict["label"]
172
+ maximum_value = value
173
+ if sentiment_dict["label"] == "positive":
174
+ sentiment_score = sentiment_score + value
175
+ if sentiment_dict["label"] == "negative":
176
+ sentiment_score = sentiment_score - value
177
+ else:
178
+ sentiment_score = sentiment_score + 0
179
+ article['sentimentScore'] = sentiment_score
180
+ article['sentimentLabel'] = label_dict[sentiment_label]
181
+ upsert_content(article)
182
  except Exception as error:
183
  print(error)