OxbridgeEconomics commited on
Commit
b2dbbaf
·
1 Parent(s): 24d2e54
Files changed (3) hide show
  1. eastmoney.py +8 -10
  2. mof.py +29 -31
  3. ndrc.py +8 -13
eastmoney.py CHANGED
@@ -10,6 +10,10 @@ import os
10
  from datetime import datetime, timedelta
11
  from decimal import Decimal
12
  from transformers import pipeline
 
 
 
 
13
  analyzer = pipeline("sentiment-analysis", model="ProsusAI/finbert")
14
 
15
  translator = Translator()
@@ -50,9 +54,6 @@ def encode(content):
50
  text += line
51
  return text
52
 
53
- AWS_ACCESS_KEY_ID = os.environ['AWS_ACCESS_KEY_ID']
54
- AWS_SECRET_ACCESS_KEY = os.environ['AWS_SECRET_ACCESS_KEY']
55
-
56
  def get_db_connection():
57
  """Get dynamoDB connection"""
58
  dynamodb = boto3.resource(
@@ -90,10 +91,6 @@ def upsert_content(report):
90
  response = table.put_item(Item=item)
91
  print(response)
92
 
93
-
94
- reportList = []
95
-
96
-
97
  today = datetime.today().strftime('%Y-%m-%d')
98
  beginDate = (datetime.today() - timedelta(days=183)).strftime('%Y-%m-%d')
99
  i = 0
@@ -141,7 +138,10 @@ while i > -1:
141
  report['author'] = translator.translate(report['researcher'], dest='en').text
142
  report['originalAuthor'] = report['researcher']
143
  report['originalContent'] = content
144
- report['content'] = translator.translate(content, dest='en').text
 
 
 
145
  report['authorid'] = uuid.uuid5(uuid.NAMESPACE_OID, report['author'])
146
  report['publishDate'] = datemodifier(report['publishDate'])
147
  report['id'] = uuid.uuid5(uuid.NAMESPACE_OID, report['title']+report['publishDate'])
@@ -167,9 +167,7 @@ while i > -1:
167
  sentiment_score = sentiment_score + 0
168
  report['sentimentScore'] = sentiment_score
169
  report['sentimentLabel'] = label_dict[sentiment_label]
170
- print(report)
171
  upsert_content(report)
172
- reportList.append(report)
173
  except Exception as error:
174
  print(error)
175
  else:
 
10
  from datetime import datetime, timedelta
11
  from decimal import Decimal
12
  from transformers import pipeline
13
+
14
+ AWS_ACCESS_KEY_ID = os.environ['AWS_ACCESS_KEY_ID']
15
+ AWS_SECRET_ACCESS_KEY = os.environ['AWS_SECRET_ACCESS_KEY']
16
+
17
  analyzer = pipeline("sentiment-analysis", model="ProsusAI/finbert")
18
 
19
  translator = Translator()
 
54
  text += line
55
  return text
56
 
 
 
 
57
  def get_db_connection():
58
  """Get dynamoDB connection"""
59
  dynamodb = boto3.resource(
 
91
  response = table.put_item(Item=item)
92
  print(response)
93
 
 
 
 
 
94
  today = datetime.today().strftime('%Y-%m-%d')
95
  beginDate = (datetime.today() - timedelta(days=183)).strftime('%Y-%m-%d')
96
  i = 0
 
138
  report['author'] = translator.translate(report['researcher'], dest='en').text
139
  report['originalAuthor'] = report['researcher']
140
  report['originalContent'] = content
141
+ content_eng = ''
142
+ for element in article['originalContent'].split("。"):
143
+ content_eng += translator.translate(element, dest='en').text + ' '
144
+ article['content'] = content_eng
145
  report['authorid'] = uuid.uuid5(uuid.NAMESPACE_OID, report['author'])
146
  report['publishDate'] = datemodifier(report['publishDate'])
147
  report['id'] = uuid.uuid5(uuid.NAMESPACE_OID, report['title']+report['publishDate'])
 
167
  sentiment_score = sentiment_score + 0
168
  report['sentimentScore'] = sentiment_score
169
  report['sentimentLabel'] = label_dict[sentiment_label]
 
170
  upsert_content(report)
 
171
  except Exception as error:
172
  print(error)
173
  else:
mof.py CHANGED
@@ -6,6 +6,14 @@ from lxml import etree
6
  from googletrans import Translator
7
  from transformers import pipeline
8
  from PyPDF2 import PdfReader
 
 
 
 
 
 
 
 
9
  analyzer = pipeline("sentiment-analysis", model="ProsusAI/finbert")
10
 
11
  translator = Translator()
@@ -71,16 +79,6 @@ def extract_from_pdf(url):
71
  extracted_text += text
72
  return extracted_text, extracted_text_eng
73
 
74
- """Upload file to dynamoDB"""
75
- # import datetime
76
- from datetime import datetime, timedelta
77
- from decimal import Decimal
78
- import boto3
79
- import os
80
-
81
- AWS_ACCESS_KEY_ID = os.environ['AWS_ACCESS_KEY_ID']
82
- AWS_SECRET_ACCESS_KEY = os.environ['AWS_SECRET_ACCESS_KEY']
83
-
84
  def get_db_connection():
85
  """Get dynamoDB connection"""
86
  dynamodb = boto3.resource(
@@ -100,9 +98,9 @@ def upsert_content(report):
100
  'id': str(report['id']),
101
  'site': report['site'],
102
  'title': report['title'],
103
- 'originalSite': report['originalSite'],
104
- 'originalTitle': report['originalTitle'],
105
- 'originalContent': report['originalContent'],
106
  'category': report['category'],
107
  # 'author': report['author'],
108
  'content': report['content'],
@@ -117,7 +115,6 @@ def upsert_content(report):
117
  response = table.put_item(Item=item)
118
  print(response)
119
 
120
- reportList = []
121
  categoryu_urls = ["https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/"]
122
  for categoryu_url in categoryu_urls:
123
  req = urllib.request.urlopen(categoryu_url)
@@ -131,11 +128,10 @@ for categoryu_url in categoryu_urls:
131
  subpage = etree.HTML(subelement)
132
  date = subpage.xpath("//span/text()")[0]
133
  parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
134
- if parsed_datetime > (datetime.today() - timedelta(days=180)):
135
  urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
136
  for url in urls:
137
  try:
138
- print(url)
139
  article = {}
140
  url = url.replace("./", "https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/")
141
  req = urllib.request.urlopen(url)
@@ -143,7 +139,10 @@ for categoryu_url in categoryu_urls:
143
  html_text = text.decode("utf-8")
144
  page = etree.HTML(html_text)
145
  article['originalContent'] = encode(page.xpath("//div[contains(@class, 'TRS_Editor')]//p"))
146
- article['content'] = translator.translate(article['originalContent'], dest='en').text
 
 
 
147
  article['site'] = "Ministry of Finance"
148
  article['originalSite'] = "财政部"
149
  article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
@@ -174,12 +173,10 @@ for categoryu_url in categoryu_urls:
174
  sentiment_score = sentiment_score + 0
175
  article['sentimentScore'] = sentiment_score
176
  article['sentimentLabel'] = label_dict[sentiment_label]
177
- print(article)
178
- # upsert_content(article)
179
  except Exception as error:
180
  print(error)
181
 
182
- reportList = []
183
  categoryu_urls = ["https://www.mof.gov.cn/zhengwuxinxi/zhengcefabu/"]
184
  for categoryu_url in categoryu_urls:
185
  req = urllib.request.urlopen(categoryu_url)
@@ -193,11 +190,10 @@ for categoryu_url in categoryu_urls:
193
  subpage = etree.HTML(subelement)
194
  date = subpage.xpath("//span/text()")[0]
195
  parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
196
- if parsed_datetime > (datetime.today() - timedelta(days=180)):
197
  urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
198
  for url in urls:
199
  try:
200
- print(url)
201
  article = {}
202
  url = url.replace("./", categoryu_url)
203
  req = urllib.request.urlopen(url)
@@ -205,7 +201,10 @@ for categoryu_url in categoryu_urls:
205
  html_text = text.decode("utf-8")
206
  page = etree.HTML(html_text)
207
  article['originalContent'] = encode(page.xpath("//div[contains(@class, 'TRS_Editor')]//p"))
208
- article['content'] = translator.translate(article['originalContent'], dest='en').text
 
 
 
209
  article['site'] = "Ministry of Finance"
210
  article['originalSite'] = "财政部"
211
  article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
@@ -236,12 +235,10 @@ for categoryu_url in categoryu_urls:
236
  sentiment_score = sentiment_score + 0
237
  article['sentimentScore'] = sentiment_score
238
  article['sentimentLabel'] = label_dict[sentiment_label]
239
- print(article)
240
- # upsert_content(article)
241
  except Exception as error:
242
  print(error)
243
 
244
- reportList = []
245
  categoryu_urls = ["https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/"]
246
  for categoryu_url in categoryu_urls:
247
  req = urllib.request.urlopen(categoryu_url)
@@ -255,11 +252,10 @@ for categoryu_url in categoryu_urls:
255
  subpage = etree.HTML(subelement)
256
  date = subpage.xpath("//span/text()")[0]
257
  parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
258
- if parsed_datetime > (datetime.today() - timedelta(days=180)):
259
  urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
260
  for url in urls:
261
  try:
262
- print(url)
263
  article = {}
264
  url = url.replace("./", categoryu_url)
265
  req = urllib.request.urlopen(url)
@@ -267,7 +263,10 @@ for categoryu_url in categoryu_urls:
267
  html_text = text.decode("utf-8")
268
  page = etree.HTML(html_text)
269
  article['originalContent'] = encode(page.xpath("//div[contains(@class, 'TRS_Editor')]//p"))
270
- article['content'] = translator.translate(article['originalContent'], dest='en').text
 
 
 
271
  article['site'] = "Ministry of Finance"
272
  article['originalSite'] = "财政部"
273
  article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
@@ -298,7 +297,6 @@ for categoryu_url in categoryu_urls:
298
  sentiment_score = sentiment_score + 0
299
  article['sentimentScore'] = sentiment_score
300
  article['sentimentLabel'] = label_dict[sentiment_label]
301
- print(article)
302
- # upsert_content(article)
303
  except Exception as error:
304
  print(error)
 
6
  from googletrans import Translator
7
  from transformers import pipeline
8
  from PyPDF2 import PdfReader
9
+ from datetime import datetime, timedelta
10
+ from decimal import Decimal
11
+ import boto3
12
+ import os
13
+
14
+ AWS_ACCESS_KEY_ID = os.environ['AWS_ACCESS_KEY_ID']
15
+ AWS_SECRET_ACCESS_KEY = os.environ['AWS_SECRET_ACCESS_KEY']
16
+
17
  analyzer = pipeline("sentiment-analysis", model="ProsusAI/finbert")
18
 
19
  translator = Translator()
 
79
  extracted_text += text
80
  return extracted_text, extracted_text_eng
81
 
 
 
 
 
 
 
 
 
 
 
82
  def get_db_connection():
83
  """Get dynamoDB connection"""
84
  dynamodb = boto3.resource(
 
98
  'id': str(report['id']),
99
  'site': report['site'],
100
  'title': report['title'],
101
+ # 'originalSite': report['originalSite'],
102
+ # 'originalTitle': report['originalTitle'],
103
+ # 'originalContent': report['originalContent'],
104
  'category': report['category'],
105
  # 'author': report['author'],
106
  'content': report['content'],
 
115
  response = table.put_item(Item=item)
116
  print(response)
117
 
 
118
  categoryu_urls = ["https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/"]
119
  for categoryu_url in categoryu_urls:
120
  req = urllib.request.urlopen(categoryu_url)
 
128
  subpage = etree.HTML(subelement)
129
  date = subpage.xpath("//span/text()")[0]
130
  parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
131
+ if parsed_datetime > (datetime.today() - timedelta(days=183)):
132
  urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
133
  for url in urls:
134
  try:
 
135
  article = {}
136
  url = url.replace("./", "https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/")
137
  req = urllib.request.urlopen(url)
 
139
  html_text = text.decode("utf-8")
140
  page = etree.HTML(html_text)
141
  article['originalContent'] = encode(page.xpath("//div[contains(@class, 'TRS_Editor')]//p"))
142
+ content_eng = ''
143
+ for element in article['originalContent'].split("。"):
144
+ content_eng += translator.translate(element, dest='en').text + ' '
145
+ article['content'] = content_eng
146
  article['site'] = "Ministry of Finance"
147
  article['originalSite'] = "财政部"
148
  article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
 
173
  sentiment_score = sentiment_score + 0
174
  article['sentimentScore'] = sentiment_score
175
  article['sentimentLabel'] = label_dict[sentiment_label]
176
+ upsert_content(article)
 
177
  except Exception as error:
178
  print(error)
179
 
 
180
  categoryu_urls = ["https://www.mof.gov.cn/zhengwuxinxi/zhengcefabu/"]
181
  for categoryu_url in categoryu_urls:
182
  req = urllib.request.urlopen(categoryu_url)
 
190
  subpage = etree.HTML(subelement)
191
  date = subpage.xpath("//span/text()")[0]
192
  parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
193
+ if parsed_datetime > (datetime.today() - timedelta(days=183)):
194
  urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
195
  for url in urls:
196
  try:
 
197
  article = {}
198
  url = url.replace("./", categoryu_url)
199
  req = urllib.request.urlopen(url)
 
201
  html_text = text.decode("utf-8")
202
  page = etree.HTML(html_text)
203
  article['originalContent'] = encode(page.xpath("//div[contains(@class, 'TRS_Editor')]//p"))
204
+ content_eng = ''
205
+ for element in article['originalContent'].split("。"):
206
+ content_eng += translator.translate(element, dest='en').text + ' '
207
+ article['content'] = content_eng
208
  article['site'] = "Ministry of Finance"
209
  article['originalSite'] = "财政部"
210
  article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
 
235
  sentiment_score = sentiment_score + 0
236
  article['sentimentScore'] = sentiment_score
237
  article['sentimentLabel'] = label_dict[sentiment_label]
238
+ upsert_content(article)
 
239
  except Exception as error:
240
  print(error)
241
 
 
242
  categoryu_urls = ["https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/"]
243
  for categoryu_url in categoryu_urls:
244
  req = urllib.request.urlopen(categoryu_url)
 
252
  subpage = etree.HTML(subelement)
253
  date = subpage.xpath("//span/text()")[0]
254
  parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
255
+ if parsed_datetime > (datetime.today() - timedelta(days=183)):
256
  urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
257
  for url in urls:
258
  try:
 
259
  article = {}
260
  url = url.replace("./", categoryu_url)
261
  req = urllib.request.urlopen(url)
 
263
  html_text = text.decode("utf-8")
264
  page = etree.HTML(html_text)
265
  article['originalContent'] = encode(page.xpath("//div[contains(@class, 'TRS_Editor')]//p"))
266
+ content_eng = ''
267
+ for element in article['originalContent'].split("。"):
268
+ content_eng += translator.translate(element, dest='en').text + ' '
269
+ article['content'] = content_eng
270
  article['site'] = "Ministry of Finance"
271
  article['originalSite'] = "财政部"
272
  article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
 
297
  sentiment_score = sentiment_score + 0
298
  article['sentimentScore'] = sentiment_score
299
  article['sentimentLabel'] = label_dict[sentiment_label]
300
+ upsert_content(article)
 
301
  except Exception as error:
302
  print(error)
ndrc.py CHANGED
@@ -75,16 +75,14 @@ def extract_from_pdf(url):
75
  pdf_reader = PdfReader(f)
76
  num_pages = len(pdf_reader.pages)
77
  extracted_text = ""
78
- extracted_text_eng = ""
79
  for page in range(num_pages):
80
  text = pdf_reader.pages[page].extract_text()
81
  if text and text[0].isdigit():
82
  text = text[1:]
83
  first_newline_index = text.find('\n')
84
  text = text[:first_newline_index+1].replace('\n', ' ') + text[first_newline_index+1:].replace('\n', '')
85
- extracted_text_eng += translator.translate(text, dest='en').text
86
  extracted_text += text
87
- return extracted_text, extracted_text_eng
88
 
89
  def get_db_connection():
90
  """Get dynamoDB connection"""
@@ -135,7 +133,7 @@ for categoryu_url in categoryu_urls:
135
  subpage = etree.HTML(subelement)
136
  date = subpage.xpath("//span/text()")[0]
137
  parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y/%m/%d")), "%Y-%m-%d")
138
- if parsed_datetime > (datetime.today() - timedelta(days=180)):
139
  urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
140
  for url in urls:
141
  try:
@@ -144,7 +142,6 @@ for categoryu_url in categoryu_urls:
144
  url = url.replace("../../", "https://www.ndrc.gov.cn/xxgk/")
145
  else:
146
  url = url.replace("./", categoryu_url)
147
- print(url)
148
  req = urllib.request.urlopen(url)
149
  text = req.read()
150
  html_text = text.decode("utf-8")
@@ -153,9 +150,12 @@ for categoryu_url in categoryu_urls:
153
  for attachment_url in attachment_urls:
154
  if ".pdf" in attachment_url:
155
  pdf_url = url.rsplit('/', 1)[0] + attachment_url.replace('./','/')
156
- pdf_content, extracted_text_eng = extract_from_pdf(pdf_url)
157
- article['content'] = extracted_text_eng
158
  article['originalContent'] = pdf_content
 
 
 
 
159
  article['site'] = "National Development and Reform Commission"
160
  article['originalSite'] = "国家发展和改革委员会"
161
  article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
@@ -186,7 +186,6 @@ for categoryu_url in categoryu_urls:
186
  sentiment_score = sentiment_score + 0
187
  article['sentimentScore'] = sentiment_score
188
  article['sentimentLabel'] = label_dict[sentiment_label]
189
- print(article)
190
  upsert_content(article)
191
  except Exception as error:
192
  print(error)
@@ -204,13 +203,12 @@ for categoryu_url in categoryu_urls:
204
  subpage = etree.HTML(subelement)
205
  date = subpage.xpath("//span/text()")[0]
206
  parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y/%m/%d")), "%Y-%m-%d")
207
- if parsed_datetime > (datetime.today() - timedelta(days=180)):
208
  urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
209
  for url in urls:
210
  try:
211
  article = {}
212
  if "https://www.gov.cn" in url:
213
- print(url)
214
  req = urllib.request.urlopen(url)
215
  text = req.read()
216
  html_text = text.decode("utf-8")
@@ -220,7 +218,6 @@ for categoryu_url in categoryu_urls:
220
  for element in article['originalContent'].split("。"):
221
  content_eng += translator.translate(element, dest='en').text + ' '
222
  article['content'] = content_eng
223
- print(article['content'])
224
  article['site'] = "State Council"
225
  article['originalSite'] = "国务院"
226
  article['originalTitle'] = page.xpath("//title/text()")[0]
@@ -240,7 +237,6 @@ for categoryu_url in categoryu_urls:
240
  for element in article['originalContent'].split("。"):
241
  content_eng += translator.translate(element, dest='en').text + ' '
242
  article['content'] = content_eng
243
- print(article['content'])
244
  article['site'] = "National Development and Reform Commission"
245
  article['originalSite'] = "国家发展和改革委员会"
246
  article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
@@ -271,7 +267,6 @@ for categoryu_url in categoryu_urls:
271
  sentiment_score = sentiment_score + 0
272
  article['sentimentScore'] = sentiment_score
273
  article['sentimentLabel'] = label_dict[sentiment_label]
274
- print(article)
275
  upsert_content(article)
276
  except Exception as error:
277
  print(error)
 
75
  pdf_reader = PdfReader(f)
76
  num_pages = len(pdf_reader.pages)
77
  extracted_text = ""
 
78
  for page in range(num_pages):
79
  text = pdf_reader.pages[page].extract_text()
80
  if text and text[0].isdigit():
81
  text = text[1:]
82
  first_newline_index = text.find('\n')
83
  text = text[:first_newline_index+1].replace('\n', ' ') + text[first_newline_index+1:].replace('\n', '')
 
84
  extracted_text += text
85
+ return extracted_text
86
 
87
  def get_db_connection():
88
  """Get dynamoDB connection"""
 
133
  subpage = etree.HTML(subelement)
134
  date = subpage.xpath("//span/text()")[0]
135
  parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y/%m/%d")), "%Y-%m-%d")
136
+ if parsed_datetime > (datetime.today() - timedelta(days=183)):
137
  urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
138
  for url in urls:
139
  try:
 
142
  url = url.replace("../../", "https://www.ndrc.gov.cn/xxgk/")
143
  else:
144
  url = url.replace("./", categoryu_url)
 
145
  req = urllib.request.urlopen(url)
146
  text = req.read()
147
  html_text = text.decode("utf-8")
 
150
  for attachment_url in attachment_urls:
151
  if ".pdf" in attachment_url:
152
  pdf_url = url.rsplit('/', 1)[0] + attachment_url.replace('./','/')
153
+ pdf_content = extract_from_pdf(pdf_url)
 
154
  article['originalContent'] = pdf_content
155
+ content_eng = ''
156
+ for element in article['originalContent'].split("。"):
157
+ content_eng += translator.translate(element, dest='en').text + ' '
158
+ article['content'] = content_eng
159
  article['site'] = "National Development and Reform Commission"
160
  article['originalSite'] = "国家发展和改革委员会"
161
  article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
 
186
  sentiment_score = sentiment_score + 0
187
  article['sentimentScore'] = sentiment_score
188
  article['sentimentLabel'] = label_dict[sentiment_label]
 
189
  upsert_content(article)
190
  except Exception as error:
191
  print(error)
 
203
  subpage = etree.HTML(subelement)
204
  date = subpage.xpath("//span/text()")[0]
205
  parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y/%m/%d")), "%Y-%m-%d")
206
+ if parsed_datetime > (datetime.today() - timedelta(days=183)):
207
  urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
208
  for url in urls:
209
  try:
210
  article = {}
211
  if "https://www.gov.cn" in url:
 
212
  req = urllib.request.urlopen(url)
213
  text = req.read()
214
  html_text = text.decode("utf-8")
 
218
  for element in article['originalContent'].split("。"):
219
  content_eng += translator.translate(element, dest='en').text + ' '
220
  article['content'] = content_eng
 
221
  article['site'] = "State Council"
222
  article['originalSite'] = "国务院"
223
  article['originalTitle'] = page.xpath("//title/text()")[0]
 
237
  for element in article['originalContent'].split("。"):
238
  content_eng += translator.translate(element, dest='en').text + ' '
239
  article['content'] = content_eng
 
240
  article['site'] = "National Development and Reform Commission"
241
  article['originalSite'] = "国家发展和改革委员会"
242
  article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
 
267
  sentiment_score = sentiment_score + 0
268
  article['sentimentScore'] = sentiment_score
269
  article['sentimentLabel'] = label_dict[sentiment_label]
 
270
  upsert_content(article)
271
  except Exception as error:
272
  print(error)