OxbridgeEconomics commited on
Commit
71294ad
·
unverified ·
1 Parent(s): a6d7194

Update ndrc.py

Browse files
Files changed (1) hide show
  1. ndrc.py +48 -24
ndrc.py CHANGED
@@ -18,6 +18,14 @@ def datemodifier(date_string):
18
  except:
19
  return False
20
 
 
 
 
 
 
 
 
 
21
  def fetch_url(url):
22
  response = requests.get(url)
23
  if response.status_code == 200:
@@ -40,7 +48,7 @@ def encode(content):
40
  subpage = etree.HTML(subelement)
41
  tree = subpage.xpath('//text()')
42
  line = ''.join(translist(tree)).\
43
- replace('\n','').replace('\t','').replace('\r','').replace(' ','').strip()
44
  else:
45
  line = element
46
  text += line
@@ -206,29 +214,46 @@ for categoryu_url in categoryu_urls:
206
  urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
207
  for url in urls:
208
  try:
209
- print(url)
210
  article = {}
211
- url = url.replace("../../", "https://www.ndrc.gov.cn/xxgk/jd/jd/")
212
- url = url.replace("./", "https://www.ndrc.gov.cn/xxgk/jd/jd/")
213
- print(url)
214
- req = urllib.request.urlopen(url)
215
- text = req.read()
216
- html_text = text.decode("utf-8")
217
- page = etree.HTML(html_text)
218
- article['originalContent'] = encode(page.xpath("//div[contains(@class, 'TRS_Editor')]//p"))
219
- split_text = article['originalContent'].split("。")
220
- half_length = len(split_text) // 2
221
- part1 = "。".join(split_text[:half_length])
222
- part2 = "。".join(split_text[half_length:])
223
- article['content'] = translator.translate(part1, dest='en').text + translator.translate(part2, dest='en').text
224
- print(len(article['originalContent']),article['content'])
225
- article['site'] = "National Development and Reform Commission"
226
- article['originalSite'] = "国家发展和改革委员会"
227
- article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
228
- article['title'] = translator.translate(article['originalTitle'], dest='en').text
229
- article['url'] = url
230
- article['category']= "Policy Interpretation"
231
- article['publishDate'] = datemodifier(page.xpath("//meta[@name = 'PubDate']/@content")[0])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
232
  article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
233
  label_dict = {
234
  "positive": "+",
@@ -256,4 +281,3 @@ for categoryu_url in categoryu_urls:
256
  upsert_content(article)
257
  except Exception as error:
258
  print(error)
259
-
 
18
  except:
19
  return False
20
 
21
+ def datemodifier_gov(date_string):
22
+ """Date Modifier Function"""
23
+ try:
24
+ to_date = time.strptime(date_string,"%Y-%m-%d-%H:%M:%S")
25
+ return time.strftime("%Y-%m-%d",to_date)
26
+ except:
27
+ return False
28
+
29
  def fetch_url(url):
30
  response = requests.get(url)
31
  if response.status_code == 200:
 
48
  subpage = etree.HTML(subelement)
49
  tree = subpage.xpath('//text()')
50
  line = ''.join(translist(tree)).\
51
+ replace('\n','').replace('\t','').replace('\r','').replace(' ','').replace('\u3000',' ').replace('\xa0','').strip()
52
  else:
53
  line = element
54
  text += line
 
214
  urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
215
  for url in urls:
216
  try:
 
217
  article = {}
218
+ if "https://www.gov.cn" in url:
219
+ print(url)
220
+ req = urllib.request.urlopen(url)
221
+ text = req.read()
222
+ html_text = text.decode("utf-8")
223
+ page = etree.HTML(html_text)
224
+ article['originalContent'] = encode(page.xpath("//div[contains(@id, 'UCAP-CONTENT')]//p"))
225
+ content_eng = ''
226
+ for element in article['originalContent'].split("。"):
227
+ content_eng += translator.translate(element, dest='en').text + ' '
228
+ article['content'] = content_eng
229
+ print(article['content'])
230
+ article['site'] = "State Council"
231
+ article['originalSite'] = "国务院"
232
+ article['originalTitle'] = page.xpath("//title/text()")[0]
233
+ article['title'] = translator.translate(article['originalTitle'], dest='en').text
234
+ article['url'] = url
235
+ article['category']= "Policy Release"
236
+ article['publishDate'] = datemodifier_gov(page.xpath("//meta[@name = 'firstpublishedtime']/@content")[0])
237
+ else:
238
+ url = url.replace("../../", "https://www.ndrc.gov.cn/xxgk/jd/jd/")
239
+ url = url.replace("./", "https://www.ndrc.gov.cn/xxgk/jd/jd/")
240
+ req = urllib.request.urlopen(url)
241
+ text = req.read()
242
+ html_text = text.decode("utf-8")
243
+ page = etree.HTML(html_text)
244
+ article['originalContent'] = encode(page.xpath("//div[contains(@class, 'TRS_Editor')]//p"))
245
+ content_eng = ''
246
+ for element in article['originalContent'].split("。"):
247
+ content_eng += translator.translate(element, dest='en').text + ' '
248
+ article['content'] = content_eng
249
+ print(article['content'])
250
+ article['site'] = "National Development and Reform Commission"
251
+ article['originalSite'] = "国家发展和改革委员会"
252
+ article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
253
+ article['title'] = translator.translate(article['originalTitle'], dest='en').text
254
+ article['url'] = url
255
+ article['category']= "Policy Interpretation"
256
+ article['publishDate'] = datemodifier(page.xpath("//meta[@name = 'PubDate']/@content")[0])
257
  article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
258
  label_dict = {
259
  "positive": "+",
 
281
  upsert_content(article)
282
  except Exception as error:
283
  print(error)