OxbridgeEconomics
commited on
Update ndrc.py
Browse files
ndrc.py
CHANGED
@@ -18,6 +18,14 @@ def datemodifier(date_string):
|
|
18 |
except:
|
19 |
return False
|
20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
def fetch_url(url):
|
22 |
response = requests.get(url)
|
23 |
if response.status_code == 200:
|
@@ -40,7 +48,7 @@ def encode(content):
|
|
40 |
subpage = etree.HTML(subelement)
|
41 |
tree = subpage.xpath('//text()')
|
42 |
line = ''.join(translist(tree)).\
|
43 |
-
replace('\n','').replace('\t','').replace('\r','').replace(' ','').strip()
|
44 |
else:
|
45 |
line = element
|
46 |
text += line
|
@@ -206,29 +214,46 @@ for categoryu_url in categoryu_urls:
|
|
206 |
urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
|
207 |
for url in urls:
|
208 |
try:
|
209 |
-
print(url)
|
210 |
article = {}
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
232 |
article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
|
233 |
label_dict = {
|
234 |
"positive": "+",
|
@@ -256,4 +281,3 @@ for categoryu_url in categoryu_urls:
|
|
256 |
upsert_content(article)
|
257 |
except Exception as error:
|
258 |
print(error)
|
259 |
-
|
|
|
18 |
except:
|
19 |
return False
|
20 |
|
21 |
+
def datemodifier_gov(date_string):
|
22 |
+
"""Date Modifier Function"""
|
23 |
+
try:
|
24 |
+
to_date = time.strptime(date_string,"%Y-%m-%d-%H:%M:%S")
|
25 |
+
return time.strftime("%Y-%m-%d",to_date)
|
26 |
+
except:
|
27 |
+
return False
|
28 |
+
|
29 |
def fetch_url(url):
|
30 |
response = requests.get(url)
|
31 |
if response.status_code == 200:
|
|
|
48 |
subpage = etree.HTML(subelement)
|
49 |
tree = subpage.xpath('//text()')
|
50 |
line = ''.join(translist(tree)).\
|
51 |
+
replace('\n','').replace('\t','').replace('\r','').replace(' ','').replace('\u3000',' ').replace('\xa0','').strip()
|
52 |
else:
|
53 |
line = element
|
54 |
text += line
|
|
|
214 |
urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
|
215 |
for url in urls:
|
216 |
try:
|
|
|
217 |
article = {}
|
218 |
+
if "https://www.gov.cn" in url:
|
219 |
+
print(url)
|
220 |
+
req = urllib.request.urlopen(url)
|
221 |
+
text = req.read()
|
222 |
+
html_text = text.decode("utf-8")
|
223 |
+
page = etree.HTML(html_text)
|
224 |
+
article['originalContent'] = encode(page.xpath("//div[contains(@id, 'UCAP-CONTENT')]//p"))
|
225 |
+
content_eng = ''
|
226 |
+
for element in article['originalContent'].split("。"):
|
227 |
+
content_eng += translator.translate(element, dest='en').text + ' '
|
228 |
+
article['content'] = content_eng
|
229 |
+
print(article['content'])
|
230 |
+
article['site'] = "State Council"
|
231 |
+
article['originalSite'] = "国务院"
|
232 |
+
article['originalTitle'] = page.xpath("//title/text()")[0]
|
233 |
+
article['title'] = translator.translate(article['originalTitle'], dest='en').text
|
234 |
+
article['url'] = url
|
235 |
+
article['category']= "Policy Release"
|
236 |
+
article['publishDate'] = datemodifier_gov(page.xpath("//meta[@name = 'firstpublishedtime']/@content")[0])
|
237 |
+
else:
|
238 |
+
url = url.replace("../../", "https://www.ndrc.gov.cn/xxgk/jd/jd/")
|
239 |
+
url = url.replace("./", "https://www.ndrc.gov.cn/xxgk/jd/jd/")
|
240 |
+
req = urllib.request.urlopen(url)
|
241 |
+
text = req.read()
|
242 |
+
html_text = text.decode("utf-8")
|
243 |
+
page = etree.HTML(html_text)
|
244 |
+
article['originalContent'] = encode(page.xpath("//div[contains(@class, 'TRS_Editor')]//p"))
|
245 |
+
content_eng = ''
|
246 |
+
for element in article['originalContent'].split("。"):
|
247 |
+
content_eng += translator.translate(element, dest='en').text + ' '
|
248 |
+
article['content'] = content_eng
|
249 |
+
print(article['content'])
|
250 |
+
article['site'] = "National Development and Reform Commission"
|
251 |
+
article['originalSite'] = "国家发展和改革委员会"
|
252 |
+
article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
|
253 |
+
article['title'] = translator.translate(article['originalTitle'], dest='en').text
|
254 |
+
article['url'] = url
|
255 |
+
article['category']= "Policy Interpretation"
|
256 |
+
article['publishDate'] = datemodifier(page.xpath("//meta[@name = 'PubDate']/@content")[0])
|
257 |
article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
|
258 |
label_dict = {
|
259 |
"positive": "+",
|
|
|
281 |
upsert_content(article)
|
282 |
except Exception as error:
|
283 |
print(error)
|
|