OxbridgeEconomics
Update ndrc.py
95b49f6 unverified
raw
history blame
6.19 kB
from datetime import datetime, timedelta
import uuid
import time
import urllib.request
from lxml import etree
from utils import encode, translate, datemodifier, sentiment_computation, upsert_content
i = 0
while i > -1:
if i == 0:
CATEGORY_URL = "https://www.ndrc.gov.cn/xxgk/jd/jd/index.html"
else:
CATEGORY_URL = f"https://www.ndrc.gov.cn/xxgk/jd/jd/index_{i}.html"
i = i + 1
req = urllib.request.urlopen(CATEGORY_URL)
text = req.read()
html_text = text.decode("utf-8")
page = etree.HTML(html_text)
articlelist = page.xpath("//div[contains(@class, 'list')]/ul/li[not(@class = 'empty')]")
for article in articlelist:
if isinstance(article, etree._Element):
subelement = etree.tostring(article).decode()
subpage = etree.HTML(subelement)
date = subpage.xpath("//span/text()")[0]
parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y/%m/%d")), "%Y-%m-%d")
if parsed_datetime < (datetime.today() - timedelta(days=183)):
i = -1
else:
urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
for url in urls:
try:
article = {}
print(url)
if "https://www.gov.cn" in url:
req = urllib.request.urlopen(url)
text = req.read()
html_text = text.decode("utf-8")
page = etree.HTML(html_text)
article['originalContent'] = encode(page.xpath("//div[contains(@id, 'UCAP-CONTENT')]//p"))
if len(article['originalContent']) < 10:
continue
CONTENT_ENG = ''
for element in article['originalContent'].split("。"):
CONTENT_ENG += translate(element) + ' '
article['content'] = CONTENT_ENG
article['site'] = "State Council"
article['originalSite'] = "国务院"
article['originalTitle'] = page.xpath("//title/text()")[0]
article['title'] = translate(article['originalTitle'])
article['url'] = url
article['category']= "Policy Release"
article['publishDate'] = datemodifier(page.xpath("//meta[@name = 'firstpublishedtime']/@content")[0], "%Y-%m-%d-%H:%M:%S")
elif "../../zcfb/" in url:
url = url.replace("../../zcfb/", "https://www.ndrc.gov.cn/xxgk/zcfb/")
print(url)
req = urllib.request.urlopen(url)
text = req.read()
html_text = text.decode("utf-8")
page = etree.HTML(html_text)
article['originalContent'] = encode(page.xpath("//div[contains(@class, 'TRS_Editor') or contains(@class, 'article_l')]"))
if len(article['originalContent']) < 10:
continue
CONTENT_ENG = ''
for element in article['originalContent'].split("。"):
CONTENT_ENG += translate(element) + ' '
article['content'] = CONTENT_ENG
article['site'] = "National Development and Reform Commission"
article['originalSite'] = "国家发展和改革委员会"
article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
article['title'] = translate(article['originalTitle'])
article['url'] = url
article['category']= "Policy Release"
article['publishDate'] = datemodifier(page.xpath("//meta[@name = 'PubDate']/@content")[0], "%Y-%m-%d %H:%M:%S")
else:
url = url.replace("../../", "https://www.ndrc.gov.cn/xxgk/jd/jd/")
url = url.replace("./", "https://www.ndrc.gov.cn/xxgk/jd/jd/")
print(url)
req = urllib.request.urlopen(url)
text = req.read()
html_text = text.decode("utf-8")
page = etree.HTML(html_text)
article['originalContent'] = encode(page.xpath("//div[contains(@class, 'TRS_Editor')]//p"))
if len(article['originalContent']) < 10:
continue
CONTENT_ENG = ''
for element in article['originalContent'].split("。"):
CONTENT_ENG += translate(element) + ' '
article['content'] = CONTENT_ENG
article['site'] = "National Development and Reform Commission"
article['originalSite'] = "国家发展和改革委员会"
article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
article['title'] = translate(article['originalTitle'])
article['url'] = url
article['category']= "Policy Interpretation"
article['publishDate'] = datemodifier(page.xpath("//meta[@name = 'PubDate']/@content")[0], "%Y-%m-%d %H:%M:%S")
article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
upsert_content(article)
except Exception as error:
print(error)