File size: 4,432 Bytes
0fc522e 046bb22 b6dcee5 046bb22 b6dcee5 046bb22 0fc522e 57c4050 0fc522e 57c4050 ec13f7a 4a8b338 0fc522e b348cfd 0fc522e 4a8b338 b6dcee5 b348cfd 0fc522e 57c4050 0fc522e 57c4050 ec13f7a 57c4050 0fc522e ec13f7a 0fc522e 4a8b338 046bb22 b6dcee5 0fc522e b348cfd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 |
from datetime import datetime, timedelta
import time
import urllib.request
from lxml import etree
from utils import crawl
# with open('xpath.json', 'r', encoding='UTF-8') as f:
# xpath_dict = json.load(f)
# def crawl(url, article):
# domain = urlparse(url).netloc
# req = urllib.request.urlopen(url)
# text = req.read()
# html_text = text.decode("utf-8")
# page = etree.HTML(html_text)
# originContent, summary = encode_content(page.xpath(xpath_dict[domain]['content']))
# article['originTitle'] = encode(page.xpath(xpath_dict[domain]['title']))
# article['title'] = translate(article['originTitle'])
# article['author'] = translate(encode(page.xpath(xpath_dict[domain]['author'])))
# article['originContent'] = repr(originContent)
# if len(article['originContent']) < 10:
# return None
# CONTENT_ENG = ''
# for element in originContent.split("\n"):
# CONTENT_ENG += translate(element) + '\n'
# article['content'] = repr(CONTENT_ENG)
# article['subtitle'] = translate(summary)
# article['publishDate'] = datemodifier(encode(page.xpath(xpath_dict[domain]['publishdate'])), xpath_dict[domain]['datetime']['format_string'])
# article['link'] = url
# article['attachment'] = ""
# article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(originContent.replace("\n",""))
# article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
# upsert_content(article)
i = 0
while i > -1:
if i == 0:
CATEGORY_URL = "https://www.gov.cn/zhengce/jiedu/home.htm"
else:
CATEGORY_URL = f"https://www.gov.cn/zhengce/jiedu/home_{i}.htm"
i = i + 1
req = urllib.request.urlopen(CATEGORY_URL)
text = req.read()
html_text = text.decode("utf-8")
page = etree.HTML(html_text)
articlelist = page.xpath("//div[contains(@class, 'news_box')]//h4")
for article in articlelist:
if isinstance(article, etree._Element):
subelement = etree.tostring(article).decode()
subpage = etree.HTML(subelement)
date = subpage.xpath("//span/text()")[0]
parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
if parsed_datetime < (datetime.today() - timedelta(days=183)):
i = -1
else:
urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
for url in urls:
try:
article = {}
url = url.replace('../', 'https://www.gov.cn/zhengce/')
if "https://www.gov.cn" in url:
article['category']= "Policy Interpretation"
crawl(url, article)
except Exception as error:
print(error)
i = 0
while i > -1:
if i == 0:
CATEGORY_URL = "https://www.gov.cn/zhengce/zuixin/home.htm"
else:
CATEGORY_URL = f"https://www.gov.cn/zhengce/zuixin/home_{i}.htm"
i = i + 1
req = urllib.request.urlopen(CATEGORY_URL)
text = req.read()
html_text = text.decode("utf-8")
page = etree.HTML(html_text)
articlelist = page.xpath("//div[contains(@class, 'news_box')]//h4")
for article in articlelist:
if isinstance(article, etree._Element):
subelement = etree.tostring(article).decode()
subpage = etree.HTML(subelement)
date = subpage.xpath("//span/text()")[0]
parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
if parsed_datetime < (datetime.today() - timedelta(days=183)):
i = -1
else:
urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
for url in urls:
try:
article = {}
url = url.replace('../', 'https://www.gov.cn/zhengce/')
if "https://www.gov.cn" in url:
article['category']= "Policy Release"
article['originSite'] = "国务院"
article['site'] = "State Council of China"
crawl(url, article)
except Exception as error:
print(error)
|