import uuid import time import urllib.request from lxml import etree from datetime import datetime, timedelta from utils import encode, translate, datemodifier, sentiment_computation, upsert_content i = 0 while i > -1: if i == 0: CATEGORY_URL = "https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/" else: CATEGORY_URL = f"https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/index_{i}.htm" i = i + 1 req = urllib.request.urlopen(CATEGORY_URL) text = req.read() html_text = text.decode("utf-8") page = etree.HTML(html_text) articlelist = page.xpath("//div[contains(@class, 'xwfb_listerji')]/ul/li[not(@class = 'clear')]") for article in articlelist: if isinstance(article, etree._Element): subelement = etree.tostring(article).decode() subpage = etree.HTML(subelement) date = subpage.xpath("//span/text()")[0] parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d") if parsed_datetime < (datetime.today() - timedelta(days=183)): i = -1 else: urls = subpage.xpath("//a[contains(@target, '_blank')]/@href") for url in urls: try: article = {} url = url.replace("../", "https://www.mof.gov.cn/zhengwuxinxi/") url = url.replace("./", "https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/") req = urllib.request.urlopen(url) text = req.read() html_text = text.decode("utf-8") page = etree.HTML(html_text) article['originalContent'] = encode(page.xpath("//div[contains(@class, 'TRS_Editor')]//p")) if len(article['originalContent']) < 10: continue CONTENT_ENG = '' for element in article['originalContent'].split("。"): CONTENT_ENG += translate(element) + ' ' article['content'] = CONTENT_ENG article['site'] = "Ministry of Finance" article['originalSite'] = "财政部" article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0] article['title'] = translate(article['originalTitle']) article['url'] = url article['category']= "Financial News" article['publishDate'] = datemodifier(page.xpath("//meta[@name = 'PubDate']/@content")[0],"%Y-%m-%d %H:%M:%S") article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate']) article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content']) upsert_content(article) except Exception as error: print(error) i = 0 while i > -1: if i == 0: CATEGORY_URL = "https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/" else: CATEGORY_URL = f"https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/index_{i}.htm" i = i + 1 req = urllib.request.urlopen(CATEGORY_URL) text = req.read() html_text = text.decode("utf-8") page = etree.HTML(html_text) articlelist = page.xpath("//div[contains(@class, 'xwfb_listerji')]/ul/li[not(@class = 'clear')]") for article in articlelist: if isinstance(article, etree._Element): subelement = etree.tostring(article).decode() subpage = etree.HTML(subelement) date = subpage.xpath("//span/text()")[0] parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d") if parsed_datetime < (datetime.today() - timedelta(days=183)): i = -1 else: urls = subpage.xpath("//a[contains(@target, '_blank')]/@href") for url in urls: try: article = {} url = url.replace("./", CATEGORY_URL) req = urllib.request.urlopen(url) text = req.read() html_text = text.decode("utf-8") page = etree.HTML(html_text) article['originalContent'] = encode(page.xpath("//div[contains(@class, 'TRS_Editor')]//p")) if len(article['originalContent']) < 10: continue CONTENT_ENG = '' for element in article['originalContent'].split("。"): CONTENT_ENG += translate(element) + ' ' article['content'] = CONTENT_ENG article['site'] = "Ministry of Finance" article['originalSite'] = "财政部" article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0] article['title'] = translate(article['originalTitle']) article['url'] = url article['category']= "Policy Interpretation" article['publishDate'] = datemodifier(page.xpath("//meta[@name = 'PubDate']/@content")[0], "%Y-%m-%d %H:%M:%S") article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate']) article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content']) upsert_content(article) except Exception as error: print(error) # categoryu_urls = ["https://www.mof.gov.cn/zhengwuxinxi/zhengcefabu/"] # for categoryu_url in categoryu_urls: # req = urllib.request.urlopen(categoryu_url) # text = req.read() # html_text = text.decode("utf-8") # page = etree.HTML(html_text) # articlelist = page.xpath("//div[contains(@class, 'xwfb_listerji')]/ul/li[not(@class = 'clear')]") # for article in articlelist: # if isinstance(article, etree._Element): # subelement = etree.tostring(article).decode() # subpage = etree.HTML(subelement) # date = subpage.xpath("//span/text()")[0] # parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d") # if parsed_datetime > (datetime.today() - timedelta(days=183)): # urls = subpage.xpath("//a[contains(@target, '_blank')]/@href") # for url in urls: # try: # article = {} # url = url.replace("./", categoryu_url) # req = urllib.request.urlopen(url) # text = req.read() # html_text = text.decode("utf-8") # page = etree.HTML(html_text) # article['originalContent'] = encode(page.xpath("//div[contains(@class, 'TRS_Editor')]//p")) # content_eng = '' # for element in article['originalContent'].split("。"): # content_eng += translator.translate(element, dest='en').text + ' ' # article['content'] = content_eng # article['site'] = "Ministry of Finance" # article['originalSite'] = "财政部" # article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0] # article['title'] = translator.translate(article['originalTitle'], dest='en').text # article['url'] = url # article['category']= "Policy Release" # article['publishDate'] = datemodifier(page.xpath("//meta[@name = 'PubDate']/@content")[0]) # article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate']) # label_dict = { # "positive": "+", # "negative": "-", # "neutral": "0", # } # sentiment_score = 0 # maximum_value = 0 # raw_sentiment = analyzer(article['content'][:512], return_all_scores=True) # sentiment_label = None # for sentiment_dict in raw_sentiment[0]: # value = sentiment_dict["score"] # if value > maximum_value: # sentiment_label = sentiment_dict["label"] # maximum_value = value # if sentiment_dict["label"] == "positive": # sentiment_score = sentiment_score + value # if sentiment_dict["label"] == "negative": # sentiment_score = sentiment_score - value # else: # sentiment_score = sentiment_score + 0 # article['sentimentScore'] = sentiment_score # article['sentimentLabel'] = label_dict[sentiment_label] # upsert_content(article) # except Exception as error: # print(error)