|
import requests |
|
import uuid |
|
import time |
|
import urllib.request |
|
from lxml import etree |
|
from googletrans import Translator |
|
from transformers import pipeline |
|
from PyPDF2 import PdfReader |
|
analyzer = pipeline("sentiment-analysis", model="ProsusAI/finbert") |
|
|
|
translator = Translator() |
|
|
|
def datemodifier(date_string): |
|
"""Date Modifier Function""" |
|
try: |
|
to_date = time.strptime(date_string,"%Y-%m-%d %H:%M:%S") |
|
return time.strftime("%Y-%m-%d",to_date) |
|
except: |
|
return False |
|
|
|
def fetch_url(url): |
|
response = requests.get(url) |
|
if response.status_code == 200: |
|
return response.text |
|
else: |
|
return None |
|
|
|
def translist(infolist): |
|
"""Translist Function""" |
|
out = list(filter(lambda s: s and |
|
(isinstance (s,str) or len(s.strip()) > 0), [i.strip() for i in infolist])) |
|
return out |
|
|
|
def encode(content): |
|
"""Encode Function""" |
|
text = '' |
|
for element in content: |
|
if isinstance(element, etree._Element): |
|
subelement = etree.tostring(element).decode() |
|
subpage = etree.HTML(subelement) |
|
tree = subpage.xpath('//text()') |
|
line = ''.join(translist(tree)).\ |
|
replace('\n','').replace('\t','').replace('\r','').replace(' ','').strip() |
|
else: |
|
line = element |
|
text += line |
|
return text |
|
|
|
def extract_from_pdf(url): |
|
|
|
response = requests.get(url) |
|
pdf_content = response.content |
|
|
|
|
|
with open("downloaded_file.pdf", "wb") as f: |
|
f.write(pdf_content) |
|
|
|
|
|
with open("downloaded_file.pdf", "rb") as f: |
|
pdf_reader = PdfReader(f) |
|
num_pages = len(pdf_reader.pages) |
|
extracted_text = "" |
|
extracted_text_eng = "" |
|
for page in range(num_pages): |
|
text = pdf_reader.pages[page].extract_text() |
|
if text and text[0].isdigit(): |
|
text = text[1:] |
|
first_newline_index = text.find('\n') |
|
text = text[:first_newline_index+1].replace('\n', ' ') + text[first_newline_index+1:].replace('\n', '') |
|
extracted_text_eng += translator.translate(text, dest='en').text |
|
extracted_text += text |
|
return extracted_text, extracted_text_eng |
|
|
|
"""Upload file to dynamoDB""" |
|
|
|
from datetime import datetime, timedelta |
|
from decimal import Decimal |
|
import boto3 |
|
|
|
AWS_ACCESS_KEY_ID = "AKIAQFXZMGHQYXKWUDWR" |
|
AWS_SECRET_ACCESS_KEY = "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" |
|
|
|
print(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY) |
|
|
|
def get_db_connection(): |
|
"""Get dynamoDB connection""" |
|
dynamodb = boto3.resource( |
|
service_name='dynamodb', |
|
region_name='us-east-1', |
|
aws_access_key_id=AWS_ACCESS_KEY_ID, |
|
aws_secret_access_key=AWS_SECRET_ACCESS_KEY |
|
) |
|
return dynamodb |
|
|
|
def upsert_content(report): |
|
"""Upsert the content records""" |
|
dynamodb = get_db_connection() |
|
table = dynamodb.Table('article_test') |
|
|
|
item = { |
|
'id': str(report['id']), |
|
'site': report['site'], |
|
'title': report['title'], |
|
'originalSite': report['originalSite'], |
|
'originalTitle': report['originalTitle'], |
|
'originalContent': report['originalContent'], |
|
'category': report['category'], |
|
|
|
'content': report['content'], |
|
'publishDate': report['publishDate'], |
|
'link': report['url'], |
|
|
|
|
|
'sentimentScore': str(Decimal(report['sentimentScore']).quantize(Decimal('0.01'))), |
|
'sentimentLabel': report['sentimentLabel'], |
|
'LastModifiedDate': datetime.now().strftime("%Y-%m-%dT%H:%M:%S") |
|
} |
|
response = table.put_item(Item=item) |
|
print(response) |
|
|
|
reportList = [] |
|
categoryu_urls = ["https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/"] |
|
for categoryu_url in categoryu_urls: |
|
req = urllib.request.urlopen(categoryu_url) |
|
text = req.read() |
|
html_text = text.decode("utf-8") |
|
page = etree.HTML(html_text) |
|
articlelist = page.xpath("//div[contains(@class, 'xwfb_listerji')]/ul/li[not(@class = 'clear')]") |
|
for article in articlelist: |
|
if isinstance(article, etree._Element): |
|
subelement = etree.tostring(article).decode() |
|
subpage = etree.HTML(subelement) |
|
date = subpage.xpath("//span/text()")[0] |
|
parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d") |
|
if parsed_datetime > (datetime.today() - timedelta(days=180)): |
|
urls = subpage.xpath("//a[contains(@target, '_blank')]/@href") |
|
for url in urls: |
|
try: |
|
print(url) |
|
article = {} |
|
url = url.replace("./", "https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/") |
|
req = urllib.request.urlopen(url) |
|
text = req.read() |
|
html_text = text.decode("utf-8") |
|
page = etree.HTML(html_text) |
|
article['originalContent'] = encode(page.xpath("//div[contains(@class, 'TRS_Editor')]//p")) |
|
article['content'] = translator.translate(article['originalContent'], dest='en').text |
|
article['site'] = "Ministry of Finance" |
|
article['originalSite'] = "财政部" |
|
article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0] |
|
article['title'] = translator.translate(article['originalTitle'], dest='en').text |
|
article['url'] = url |
|
article['category']= "Finance News" |
|
article['publishDate'] = datemodifier(page.xpath("//meta[@name = 'PubDate']/@content")[0]) |
|
article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate']) |
|
label_dict = { |
|
"positive": "+", |
|
"negative": "-", |
|
"neutral": "0", |
|
} |
|
sentiment_score = 0 |
|
maximum_value = 0 |
|
raw_sentiment = analyzer(article['content'][:512], return_all_scores=True) |
|
sentiment_label = None |
|
for sentiment_dict in raw_sentiment[0]: |
|
value = sentiment_dict["score"] |
|
if value > maximum_value: |
|
sentiment_label = sentiment_dict["label"] |
|
maximum_value = value |
|
if sentiment_dict["label"] == "positive": |
|
sentiment_score = sentiment_score + value |
|
if sentiment_dict["label"] == "negative": |
|
sentiment_score = sentiment_score - value |
|
else: |
|
sentiment_score = sentiment_score + 0 |
|
article['sentimentScore'] = sentiment_score |
|
article['sentimentLabel'] = label_dict[sentiment_label] |
|
print(article) |
|
|
|
except Exception as error: |
|
print(error) |
|
|
|
reportList = [] |
|
categoryu_urls = ["https://www.mof.gov.cn/zhengwuxinxi/zhengcefabu/"] |
|
for categoryu_url in categoryu_urls: |
|
req = urllib.request.urlopen(categoryu_url) |
|
text = req.read() |
|
html_text = text.decode("utf-8") |
|
page = etree.HTML(html_text) |
|
articlelist = page.xpath("//div[contains(@class, 'xwfb_listerji')]/ul/li[not(@class = 'clear')]") |
|
for article in articlelist: |
|
if isinstance(article, etree._Element): |
|
subelement = etree.tostring(article).decode() |
|
subpage = etree.HTML(subelement) |
|
date = subpage.xpath("//span/text()")[0] |
|
parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d") |
|
if parsed_datetime > (datetime.today() - timedelta(days=180)): |
|
urls = subpage.xpath("//a[contains(@target, '_blank')]/@href") |
|
for url in urls: |
|
try: |
|
print(url) |
|
article = {} |
|
url = url.replace("./", categoryu_url) |
|
req = urllib.request.urlopen(url) |
|
text = req.read() |
|
html_text = text.decode("utf-8") |
|
page = etree.HTML(html_text) |
|
article['originalContent'] = encode(page.xpath("//div[contains(@class, 'TRS_Editor')]//p")) |
|
article['content'] = translator.translate(article['originalContent'], dest='en').text |
|
article['site'] = "Ministry of Finance" |
|
article['originalSite'] = "财政部" |
|
article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0] |
|
article['title'] = translator.translate(article['originalTitle'], dest='en').text |
|
article['url'] = url |
|
article['category']= "Policy Release" |
|
article['publishDate'] = datemodifier(page.xpath("//meta[@name = 'PubDate']/@content")[0]) |
|
article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate']) |
|
label_dict = { |
|
"positive": "+", |
|
"negative": "-", |
|
"neutral": "0", |
|
} |
|
sentiment_score = 0 |
|
maximum_value = 0 |
|
raw_sentiment = analyzer(article['content'][:512], return_all_scores=True) |
|
sentiment_label = None |
|
for sentiment_dict in raw_sentiment[0]: |
|
value = sentiment_dict["score"] |
|
if value > maximum_value: |
|
sentiment_label = sentiment_dict["label"] |
|
maximum_value = value |
|
if sentiment_dict["label"] == "positive": |
|
sentiment_score = sentiment_score + value |
|
if sentiment_dict["label"] == "negative": |
|
sentiment_score = sentiment_score - value |
|
else: |
|
sentiment_score = sentiment_score + 0 |
|
article['sentimentScore'] = sentiment_score |
|
article['sentimentLabel'] = label_dict[sentiment_label] |
|
print(article) |
|
|
|
except Exception as error: |
|
print(error) |
|
|
|
reportList = [] |
|
categoryu_urls = ["https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/"] |
|
for categoryu_url in categoryu_urls: |
|
req = urllib.request.urlopen(categoryu_url) |
|
text = req.read() |
|
html_text = text.decode("utf-8") |
|
page = etree.HTML(html_text) |
|
articlelist = page.xpath("//div[contains(@class, 'xwfb_listerji')]/ul/li[not(@class = 'clear')]") |
|
for article in articlelist: |
|
if isinstance(article, etree._Element): |
|
subelement = etree.tostring(article).decode() |
|
subpage = etree.HTML(subelement) |
|
date = subpage.xpath("//span/text()")[0] |
|
parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d") |
|
if parsed_datetime > (datetime.today() - timedelta(days=180)): |
|
urls = subpage.xpath("//a[contains(@target, '_blank')]/@href") |
|
for url in urls: |
|
try: |
|
print(url) |
|
article = {} |
|
url = url.replace("./", categoryu_url) |
|
req = urllib.request.urlopen(url) |
|
text = req.read() |
|
html_text = text.decode("utf-8") |
|
page = etree.HTML(html_text) |
|
article['originalContent'] = encode(page.xpath("//div[contains(@class, 'TRS_Editor')]//p")) |
|
article['content'] = translator.translate(article['originalContent'], dest='en').text |
|
article['site'] = "Ministry of Finance" |
|
article['originalSite'] = "财政部" |
|
article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0] |
|
article['title'] = translator.translate(article['originalTitle'], dest='en').text |
|
article['url'] = url |
|
article['category']= "Policy Interpretation" |
|
article['publishDate'] = datemodifier(page.xpath("//meta[@name = 'PubDate']/@content")[0]) |
|
article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate']) |
|
label_dict = { |
|
"positive": "+", |
|
"negative": "-", |
|
"neutral": "0", |
|
} |
|
sentiment_score = 0 |
|
maximum_value = 0 |
|
raw_sentiment = analyzer(article['content'][:512], return_all_scores=True) |
|
sentiment_label = None |
|
for sentiment_dict in raw_sentiment[0]: |
|
value = sentiment_dict["score"] |
|
if value > maximum_value: |
|
sentiment_label = sentiment_dict["label"] |
|
maximum_value = value |
|
if sentiment_dict["label"] == "positive": |
|
sentiment_score = sentiment_score + value |
|
if sentiment_dict["label"] == "negative": |
|
sentiment_score = sentiment_score - value |
|
else: |
|
sentiment_score = sentiment_score + 0 |
|
article['sentimentScore'] = sentiment_score |
|
article['sentimentLabel'] = label_dict[sentiment_label] |
|
print(article) |
|
|
|
except Exception as error: |
|
print(error) |