OxbridgeEconomics
commited on
Update pbc.py
Browse files
pbc.py
CHANGED
@@ -84,8 +84,6 @@ import boto3
|
|
84 |
AWS_ACCESS_KEY_ID = "AKIAQFXZMGHQYXKWUDWR"
|
85 |
AWS_SECRET_ACCESS_KEY = "D2A0IEVl5g3Ljbu0Y5iq9WuFETpDeoEpl69C+6xo"
|
86 |
|
87 |
-
print(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
|
88 |
-
|
89 |
def get_db_connection():
|
90 |
"""Get dynamoDB connection"""
|
91 |
dynamodb = boto3.resource(
|
@@ -105,9 +103,9 @@ def upsert_content(report):
|
|
105 |
'id': str(report['id']),
|
106 |
'site': report['site'],
|
107 |
'title': report['title'],
|
108 |
-
'originalSite': report['originalSite'],
|
109 |
-
'originalTitle': report['originalTitle'],
|
110 |
-
'originalContent': report['originalContent'],
|
111 |
'category': report['category'],
|
112 |
# 'author': report['author'],
|
113 |
'content': report['content'],
|
@@ -132,7 +130,6 @@ for categoryu_url in categoryu_urls:
|
|
132 |
for url in urls:
|
133 |
try:
|
134 |
url = "http://www.pbc.gov.cn" + url
|
135 |
-
print(url)
|
136 |
article = {}
|
137 |
response = requests.get(url)
|
138 |
response.encoding = 'utf-8'
|
@@ -142,18 +139,15 @@ for categoryu_url in categoryu_urls:
|
|
142 |
for element in article['originalContent'].split("。"):
|
143 |
content_eng += translator.translate(element, dest='en').text + ' '
|
144 |
article['content'] = content_eng
|
145 |
-
print(article['content'])
|
146 |
article['site'] = "The People's Bank of China"
|
147 |
article['originalSite'] = "中国人民银行"
|
148 |
article['originalTitle'] = page.xpath("//title/text()")[0]
|
149 |
-
print(article['originalTitle'])
|
150 |
article['title'] = translator.translate(article['originalTitle'], dest='en').text
|
151 |
article['url'] = url
|
152 |
article['category']= "Policy Interpretation"
|
153 |
article['publishDate'] = datemodifier(page.xpath("//meta[@name = '页面生成时间']/@content")[0])
|
154 |
parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(article['publishDate'],"%Y-%m-%d")), "%Y-%m-%d")
|
155 |
-
if parsed_datetime < (datetime.today() - timedelta(days=
|
156 |
-
print(article['publishDate'])
|
157 |
continue
|
158 |
article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
|
159 |
label_dict = {
|
@@ -178,7 +172,6 @@ for categoryu_url in categoryu_urls:
|
|
178 |
sentiment_score = sentiment_score + 0
|
179 |
article['sentimentScore'] = sentiment_score
|
180 |
article['sentimentLabel'] = label_dict[sentiment_label]
|
181 |
-
print(article)
|
182 |
upsert_content(article)
|
183 |
except Exception as error:
|
184 |
print(error)
|
|
|
84 |
AWS_ACCESS_KEY_ID = "AKIAQFXZMGHQYXKWUDWR"
|
85 |
AWS_SECRET_ACCESS_KEY = "D2A0IEVl5g3Ljbu0Y5iq9WuFETpDeoEpl69C+6xo"
|
86 |
|
|
|
|
|
87 |
def get_db_connection():
|
88 |
"""Get dynamoDB connection"""
|
89 |
dynamodb = boto3.resource(
|
|
|
103 |
'id': str(report['id']),
|
104 |
'site': report['site'],
|
105 |
'title': report['title'],
|
106 |
+
# 'originalSite': report['originalSite'],
|
107 |
+
# 'originalTitle': report['originalTitle'],
|
108 |
+
# 'originalContent': report['originalContent'],
|
109 |
'category': report['category'],
|
110 |
# 'author': report['author'],
|
111 |
'content': report['content'],
|
|
|
130 |
for url in urls:
|
131 |
try:
|
132 |
url = "http://www.pbc.gov.cn" + url
|
|
|
133 |
article = {}
|
134 |
response = requests.get(url)
|
135 |
response.encoding = 'utf-8'
|
|
|
139 |
for element in article['originalContent'].split("。"):
|
140 |
content_eng += translator.translate(element, dest='en').text + ' '
|
141 |
article['content'] = content_eng
|
|
|
142 |
article['site'] = "The People's Bank of China"
|
143 |
article['originalSite'] = "中国人民银行"
|
144 |
article['originalTitle'] = page.xpath("//title/text()")[0]
|
|
|
145 |
article['title'] = translator.translate(article['originalTitle'], dest='en').text
|
146 |
article['url'] = url
|
147 |
article['category']= "Policy Interpretation"
|
148 |
article['publishDate'] = datemodifier(page.xpath("//meta[@name = '页面生成时间']/@content")[0])
|
149 |
parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(article['publishDate'],"%Y-%m-%d")), "%Y-%m-%d")
|
150 |
+
if parsed_datetime < (datetime.today() - timedelta(days=183)):
|
|
|
151 |
continue
|
152 |
article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
|
153 |
label_dict = {
|
|
|
172 |
sentiment_score = sentiment_score + 0
|
173 |
article['sentimentScore'] = sentiment_score
|
174 |
article['sentimentLabel'] = label_dict[sentiment_label]
|
|
|
175 |
upsert_content(article)
|
176 |
except Exception as error:
|
177 |
print(error)
|