OxbridgeEconomics
commited on
Commit
·
b2dbbaf
1
Parent(s):
24d2e54
commit
Browse files- eastmoney.py +8 -10
- mof.py +29 -31
- ndrc.py +8 -13
eastmoney.py
CHANGED
@@ -10,6 +10,10 @@ import os
|
|
10 |
from datetime import datetime, timedelta
|
11 |
from decimal import Decimal
|
12 |
from transformers import pipeline
|
|
|
|
|
|
|
|
|
13 |
analyzer = pipeline("sentiment-analysis", model="ProsusAI/finbert")
|
14 |
|
15 |
translator = Translator()
|
@@ -50,9 +54,6 @@ def encode(content):
|
|
50 |
text += line
|
51 |
return text
|
52 |
|
53 |
-
AWS_ACCESS_KEY_ID = os.environ['AWS_ACCESS_KEY_ID']
|
54 |
-
AWS_SECRET_ACCESS_KEY = os.environ['AWS_SECRET_ACCESS_KEY']
|
55 |
-
|
56 |
def get_db_connection():
|
57 |
"""Get dynamoDB connection"""
|
58 |
dynamodb = boto3.resource(
|
@@ -90,10 +91,6 @@ def upsert_content(report):
|
|
90 |
response = table.put_item(Item=item)
|
91 |
print(response)
|
92 |
|
93 |
-
|
94 |
-
reportList = []
|
95 |
-
|
96 |
-
|
97 |
today = datetime.today().strftime('%Y-%m-%d')
|
98 |
beginDate = (datetime.today() - timedelta(days=183)).strftime('%Y-%m-%d')
|
99 |
i = 0
|
@@ -141,7 +138,10 @@ while i > -1:
|
|
141 |
report['author'] = translator.translate(report['researcher'], dest='en').text
|
142 |
report['originalAuthor'] = report['researcher']
|
143 |
report['originalContent'] = content
|
144 |
-
|
|
|
|
|
|
|
145 |
report['authorid'] = uuid.uuid5(uuid.NAMESPACE_OID, report['author'])
|
146 |
report['publishDate'] = datemodifier(report['publishDate'])
|
147 |
report['id'] = uuid.uuid5(uuid.NAMESPACE_OID, report['title']+report['publishDate'])
|
@@ -167,9 +167,7 @@ while i > -1:
|
|
167 |
sentiment_score = sentiment_score + 0
|
168 |
report['sentimentScore'] = sentiment_score
|
169 |
report['sentimentLabel'] = label_dict[sentiment_label]
|
170 |
-
print(report)
|
171 |
upsert_content(report)
|
172 |
-
reportList.append(report)
|
173 |
except Exception as error:
|
174 |
print(error)
|
175 |
else:
|
|
|
10 |
from datetime import datetime, timedelta
|
11 |
from decimal import Decimal
|
12 |
from transformers import pipeline
|
13 |
+
|
14 |
+
AWS_ACCESS_KEY_ID = os.environ['AWS_ACCESS_KEY_ID']
|
15 |
+
AWS_SECRET_ACCESS_KEY = os.environ['AWS_SECRET_ACCESS_KEY']
|
16 |
+
|
17 |
analyzer = pipeline("sentiment-analysis", model="ProsusAI/finbert")
|
18 |
|
19 |
translator = Translator()
|
|
|
54 |
text += line
|
55 |
return text
|
56 |
|
|
|
|
|
|
|
57 |
def get_db_connection():
|
58 |
"""Get dynamoDB connection"""
|
59 |
dynamodb = boto3.resource(
|
|
|
91 |
response = table.put_item(Item=item)
|
92 |
print(response)
|
93 |
|
|
|
|
|
|
|
|
|
94 |
today = datetime.today().strftime('%Y-%m-%d')
|
95 |
beginDate = (datetime.today() - timedelta(days=183)).strftime('%Y-%m-%d')
|
96 |
i = 0
|
|
|
138 |
report['author'] = translator.translate(report['researcher'], dest='en').text
|
139 |
report['originalAuthor'] = report['researcher']
|
140 |
report['originalContent'] = content
|
141 |
+
content_eng = ''
|
142 |
+
for element in article['originalContent'].split("。"):
|
143 |
+
content_eng += translator.translate(element, dest='en').text + ' '
|
144 |
+
article['content'] = content_eng
|
145 |
report['authorid'] = uuid.uuid5(uuid.NAMESPACE_OID, report['author'])
|
146 |
report['publishDate'] = datemodifier(report['publishDate'])
|
147 |
report['id'] = uuid.uuid5(uuid.NAMESPACE_OID, report['title']+report['publishDate'])
|
|
|
167 |
sentiment_score = sentiment_score + 0
|
168 |
report['sentimentScore'] = sentiment_score
|
169 |
report['sentimentLabel'] = label_dict[sentiment_label]
|
|
|
170 |
upsert_content(report)
|
|
|
171 |
except Exception as error:
|
172 |
print(error)
|
173 |
else:
|
mof.py
CHANGED
@@ -6,6 +6,14 @@ from lxml import etree
|
|
6 |
from googletrans import Translator
|
7 |
from transformers import pipeline
|
8 |
from PyPDF2 import PdfReader
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
analyzer = pipeline("sentiment-analysis", model="ProsusAI/finbert")
|
10 |
|
11 |
translator = Translator()
|
@@ -71,16 +79,6 @@ def extract_from_pdf(url):
|
|
71 |
extracted_text += text
|
72 |
return extracted_text, extracted_text_eng
|
73 |
|
74 |
-
"""Upload file to dynamoDB"""
|
75 |
-
# import datetime
|
76 |
-
from datetime import datetime, timedelta
|
77 |
-
from decimal import Decimal
|
78 |
-
import boto3
|
79 |
-
import os
|
80 |
-
|
81 |
-
AWS_ACCESS_KEY_ID = os.environ['AWS_ACCESS_KEY_ID']
|
82 |
-
AWS_SECRET_ACCESS_KEY = os.environ['AWS_SECRET_ACCESS_KEY']
|
83 |
-
|
84 |
def get_db_connection():
|
85 |
"""Get dynamoDB connection"""
|
86 |
dynamodb = boto3.resource(
|
@@ -100,9 +98,9 @@ def upsert_content(report):
|
|
100 |
'id': str(report['id']),
|
101 |
'site': report['site'],
|
102 |
'title': report['title'],
|
103 |
-
'originalSite': report['originalSite'],
|
104 |
-
'originalTitle': report['originalTitle'],
|
105 |
-
'originalContent': report['originalContent'],
|
106 |
'category': report['category'],
|
107 |
# 'author': report['author'],
|
108 |
'content': report['content'],
|
@@ -117,7 +115,6 @@ def upsert_content(report):
|
|
117 |
response = table.put_item(Item=item)
|
118 |
print(response)
|
119 |
|
120 |
-
reportList = []
|
121 |
categoryu_urls = ["https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/"]
|
122 |
for categoryu_url in categoryu_urls:
|
123 |
req = urllib.request.urlopen(categoryu_url)
|
@@ -131,11 +128,10 @@ for categoryu_url in categoryu_urls:
|
|
131 |
subpage = etree.HTML(subelement)
|
132 |
date = subpage.xpath("//span/text()")[0]
|
133 |
parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
|
134 |
-
if parsed_datetime > (datetime.today() - timedelta(days=
|
135 |
urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
|
136 |
for url in urls:
|
137 |
try:
|
138 |
-
print(url)
|
139 |
article = {}
|
140 |
url = url.replace("./", "https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/")
|
141 |
req = urllib.request.urlopen(url)
|
@@ -143,7 +139,10 @@ for categoryu_url in categoryu_urls:
|
|
143 |
html_text = text.decode("utf-8")
|
144 |
page = etree.HTML(html_text)
|
145 |
article['originalContent'] = encode(page.xpath("//div[contains(@class, 'TRS_Editor')]//p"))
|
146 |
-
|
|
|
|
|
|
|
147 |
article['site'] = "Ministry of Finance"
|
148 |
article['originalSite'] = "财政部"
|
149 |
article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
|
@@ -174,12 +173,10 @@ for categoryu_url in categoryu_urls:
|
|
174 |
sentiment_score = sentiment_score + 0
|
175 |
article['sentimentScore'] = sentiment_score
|
176 |
article['sentimentLabel'] = label_dict[sentiment_label]
|
177 |
-
|
178 |
-
# upsert_content(article)
|
179 |
except Exception as error:
|
180 |
print(error)
|
181 |
|
182 |
-
reportList = []
|
183 |
categoryu_urls = ["https://www.mof.gov.cn/zhengwuxinxi/zhengcefabu/"]
|
184 |
for categoryu_url in categoryu_urls:
|
185 |
req = urllib.request.urlopen(categoryu_url)
|
@@ -193,11 +190,10 @@ for categoryu_url in categoryu_urls:
|
|
193 |
subpage = etree.HTML(subelement)
|
194 |
date = subpage.xpath("//span/text()")[0]
|
195 |
parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
|
196 |
-
if parsed_datetime > (datetime.today() - timedelta(days=
|
197 |
urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
|
198 |
for url in urls:
|
199 |
try:
|
200 |
-
print(url)
|
201 |
article = {}
|
202 |
url = url.replace("./", categoryu_url)
|
203 |
req = urllib.request.urlopen(url)
|
@@ -205,7 +201,10 @@ for categoryu_url in categoryu_urls:
|
|
205 |
html_text = text.decode("utf-8")
|
206 |
page = etree.HTML(html_text)
|
207 |
article['originalContent'] = encode(page.xpath("//div[contains(@class, 'TRS_Editor')]//p"))
|
208 |
-
|
|
|
|
|
|
|
209 |
article['site'] = "Ministry of Finance"
|
210 |
article['originalSite'] = "财政部"
|
211 |
article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
|
@@ -236,12 +235,10 @@ for categoryu_url in categoryu_urls:
|
|
236 |
sentiment_score = sentiment_score + 0
|
237 |
article['sentimentScore'] = sentiment_score
|
238 |
article['sentimentLabel'] = label_dict[sentiment_label]
|
239 |
-
|
240 |
-
# upsert_content(article)
|
241 |
except Exception as error:
|
242 |
print(error)
|
243 |
|
244 |
-
reportList = []
|
245 |
categoryu_urls = ["https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/"]
|
246 |
for categoryu_url in categoryu_urls:
|
247 |
req = urllib.request.urlopen(categoryu_url)
|
@@ -255,11 +252,10 @@ for categoryu_url in categoryu_urls:
|
|
255 |
subpage = etree.HTML(subelement)
|
256 |
date = subpage.xpath("//span/text()")[0]
|
257 |
parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
|
258 |
-
if parsed_datetime > (datetime.today() - timedelta(days=
|
259 |
urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
|
260 |
for url in urls:
|
261 |
try:
|
262 |
-
print(url)
|
263 |
article = {}
|
264 |
url = url.replace("./", categoryu_url)
|
265 |
req = urllib.request.urlopen(url)
|
@@ -267,7 +263,10 @@ for categoryu_url in categoryu_urls:
|
|
267 |
html_text = text.decode("utf-8")
|
268 |
page = etree.HTML(html_text)
|
269 |
article['originalContent'] = encode(page.xpath("//div[contains(@class, 'TRS_Editor')]//p"))
|
270 |
-
|
|
|
|
|
|
|
271 |
article['site'] = "Ministry of Finance"
|
272 |
article['originalSite'] = "财政部"
|
273 |
article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
|
@@ -298,7 +297,6 @@ for categoryu_url in categoryu_urls:
|
|
298 |
sentiment_score = sentiment_score + 0
|
299 |
article['sentimentScore'] = sentiment_score
|
300 |
article['sentimentLabel'] = label_dict[sentiment_label]
|
301 |
-
|
302 |
-
# upsert_content(article)
|
303 |
except Exception as error:
|
304 |
print(error)
|
|
|
6 |
from googletrans import Translator
|
7 |
from transformers import pipeline
|
8 |
from PyPDF2 import PdfReader
|
9 |
+
from datetime import datetime, timedelta
|
10 |
+
from decimal import Decimal
|
11 |
+
import boto3
|
12 |
+
import os
|
13 |
+
|
14 |
+
AWS_ACCESS_KEY_ID = os.environ['AWS_ACCESS_KEY_ID']
|
15 |
+
AWS_SECRET_ACCESS_KEY = os.environ['AWS_SECRET_ACCESS_KEY']
|
16 |
+
|
17 |
analyzer = pipeline("sentiment-analysis", model="ProsusAI/finbert")
|
18 |
|
19 |
translator = Translator()
|
|
|
79 |
extracted_text += text
|
80 |
return extracted_text, extracted_text_eng
|
81 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
82 |
def get_db_connection():
|
83 |
"""Get dynamoDB connection"""
|
84 |
dynamodb = boto3.resource(
|
|
|
98 |
'id': str(report['id']),
|
99 |
'site': report['site'],
|
100 |
'title': report['title'],
|
101 |
+
# 'originalSite': report['originalSite'],
|
102 |
+
# 'originalTitle': report['originalTitle'],
|
103 |
+
# 'originalContent': report['originalContent'],
|
104 |
'category': report['category'],
|
105 |
# 'author': report['author'],
|
106 |
'content': report['content'],
|
|
|
115 |
response = table.put_item(Item=item)
|
116 |
print(response)
|
117 |
|
|
|
118 |
categoryu_urls = ["https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/"]
|
119 |
for categoryu_url in categoryu_urls:
|
120 |
req = urllib.request.urlopen(categoryu_url)
|
|
|
128 |
subpage = etree.HTML(subelement)
|
129 |
date = subpage.xpath("//span/text()")[0]
|
130 |
parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
|
131 |
+
if parsed_datetime > (datetime.today() - timedelta(days=183)):
|
132 |
urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
|
133 |
for url in urls:
|
134 |
try:
|
|
|
135 |
article = {}
|
136 |
url = url.replace("./", "https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/")
|
137 |
req = urllib.request.urlopen(url)
|
|
|
139 |
html_text = text.decode("utf-8")
|
140 |
page = etree.HTML(html_text)
|
141 |
article['originalContent'] = encode(page.xpath("//div[contains(@class, 'TRS_Editor')]//p"))
|
142 |
+
content_eng = ''
|
143 |
+
for element in article['originalContent'].split("。"):
|
144 |
+
content_eng += translator.translate(element, dest='en').text + ' '
|
145 |
+
article['content'] = content_eng
|
146 |
article['site'] = "Ministry of Finance"
|
147 |
article['originalSite'] = "财政部"
|
148 |
article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
|
|
|
173 |
sentiment_score = sentiment_score + 0
|
174 |
article['sentimentScore'] = sentiment_score
|
175 |
article['sentimentLabel'] = label_dict[sentiment_label]
|
176 |
+
upsert_content(article)
|
|
|
177 |
except Exception as error:
|
178 |
print(error)
|
179 |
|
|
|
180 |
categoryu_urls = ["https://www.mof.gov.cn/zhengwuxinxi/zhengcefabu/"]
|
181 |
for categoryu_url in categoryu_urls:
|
182 |
req = urllib.request.urlopen(categoryu_url)
|
|
|
190 |
subpage = etree.HTML(subelement)
|
191 |
date = subpage.xpath("//span/text()")[0]
|
192 |
parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
|
193 |
+
if parsed_datetime > (datetime.today() - timedelta(days=183)):
|
194 |
urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
|
195 |
for url in urls:
|
196 |
try:
|
|
|
197 |
article = {}
|
198 |
url = url.replace("./", categoryu_url)
|
199 |
req = urllib.request.urlopen(url)
|
|
|
201 |
html_text = text.decode("utf-8")
|
202 |
page = etree.HTML(html_text)
|
203 |
article['originalContent'] = encode(page.xpath("//div[contains(@class, 'TRS_Editor')]//p"))
|
204 |
+
content_eng = ''
|
205 |
+
for element in article['originalContent'].split("。"):
|
206 |
+
content_eng += translator.translate(element, dest='en').text + ' '
|
207 |
+
article['content'] = content_eng
|
208 |
article['site'] = "Ministry of Finance"
|
209 |
article['originalSite'] = "财政部"
|
210 |
article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
|
|
|
235 |
sentiment_score = sentiment_score + 0
|
236 |
article['sentimentScore'] = sentiment_score
|
237 |
article['sentimentLabel'] = label_dict[sentiment_label]
|
238 |
+
upsert_content(article)
|
|
|
239 |
except Exception as error:
|
240 |
print(error)
|
241 |
|
|
|
242 |
categoryu_urls = ["https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/"]
|
243 |
for categoryu_url in categoryu_urls:
|
244 |
req = urllib.request.urlopen(categoryu_url)
|
|
|
252 |
subpage = etree.HTML(subelement)
|
253 |
date = subpage.xpath("//span/text()")[0]
|
254 |
parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
|
255 |
+
if parsed_datetime > (datetime.today() - timedelta(days=183)):
|
256 |
urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
|
257 |
for url in urls:
|
258 |
try:
|
|
|
259 |
article = {}
|
260 |
url = url.replace("./", categoryu_url)
|
261 |
req = urllib.request.urlopen(url)
|
|
|
263 |
html_text = text.decode("utf-8")
|
264 |
page = etree.HTML(html_text)
|
265 |
article['originalContent'] = encode(page.xpath("//div[contains(@class, 'TRS_Editor')]//p"))
|
266 |
+
content_eng = ''
|
267 |
+
for element in article['originalContent'].split("。"):
|
268 |
+
content_eng += translator.translate(element, dest='en').text + ' '
|
269 |
+
article['content'] = content_eng
|
270 |
article['site'] = "Ministry of Finance"
|
271 |
article['originalSite'] = "财政部"
|
272 |
article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
|
|
|
297 |
sentiment_score = sentiment_score + 0
|
298 |
article['sentimentScore'] = sentiment_score
|
299 |
article['sentimentLabel'] = label_dict[sentiment_label]
|
300 |
+
upsert_content(article)
|
|
|
301 |
except Exception as error:
|
302 |
print(error)
|
ndrc.py
CHANGED
@@ -75,16 +75,14 @@ def extract_from_pdf(url):
|
|
75 |
pdf_reader = PdfReader(f)
|
76 |
num_pages = len(pdf_reader.pages)
|
77 |
extracted_text = ""
|
78 |
-
extracted_text_eng = ""
|
79 |
for page in range(num_pages):
|
80 |
text = pdf_reader.pages[page].extract_text()
|
81 |
if text and text[0].isdigit():
|
82 |
text = text[1:]
|
83 |
first_newline_index = text.find('\n')
|
84 |
text = text[:first_newline_index+1].replace('\n', ' ') + text[first_newline_index+1:].replace('\n', '')
|
85 |
-
extracted_text_eng += translator.translate(text, dest='en').text
|
86 |
extracted_text += text
|
87 |
-
return extracted_text
|
88 |
|
89 |
def get_db_connection():
|
90 |
"""Get dynamoDB connection"""
|
@@ -135,7 +133,7 @@ for categoryu_url in categoryu_urls:
|
|
135 |
subpage = etree.HTML(subelement)
|
136 |
date = subpage.xpath("//span/text()")[0]
|
137 |
parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y/%m/%d")), "%Y-%m-%d")
|
138 |
-
if parsed_datetime > (datetime.today() - timedelta(days=
|
139 |
urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
|
140 |
for url in urls:
|
141 |
try:
|
@@ -144,7 +142,6 @@ for categoryu_url in categoryu_urls:
|
|
144 |
url = url.replace("../../", "https://www.ndrc.gov.cn/xxgk/")
|
145 |
else:
|
146 |
url = url.replace("./", categoryu_url)
|
147 |
-
print(url)
|
148 |
req = urllib.request.urlopen(url)
|
149 |
text = req.read()
|
150 |
html_text = text.decode("utf-8")
|
@@ -153,9 +150,12 @@ for categoryu_url in categoryu_urls:
|
|
153 |
for attachment_url in attachment_urls:
|
154 |
if ".pdf" in attachment_url:
|
155 |
pdf_url = url.rsplit('/', 1)[0] + attachment_url.replace('./','/')
|
156 |
-
pdf_content
|
157 |
-
article['content'] = extracted_text_eng
|
158 |
article['originalContent'] = pdf_content
|
|
|
|
|
|
|
|
|
159 |
article['site'] = "National Development and Reform Commission"
|
160 |
article['originalSite'] = "国家发展和改革委员会"
|
161 |
article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
|
@@ -186,7 +186,6 @@ for categoryu_url in categoryu_urls:
|
|
186 |
sentiment_score = sentiment_score + 0
|
187 |
article['sentimentScore'] = sentiment_score
|
188 |
article['sentimentLabel'] = label_dict[sentiment_label]
|
189 |
-
print(article)
|
190 |
upsert_content(article)
|
191 |
except Exception as error:
|
192 |
print(error)
|
@@ -204,13 +203,12 @@ for categoryu_url in categoryu_urls:
|
|
204 |
subpage = etree.HTML(subelement)
|
205 |
date = subpage.xpath("//span/text()")[0]
|
206 |
parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y/%m/%d")), "%Y-%m-%d")
|
207 |
-
if parsed_datetime > (datetime.today() - timedelta(days=
|
208 |
urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
|
209 |
for url in urls:
|
210 |
try:
|
211 |
article = {}
|
212 |
if "https://www.gov.cn" in url:
|
213 |
-
print(url)
|
214 |
req = urllib.request.urlopen(url)
|
215 |
text = req.read()
|
216 |
html_text = text.decode("utf-8")
|
@@ -220,7 +218,6 @@ for categoryu_url in categoryu_urls:
|
|
220 |
for element in article['originalContent'].split("。"):
|
221 |
content_eng += translator.translate(element, dest='en').text + ' '
|
222 |
article['content'] = content_eng
|
223 |
-
print(article['content'])
|
224 |
article['site'] = "State Council"
|
225 |
article['originalSite'] = "国务院"
|
226 |
article['originalTitle'] = page.xpath("//title/text()")[0]
|
@@ -240,7 +237,6 @@ for categoryu_url in categoryu_urls:
|
|
240 |
for element in article['originalContent'].split("。"):
|
241 |
content_eng += translator.translate(element, dest='en').text + ' '
|
242 |
article['content'] = content_eng
|
243 |
-
print(article['content'])
|
244 |
article['site'] = "National Development and Reform Commission"
|
245 |
article['originalSite'] = "国家发展和改革委员会"
|
246 |
article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
|
@@ -271,7 +267,6 @@ for categoryu_url in categoryu_urls:
|
|
271 |
sentiment_score = sentiment_score + 0
|
272 |
article['sentimentScore'] = sentiment_score
|
273 |
article['sentimentLabel'] = label_dict[sentiment_label]
|
274 |
-
print(article)
|
275 |
upsert_content(article)
|
276 |
except Exception as error:
|
277 |
print(error)
|
|
|
75 |
pdf_reader = PdfReader(f)
|
76 |
num_pages = len(pdf_reader.pages)
|
77 |
extracted_text = ""
|
|
|
78 |
for page in range(num_pages):
|
79 |
text = pdf_reader.pages[page].extract_text()
|
80 |
if text and text[0].isdigit():
|
81 |
text = text[1:]
|
82 |
first_newline_index = text.find('\n')
|
83 |
text = text[:first_newline_index+1].replace('\n', ' ') + text[first_newline_index+1:].replace('\n', '')
|
|
|
84 |
extracted_text += text
|
85 |
+
return extracted_text
|
86 |
|
87 |
def get_db_connection():
|
88 |
"""Get dynamoDB connection"""
|
|
|
133 |
subpage = etree.HTML(subelement)
|
134 |
date = subpage.xpath("//span/text()")[0]
|
135 |
parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y/%m/%d")), "%Y-%m-%d")
|
136 |
+
if parsed_datetime > (datetime.today() - timedelta(days=183)):
|
137 |
urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
|
138 |
for url in urls:
|
139 |
try:
|
|
|
142 |
url = url.replace("../../", "https://www.ndrc.gov.cn/xxgk/")
|
143 |
else:
|
144 |
url = url.replace("./", categoryu_url)
|
|
|
145 |
req = urllib.request.urlopen(url)
|
146 |
text = req.read()
|
147 |
html_text = text.decode("utf-8")
|
|
|
150 |
for attachment_url in attachment_urls:
|
151 |
if ".pdf" in attachment_url:
|
152 |
pdf_url = url.rsplit('/', 1)[0] + attachment_url.replace('./','/')
|
153 |
+
pdf_content = extract_from_pdf(pdf_url)
|
|
|
154 |
article['originalContent'] = pdf_content
|
155 |
+
content_eng = ''
|
156 |
+
for element in article['originalContent'].split("。"):
|
157 |
+
content_eng += translator.translate(element, dest='en').text + ' '
|
158 |
+
article['content'] = content_eng
|
159 |
article['site'] = "National Development and Reform Commission"
|
160 |
article['originalSite'] = "国家发展和改革委员会"
|
161 |
article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
|
|
|
186 |
sentiment_score = sentiment_score + 0
|
187 |
article['sentimentScore'] = sentiment_score
|
188 |
article['sentimentLabel'] = label_dict[sentiment_label]
|
|
|
189 |
upsert_content(article)
|
190 |
except Exception as error:
|
191 |
print(error)
|
|
|
203 |
subpage = etree.HTML(subelement)
|
204 |
date = subpage.xpath("//span/text()")[0]
|
205 |
parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y/%m/%d")), "%Y-%m-%d")
|
206 |
+
if parsed_datetime > (datetime.today() - timedelta(days=183)):
|
207 |
urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
|
208 |
for url in urls:
|
209 |
try:
|
210 |
article = {}
|
211 |
if "https://www.gov.cn" in url:
|
|
|
212 |
req = urllib.request.urlopen(url)
|
213 |
text = req.read()
|
214 |
html_text = text.decode("utf-8")
|
|
|
218 |
for element in article['originalContent'].split("。"):
|
219 |
content_eng += translator.translate(element, dest='en').text + ' '
|
220 |
article['content'] = content_eng
|
|
|
221 |
article['site'] = "State Council"
|
222 |
article['originalSite'] = "国务院"
|
223 |
article['originalTitle'] = page.xpath("//title/text()")[0]
|
|
|
237 |
for element in article['originalContent'].split("。"):
|
238 |
content_eng += translator.translate(element, dest='en').text + ' '
|
239 |
article['content'] = content_eng
|
|
|
240 |
article['site'] = "National Development and Reform Commission"
|
241 |
article['originalSite'] = "国家发展和改革委员会"
|
242 |
article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
|
|
|
267 |
sentiment_score = sentiment_score + 0
|
268 |
article['sentimentScore'] = sentiment_score
|
269 |
article['sentimentLabel'] = label_dict[sentiment_label]
|
|
|
270 |
upsert_content(article)
|
271 |
except Exception as error:
|
272 |
print(error)
|