OxbridgeEconomics commited on
Commit
a6d7194
·
1 Parent(s): 66cae81
.github/workflows/eastmoney.yml ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This workflow will install Python dependencies, run tests and lint with a single version of Python
2
+ # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
3
+
4
+ name: Data Collection - EastMoney
5
+
6
+ on:
7
+ # schedule:
8
+ # - cron: '0 16 * * *'
9
+ workflow_dispatch:
10
+
11
+ permissions:
12
+ contents: read
13
+
14
+ jobs:
15
+ build:
16
+
17
+ runs-on: ubuntu-latest
18
+
19
+ steps:
20
+ - uses: actions/checkout@v3
21
+ - name: Set up Python 3.10
22
+ uses: actions/setup-python@v3
23
+ with:
24
+ python-version: "3.10"
25
+ - name: Install dependencies
26
+ run: |
27
+ python -m pip install --upgrade pip
28
+ if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
29
+ pip install transformers
30
+ pip install tensorflow
31
+ pip install tf-keras
32
+ - name: Data Collection
33
+ env:
34
+ AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
35
+ AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
36
+ run: |
37
+ python eastmoney.py
.github/workflows/{python-app.yml → mof.yml} RENAMED
@@ -1,7 +1,7 @@
1
  # This workflow will install Python dependencies, run tests and lint with a single version of Python
2
  # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
3
 
4
- name: Security Data Collection
5
 
6
  on:
7
  # schedule:
@@ -34,4 +34,4 @@ jobs:
34
  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
35
  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
36
  run: |
37
- python main.py
 
1
  # This workflow will install Python dependencies, run tests and lint with a single version of Python
2
  # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
3
 
4
+ name: Data Collection - MOF
5
 
6
  on:
7
  # schedule:
 
34
  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
35
  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
36
  run: |
37
+ python mof.py
.github/workflows/ndrc.yml ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This workflow will install Python dependencies, run tests and lint with a single version of Python
2
+ # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
3
+
4
+ name: Data Collection - NDRC
5
+
6
+ on:
7
+ # schedule:
8
+ # - cron: '0 16 * * *'
9
+ workflow_dispatch:
10
+
11
+ permissions:
12
+ contents: read
13
+
14
+ jobs:
15
+ build:
16
+
17
+ runs-on: ubuntu-latest
18
+
19
+ steps:
20
+ - uses: actions/checkout@v3
21
+ - name: Set up Python 3.10
22
+ uses: actions/setup-python@v3
23
+ with:
24
+ python-version: "3.10"
25
+ - name: Install dependencies
26
+ run: |
27
+ python -m pip install --upgrade pip
28
+ if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
29
+ pip install transformers
30
+ pip install tensorflow
31
+ pip install tf-keras
32
+ - name: Data Collection
33
+ env:
34
+ AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
35
+ AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
36
+ run: |
37
+ python ndrc.py
main.ipynb → eastmoney.ipynb RENAMED
File without changes
main.py → eastmoney.py RENAMED
@@ -74,6 +74,7 @@ def upsert_content(report):
74
  'site': report['site'],
75
  'title': report['title'],
76
  'originalSite': report['originalSite'],
 
77
  'originalContent': report['originalContent'],
78
  'category': "Macroeconomic Research",
79
  'author': report['author'],
@@ -94,7 +95,7 @@ reportList = []
94
 
95
 
96
  today = datetime.today().strftime('%Y-%m-%d')
97
- beginDate = (datetime.today() - timedelta(days=90)).strftime('%Y-%m-%d')
98
  i = 0
99
  while i > -1:
100
  url = "https://reportapi.eastmoney.com/report/jg"
@@ -135,8 +136,8 @@ while i > -1:
135
  report['site'] = translator.translate(report['orgName'], dest='en').text
136
  report['originalSite'] = report['orgSName']
137
  report['reporturl'] = reporturl
138
- report['title'] = translator.translate(report['title'], dest='en').text
139
  report['originalTitle'] = report['title']
 
140
  report['author'] = translator.translate(report['researcher'], dest='en').text
141
  report['originalAuthor'] = report['researcher']
142
  report['originalContent'] = content
 
74
  'site': report['site'],
75
  'title': report['title'],
76
  'originalSite': report['originalSite'],
77
+ 'originalTitle': report['originalTitle'],
78
  'originalContent': report['originalContent'],
79
  'category': "Macroeconomic Research",
80
  'author': report['author'],
 
95
 
96
 
97
  today = datetime.today().strftime('%Y-%m-%d')
98
+ beginDate = (datetime.today() - timedelta(days=180)).strftime('%Y-%m-%d')
99
  i = 0
100
  while i > -1:
101
  url = "https://reportapi.eastmoney.com/report/jg"
 
136
  report['site'] = translator.translate(report['orgName'], dest='en').text
137
  report['originalSite'] = report['orgSName']
138
  report['reporturl'] = reporturl
 
139
  report['originalTitle'] = report['title']
140
+ report['title'] = translator.translate(report['title'], dest='en').text
141
  report['author'] = translator.translate(report['researcher'], dest='en').text
142
  report['originalAuthor'] = report['researcher']
143
  report['originalContent'] = content
mof.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
mof.py ADDED
@@ -0,0 +1,305 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import uuid
3
+ import time
4
+ import urllib.request
5
+ from lxml import etree
6
+ from googletrans import Translator
7
+ from transformers import pipeline
8
+ from PyPDF2 import PdfReader
9
+ analyzer = pipeline("sentiment-analysis", model="ProsusAI/finbert")
10
+
11
+ translator = Translator()
12
+
13
+ def datemodifier(date_string):
14
+ """Date Modifier Function"""
15
+ try:
16
+ to_date = time.strptime(date_string,"%Y-%m-%d %H:%M:%S")
17
+ return time.strftime("%Y-%m-%d",to_date)
18
+ except:
19
+ return False
20
+
21
+ def fetch_url(url):
22
+ response = requests.get(url)
23
+ if response.status_code == 200:
24
+ return response.text
25
+ else:
26
+ return None
27
+
28
+ def translist(infolist):
29
+ """Translist Function"""
30
+ out = list(filter(lambda s: s and
31
+ (isinstance (s,str) or len(s.strip()) > 0), [i.strip() for i in infolist]))
32
+ return out
33
+
34
+ def encode(content):
35
+ """Encode Function"""
36
+ text = ''
37
+ for element in content:
38
+ if isinstance(element, etree._Element):
39
+ subelement = etree.tostring(element).decode()
40
+ subpage = etree.HTML(subelement)
41
+ tree = subpage.xpath('//text()')
42
+ line = ''.join(translist(tree)).\
43
+ replace('\n','').replace('\t','').replace('\r','').replace(' ','').strip()
44
+ else:
45
+ line = element
46
+ text += line
47
+ return text
48
+
49
+ def extract_from_pdf(url):
50
+ # Send a GET request to the URL and retrieve the PDF content
51
+ response = requests.get(url)
52
+ pdf_content = response.content
53
+
54
+ # Save the PDF content to a local file
55
+ with open("downloaded_file.pdf", "wb") as f:
56
+ f.write(pdf_content)
57
+
58
+ # Open the downloaded PDF file and extract the text
59
+ with open("downloaded_file.pdf", "rb") as f:
60
+ pdf_reader = PdfReader(f)
61
+ num_pages = len(pdf_reader.pages)
62
+ extracted_text = ""
63
+ extracted_text_eng = ""
64
+ for page in range(num_pages):
65
+ text = pdf_reader.pages[page].extract_text()
66
+ if text and text[0].isdigit():
67
+ text = text[1:]
68
+ first_newline_index = text.find('\n')
69
+ text = text[:first_newline_index+1].replace('\n', ' ') + text[first_newline_index+1:].replace('\n', '')
70
+ extracted_text_eng += translator.translate(text, dest='en').text
71
+ extracted_text += text
72
+ return extracted_text, extracted_text_eng
73
+
74
+ """Upload file to dynamoDB"""
75
+ # import datetime
76
+ from datetime import datetime, timedelta
77
+ from decimal import Decimal
78
+ import boto3
79
+
80
+ AWS_ACCESS_KEY_ID = "AKIAQFXZMGHQYXKWUDWR"
81
+ AWS_SECRET_ACCESS_KEY = "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"
82
+
83
+ print(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
84
+
85
+ def get_db_connection():
86
+ """Get dynamoDB connection"""
87
+ dynamodb = boto3.resource(
88
+ service_name='dynamodb',
89
+ region_name='us-east-1',
90
+ aws_access_key_id=AWS_ACCESS_KEY_ID,
91
+ aws_secret_access_key=AWS_SECRET_ACCESS_KEY
92
+ )
93
+ return dynamodb
94
+
95
+ def upsert_content(report):
96
+ """Upsert the content records"""
97
+ dynamodb = get_db_connection()
98
+ table = dynamodb.Table('article_test')
99
+ # Define the item data
100
+ item = {
101
+ 'id': str(report['id']),
102
+ 'site': report['site'],
103
+ 'title': report['title'],
104
+ 'originalSite': report['originalSite'],
105
+ 'originalTitle': report['originalTitle'],
106
+ 'originalContent': report['originalContent'],
107
+ 'category': report['category'],
108
+ # 'author': report['author'],
109
+ 'content': report['content'],
110
+ 'publishDate': report['publishDate'],
111
+ 'link': report['url'],
112
+ # 'attachment': report['reporturl'],
113
+ # 'authorID': str(report['authorid']),
114
+ 'sentimentScore': str(Decimal(report['sentimentScore']).quantize(Decimal('0.01'))),
115
+ 'sentimentLabel': report['sentimentLabel'],
116
+ 'LastModifiedDate': datetime.now().strftime("%Y-%m-%dT%H:%M:%S")
117
+ }
118
+ response = table.put_item(Item=item)
119
+ print(response)
120
+
121
+ reportList = []
122
+ categoryu_urls = ["https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/"]
123
+ for categoryu_url in categoryu_urls:
124
+ req = urllib.request.urlopen(categoryu_url)
125
+ text = req.read()
126
+ html_text = text.decode("utf-8")
127
+ page = etree.HTML(html_text)
128
+ articlelist = page.xpath("//div[contains(@class, 'xwfb_listerji')]/ul/li[not(@class = 'clear')]")
129
+ for article in articlelist:
130
+ if isinstance(article, etree._Element):
131
+ subelement = etree.tostring(article).decode()
132
+ subpage = etree.HTML(subelement)
133
+ date = subpage.xpath("//span/text()")[0]
134
+ parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
135
+ if parsed_datetime > (datetime.today() - timedelta(days=180)):
136
+ urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
137
+ for url in urls:
138
+ try:
139
+ print(url)
140
+ article = {}
141
+ url = url.replace("./", "https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/")
142
+ req = urllib.request.urlopen(url)
143
+ text = req.read()
144
+ html_text = text.decode("utf-8")
145
+ page = etree.HTML(html_text)
146
+ article['originalContent'] = encode(page.xpath("//div[contains(@class, 'TRS_Editor')]//p"))
147
+ article['content'] = translator.translate(article['originalContent'], dest='en').text
148
+ article['site'] = "Ministry of Finance"
149
+ article['originalSite'] = "财政部"
150
+ article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
151
+ article['title'] = translator.translate(article['originalTitle'], dest='en').text
152
+ article['url'] = url
153
+ article['category']= "Finance News"
154
+ article['publishDate'] = datemodifier(page.xpath("//meta[@name = 'PubDate']/@content")[0])
155
+ article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
156
+ label_dict = {
157
+ "positive": "+",
158
+ "negative": "-",
159
+ "neutral": "0",
160
+ }
161
+ sentiment_score = 0
162
+ maximum_value = 0
163
+ raw_sentiment = analyzer(article['content'][:512], return_all_scores=True)
164
+ sentiment_label = None
165
+ for sentiment_dict in raw_sentiment[0]:
166
+ value = sentiment_dict["score"]
167
+ if value > maximum_value:
168
+ sentiment_label = sentiment_dict["label"]
169
+ maximum_value = value
170
+ if sentiment_dict["label"] == "positive":
171
+ sentiment_score = sentiment_score + value
172
+ if sentiment_dict["label"] == "negative":
173
+ sentiment_score = sentiment_score - value
174
+ else:
175
+ sentiment_score = sentiment_score + 0
176
+ article['sentimentScore'] = sentiment_score
177
+ article['sentimentLabel'] = label_dict[sentiment_label]
178
+ print(article)
179
+ # upsert_content(article)
180
+ except Exception as error:
181
+ print(error)
182
+
183
+ reportList = []
184
+ categoryu_urls = ["https://www.mof.gov.cn/zhengwuxinxi/zhengcefabu/"]
185
+ for categoryu_url in categoryu_urls:
186
+ req = urllib.request.urlopen(categoryu_url)
187
+ text = req.read()
188
+ html_text = text.decode("utf-8")
189
+ page = etree.HTML(html_text)
190
+ articlelist = page.xpath("//div[contains(@class, 'xwfb_listerji')]/ul/li[not(@class = 'clear')]")
191
+ for article in articlelist:
192
+ if isinstance(article, etree._Element):
193
+ subelement = etree.tostring(article).decode()
194
+ subpage = etree.HTML(subelement)
195
+ date = subpage.xpath("//span/text()")[0]
196
+ parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
197
+ if parsed_datetime > (datetime.today() - timedelta(days=180)):
198
+ urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
199
+ for url in urls:
200
+ try:
201
+ print(url)
202
+ article = {}
203
+ url = url.replace("./", categoryu_url)
204
+ req = urllib.request.urlopen(url)
205
+ text = req.read()
206
+ html_text = text.decode("utf-8")
207
+ page = etree.HTML(html_text)
208
+ article['originalContent'] = encode(page.xpath("//div[contains(@class, 'TRS_Editor')]//p"))
209
+ article['content'] = translator.translate(article['originalContent'], dest='en').text
210
+ article['site'] = "Ministry of Finance"
211
+ article['originalSite'] = "财政部"
212
+ article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
213
+ article['title'] = translator.translate(article['originalTitle'], dest='en').text
214
+ article['url'] = url
215
+ article['category']= "Policy Release"
216
+ article['publishDate'] = datemodifier(page.xpath("//meta[@name = 'PubDate']/@content")[0])
217
+ article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
218
+ label_dict = {
219
+ "positive": "+",
220
+ "negative": "-",
221
+ "neutral": "0",
222
+ }
223
+ sentiment_score = 0
224
+ maximum_value = 0
225
+ raw_sentiment = analyzer(article['content'][:512], return_all_scores=True)
226
+ sentiment_label = None
227
+ for sentiment_dict in raw_sentiment[0]:
228
+ value = sentiment_dict["score"]
229
+ if value > maximum_value:
230
+ sentiment_label = sentiment_dict["label"]
231
+ maximum_value = value
232
+ if sentiment_dict["label"] == "positive":
233
+ sentiment_score = sentiment_score + value
234
+ if sentiment_dict["label"] == "negative":
235
+ sentiment_score = sentiment_score - value
236
+ else:
237
+ sentiment_score = sentiment_score + 0
238
+ article['sentimentScore'] = sentiment_score
239
+ article['sentimentLabel'] = label_dict[sentiment_label]
240
+ print(article)
241
+ # upsert_content(article)
242
+ except Exception as error:
243
+ print(error)
244
+
245
+ reportList = []
246
+ categoryu_urls = ["https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/"]
247
+ for categoryu_url in categoryu_urls:
248
+ req = urllib.request.urlopen(categoryu_url)
249
+ text = req.read()
250
+ html_text = text.decode("utf-8")
251
+ page = etree.HTML(html_text)
252
+ articlelist = page.xpath("//div[contains(@class, 'xwfb_listerji')]/ul/li[not(@class = 'clear')]")
253
+ for article in articlelist:
254
+ if isinstance(article, etree._Element):
255
+ subelement = etree.tostring(article).decode()
256
+ subpage = etree.HTML(subelement)
257
+ date = subpage.xpath("//span/text()")[0]
258
+ parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
259
+ if parsed_datetime > (datetime.today() - timedelta(days=180)):
260
+ urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
261
+ for url in urls:
262
+ try:
263
+ print(url)
264
+ article = {}
265
+ url = url.replace("./", categoryu_url)
266
+ req = urllib.request.urlopen(url)
267
+ text = req.read()
268
+ html_text = text.decode("utf-8")
269
+ page = etree.HTML(html_text)
270
+ article['originalContent'] = encode(page.xpath("//div[contains(@class, 'TRS_Editor')]//p"))
271
+ article['content'] = translator.translate(article['originalContent'], dest='en').text
272
+ article['site'] = "Ministry of Finance"
273
+ article['originalSite'] = "财政部"
274
+ article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
275
+ article['title'] = translator.translate(article['originalTitle'], dest='en').text
276
+ article['url'] = url
277
+ article['category']= "Policy Interpretation"
278
+ article['publishDate'] = datemodifier(page.xpath("//meta[@name = 'PubDate']/@content")[0])
279
+ article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
280
+ label_dict = {
281
+ "positive": "+",
282
+ "negative": "-",
283
+ "neutral": "0",
284
+ }
285
+ sentiment_score = 0
286
+ maximum_value = 0
287
+ raw_sentiment = analyzer(article['content'][:512], return_all_scores=True)
288
+ sentiment_label = None
289
+ for sentiment_dict in raw_sentiment[0]:
290
+ value = sentiment_dict["score"]
291
+ if value > maximum_value:
292
+ sentiment_label = sentiment_dict["label"]
293
+ maximum_value = value
294
+ if sentiment_dict["label"] == "positive":
295
+ sentiment_score = sentiment_score + value
296
+ if sentiment_dict["label"] == "negative":
297
+ sentiment_score = sentiment_score - value
298
+ else:
299
+ sentiment_score = sentiment_score + 0
300
+ article['sentimentScore'] = sentiment_score
301
+ article['sentimentLabel'] = label_dict[sentiment_label]
302
+ print(article)
303
+ # upsert_content(article)
304
+ except Exception as error:
305
+ print(error)
ndrc.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
ndrc.py ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import uuid
3
+ import time
4
+ import urllib.request
5
+ from lxml import etree
6
+ from googletrans import Translator
7
+ from transformers import pipeline
8
+ from PyPDF2 import PdfReader
9
+ analyzer = pipeline("sentiment-analysis", model="ProsusAI/finbert")
10
+
11
+ translator = Translator()
12
+
13
+ def datemodifier(date_string):
14
+ """Date Modifier Function"""
15
+ try:
16
+ to_date = time.strptime(date_string,"%Y-%m-%d %H:%M:%S")
17
+ return time.strftime("%Y-%m-%d",to_date)
18
+ except:
19
+ return False
20
+
21
+ def fetch_url(url):
22
+ response = requests.get(url)
23
+ if response.status_code == 200:
24
+ return response.text
25
+ else:
26
+ return None
27
+
28
+ def translist(infolist):
29
+ """Translist Function"""
30
+ out = list(filter(lambda s: s and
31
+ (isinstance (s,str) or len(s.strip()) > 0), [i.strip() for i in infolist]))
32
+ return out
33
+
34
+ def encode(content):
35
+ """Encode Function"""
36
+ text = ''
37
+ for element in content:
38
+ if isinstance(element, etree._Element):
39
+ subelement = etree.tostring(element).decode()
40
+ subpage = etree.HTML(subelement)
41
+ tree = subpage.xpath('//text()')
42
+ line = ''.join(translist(tree)).\
43
+ replace('\n','').replace('\t','').replace('\r','').replace(' ','').strip()
44
+ else:
45
+ line = element
46
+ text += line
47
+ return text
48
+
49
+ def extract_from_pdf(url):
50
+ # Send a GET request to the URL and retrieve the PDF content
51
+ response = requests.get(url)
52
+ pdf_content = response.content
53
+
54
+ # Save the PDF content to a local file
55
+ with open("downloaded_file.pdf", "wb") as f:
56
+ f.write(pdf_content)
57
+
58
+ # Open the downloaded PDF file and extract the text
59
+ with open("downloaded_file.pdf", "rb") as f:
60
+ pdf_reader = PdfReader(f)
61
+ num_pages = len(pdf_reader.pages)
62
+ extracted_text = ""
63
+ extracted_text_eng = ""
64
+ for page in range(num_pages):
65
+ text = pdf_reader.pages[page].extract_text()
66
+ if text and text[0].isdigit():
67
+ text = text[1:]
68
+ first_newline_index = text.find('\n')
69
+ text = text[:first_newline_index+1].replace('\n', ' ') + text[first_newline_index+1:].replace('\n', '')
70
+ extracted_text_eng += translator.translate(text, dest='en').text
71
+ extracted_text += text
72
+ return extracted_text, extracted_text_eng
73
+
74
+ """Upload file to dynamoDB"""
75
+ # import datetime
76
+ from datetime import datetime, timedelta
77
+ from decimal import Decimal
78
+ import boto3
79
+
80
+ AWS_ACCESS_KEY_ID = "AKIAQFXZMGHQYXKWUDWR"
81
+ AWS_SECRET_ACCESS_KEY = "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"
82
+
83
+ print(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
84
+
85
+ def get_db_connection():
86
+ """Get dynamoDB connection"""
87
+ dynamodb = boto3.resource(
88
+ service_name='dynamodb',
89
+ region_name='us-east-1',
90
+ aws_access_key_id=AWS_ACCESS_KEY_ID,
91
+ aws_secret_access_key=AWS_SECRET_ACCESS_KEY
92
+ )
93
+ return dynamodb
94
+
95
+ def upsert_content(report):
96
+ """Upsert the content records"""
97
+ dynamodb = get_db_connection()
98
+ table = dynamodb.Table('article_test')
99
+ # Define the item data
100
+ item = {
101
+ 'id': str(report['id']),
102
+ 'site': report['site'],
103
+ 'title': report['title'],
104
+ 'originalSite': report['originalSite'],
105
+ 'originalTitle': report['originalTitle'],
106
+ 'originalContent': report['originalContent'],
107
+ 'category': report['category'],
108
+ # 'author': report['author'],
109
+ 'content': report['content'],
110
+ 'publishDate': report['publishDate'],
111
+ 'link': report['url'],
112
+ # 'attachment': report['reporturl'],
113
+ # 'authorID': str(report['authorid']),
114
+ 'sentimentScore': str(Decimal(report['sentimentScore']).quantize(Decimal('0.01'))),
115
+ 'sentimentLabel': report['sentimentLabel'],
116
+ 'LastModifiedDate': datetime.now().strftime("%Y-%m-%dT%H:%M:%S")
117
+ }
118
+ response = table.put_item(Item=item)
119
+ print(response)
120
+
121
+ reportList = []
122
+ categoryu_urls = ["https://www.ndrc.gov.cn/xxgk/zcfb/fzggwl/", "https://www.ndrc.gov.cn/xxgk/zcfb/ghxwj/","https://www.ndrc.gov.cn/xxgk/zcfb/ghwb/","https://www.ndrc.gov.cn/xxgk/zcfb/gg/","https://www.ndrc.gov.cn/xxgk/zcfb/tz/","https://www.ndrc.gov.cn/xxgk/zcfb/pifu/","https://www.ndrc.gov.cn/xxgk/zcfb/qt/"]
123
+ for categoryu_url in categoryu_urls:
124
+ req = urllib.request.urlopen(categoryu_url)
125
+ text = req.read()
126
+ html_text = text.decode("utf-8")
127
+ page = etree.HTML(html_text)
128
+ articlelist = page.xpath("//div[contains(@class, 'list')]/ul/li[not(@class = 'empty')]")
129
+ for article in articlelist:
130
+ if isinstance(article, etree._Element):
131
+ subelement = etree.tostring(article).decode()
132
+ subpage = etree.HTML(subelement)
133
+ date = subpage.xpath("//span/text()")[0]
134
+ parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y/%m/%d")), "%Y-%m-%d")
135
+ if parsed_datetime > (datetime.today() - timedelta(days=180)):
136
+ urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
137
+ for url in urls:
138
+ try:
139
+ article = {}
140
+ if "/jd/jd" in url:
141
+ url = url.replace("../../", "https://www.ndrc.gov.cn/xxgk/")
142
+ else:
143
+ url = url.replace("./", categoryu_url)
144
+ print(url)
145
+ req = urllib.request.urlopen(url)
146
+ text = req.read()
147
+ html_text = text.decode("utf-8")
148
+ page = etree.HTML(html_text)
149
+ attachment_urls = page.xpath("//div[contains(@class, 'attachment_r')]//a/@href")
150
+ for attachment_url in attachment_urls:
151
+ if ".pdf" in attachment_url:
152
+ pdf_url = url.rsplit('/', 1)[0] + attachment_url.replace('./','/')
153
+ pdf_content, extracted_text_eng = extract_from_pdf(pdf_url)
154
+ article['content'] = extracted_text_eng
155
+ article['originalContent'] = pdf_content
156
+ article['site'] = "National Development and Reform Commission"
157
+ article['originalSite'] = "国家发展和改革委员会"
158
+ article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
159
+ article['title'] = translator.translate(article['originalTitle'], dest='en').text
160
+ article['url'] = url
161
+ article['category']= "Policy Release"
162
+ article['publishDate'] = datemodifier(page.xpath("//meta[@name = 'PubDate']/@content")[0])
163
+ article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
164
+ label_dict = {
165
+ "positive": "+",
166
+ "negative": "-",
167
+ "neutral": "0",
168
+ }
169
+ sentiment_score = 0
170
+ maximum_value = 0
171
+ raw_sentiment = analyzer(article['content'][:512], return_all_scores=True)
172
+ sentiment_label = None
173
+ for sentiment_dict in raw_sentiment[0]:
174
+ value = sentiment_dict["score"]
175
+ if value > maximum_value:
176
+ sentiment_label = sentiment_dict["label"]
177
+ maximum_value = value
178
+ if sentiment_dict["label"] == "positive":
179
+ sentiment_score = sentiment_score + value
180
+ if sentiment_dict["label"] == "negative":
181
+ sentiment_score = sentiment_score - value
182
+ else:
183
+ sentiment_score = sentiment_score + 0
184
+ article['sentimentScore'] = sentiment_score
185
+ article['sentimentLabel'] = label_dict[sentiment_label]
186
+ print(article)
187
+ upsert_content(article)
188
+ except Exception as error:
189
+ print(error)
190
+
191
+ reportList = []
192
+ categoryu_urls = ["https://www.ndrc.gov.cn/xxgk/jd/jd/index.html"]
193
+ for categoryu_url in categoryu_urls:
194
+ req = urllib.request.urlopen(categoryu_url)
195
+ text = req.read()
196
+ html_text = text.decode("utf-8")
197
+ page = etree.HTML(html_text)
198
+ articlelist = page.xpath("//div[contains(@class, 'list')]/ul/li[not(@class = 'empty')]")
199
+ for article in articlelist:
200
+ if isinstance(article, etree._Element):
201
+ subelement = etree.tostring(article).decode()
202
+ subpage = etree.HTML(subelement)
203
+ date = subpage.xpath("//span/text()")[0]
204
+ parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y/%m/%d")), "%Y-%m-%d")
205
+ if parsed_datetime > (datetime.today() - timedelta(days=180)):
206
+ urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
207
+ for url in urls:
208
+ try:
209
+ print(url)
210
+ article = {}
211
+ url = url.replace("../../", "https://www.ndrc.gov.cn/xxgk/jd/jd/")
212
+ url = url.replace("./", "https://www.ndrc.gov.cn/xxgk/jd/jd/")
213
+ print(url)
214
+ req = urllib.request.urlopen(url)
215
+ text = req.read()
216
+ html_text = text.decode("utf-8")
217
+ page = etree.HTML(html_text)
218
+ article['originalContent'] = encode(page.xpath("//div[contains(@class, 'TRS_Editor')]//p"))
219
+ split_text = article['originalContent'].split("。")
220
+ half_length = len(split_text) // 2
221
+ part1 = "。".join(split_text[:half_length])
222
+ part2 = "。".join(split_text[half_length:])
223
+ article['content'] = translator.translate(part1, dest='en').text + translator.translate(part2, dest='en').text
224
+ print(len(article['originalContent']),article['content'])
225
+ article['site'] = "National Development and Reform Commission"
226
+ article['originalSite'] = "国家发展和改革委员会"
227
+ article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
228
+ article['title'] = translator.translate(article['originalTitle'], dest='en').text
229
+ article['url'] = url
230
+ article['category']= "Policy Interpretation"
231
+ article['publishDate'] = datemodifier(page.xpath("//meta[@name = 'PubDate']/@content")[0])
232
+ article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
233
+ label_dict = {
234
+ "positive": "+",
235
+ "negative": "-",
236
+ "neutral": "0",
237
+ }
238
+ sentiment_score = 0
239
+ maximum_value = 0
240
+ raw_sentiment = analyzer(article['content'][:512], return_all_scores=True)
241
+ sentiment_label = None
242
+ for sentiment_dict in raw_sentiment[0]:
243
+ value = sentiment_dict["score"]
244
+ if value > maximum_value:
245
+ sentiment_label = sentiment_dict["label"]
246
+ maximum_value = value
247
+ if sentiment_dict["label"] == "positive":
248
+ sentiment_score = sentiment_score + value
249
+ if sentiment_dict["label"] == "negative":
250
+ sentiment_score = sentiment_score - value
251
+ else:
252
+ sentiment_score = sentiment_score + 0
253
+ article['sentimentScore'] = sentiment_score
254
+ article['sentimentLabel'] = label_dict[sentiment_label]
255
+ print(article)
256
+ upsert_content(article)
257
+ except Exception as error:
258
+ print(error)
259
+
requirements.txt CHANGED
@@ -21,3 +21,4 @@ s3transfer==0.10.0
21
  six==1.16.0
22
  sniffio==1.3.1
23
  urllib3==2.0.7
 
 
21
  six==1.16.0
22
  sniffio==1.3.1
23
  urllib3==2.0.7
24
+ PyPDF2
sample.csv DELETED
The diff for this file is too large to render. See raw diff