OxbridgeEconomics commited on
Commit
0fc522e
·
1 Parent(s): ae6aa5f
Files changed (7) hide show
  1. eastmoney.ipynb +0 -0
  2. gov.py +254 -0
  3. mof.ipynb +0 -0
  4. mof.py +74 -66
  5. ndrc.ipynb +0 -0
  6. ndrc.py +75 -71
  7. pbc.py +8 -3
eastmoney.ipynb DELETED
The diff for this file is too large to render. See raw diff
 
gov.py ADDED
@@ -0,0 +1,254 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from datetime import datetime, timedelta
3
+ from decimal import Decimal
4
+ import boto3
5
+ import uuid
6
+ import time
7
+ import urllib.request
8
+ from lxml import etree
9
+ from googletrans import Translator
10
+ from transformers import pipeline
11
+ from PyPDF2 import PdfReader
12
+ import os
13
+
14
+ AWS_ACCESS_KEY_ID = os.environ['AWS_ACCESS_KEY_ID']
15
+ AWS_SECRET_ACCESS_KEY = os.environ['AWS_SECRET_ACCESS_KEY']
16
+
17
+ analyzer = pipeline("sentiment-analysis", model="ProsusAI/finbert")
18
+
19
+ translator = Translator()
20
+
21
+ def datemodifier(date_string):
22
+ """Date Modifier Function"""
23
+ try:
24
+ to_date = time.strptime(date_string,"%Y-%m-%d %H:%M:%S")
25
+ return time.strftime("%Y-%m-%d",to_date)
26
+ except:
27
+ return False
28
+
29
+ def fetch_url(url):
30
+ response = requests.get(url)
31
+ if response.status_code == 200:
32
+ return response.text
33
+ else:
34
+ return None
35
+
36
+ def translist(infolist):
37
+ """Translist Function"""
38
+ out = list(filter(lambda s: s and
39
+ (isinstance (s,str) or len(s.strip()) > 0), [i.strip() for i in infolist]))
40
+ return out
41
+
42
+ def encode(content):
43
+ """Encode Function"""
44
+ text = ''
45
+ for element in content[:1]:
46
+ if isinstance(element, etree._Element):
47
+ subelement = etree.tostring(element).decode()
48
+ subpage = etree.HTML(subelement)
49
+ tree = subpage.xpath('//text()')
50
+ line = ''.join(translist(tree)).\
51
+ replace('\n','').replace('\t','').replace('\r','').replace(' ','').strip()
52
+ else:
53
+ line = element
54
+ text += line
55
+ index = text.find('打印本页')
56
+ if index != -1:
57
+ text = text[:index]
58
+
59
+ return text
60
+
61
+ def extract_from_pdf(url):
62
+ # Send a GET request to the URL and retrieve the PDF content
63
+ response = requests.get(url)
64
+ pdf_content = response.content
65
+
66
+ # Save the PDF content to a local file
67
+ with open("downloaded_file.pdf", "wb") as f:
68
+ f.write(pdf_content)
69
+
70
+ # Open the downloaded PDF file and extract the text
71
+ with open("downloaded_file.pdf", "rb") as f:
72
+ pdf_reader = PdfReader(f)
73
+ num_pages = len(pdf_reader.pages)
74
+ extracted_text = ""
75
+ extracted_text_eng = ""
76
+ for page in range(num_pages):
77
+ text = pdf_reader.pages[page].extract_text()
78
+ if text and text[0].isdigit():
79
+ text = text[1:]
80
+ first_newline_index = text.find('\n')
81
+ text = text[:first_newline_index+1].replace('\n', ' ') + text[first_newline_index+1:].replace('\n', '')
82
+ extracted_text_eng += translator.translate(text, dest='en').text
83
+ extracted_text += text
84
+ return extracted_text, extracted_text_eng
85
+
86
+ def get_db_connection():
87
+ """Get dynamoDB connection"""
88
+ dynamodb = boto3.resource(
89
+ service_name='dynamodb',
90
+ region_name='us-east-1',
91
+ aws_access_key_id=AWS_ACCESS_KEY_ID,
92
+ aws_secret_access_key=AWS_SECRET_ACCESS_KEY
93
+ )
94
+ return dynamodb
95
+
96
+ def upsert_content(report):
97
+ """Upsert the content records"""
98
+ dynamodb = get_db_connection()
99
+ table = dynamodb.Table('article_china')
100
+ # Define the item data
101
+ item = {
102
+ 'id': str(report['id']),
103
+ 'site': report['site'],
104
+ 'title': report['title'],
105
+ # 'originalSite': report['originalSite'],
106
+ # 'originalTitle': report['originalTitle'],
107
+ # 'originalContent': report['originalContent'],
108
+ 'category': report['category'],
109
+ # 'author': report['author'],
110
+ 'content': report['content'],
111
+ 'publishDate': report['publishDate'],
112
+ 'link': report['url'],
113
+ # 'attachment': report['reporturl'],
114
+ # 'authorID': str(report['authorid']),
115
+ 'sentimentScore': str(Decimal(report['sentimentScore']).quantize(Decimal('0.01'))),
116
+ 'sentimentLabel': report['sentimentLabel'],
117
+ 'LastModifiedDate': datetime.now().strftime("%Y-%m-%dT%H:%M:%S")
118
+ }
119
+ response = table.put_item(Item=item)
120
+ print(response)
121
+
122
+ i = 0
123
+ while i > -1:
124
+ if i == 0:
125
+ categoryu_url = "https://www.gov.cn/zhengce/jiedu/home.htm"
126
+ else:
127
+ categoryu_url = f"https://www.gov.cn/zhengce/jiedu/home_{i}.htm"
128
+ req = urllib.request.urlopen(categoryu_url)
129
+ text = req.read()
130
+ html_text = text.decode("utf-8")
131
+ page = etree.HTML(html_text)
132
+ articlelist = page.xpath("//div[contains(@class, 'news_box')]//h4")
133
+ for article in articlelist:
134
+ if isinstance(article, etree._Element):
135
+ subelement = etree.tostring(article).decode()
136
+ subpage = etree.HTML(subelement)
137
+ date = subpage.xpath("//span/text()")[0]
138
+ parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
139
+ if parsed_datetime < (datetime.today() - timedelta(days=183)):
140
+ print(categoryu_url)
141
+ i = -1
142
+ else:
143
+ urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
144
+ for url in urls:
145
+ article = {}
146
+ url = url.replace('../', 'https://www.gov.cn/zhengce/')
147
+ if "https://www.gov.cn" in url:
148
+ req = urllib.request.urlopen(url)
149
+ text = req.read()
150
+ html_text = text.decode("utf-8")
151
+ page = etree.HTML(html_text)
152
+ article['originalContent'] = encode(page.xpath("//div[contains(@id, 'UCAP-CONTENT')]//p"))
153
+ content_eng = ''
154
+ for element in article['originalContent'].split("。"):
155
+ content_eng += translator.translate(element, dest='en').text + ' '
156
+ article['content'] = content_eng
157
+ article['site'] = "State Council"
158
+ article['originalSite'] = "国务院"
159
+ article['originalTitle'] = page.xpath("//title/text()")[0]
160
+ article['title'] = translator.translate(article['originalTitle'], dest='en').text
161
+ article['url'] = url
162
+ article['category']= "Policy Interpretation"
163
+ article['publishDate'] = datemodifier(page.xpath("//meta[@name = 'firstpublishedtime']/@content")[0])
164
+ article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
165
+ label_dict = {
166
+ "positive": "+",
167
+ "negative": "-",
168
+ "neutral": "0",
169
+ }
170
+ sentiment_score = 0
171
+ maximum_value = 0
172
+ raw_sentiment = analyzer(article['content'][:512], return_all_scores=True)
173
+ sentiment_label = None
174
+ for sentiment_dict in raw_sentiment[0]:
175
+ value = sentiment_dict["score"]
176
+ if value > maximum_value:
177
+ sentiment_label = sentiment_dict["label"]
178
+ maximum_value = value
179
+ if sentiment_dict["label"] == "positive":
180
+ sentiment_score = sentiment_score + value
181
+ if sentiment_dict["label"] == "negative":
182
+ sentiment_score = sentiment_score - value
183
+ else:
184
+ sentiment_score = sentiment_score + 0
185
+ article['sentimentScore'] = sentiment_score
186
+ article['sentimentLabel'] = label_dict[sentiment_label]
187
+ upsert_content(article)
188
+
189
+ i = 0
190
+ while i > -1:
191
+ if i == 0:
192
+ categoryu_url = "https://www.gov.cn/zhengce/zuixin/home.htm"
193
+ else:
194
+ categoryu_url = f"https://www.gov.cn/zhengce/zuixin/home_{i}.htm"
195
+ req = urllib.request.urlopen(categoryu_url)
196
+ text = req.read()
197
+ html_text = text.decode("utf-8")
198
+ page = etree.HTML(html_text)
199
+ articlelist = page.xpath("//div[contains(@class, 'news_box')]//h4")
200
+ for article in articlelist:
201
+ if isinstance(article, etree._Element):
202
+ subelement = etree.tostring(article).decode()
203
+ subpage = etree.HTML(subelement)
204
+ date = subpage.xpath("//span/text()")[0]
205
+ parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
206
+ if parsed_datetime > (datetime.today() - timedelta(days=183)):
207
+ urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
208
+ for url in urls:
209
+ try:
210
+ article = {}
211
+ url = url.replace('../', 'https://www.gov.cn/zhengce/')
212
+ if "https://www.gov.cn" in url:
213
+ req = urllib.request.urlopen(url)
214
+ text = req.read()
215
+ html_text = text.decode("utf-8")
216
+ page = etree.HTML(html_text)
217
+ article['originalContent'] = encode(page.xpath("//div[contains(@id, 'UCAP-CONTENT')]//p"))
218
+ content_eng = ''
219
+ for element in article['originalContent'].split("。"):
220
+ content_eng += translator.translate(element, dest='en').text + ' '
221
+ article['content'] = content_eng
222
+ article['site'] = "State Council"
223
+ article['originalSite'] = "国务院"
224
+ article['originalTitle'] = page.xpath("//title/text()")[0]
225
+ article['title'] = translator.translate(article['originalTitle'], dest='en').text
226
+ article['url'] = url
227
+ article['category']= "Policy Release"
228
+ article['publishDate'] = datemodifier(page.xpath("//meta[@name = 'firstpublishedtime']/@content")[0])
229
+ article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
230
+ label_dict = {
231
+ "positive": "+",
232
+ "negative": "-",
233
+ "neutral": "0",
234
+ }
235
+ sentiment_score = 0
236
+ maximum_value = 0
237
+ raw_sentiment = analyzer(article['content'][:512], return_all_scores=True)
238
+ sentiment_label = None
239
+ for sentiment_dict in raw_sentiment[0]:
240
+ value = sentiment_dict["score"]
241
+ if value > maximum_value:
242
+ sentiment_label = sentiment_dict["label"]
243
+ maximum_value = value
244
+ if sentiment_dict["label"] == "positive":
245
+ sentiment_score = sentiment_score + value
246
+ if sentiment_dict["label"] == "negative":
247
+ sentiment_score = sentiment_score - value
248
+ else:
249
+ sentiment_score = sentiment_score + 0
250
+ article['sentimentScore'] = sentiment_score
251
+ article['sentimentLabel'] = label_dict[sentiment_label]
252
+ upsert_content(article)
253
+ except Exception as error:
254
+ print(error)
mof.ipynb DELETED
The diff for this file is too large to render. See raw diff
 
mof.py CHANGED
@@ -115,8 +115,12 @@ def upsert_content(report):
115
  response = table.put_item(Item=item)
116
  print(response)
117
 
118
- categoryu_urls = ["https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/"]
119
- for categoryu_url in categoryu_urls:
 
 
 
 
120
  req = urllib.request.urlopen(categoryu_url)
121
  text = req.read()
122
  html_text = text.decode("utf-8")
@@ -148,7 +152,7 @@ for categoryu_url in categoryu_urls:
148
  article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
149
  article['title'] = translator.translate(article['originalTitle'], dest='en').text
150
  article['url'] = url
151
- article['category']= "Finance News"
152
  article['publishDate'] = datemodifier(page.xpath("//meta[@name = 'PubDate']/@content")[0])
153
  article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
154
  label_dict = {
@@ -177,70 +181,74 @@ for categoryu_url in categoryu_urls:
177
  except Exception as error:
178
  print(error)
179
 
180
- categoryu_urls = ["https://www.mof.gov.cn/zhengwuxinxi/zhengcefabu/"]
181
- for categoryu_url in categoryu_urls:
182
- req = urllib.request.urlopen(categoryu_url)
183
- text = req.read()
184
- html_text = text.decode("utf-8")
185
- page = etree.HTML(html_text)
186
- articlelist = page.xpath("//div[contains(@class, 'xwfb_listerji')]/ul/li[not(@class = 'clear')]")
187
- for article in articlelist:
188
- if isinstance(article, etree._Element):
189
- subelement = etree.tostring(article).decode()
190
- subpage = etree.HTML(subelement)
191
- date = subpage.xpath("//span/text()")[0]
192
- parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
193
- if parsed_datetime > (datetime.today() - timedelta(days=183)):
194
- urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
195
- for url in urls:
196
- try:
197
- article = {}
198
- url = url.replace("./", categoryu_url)
199
- req = urllib.request.urlopen(url)
200
- text = req.read()
201
- html_text = text.decode("utf-8")
202
- page = etree.HTML(html_text)
203
- article['originalContent'] = encode(page.xpath("//div[contains(@class, 'TRS_Editor')]//p"))
204
- content_eng = ''
205
- for element in article['originalContent'].split("。"):
206
- content_eng += translator.translate(element, dest='en').text + ' '
207
- article['content'] = content_eng
208
- article['site'] = "Ministry of Finance"
209
- article['originalSite'] = "财政部"
210
- article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
211
- article['title'] = translator.translate(article['originalTitle'], dest='en').text
212
- article['url'] = url
213
- article['category']= "Policy Release"
214
- article['publishDate'] = datemodifier(page.xpath("//meta[@name = 'PubDate']/@content")[0])
215
- article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
216
- label_dict = {
217
- "positive": "+",
218
- "negative": "-",
219
- "neutral": "0",
220
- }
221
- sentiment_score = 0
222
- maximum_value = 0
223
- raw_sentiment = analyzer(article['content'][:512], return_all_scores=True)
224
- sentiment_label = None
225
- for sentiment_dict in raw_sentiment[0]:
226
- value = sentiment_dict["score"]
227
- if value > maximum_value:
228
- sentiment_label = sentiment_dict["label"]
229
- maximum_value = value
230
- if sentiment_dict["label"] == "positive":
231
- sentiment_score = sentiment_score + value
232
- if sentiment_dict["label"] == "negative":
233
- sentiment_score = sentiment_score - value
234
- else:
235
- sentiment_score = sentiment_score + 0
236
- article['sentimentScore'] = sentiment_score
237
- article['sentimentLabel'] = label_dict[sentiment_label]
238
- upsert_content(article)
239
- except Exception as error:
240
- print(error)
241
 
242
- categoryu_urls = ["https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/"]
243
- for categoryu_url in categoryu_urls:
 
 
 
 
244
  req = urllib.request.urlopen(categoryu_url)
245
  text = req.read()
246
  html_text = text.decode("utf-8")
 
115
  response = table.put_item(Item=item)
116
  print(response)
117
 
118
+ i = 0
119
+ while i > -1:
120
+ if i == 0:
121
+ categoryu_url = "https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/"
122
+ else:
123
+ categoryu_url = f"https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/index_{i}.htm"
124
  req = urllib.request.urlopen(categoryu_url)
125
  text = req.read()
126
  html_text = text.decode("utf-8")
 
152
  article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
153
  article['title'] = translator.translate(article['originalTitle'], dest='en').text
154
  article['url'] = url
155
+ article['category']= "Financial News"
156
  article['publishDate'] = datemodifier(page.xpath("//meta[@name = 'PubDate']/@content")[0])
157
  article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
158
  label_dict = {
 
181
  except Exception as error:
182
  print(error)
183
 
184
+ # categoryu_urls = ["https://www.mof.gov.cn/zhengwuxinxi/zhengcefabu/"]
185
+ # for categoryu_url in categoryu_urls:
186
+ # req = urllib.request.urlopen(categoryu_url)
187
+ # text = req.read()
188
+ # html_text = text.decode("utf-8")
189
+ # page = etree.HTML(html_text)
190
+ # articlelist = page.xpath("//div[contains(@class, 'xwfb_listerji')]/ul/li[not(@class = 'clear')]")
191
+ # for article in articlelist:
192
+ # if isinstance(article, etree._Element):
193
+ # subelement = etree.tostring(article).decode()
194
+ # subpage = etree.HTML(subelement)
195
+ # date = subpage.xpath("//span/text()")[0]
196
+ # parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
197
+ # if parsed_datetime > (datetime.today() - timedelta(days=183)):
198
+ # urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
199
+ # for url in urls:
200
+ # try:
201
+ # article = {}
202
+ # url = url.replace("./", categoryu_url)
203
+ # req = urllib.request.urlopen(url)
204
+ # text = req.read()
205
+ # html_text = text.decode("utf-8")
206
+ # page = etree.HTML(html_text)
207
+ # article['originalContent'] = encode(page.xpath("//div[contains(@class, 'TRS_Editor')]//p"))
208
+ # content_eng = ''
209
+ # for element in article['originalContent'].split("。"):
210
+ # content_eng += translator.translate(element, dest='en').text + ' '
211
+ # article['content'] = content_eng
212
+ # article['site'] = "Ministry of Finance"
213
+ # article['originalSite'] = "财政部"
214
+ # article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
215
+ # article['title'] = translator.translate(article['originalTitle'], dest='en').text
216
+ # article['url'] = url
217
+ # article['category']= "Policy Release"
218
+ # article['publishDate'] = datemodifier(page.xpath("//meta[@name = 'PubDate']/@content")[0])
219
+ # article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
220
+ # label_dict = {
221
+ # "positive": "+",
222
+ # "negative": "-",
223
+ # "neutral": "0",
224
+ # }
225
+ # sentiment_score = 0
226
+ # maximum_value = 0
227
+ # raw_sentiment = analyzer(article['content'][:512], return_all_scores=True)
228
+ # sentiment_label = None
229
+ # for sentiment_dict in raw_sentiment[0]:
230
+ # value = sentiment_dict["score"]
231
+ # if value > maximum_value:
232
+ # sentiment_label = sentiment_dict["label"]
233
+ # maximum_value = value
234
+ # if sentiment_dict["label"] == "positive":
235
+ # sentiment_score = sentiment_score + value
236
+ # if sentiment_dict["label"] == "negative":
237
+ # sentiment_score = sentiment_score - value
238
+ # else:
239
+ # sentiment_score = sentiment_score + 0
240
+ # article['sentimentScore'] = sentiment_score
241
+ # article['sentimentLabel'] = label_dict[sentiment_label]
242
+ # upsert_content(article)
243
+ # except Exception as error:
244
+ # print(error)
245
 
246
+ i = 0
247
+ while i > -1:
248
+ if i == 0:
249
+ categoryu_url = "https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/"
250
+ else:
251
+ categoryu_url = f"https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/index_{i}.htm"
252
  req = urllib.request.urlopen(categoryu_url)
253
  text = req.read()
254
  html_text = text.decode("utf-8")
ndrc.ipynb DELETED
The diff for this file is too large to render. See raw diff
 
ndrc.py CHANGED
@@ -120,78 +120,82 @@ def upsert_content(report):
120
  response = table.put_item(Item=item)
121
  print(response)
122
 
123
- categoryu_urls = ["https://www.ndrc.gov.cn/xxgk/zcfb/fzggwl/", "https://www.ndrc.gov.cn/xxgk/zcfb/ghxwj/","https://www.ndrc.gov.cn/xxgk/zcfb/ghwb/","https://www.ndrc.gov.cn/xxgk/zcfb/gg/","https://www.ndrc.gov.cn/xxgk/zcfb/tz/","https://www.ndrc.gov.cn/xxgk/zcfb/pifu/","https://www.ndrc.gov.cn/xxgk/zcfb/qt/"]
124
- for categoryu_url in categoryu_urls:
125
- req = urllib.request.urlopen(categoryu_url)
126
- text = req.read()
127
- html_text = text.decode("utf-8")
128
- page = etree.HTML(html_text)
129
- articlelist = page.xpath("//div[contains(@class, 'list')]/ul/li[not(@class = 'empty')]")
130
- for article in articlelist:
131
- if isinstance(article, etree._Element):
132
- subelement = etree.tostring(article).decode()
133
- subpage = etree.HTML(subelement)
134
- date = subpage.xpath("//span/text()")[0]
135
- parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y/%m/%d")), "%Y-%m-%d")
136
- if parsed_datetime > (datetime.today() - timedelta(days=183)):
137
- urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
138
- for url in urls:
139
- try:
140
- article = {}
141
- if "/jd/jd" in url:
142
- url = url.replace("../../", "https://www.ndrc.gov.cn/xxgk/")
143
- else:
144
- url = url.replace("./", categoryu_url)
145
- req = urllib.request.urlopen(url)
146
- text = req.read()
147
- html_text = text.decode("utf-8")
148
- page = etree.HTML(html_text)
149
- attachment_urls = page.xpath("//div[contains(@class, 'attachment_r')]//a/@href")
150
- for attachment_url in attachment_urls:
151
- if ".pdf" in attachment_url:
152
- pdf_url = url.rsplit('/', 1)[0] + attachment_url.replace('./','/')
153
- pdf_content = extract_from_pdf(pdf_url)
154
- article['originalContent'] = pdf_content
155
- content_eng = ''
156
- for element in article['originalContent'].split("。"):
157
- content_eng += translator.translate(element, dest='en').text + ' '
158
- article['content'] = content_eng
159
- article['site'] = "National Development and Reform Commission"
160
- article['originalSite'] = "国家发展和改革委员会"
161
- article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
162
- article['title'] = translator.translate(article['originalTitle'], dest='en').text
163
- article['url'] = url
164
- article['category']= "Policy Release"
165
- article['publishDate'] = datemodifier(page.xpath("//meta[@name = 'PubDate']/@content")[0])
166
- article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
167
- label_dict = {
168
- "positive": "+",
169
- "negative": "-",
170
- "neutral": "0",
171
- }
172
- sentiment_score = 0
173
- maximum_value = 0
174
- raw_sentiment = analyzer(article['content'][:512], return_all_scores=True)
175
- sentiment_label = None
176
- for sentiment_dict in raw_sentiment[0]:
177
- value = sentiment_dict["score"]
178
- if value > maximum_value:
179
- sentiment_label = sentiment_dict["label"]
180
- maximum_value = value
181
- if sentiment_dict["label"] == "positive":
182
- sentiment_score = sentiment_score + value
183
- if sentiment_dict["label"] == "negative":
184
- sentiment_score = sentiment_score - value
185
- else:
186
- sentiment_score = sentiment_score + 0
187
- article['sentimentScore'] = sentiment_score
188
- article['sentimentLabel'] = label_dict[sentiment_label]
189
- upsert_content(article)
190
- except Exception as error:
191
- print(error)
192
 
193
- categoryu_urls = ["https://www.ndrc.gov.cn/xxgk/jd/jd/index.html"]
194
- for categoryu_url in categoryu_urls:
 
 
 
 
195
  req = urllib.request.urlopen(categoryu_url)
196
  text = req.read()
197
  html_text = text.decode("utf-8")
 
120
  response = table.put_item(Item=item)
121
  print(response)
122
 
123
+ # categoryu_urls = ["https://www.ndrc.gov.cn/xxgk/zcfb/fzggwl/", "https://www.ndrc.gov.cn/xxgk/zcfb/ghxwj/","https://www.ndrc.gov.cn/xxgk/zcfb/ghwb/","https://www.ndrc.gov.cn/xxgk/zcfb/gg/","https://www.ndrc.gov.cn/xxgk/zcfb/tz/","https://www.ndrc.gov.cn/xxgk/zcfb/pifu/","https://www.ndrc.gov.cn/xxgk/zcfb/qt/"]
124
+ # for categoryu_url in categoryu_urls:
125
+ # req = urllib.request.urlopen(categoryu_url)
126
+ # text = req.read()
127
+ # html_text = text.decode("utf-8")
128
+ # page = etree.HTML(html_text)
129
+ # articlelist = page.xpath("//div[contains(@class, 'list')]/ul/li[not(@class = 'empty')]")
130
+ # for article in articlelist:
131
+ # if isinstance(article, etree._Element):
132
+ # subelement = etree.tostring(article).decode()
133
+ # subpage = etree.HTML(subelement)
134
+ # date = subpage.xpath("//span/text()")[0]
135
+ # parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y/%m/%d")), "%Y-%m-%d")
136
+ # if parsed_datetime > (datetime.today() - timedelta(days=183)):
137
+ # urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
138
+ # for url in urls:
139
+ # try:
140
+ # article = {}
141
+ # if "/jd/jd" in url:
142
+ # url = url.replace("../../", "https://www.ndrc.gov.cn/xxgk/")
143
+ # else:
144
+ # url = url.replace("./", categoryu_url)
145
+ # req = urllib.request.urlopen(url)
146
+ # text = req.read()
147
+ # html_text = text.decode("utf-8")
148
+ # page = etree.HTML(html_text)
149
+ # attachment_urls = page.xpath("//div[contains(@class, 'attachment_r')]//a/@href")
150
+ # for attachment_url in attachment_urls:
151
+ # if ".pdf" in attachment_url:
152
+ # pdf_url = url.rsplit('/', 1)[0] + attachment_url.replace('./','/')
153
+ # pdf_content = extract_from_pdf(pdf_url)
154
+ # article['originalContent'] = pdf_content
155
+ # content_eng = ''
156
+ # for element in article['originalContent'].split("。"):
157
+ # content_eng += translator.translate(element, dest='en').text + ' '
158
+ # article['content'] = content_eng
159
+ # article['site'] = "National Development and Reform Commission"
160
+ # article['originalSite'] = "国家发展和改革委员会"
161
+ # article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
162
+ # article['title'] = translator.translate(article['originalTitle'], dest='en').text
163
+ # article['url'] = url
164
+ # article['category']= "Policy Release"
165
+ # article['publishDate'] = datemodifier(page.xpath("//meta[@name = 'PubDate']/@content")[0])
166
+ # article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
167
+ # label_dict = {
168
+ # "positive": "+",
169
+ # "negative": "-",
170
+ # "neutral": "0",
171
+ # }
172
+ # sentiment_score = 0
173
+ # maximum_value = 0
174
+ # raw_sentiment = analyzer(article['content'][:512], return_all_scores=True)
175
+ # sentiment_label = None
176
+ # for sentiment_dict in raw_sentiment[0]:
177
+ # value = sentiment_dict["score"]
178
+ # if value > maximum_value:
179
+ # sentiment_label = sentiment_dict["label"]
180
+ # maximum_value = value
181
+ # if sentiment_dict["label"] == "positive":
182
+ # sentiment_score = sentiment_score + value
183
+ # if sentiment_dict["label"] == "negative":
184
+ # sentiment_score = sentiment_score - value
185
+ # else:
186
+ # sentiment_score = sentiment_score + 0
187
+ # article['sentimentScore'] = sentiment_score
188
+ # article['sentimentLabel'] = label_dict[sentiment_label]
189
+ # upsert_content(article)
190
+ # except Exception as error:
191
+ # print(error)
192
 
193
+ i = 0
194
+ while i > -1:
195
+ if i == 0:
196
+ categoryu_url = "https://www.ndrc.gov.cn/xxgk/jd/jd/index.html"
197
+ else:
198
+ categoryu_url = f"https://www.ndrc.gov.cn/xxgk/jd/jd/index_{i}.html"
199
  req = urllib.request.urlopen(categoryu_url)
200
  text = req.read()
201
  html_text = text.decode("utf-8")
pbc.py CHANGED
@@ -120,9 +120,14 @@ def upsert_content(report):
120
  response = table.put_item(Item=item)
121
  print(response)
122
 
123
- reportList = []
124
- categoryu_urls = ["http://www.pbc.gov.cn/rmyh/3963412/index.html"]
125
- for categoryu_url in categoryu_urls:
 
 
 
 
 
126
  response = requests.get(categoryu_url)
127
  page = etree.HTML(response.text)
128
  urls = page.xpath("//td[contains(@height,'22')]//a[contains(@target, '_blank')]/@href")
 
120
  response = table.put_item(Item=item)
121
  print(response)
122
 
123
+ i = 0
124
+ while i > -1:
125
+ if i == 0:
126
+ categoryu_url = "http://www.pbc.gov.cn/rmyh/3963412/3963426/index.html"
127
+ else:
128
+ j = i + 1
129
+ categoryu_url = f"http://www.pbc.gov.cn/rmyh/3963412/3963426/index_{j}.html"
130
+
131
  response = requests.get(categoryu_url)
132
  page = etree.HTML(response.text)
133
  urls = page.xpath("//td[contains(@height,'22')]//a[contains(@target, '_blank')]/@href")