OxbridgeEconomics commited on
Commit
4a8b338
·
1 Parent(s): 48eb6ea
Files changed (3) hide show
  1. .gitignore +2 -1
  2. gov.py +42 -200
  3. utils.py +146 -0
.gitignore CHANGED
@@ -1 +1,2 @@
1
- ./env
 
 
1
+ env
2
+ __pycache__
gov.py CHANGED
@@ -1,134 +1,18 @@
1
- import requests
2
  from datetime import datetime, timedelta
3
- from decimal import Decimal
4
- import boto3
5
  import uuid
6
  import time
7
  import urllib.request
8
  from lxml import etree
9
- from googletrans import Translator
10
- from transformers import pipeline
11
- from PyPDF2 import PdfReader
12
- import os
13
-
14
- # AWS_ACCESS_KEY_ID = os.environ['AWS_ACCESS_KEY_ID']
15
- # AWS_SECRET_ACCESS_KEY = os.environ['AWS_SECRET_ACCESS_KEY']
16
- AWS_ACCESS_KEY_ID="AKIAQFXZMGHQYXKWUDWR"
17
- AWS_SECRET_ACCESS_KEY="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"
18
-
19
- analyzer = pipeline("sentiment-analysis", model="ProsusAI/finbert")
20
-
21
- translator = Translator()
22
-
23
- def datemodifier(date_string):
24
- """Date Modifier Function"""
25
- try:
26
- to_date = time.strptime(date_string,"%Y-%m-%d-%H:%M:%S")
27
- return time.strftime("%Y-%m-%d",to_date)
28
- except:
29
- return False
30
-
31
- def fetch_url(url):
32
- response = requests.get(url)
33
- if response.status_code == 200:
34
- return response.text
35
- else:
36
- return None
37
-
38
- def translist(infolist):
39
- """Translist Function"""
40
- out = list(filter(lambda s: s and
41
- (isinstance (s,str) or len(s.strip()) > 0), [i.strip() for i in infolist]))
42
- return out
43
-
44
- def encode(content):
45
- """Encode Function"""
46
- text = ''
47
- for element in content[:1]:
48
- if isinstance(element, etree._Element):
49
- subelement = etree.tostring(element).decode()
50
- subpage = etree.HTML(subelement)
51
- tree = subpage.xpath('//text()')
52
- line = ''.join(translist(tree)).\
53
- replace('\n','').replace('\t','').replace('\r','').replace(' ','').strip()
54
- else:
55
- line = element
56
- text += line
57
- index = text.find('打印本页')
58
- if index != -1:
59
- text = text[:index]
60
-
61
- return text
62
-
63
- def extract_from_pdf(url):
64
- # Send a GET request to the URL and retrieve the PDF content
65
- response = requests.get(url)
66
- pdf_content = response.content
67
-
68
- # Save the PDF content to a local file
69
- with open("downloaded_file.pdf", "wb") as f:
70
- f.write(pdf_content)
71
-
72
- # Open the downloaded PDF file and extract the text
73
- with open("downloaded_file.pdf", "rb") as f:
74
- pdf_reader = PdfReader(f)
75
- num_pages = len(pdf_reader.pages)
76
- extracted_text = ""
77
- extracted_text_eng = ""
78
- for page in range(num_pages):
79
- text = pdf_reader.pages[page].extract_text()
80
- if text and text[0].isdigit():
81
- text = text[1:]
82
- first_newline_index = text.find('\n')
83
- text = text[:first_newline_index+1].replace('\n', ' ') + text[first_newline_index+1:].replace('\n', '')
84
- extracted_text_eng += translator.translate(text, dest='en').text
85
- extracted_text += text
86
- return extracted_text, extracted_text_eng
87
-
88
- def get_db_connection():
89
- """Get dynamoDB connection"""
90
- dynamodb = boto3.resource(
91
- service_name='dynamodb',
92
- region_name='us-east-1',
93
- aws_access_key_id=AWS_ACCESS_KEY_ID,
94
- aws_secret_access_key=AWS_SECRET_ACCESS_KEY
95
- )
96
- return dynamodb
97
-
98
- def upsert_content(report):
99
- """Upsert the content records"""
100
- dynamodb = get_db_connection()
101
- table = dynamodb.Table('article_china')
102
- # Define the item data
103
- item = {
104
- 'id': str(report['id']),
105
- 'site': report['site'],
106
- 'title': report['title'],
107
- # 'originalSite': report['originalSite'],
108
- # 'originalTitle': report['originalTitle'],
109
- # 'originalContent': report['originalContent'],
110
- 'category': report['category'],
111
- # 'author': report['author'],
112
- 'content': report['content'],
113
- 'publishDate': report['publishDate'],
114
- 'link': report['url'],
115
- # 'attachment': report['reporturl'],
116
- # 'authorID': str(report['authorid']),
117
- 'sentimentScore': str(Decimal(report['sentimentScore']).quantize(Decimal('0.01'))),
118
- 'sentimentLabel': report['sentimentLabel'],
119
- 'LastModifiedDate': datetime.now().strftime("%Y-%m-%dT%H:%M:%S")
120
- }
121
- response = table.put_item(Item=item)
122
- print(response)
123
 
124
  i = 0
125
  while i > -1:
126
  if i == 0:
127
- categoryu_url = "https://www.gov.cn/zhengce/jiedu/home.htm"
128
  else:
129
- categoryu_url = f"https://www.gov.cn/zhengce/jiedu/home_{i}.htm"
130
  i = i + 1
131
- req = urllib.request.urlopen(categoryu_url)
132
  text = req.read()
133
  html_text = text.decode("utf-8")
134
  page = etree.HTML(html_text)
@@ -148,46 +32,25 @@ while i > -1:
148
  article = {}
149
  url = url.replace('../', 'https://www.gov.cn/zhengce/')
150
  if "https://www.gov.cn" in url:
151
- req = urllib.request.urlopen(url)
152
- text = req.read()
153
- html_text = text.decode("utf-8")
154
- page = etree.HTML(html_text)
155
- article['originalContent'] = encode(page.xpath("//div[contains(@id, 'UCAP-CONTENT')]//p"))
156
- content_eng = ''
157
- for element in article['originalContent'].split("。"):
158
- content_eng += translator.translate(element, dest='en').text + ' '
159
- article['content'] = content_eng
160
- article['site'] = "State Council"
161
- article['originalSite'] = "国务院"
162
- article['originalTitle'] = page.xpath("//title/text()")[0]
163
- article['title'] = translator.translate(article['originalTitle'], dest='en').text
164
- article['url'] = url
165
- article['category']= "Policy Interpretation"
166
- article['publishDate'] = datemodifier(page.xpath("//meta[@name = 'firstpublishedtime']/@content")[0])
167
- article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
168
- label_dict = {
169
- "positive": "+",
170
- "negative": "-",
171
- "neutral": "0",
172
- }
173
- sentiment_score = 0
174
- maximum_value = 0
175
- raw_sentiment = analyzer(article['content'][:512], return_all_scores=True)
176
- sentiment_label = None
177
- for sentiment_dict in raw_sentiment[0]:
178
- value = sentiment_dict["score"]
179
- if value > maximum_value:
180
- sentiment_label = sentiment_dict["label"]
181
- maximum_value = value
182
- if sentiment_dict["label"] == "positive":
183
- sentiment_score = sentiment_score + value
184
- if sentiment_dict["label"] == "negative":
185
- sentiment_score = sentiment_score - value
186
- else:
187
- sentiment_score = sentiment_score + 0
188
- article['sentimentScore'] = sentiment_score
189
- article['sentimentLabel'] = label_dict[sentiment_label]
190
- upsert_content(article)
191
  except Exception as error:
192
  print(error)
193
 
@@ -218,45 +81,24 @@ while i > -1:
218
  article = {}
219
  url = url.replace('../', 'https://www.gov.cn/zhengce/')
220
  if "https://www.gov.cn" in url:
221
- req = urllib.request.urlopen(url)
222
- text = req.read()
223
- html_text = text.decode("utf-8")
224
- page = etree.HTML(html_text)
225
- article['originalContent'] = encode(page.xpath("//div[contains(@id, 'UCAP-CONTENT')]//p"))
226
- content_eng = ''
227
- for element in article['originalContent'].split("。"):
228
- content_eng += translator.translate(element, dest='en').text + ' '
229
- article['content'] = content_eng
230
- article['site'] = "State Council"
231
- article['originalSite'] = "国���院"
232
- article['originalTitle'] = page.xpath("//title/text()")[0]
233
- article['title'] = translator.translate(article['originalTitle'], dest='en').text
234
- article['url'] = url
235
- article['category']= "Policy Release"
236
- article['publishDate'] = datemodifier(page.xpath("//meta[@name = 'firstpublishedtime']/@content")[0])
237
- article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
238
- label_dict = {
239
- "positive": "+",
240
- "negative": "-",
241
- "neutral": "0",
242
- }
243
- sentiment_score = 0
244
- maximum_value = 0
245
- raw_sentiment = analyzer(article['content'][:512], return_all_scores=True)
246
- sentiment_label = None
247
- for sentiment_dict in raw_sentiment[0]:
248
- value = sentiment_dict["score"]
249
- if value > maximum_value:
250
- sentiment_label = sentiment_dict["label"]
251
- maximum_value = value
252
- if sentiment_dict["label"] == "positive":
253
- sentiment_score = sentiment_score + value
254
- if sentiment_dict["label"] == "negative":
255
- sentiment_score = sentiment_score - value
256
- else:
257
- sentiment_score = sentiment_score + 0
258
- article['sentimentScore'] = sentiment_score
259
- article['sentimentLabel'] = label_dict[sentiment_label]
260
- upsert_content(article)
261
  except Exception as error:
262
  print(error)
 
 
1
  from datetime import datetime, timedelta
 
 
2
  import uuid
3
  import time
4
  import urllib.request
5
  from lxml import etree
6
+ from utils import encode, translate, datemodifier, sentiment_computation, upsert_content
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
  i = 0
9
  while i > -1:
10
  if i == 0:
11
+ CATEGORY_URL = "https://www.gov.cn/zhengce/jiedu/home.htm"
12
  else:
13
+ CATEGORY_URL = f"https://www.gov.cn/zhengce/jiedu/home_{i}.htm"
14
  i = i + 1
15
+ req = urllib.request.urlopen(CATEGORY_URL)
16
  text = req.read()
17
  html_text = text.decode("utf-8")
18
  page = etree.HTML(html_text)
 
32
  article = {}
33
  url = url.replace('../', 'https://www.gov.cn/zhengce/')
34
  if "https://www.gov.cn" in url:
35
+ req = urllib.request.urlopen(url)
36
+ text = req.read()
37
+ html_text = text.decode("utf-8")
38
+ page = etree.HTML(html_text)
39
+ article['originalContent'] = encode(page.xpath("//div[contains(@id, 'UCAP-CONTENT')]//p"))
40
+ CONTENT_ENG = ''
41
+ for element in article['originalContent'].split("。"):
42
+ CONTENT_ENG += translate(element) + ' '
43
+ article['content'] = CONTENT_ENG
44
+ article['site'] = "State Council"
45
+ article['originalSite'] = "国务院"
46
+ article['originalTitle'] = page.xpath("//title/text()")[0]
47
+ article['title'] = translate(article['originalTitle'])
48
+ article['url'] = url
49
+ article['category']= "Policy Interpretation"
50
+ article['publishDate'] = datemodifier(page.xpath("//meta[@name = 'firstpublishedtime']/@content")[0], "%Y-%m-%d-%H:%M:%S")
51
+ article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
52
+ article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
53
+ upsert_content(article)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  except Exception as error:
55
  print(error)
56
 
 
81
  article = {}
82
  url = url.replace('../', 'https://www.gov.cn/zhengce/')
83
  if "https://www.gov.cn" in url:
84
+ req = urllib.request.urlopen(url)
85
+ text = req.read()
86
+ html_text = text.decode("utf-8")
87
+ page = etree.HTML(html_text)
88
+ article['originalContent'] = encode(page.xpath("//div[contains(@id, 'UCAP-CONTENT')]//p"))
89
+ CONTENT_ENG = ''
90
+ for element in article['originalContent'].split("。"):
91
+ CONTENT_ENG += translate(article['originalContent']) + ' '
92
+ article['content'] = CONTENT_ENG
93
+ article['site'] = "State Council"
94
+ article['originalSite'] = "国务院"
95
+ article['originalTitle'] = page.xpath("//title/text()")[0]
96
+ article['title'] = translate(article['originalTitle'])
97
+ article['url'] = url
98
+ article['category']= "Policy Release"
99
+ article['publishDate'] = datemodifier(page.xpath("//meta[@name = 'firstpublishedtime']/@content")[0], "%Y-%m-%d-%H:%M:%S")
100
+ article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
101
+ article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
102
+ upsert_content(article)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  except Exception as error:
104
  print(error)
utils.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Utilis Functions"""
2
+ import time
3
+ from datetime import datetime
4
+ from decimal import Decimal
5
+ import requests
6
+ import boto3
7
+ from lxml import etree
8
+ from googletrans import Translator
9
+ from transformers import pipeline
10
+ from PyPDF2 import PdfReader
11
+
12
+ # AWS_ACCESS_KEY_ID = os.environ['AWS_ACCESS_KEY_ID']
13
+ # AWS_SECRET_ACCESS_KEY = os.environ['AWS_SECRET_ACCESS_KEY']
14
+ AWS_ACCESS_KEY_ID="AKIAQFXZMGHQYXKWUDWR"
15
+ AWS_SECRET_ACCESS_KEY="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"
16
+
17
+ analyzer = pipeline("sentiment-analysis", model="ProsusAI/finbert")
18
+
19
+ translator = Translator()
20
+
21
+ def translate(text):
22
+ return translator.translate(text, dest='en').text
23
+
24
+ def datemodifier(date_string, date_format):
25
+ """Date Modifier Function"""
26
+ try:
27
+ to_date = time.strptime(date_string,date_format)
28
+ return time.strftime("%Y-%m-%d",to_date)
29
+ except:
30
+ return False
31
+
32
+ def fetch_url(url):
33
+ response = requests.get(url)
34
+ if response.status_code == 200:
35
+ return response.text
36
+ else:
37
+ return None
38
+
39
+ def translist(infolist):
40
+ """Translist Function"""
41
+ out = list(filter(lambda s: s and
42
+ (isinstance (s,str) or len(s.strip()) > 0), [i.strip() for i in infolist]))
43
+ return out
44
+
45
+ def encode(content):
46
+ """Encode Function"""
47
+ text = ''
48
+ for element in content[:1]:
49
+ if isinstance(element, etree._Element):
50
+ subelement = etree.tostring(element).decode()
51
+ subpage = etree.HTML(subelement)
52
+ tree = subpage.xpath('//text()')
53
+ line = ''.join(translist(tree)).\
54
+ replace('\n','').replace('\t','').replace('\r','').replace(' ','').strip()
55
+ else:
56
+ line = element
57
+ text += line
58
+ index = text.find('打印本页')
59
+ if index != -1:
60
+ text = text[:index]
61
+
62
+ return text
63
+
64
+ def extract_from_pdf(url):
65
+ # Send a GET request to the URL and retrieve the PDF content
66
+ response = requests.get(url)
67
+ pdf_content = response.content
68
+
69
+ # Save the PDF content to a local file
70
+ with open("downloaded_file.pdf", "wb") as f:
71
+ f.write(pdf_content)
72
+
73
+ # Open the downloaded PDF file and extract the text
74
+ with open("downloaded_file.pdf", "rb") as f:
75
+ pdf_reader = PdfReader(f)
76
+ num_pages = len(pdf_reader.pages)
77
+ extracted_text = ""
78
+ extracted_text_eng = ""
79
+ for page in range(num_pages):
80
+ text = pdf_reader.pages[page].extract_text()
81
+ if text and text[0].isdigit():
82
+ text = text[1:]
83
+ first_newline_index = text.find('\n')
84
+ text = text[:first_newline_index+1].replace('\n', ' ') + text[first_newline_index+1:].replace('\n', '')
85
+ extracted_text_eng += translator.translate(text, dest='en').text
86
+ extracted_text += text
87
+ return extracted_text, extracted_text_eng
88
+
89
+ def get_db_connection():
90
+ """Get dynamoDB connection"""
91
+ dynamodb = boto3.resource(
92
+ service_name='dynamodb',
93
+ region_name='us-east-1',
94
+ aws_access_key_id=AWS_ACCESS_KEY_ID,
95
+ aws_secret_access_key=AWS_SECRET_ACCESS_KEY
96
+ )
97
+ return dynamodb
98
+
99
+ def sentiment_computation(content):
100
+ label_dict = {
101
+ "positive": "+",
102
+ "negative": "-",
103
+ "neutral": "0",
104
+ }
105
+ sentiment_score = 0
106
+ maximum_value = 0
107
+ raw_sentiment = analyzer(content[:512], return_all_scores=True)
108
+ sentiment_label = None
109
+ for sentiment_dict in raw_sentiment[0]:
110
+ value = sentiment_dict["score"]
111
+ if value > maximum_value:
112
+ sentiment_label = sentiment_dict["label"]
113
+ maximum_value = value
114
+ if sentiment_dict["label"] == "positive":
115
+ sentiment_score = sentiment_score + value
116
+ if sentiment_dict["label"] == "negative":
117
+ sentiment_score = sentiment_score - value
118
+ else:
119
+ sentiment_score = sentiment_score + 0
120
+ return sentiment_score, label_dict[sentiment_label]
121
+
122
+ def upsert_content(report):
123
+ """Upsert the content records"""
124
+ dynamodb = get_db_connection()
125
+ table = dynamodb.Table('article_china')
126
+ # Define the item data
127
+ item = {
128
+ 'id': str(report['id']),
129
+ 'site': report['site'],
130
+ 'title': report['title'],
131
+ # 'originalSite': report['originalSite'],
132
+ # 'originalTitle': report['originalTitle'],
133
+ # 'originalContent': report['originalContent'],
134
+ 'category': report['category'],
135
+ # 'author': report['author'],
136
+ 'content': report['content'],
137
+ 'publishDate': report['publishDate'],
138
+ 'link': report['url'],
139
+ # 'attachment': report['reporturl'],
140
+ # 'authorID': str(report['authorid']),
141
+ 'sentimentScore': str(Decimal(report['sentimentScore']).quantize(Decimal('0.01'))),
142
+ 'sentimentLabel': report['sentimentLabel'],
143
+ 'LastModifiedDate': datetime.now().strftime("%Y-%m-%dT%H:%M:%S")
144
+ }
145
+ response = table.put_item(Item=item)
146
+ print(response)