OxbridgeEconomics
commited on
Commit
·
b2a3d45
1
Parent(s):
f801221
commit
Browse files
.gitignore
CHANGED
@@ -1,3 +1,5 @@
|
|
1 |
env
|
2 |
__pycache__
|
3 |
-
downloaded_file.pdf
|
|
|
|
|
|
1 |
env
|
2 |
__pycache__
|
3 |
+
downloaded_file.pdf
|
4 |
+
downloaded_file.docx
|
5 |
+
downloaded_file.doc
|
cbirc.py
CHANGED
@@ -26,7 +26,7 @@ while i > -1:
|
|
26 |
for element in article['originalContent'].split("。"):
|
27 |
CONTENT_ENG += translate(element) + ' '
|
28 |
article['content'] = CONTENT_ENG
|
29 |
-
article['site'] = "National Financial Regulatory Administration"
|
30 |
article['originalSite'] = "国家金融监督管理总局"
|
31 |
article['originalTitle'] = article['docSubtitle']
|
32 |
article['title'] = translate(article['originalTitle'])
|
@@ -62,7 +62,7 @@ while i > -1:
|
|
62 |
for element in article['originalContent'].split("。"):
|
63 |
CONTENT_ENG += translate(element) + ' '
|
64 |
article['content'] = CONTENT_ENG
|
65 |
-
article['site'] = "State Taxation Administration"
|
66 |
article['originalSite'] = "国家税务总局"
|
67 |
article['originalTitle'] = article['title']
|
68 |
article['title'] = translate(article['originalTitle'])
|
|
|
26 |
for element in article['originalContent'].split("。"):
|
27 |
CONTENT_ENG += translate(element) + ' '
|
28 |
article['content'] = CONTENT_ENG
|
29 |
+
article['site'] = "National Financial Regulatory Administration of China"
|
30 |
article['originalSite'] = "国家金融监督管理总局"
|
31 |
article['originalTitle'] = article['docSubtitle']
|
32 |
article['title'] = translate(article['originalTitle'])
|
|
|
62 |
for element in article['originalContent'].split("。"):
|
63 |
CONTENT_ENG += translate(element) + ' '
|
64 |
article['content'] = CONTENT_ENG
|
65 |
+
article['site'] = "State Taxation Administration of China"
|
66 |
article['originalSite'] = "国家税务总局"
|
67 |
article['originalTitle'] = article['title']
|
68 |
article['title'] = translate(article['originalTitle'])
|
chinatax.py
CHANGED
@@ -32,7 +32,7 @@ while i > -1:
|
|
32 |
for element in article['originalContent'].split("。"):
|
33 |
CONTENT_ENG += translate(element) + ' '
|
34 |
article['content'] = CONTENT_ENG
|
35 |
-
article['site'] = "State Taxation Administration"
|
36 |
article['originalSite'] = "国家税务总局"
|
37 |
article['originalTitle'] = article['title']
|
38 |
article['title'] = translate(article['originalTitle'])
|
@@ -80,7 +80,7 @@ while i > -1:
|
|
80 |
for element in article['originalContent'].split("。"):
|
81 |
CONTENT_ENG += translate(element) + ' '
|
82 |
article['content'] = CONTENT_ENG
|
83 |
-
article['site'] = "State Taxation Administration"
|
84 |
article['originalSite'] = "国家税务总局"
|
85 |
article['originalTitle'] = article['title']
|
86 |
article['title'] = translate(article['originalTitle'])
|
|
|
32 |
for element in article['originalContent'].split("。"):
|
33 |
CONTENT_ENG += translate(element) + ' '
|
34 |
article['content'] = CONTENT_ENG
|
35 |
+
article['site'] = "State Taxation Administration of China"
|
36 |
article['originalSite'] = "国家税务总局"
|
37 |
article['originalTitle'] = article['title']
|
38 |
article['title'] = translate(article['originalTitle'])
|
|
|
80 |
for element in article['originalContent'].split("。"):
|
81 |
CONTENT_ENG += translate(element) + ' '
|
82 |
article['content'] = CONTENT_ENG
|
83 |
+
article['site'] = "State Taxation Administration of China"
|
84 |
article['originalSite'] = "国家税务总局"
|
85 |
article['originalTitle'] = article['title']
|
86 |
article['title'] = translate(article['originalTitle'])
|
csrc.py
CHANGED
@@ -43,7 +43,7 @@ while i > -1:
|
|
43 |
for element in article['originalContent'].split("。"):
|
44 |
CONTENT_ENG += translate(element) + ' '
|
45 |
article['content'] = CONTENT_ENG
|
46 |
-
article['site'] = "Securities Regulatory Commission"
|
47 |
article['originalSite'] = "证监会"
|
48 |
article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
|
49 |
article['title'] = translate(article['originalTitle'])
|
@@ -75,7 +75,7 @@ while i > -1:
|
|
75 |
for element in article['originalContent'].split("。"):
|
76 |
CONTENT_ENG += translate(element) + ' '
|
77 |
article['content'] = CONTENT_ENG
|
78 |
-
article['site'] = "Securities Regulatory Commission"
|
79 |
article['originalSite'] = "证监会"
|
80 |
article['originalTitle'] = article['title']
|
81 |
article['title'] = translate(article['originalTitle'])
|
|
|
43 |
for element in article['originalContent'].split("。"):
|
44 |
CONTENT_ENG += translate(element) + ' '
|
45 |
article['content'] = CONTENT_ENG
|
46 |
+
article['site'] = "Securities Regulatory Commission of China"
|
47 |
article['originalSite'] = "证监会"
|
48 |
article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
|
49 |
article['title'] = translate(article['originalTitle'])
|
|
|
75 |
for element in article['originalContent'].split("。"):
|
76 |
CONTENT_ENG += translate(element) + ' '
|
77 |
article['content'] = CONTENT_ENG
|
78 |
+
article['site'] = "Securities Regulatory Commission of China"
|
79 |
article['originalSite'] = "证监会"
|
80 |
article['originalTitle'] = article['title']
|
81 |
article['title'] = translate(article['originalTitle'])
|
gov.py
CHANGED
@@ -43,7 +43,7 @@ while i > -1:
|
|
43 |
for element in article['originalContent'].split("。"):
|
44 |
CONTENT_ENG += translate(element) + ' '
|
45 |
article['content'] = CONTENT_ENG
|
46 |
-
article['site'] = "State Council"
|
47 |
article['originalSite'] = "国务院"
|
48 |
article['originalTitle'] = page.xpath("//title/text()")[0]
|
49 |
article['title'] = translate(article['originalTitle'])
|
@@ -94,7 +94,7 @@ while i > -1:
|
|
94 |
for element in article['originalContent'].split("。"):
|
95 |
CONTENT_ENG += translate(article['originalContent']) + ' '
|
96 |
article['content'] = CONTENT_ENG
|
97 |
-
article['site'] = "State Council"
|
98 |
article['originalSite'] = "国务院"
|
99 |
article['originalTitle'] = page.xpath("//title/text()")[0]
|
100 |
article['title'] = translate(article['originalTitle'])
|
|
|
43 |
for element in article['originalContent'].split("。"):
|
44 |
CONTENT_ENG += translate(element) + ' '
|
45 |
article['content'] = CONTENT_ENG
|
46 |
+
article['site'] = "State Council of China"
|
47 |
article['originalSite'] = "国务院"
|
48 |
article['originalTitle'] = page.xpath("//title/text()")[0]
|
49 |
article['title'] = translate(article['originalTitle'])
|
|
|
94 |
for element in article['originalContent'].split("。"):
|
95 |
CONTENT_ENG += translate(article['originalContent']) + ' '
|
96 |
article['content'] = CONTENT_ENG
|
97 |
+
article['site'] = "State Council of China"
|
98 |
article['originalSite'] = "国务院"
|
99 |
article['originalTitle'] = page.xpath("//title/text()")[0]
|
100 |
article['title'] = translate(article['originalTitle'])
|
mof.py
CHANGED
@@ -43,7 +43,7 @@ while i > -1:
|
|
43 |
for element in article['originalContent'].split("。"):
|
44 |
CONTENT_ENG += translate(element) + ' '
|
45 |
article['content'] = CONTENT_ENG
|
46 |
-
article['site'] = "Ministry of Finance"
|
47 |
article['originalSite'] = "财政部"
|
48 |
article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
|
49 |
article['title'] = translate(article['originalTitle'])
|
@@ -93,7 +93,7 @@ while i > -1:
|
|
93 |
for element in article['originalContent'].split("。"):
|
94 |
CONTENT_ENG += translate(element) + ' '
|
95 |
article['content'] = CONTENT_ENG
|
96 |
-
article['site'] = "Ministry of Finance"
|
97 |
article['originalSite'] = "财政部"
|
98 |
article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
|
99 |
article['title'] = translate(article['originalTitle'])
|
|
|
43 |
for element in article['originalContent'].split("。"):
|
44 |
CONTENT_ENG += translate(element) + ' '
|
45 |
article['content'] = CONTENT_ENG
|
46 |
+
article['site'] = "Ministry of Finance of China"
|
47 |
article['originalSite'] = "财政部"
|
48 |
article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
|
49 |
article['title'] = translate(article['originalTitle'])
|
|
|
93 |
for element in article['originalContent'].split("。"):
|
94 |
CONTENT_ENG += translate(element) + ' '
|
95 |
article['content'] = CONTENT_ENG
|
96 |
+
article['site'] = "Ministry of Finance of China"
|
97 |
article['originalSite'] = "财政部"
|
98 |
article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
|
99 |
article['title'] = translate(article['originalTitle'])
|
mofcom.py
CHANGED
@@ -48,7 +48,7 @@ for category in categories:
|
|
48 |
for element in article['originalContent'].split("。"):
|
49 |
CONTENT_ENG += translate(element) + ' '
|
50 |
article['content'] = CONTENT_ENG
|
51 |
-
article['site'] = "Ministry of Commerce"
|
52 |
article['originalSite'] = "商务部"
|
53 |
article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
|
54 |
article['title'] = translate(article['originalTitle'])
|
|
|
48 |
for element in article['originalContent'].split("。"):
|
49 |
CONTENT_ENG += translate(element) + ' '
|
50 |
article['content'] = CONTENT_ENG
|
51 |
+
article['site'] = "Ministry of Commerce of China"
|
52 |
article['originalSite'] = "商务部"
|
53 |
article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
|
54 |
article['title'] = translate(article['originalTitle'])
|
ndrc.py
CHANGED
@@ -43,7 +43,7 @@ while i > -1:
|
|
43 |
for element in article['originalContent'].split("。"):
|
44 |
CONTENT_ENG += translate(element) + ' '
|
45 |
article['content'] = CONTENT_ENG
|
46 |
-
article['site'] = "State Council"
|
47 |
article['originalSite'] = "国务院"
|
48 |
article['originalTitle'] = page.xpath("//title/text()")[0]
|
49 |
article['title'] = translate(article['originalTitle'])
|
@@ -64,7 +64,7 @@ while i > -1:
|
|
64 |
for element in article['originalContent'].split("。"):
|
65 |
CONTENT_ENG += translate(element) + ' '
|
66 |
article['content'] = CONTENT_ENG
|
67 |
-
article['site'] = "National Development and Reform Commission"
|
68 |
article['originalSite'] = "国家发展和改革委员会"
|
69 |
article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
|
70 |
article['title'] = translate(article['originalTitle'])
|
@@ -86,7 +86,7 @@ while i > -1:
|
|
86 |
for element in article['originalContent'].split("。"):
|
87 |
CONTENT_ENG += translate(element) + ' '
|
88 |
article['content'] = CONTENT_ENG
|
89 |
-
article['site'] = "National Development and Reform Commission"
|
90 |
article['originalSite'] = "国家发展和改革委员会"
|
91 |
article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
|
92 |
article['title'] = translate(article['originalTitle'])
|
|
|
43 |
for element in article['originalContent'].split("。"):
|
44 |
CONTENT_ENG += translate(element) + ' '
|
45 |
article['content'] = CONTENT_ENG
|
46 |
+
article['site'] = "State Council of China"
|
47 |
article['originalSite'] = "国务院"
|
48 |
article['originalTitle'] = page.xpath("//title/text()")[0]
|
49 |
article['title'] = translate(article['originalTitle'])
|
|
|
64 |
for element in article['originalContent'].split("。"):
|
65 |
CONTENT_ENG += translate(element) + ' '
|
66 |
article['content'] = CONTENT_ENG
|
67 |
+
article['site'] = "National Development and Reform Commission of China"
|
68 |
article['originalSite'] = "国家发展和改革委员会"
|
69 |
article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
|
70 |
article['title'] = translate(article['originalTitle'])
|
|
|
86 |
for element in article['originalContent'].split("。"):
|
87 |
CONTENT_ENG += translate(element) + ' '
|
88 |
article['content'] = CONTENT_ENG
|
89 |
+
article['site'] = "National Development and Reform Commission of China"
|
90 |
article['originalSite'] = "国家发展和改革委员会"
|
91 |
article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
|
92 |
article['title'] = translate(article['originalTitle'])
|
safe.py
CHANGED
@@ -42,7 +42,7 @@ while i > -1:
|
|
42 |
for element in article['originalContent'].split("。"):
|
43 |
CONTENT_ENG += translate(element) + ' '
|
44 |
article['content'] = CONTENT_ENG
|
45 |
-
article['site'] = "State Administration of Foregin Exchange"
|
46 |
article['originalSite'] = "外汇管理局"
|
47 |
article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
|
48 |
article['title'] = translate(article['originalTitle'])
|
@@ -92,7 +92,7 @@ while i > -1:
|
|
92 |
for element in article['originalContent'].split("。"):
|
93 |
CONTENT_ENG += translate(element) + ' '
|
94 |
article['content'] = CONTENT_ENG
|
95 |
-
article['site'] = "State Administration of Foregin Exchange"
|
96 |
article['originalSite'] = "外汇管理局"
|
97 |
article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
|
98 |
article['title'] = translate(article['originalTitle'])
|
|
|
42 |
for element in article['originalContent'].split("。"):
|
43 |
CONTENT_ENG += translate(element) + ' '
|
44 |
article['content'] = CONTENT_ENG
|
45 |
+
article['site'] = "State Administration of Foregin Exchange of China"
|
46 |
article['originalSite'] = "外汇管理局"
|
47 |
article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
|
48 |
article['title'] = translate(article['originalTitle'])
|
|
|
92 |
for element in article['originalContent'].split("。"):
|
93 |
CONTENT_ENG += translate(element) + ' '
|
94 |
article['content'] = CONTENT_ENG
|
95 |
+
article['site'] = "State Administration of Foregin Exchange of China"
|
96 |
article['originalSite'] = "外汇管理局"
|
97 |
article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
|
98 |
article['title'] = translate(article['originalTitle'])
|
stats.py
CHANGED
@@ -30,7 +30,7 @@ while i > -1:
|
|
30 |
for url in urls:
|
31 |
try:
|
32 |
article = {}
|
33 |
-
url = url.replace('./', "https://www.stats.gov.cn/sj/sjjd/"
|
34 |
req = urllib.request.urlopen(url)
|
35 |
text = req.read()
|
36 |
html_text = text.decode("utf-8")
|
@@ -42,7 +42,7 @@ while i > -1:
|
|
42 |
for element in article['originalContent'].split("。"):
|
43 |
CONTENT_ENG += translate(element) + ' '
|
44 |
article['content'] = CONTENT_ENG
|
45 |
-
article['site'] = "National Bureau of Statistics"
|
46 |
article['originalSite'] = "国家统计局"
|
47 |
article['originalTitle'] = page.xpath("//title/text()")[0]
|
48 |
article['title'] = translate(article['originalTitle'])
|
|
|
30 |
for url in urls:
|
31 |
try:
|
32 |
article = {}
|
33 |
+
url = url.replace('./', "https://www.stats.gov.cn/sj/sjjd/")
|
34 |
req = urllib.request.urlopen(url)
|
35 |
text = req.read()
|
36 |
html_text = text.decode("utf-8")
|
|
|
42 |
for element in article['originalContent'].split("。"):
|
43 |
CONTENT_ENG += translate(element) + ' '
|
44 |
article['content'] = CONTENT_ENG
|
45 |
+
article['site'] = "National Bureau of Statistics of China"
|
46 |
article['originalSite'] = "国家统计局"
|
47 |
article['originalTitle'] = page.xpath("//title/text()")[0]
|
48 |
article['title'] = translate(article['originalTitle'])
|
utils.py
CHANGED
@@ -55,12 +55,16 @@ def encode(content):
|
|
55 |
replace('\n','').replace('\t','').replace('\r','').replace(' ','').strip()
|
56 |
else:
|
57 |
line = element
|
|
|
58 |
text += line
|
59 |
index = text.find('打印本页')
|
60 |
if index != -1:
|
61 |
text = text[:index]
|
62 |
-
|
63 |
-
|
|
|
|
|
|
|
64 |
|
65 |
def extract_from_pdf(url):
|
66 |
# Send a GET request to the URL and retrieve the PDF content
|
@@ -131,17 +135,54 @@ def upsert_content(report):
|
|
131 |
'title': report['title'],
|
132 |
# 'originalSite': report['originalSite'],
|
133 |
# 'originalTitle': report['originalTitle'],
|
134 |
-
|
135 |
'category': report['category'],
|
136 |
# 'author': report['author'],
|
137 |
'content': report['content'],
|
138 |
-
'publishDate': report['
|
139 |
-
'link': report['
|
140 |
# 'attachment': report['reporturl'],
|
141 |
# 'authorID': str(report['authorid']),
|
142 |
-
'
|
143 |
-
'
|
144 |
-
'
|
|
|
|
|
145 |
}
|
146 |
response = table.put_item(Item=item)
|
147 |
print(response)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
replace('\n','').replace('\t','').replace('\r','').replace(' ','').strip()
|
56 |
else:
|
57 |
line = element
|
58 |
+
line = line + '\n'
|
59 |
text += line
|
60 |
index = text.find('打印本页')
|
61 |
if index != -1:
|
62 |
text = text[:index]
|
63 |
+
try:
|
64 |
+
summary = '\n'.join(text.split('\n')[:2])
|
65 |
+
except:
|
66 |
+
summary = text
|
67 |
+
return text, summary
|
68 |
|
69 |
def extract_from_pdf(url):
|
70 |
# Send a GET request to the URL and retrieve the PDF content
|
|
|
135 |
'title': report['title'],
|
136 |
# 'originalSite': report['originalSite'],
|
137 |
# 'originalTitle': report['originalTitle'],
|
138 |
+
'originContent': report['originContent'],
|
139 |
'category': report['category'],
|
140 |
# 'author': report['author'],
|
141 |
'content': report['content'],
|
142 |
+
'publishDate': report['publishdate'],
|
143 |
+
'link': report['link'],
|
144 |
# 'attachment': report['reporturl'],
|
145 |
# 'authorID': str(report['authorid']),
|
146 |
+
'entityList': report['entitylist'],
|
147 |
+
'sentimentScore': Decimal(str(report['sentimentscore'])).quantize(Decimal('0.01')),
|
148 |
+
'sentimentLabel': report['sentimentlabel'],
|
149 |
+
'LastModifiedDate': datetime.now().strftime("%Y-%m-%dT%H:%M:%S"),
|
150 |
+
'subtitle': report['subtitle']
|
151 |
}
|
152 |
response = table.put_item(Item=item)
|
153 |
print(response)
|
154 |
+
|
155 |
+
def get_client_connection():
|
156 |
+
"""Get dynamoDB connection"""
|
157 |
+
dynamodb = boto3.client(
|
158 |
+
service_name='dynamodb',
|
159 |
+
region_name='us-east-1',
|
160 |
+
aws_access_key_id=AWS_ACCESS_KEY_ID,
|
161 |
+
aws_secret_access_key=AWS_SECRET_ACCESS_KEY
|
162 |
+
)
|
163 |
+
return dynamodb
|
164 |
+
|
165 |
+
def delete_records(item):
|
166 |
+
dynamodb_client = get_client_connection()
|
167 |
+
dynamodb_client.delete_item(
|
168 |
+
TableName="article_china",
|
169 |
+
Key={
|
170 |
+
'id': {'S': item['id']},
|
171 |
+
'site': {'S': item['site']}
|
172 |
+
}
|
173 |
+
)
|
174 |
+
|
175 |
+
def update_content(report):
|
176 |
+
dynamodb = get_client_connection()
|
177 |
+
response = dynamodb.update_item(
|
178 |
+
TableName="article_china",
|
179 |
+
Key={
|
180 |
+
'id': {'S': report['id']},
|
181 |
+
'site': {'S': report['site']}
|
182 |
+
},
|
183 |
+
UpdateExpression='SET sentimentScore = :sentimentScore, sentimentLabel = :sentimentLabel',
|
184 |
+
ExpressionAttributeValues={
|
185 |
+
':sentimentScore': {'N': str(Decimal(str(report['sentimentscore'])).quantize(Decimal('0.01')))},
|
186 |
+
':sentimentLabel': {'S': report['sentimentlabel']}
|
187 |
+
}
|
188 |
+
)
|