OxbridgeEconomics commited on
Commit
b2a3d45
·
1 Parent(s): f801221
Files changed (11) hide show
  1. .gitignore +3 -1
  2. cbirc.py +2 -2
  3. chinatax.py +2 -2
  4. csrc.py +2 -2
  5. gov.py +2 -2
  6. mof.py +2 -2
  7. mofcom.py +1 -1
  8. ndrc.py +3 -3
  9. safe.py +2 -2
  10. stats.py +2 -2
  11. utils.py +49 -8
.gitignore CHANGED
@@ -1,3 +1,5 @@
1
  env
2
  __pycache__
3
- downloaded_file.pdf
 
 
 
1
  env
2
  __pycache__
3
+ downloaded_file.pdf
4
+ downloaded_file.docx
5
+ downloaded_file.doc
cbirc.py CHANGED
@@ -26,7 +26,7 @@ while i > -1:
26
  for element in article['originalContent'].split("。"):
27
  CONTENT_ENG += translate(element) + ' '
28
  article['content'] = CONTENT_ENG
29
- article['site'] = "National Financial Regulatory Administration"
30
  article['originalSite'] = "国家金融监督管理总局"
31
  article['originalTitle'] = article['docSubtitle']
32
  article['title'] = translate(article['originalTitle'])
@@ -62,7 +62,7 @@ while i > -1:
62
  for element in article['originalContent'].split("。"):
63
  CONTENT_ENG += translate(element) + ' '
64
  article['content'] = CONTENT_ENG
65
- article['site'] = "State Taxation Administration"
66
  article['originalSite'] = "国家税务总局"
67
  article['originalTitle'] = article['title']
68
  article['title'] = translate(article['originalTitle'])
 
26
  for element in article['originalContent'].split("。"):
27
  CONTENT_ENG += translate(element) + ' '
28
  article['content'] = CONTENT_ENG
29
+ article['site'] = "National Financial Regulatory Administration of China"
30
  article['originalSite'] = "国家金融监督管理总局"
31
  article['originalTitle'] = article['docSubtitle']
32
  article['title'] = translate(article['originalTitle'])
 
62
  for element in article['originalContent'].split("。"):
63
  CONTENT_ENG += translate(element) + ' '
64
  article['content'] = CONTENT_ENG
65
+ article['site'] = "State Taxation Administration of China"
66
  article['originalSite'] = "国家税务总局"
67
  article['originalTitle'] = article['title']
68
  article['title'] = translate(article['originalTitle'])
chinatax.py CHANGED
@@ -32,7 +32,7 @@ while i > -1:
32
  for element in article['originalContent'].split("。"):
33
  CONTENT_ENG += translate(element) + ' '
34
  article['content'] = CONTENT_ENG
35
- article['site'] = "State Taxation Administration"
36
  article['originalSite'] = "国家税务总局"
37
  article['originalTitle'] = article['title']
38
  article['title'] = translate(article['originalTitle'])
@@ -80,7 +80,7 @@ while i > -1:
80
  for element in article['originalContent'].split("。"):
81
  CONTENT_ENG += translate(element) + ' '
82
  article['content'] = CONTENT_ENG
83
- article['site'] = "State Taxation Administration"
84
  article['originalSite'] = "国家税务总局"
85
  article['originalTitle'] = article['title']
86
  article['title'] = translate(article['originalTitle'])
 
32
  for element in article['originalContent'].split("。"):
33
  CONTENT_ENG += translate(element) + ' '
34
  article['content'] = CONTENT_ENG
35
+ article['site'] = "State Taxation Administration of China"
36
  article['originalSite'] = "国家税务总局"
37
  article['originalTitle'] = article['title']
38
  article['title'] = translate(article['originalTitle'])
 
80
  for element in article['originalContent'].split("。"):
81
  CONTENT_ENG += translate(element) + ' '
82
  article['content'] = CONTENT_ENG
83
+ article['site'] = "State Taxation Administration of China"
84
  article['originalSite'] = "国家税务总局"
85
  article['originalTitle'] = article['title']
86
  article['title'] = translate(article['originalTitle'])
csrc.py CHANGED
@@ -43,7 +43,7 @@ while i > -1:
43
  for element in article['originalContent'].split("。"):
44
  CONTENT_ENG += translate(element) + ' '
45
  article['content'] = CONTENT_ENG
46
- article['site'] = "Securities Regulatory Commission"
47
  article['originalSite'] = "证监会"
48
  article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
49
  article['title'] = translate(article['originalTitle'])
@@ -75,7 +75,7 @@ while i > -1:
75
  for element in article['originalContent'].split("。"):
76
  CONTENT_ENG += translate(element) + ' '
77
  article['content'] = CONTENT_ENG
78
- article['site'] = "Securities Regulatory Commission"
79
  article['originalSite'] = "证监会"
80
  article['originalTitle'] = article['title']
81
  article['title'] = translate(article['originalTitle'])
 
43
  for element in article['originalContent'].split("。"):
44
  CONTENT_ENG += translate(element) + ' '
45
  article['content'] = CONTENT_ENG
46
+ article['site'] = "Securities Regulatory Commission of China"
47
  article['originalSite'] = "证监会"
48
  article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
49
  article['title'] = translate(article['originalTitle'])
 
75
  for element in article['originalContent'].split("。"):
76
  CONTENT_ENG += translate(element) + ' '
77
  article['content'] = CONTENT_ENG
78
+ article['site'] = "Securities Regulatory Commission of China"
79
  article['originalSite'] = "证监会"
80
  article['originalTitle'] = article['title']
81
  article['title'] = translate(article['originalTitle'])
gov.py CHANGED
@@ -43,7 +43,7 @@ while i > -1:
43
  for element in article['originalContent'].split("。"):
44
  CONTENT_ENG += translate(element) + ' '
45
  article['content'] = CONTENT_ENG
46
- article['site'] = "State Council"
47
  article['originalSite'] = "国务院"
48
  article['originalTitle'] = page.xpath("//title/text()")[0]
49
  article['title'] = translate(article['originalTitle'])
@@ -94,7 +94,7 @@ while i > -1:
94
  for element in article['originalContent'].split("。"):
95
  CONTENT_ENG += translate(article['originalContent']) + ' '
96
  article['content'] = CONTENT_ENG
97
- article['site'] = "State Council"
98
  article['originalSite'] = "国务院"
99
  article['originalTitle'] = page.xpath("//title/text()")[0]
100
  article['title'] = translate(article['originalTitle'])
 
43
  for element in article['originalContent'].split("。"):
44
  CONTENT_ENG += translate(element) + ' '
45
  article['content'] = CONTENT_ENG
46
+ article['site'] = "State Council of China"
47
  article['originalSite'] = "国务院"
48
  article['originalTitle'] = page.xpath("//title/text()")[0]
49
  article['title'] = translate(article['originalTitle'])
 
94
  for element in article['originalContent'].split("。"):
95
  CONTENT_ENG += translate(article['originalContent']) + ' '
96
  article['content'] = CONTENT_ENG
97
+ article['site'] = "State Council of China"
98
  article['originalSite'] = "国务院"
99
  article['originalTitle'] = page.xpath("//title/text()")[0]
100
  article['title'] = translate(article['originalTitle'])
mof.py CHANGED
@@ -43,7 +43,7 @@ while i > -1:
43
  for element in article['originalContent'].split("。"):
44
  CONTENT_ENG += translate(element) + ' '
45
  article['content'] = CONTENT_ENG
46
- article['site'] = "Ministry of Finance"
47
  article['originalSite'] = "财政部"
48
  article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
49
  article['title'] = translate(article['originalTitle'])
@@ -93,7 +93,7 @@ while i > -1:
93
  for element in article['originalContent'].split("。"):
94
  CONTENT_ENG += translate(element) + ' '
95
  article['content'] = CONTENT_ENG
96
- article['site'] = "Ministry of Finance"
97
  article['originalSite'] = "财政部"
98
  article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
99
  article['title'] = translate(article['originalTitle'])
 
43
  for element in article['originalContent'].split("。"):
44
  CONTENT_ENG += translate(element) + ' '
45
  article['content'] = CONTENT_ENG
46
+ article['site'] = "Ministry of Finance of China"
47
  article['originalSite'] = "财政部"
48
  article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
49
  article['title'] = translate(article['originalTitle'])
 
93
  for element in article['originalContent'].split("。"):
94
  CONTENT_ENG += translate(element) + ' '
95
  article['content'] = CONTENT_ENG
96
+ article['site'] = "Ministry of Finance of China"
97
  article['originalSite'] = "财政部"
98
  article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
99
  article['title'] = translate(article['originalTitle'])
mofcom.py CHANGED
@@ -48,7 +48,7 @@ for category in categories:
48
  for element in article['originalContent'].split("。"):
49
  CONTENT_ENG += translate(element) + ' '
50
  article['content'] = CONTENT_ENG
51
- article['site'] = "Ministry of Commerce"
52
  article['originalSite'] = "商务部"
53
  article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
54
  article['title'] = translate(article['originalTitle'])
 
48
  for element in article['originalContent'].split("。"):
49
  CONTENT_ENG += translate(element) + ' '
50
  article['content'] = CONTENT_ENG
51
+ article['site'] = "Ministry of Commerce of China"
52
  article['originalSite'] = "商务部"
53
  article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
54
  article['title'] = translate(article['originalTitle'])
ndrc.py CHANGED
@@ -43,7 +43,7 @@ while i > -1:
43
  for element in article['originalContent'].split("。"):
44
  CONTENT_ENG += translate(element) + ' '
45
  article['content'] = CONTENT_ENG
46
- article['site'] = "State Council"
47
  article['originalSite'] = "国务院"
48
  article['originalTitle'] = page.xpath("//title/text()")[0]
49
  article['title'] = translate(article['originalTitle'])
@@ -64,7 +64,7 @@ while i > -1:
64
  for element in article['originalContent'].split("。"):
65
  CONTENT_ENG += translate(element) + ' '
66
  article['content'] = CONTENT_ENG
67
- article['site'] = "National Development and Reform Commission"
68
  article['originalSite'] = "国家发展和改革委员会"
69
  article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
70
  article['title'] = translate(article['originalTitle'])
@@ -86,7 +86,7 @@ while i > -1:
86
  for element in article['originalContent'].split("。"):
87
  CONTENT_ENG += translate(element) + ' '
88
  article['content'] = CONTENT_ENG
89
- article['site'] = "National Development and Reform Commission"
90
  article['originalSite'] = "国家发展和改革委员会"
91
  article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
92
  article['title'] = translate(article['originalTitle'])
 
43
  for element in article['originalContent'].split("。"):
44
  CONTENT_ENG += translate(element) + ' '
45
  article['content'] = CONTENT_ENG
46
+ article['site'] = "State Council of China"
47
  article['originalSite'] = "国务院"
48
  article['originalTitle'] = page.xpath("//title/text()")[0]
49
  article['title'] = translate(article['originalTitle'])
 
64
  for element in article['originalContent'].split("。"):
65
  CONTENT_ENG += translate(element) + ' '
66
  article['content'] = CONTENT_ENG
67
+ article['site'] = "National Development and Reform Commission of China"
68
  article['originalSite'] = "国家发展和改革委员会"
69
  article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
70
  article['title'] = translate(article['originalTitle'])
 
86
  for element in article['originalContent'].split("。"):
87
  CONTENT_ENG += translate(element) + ' '
88
  article['content'] = CONTENT_ENG
89
+ article['site'] = "National Development and Reform Commission of China"
90
  article['originalSite'] = "国家发展和改革委员会"
91
  article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
92
  article['title'] = translate(article['originalTitle'])
safe.py CHANGED
@@ -42,7 +42,7 @@ while i > -1:
42
  for element in article['originalContent'].split("。"):
43
  CONTENT_ENG += translate(element) + ' '
44
  article['content'] = CONTENT_ENG
45
- article['site'] = "State Administration of Foregin Exchange"
46
  article['originalSite'] = "外汇管理局"
47
  article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
48
  article['title'] = translate(article['originalTitle'])
@@ -92,7 +92,7 @@ while i > -1:
92
  for element in article['originalContent'].split("。"):
93
  CONTENT_ENG += translate(element) + ' '
94
  article['content'] = CONTENT_ENG
95
- article['site'] = "State Administration of Foregin Exchange"
96
  article['originalSite'] = "外汇管理局"
97
  article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
98
  article['title'] = translate(article['originalTitle'])
 
42
  for element in article['originalContent'].split("。"):
43
  CONTENT_ENG += translate(element) + ' '
44
  article['content'] = CONTENT_ENG
45
+ article['site'] = "State Administration of Foregin Exchange of China"
46
  article['originalSite'] = "外汇管理局"
47
  article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
48
  article['title'] = translate(article['originalTitle'])
 
92
  for element in article['originalContent'].split("。"):
93
  CONTENT_ENG += translate(element) + ' '
94
  article['content'] = CONTENT_ENG
95
+ article['site'] = "State Administration of Foregin Exchange of China"
96
  article['originalSite'] = "外汇管理局"
97
  article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
98
  article['title'] = translate(article['originalTitle'])
stats.py CHANGED
@@ -30,7 +30,7 @@ while i > -1:
30
  for url in urls:
31
  try:
32
  article = {}
33
- url = url.replace('./', "https://www.stats.gov.cn/sj/sjjd/", )
34
  req = urllib.request.urlopen(url)
35
  text = req.read()
36
  html_text = text.decode("utf-8")
@@ -42,7 +42,7 @@ while i > -1:
42
  for element in article['originalContent'].split("。"):
43
  CONTENT_ENG += translate(element) + ' '
44
  article['content'] = CONTENT_ENG
45
- article['site'] = "National Bureau of Statistics"
46
  article['originalSite'] = "国家统计局"
47
  article['originalTitle'] = page.xpath("//title/text()")[0]
48
  article['title'] = translate(article['originalTitle'])
 
30
  for url in urls:
31
  try:
32
  article = {}
33
+ url = url.replace('./', "https://www.stats.gov.cn/sj/sjjd/")
34
  req = urllib.request.urlopen(url)
35
  text = req.read()
36
  html_text = text.decode("utf-8")
 
42
  for element in article['originalContent'].split("。"):
43
  CONTENT_ENG += translate(element) + ' '
44
  article['content'] = CONTENT_ENG
45
+ article['site'] = "National Bureau of Statistics of China"
46
  article['originalSite'] = "国家统计局"
47
  article['originalTitle'] = page.xpath("//title/text()")[0]
48
  article['title'] = translate(article['originalTitle'])
utils.py CHANGED
@@ -55,12 +55,16 @@ def encode(content):
55
  replace('\n','').replace('\t','').replace('\r','').replace(' ','').strip()
56
  else:
57
  line = element
 
58
  text += line
59
  index = text.find('打印本页')
60
  if index != -1:
61
  text = text[:index]
62
-
63
- return text
 
 
 
64
 
65
  def extract_from_pdf(url):
66
  # Send a GET request to the URL and retrieve the PDF content
@@ -131,17 +135,54 @@ def upsert_content(report):
131
  'title': report['title'],
132
  # 'originalSite': report['originalSite'],
133
  # 'originalTitle': report['originalTitle'],
134
- # 'originalContent': report['originalContent'],
135
  'category': report['category'],
136
  # 'author': report['author'],
137
  'content': report['content'],
138
- 'publishDate': report['publishDate'],
139
- 'link': report['url'],
140
  # 'attachment': report['reporturl'],
141
  # 'authorID': str(report['authorid']),
142
- 'sentimentScore': str(Decimal(report['sentimentScore']).quantize(Decimal('0.01'))),
143
- 'sentimentLabel': report['sentimentLabel'],
144
- 'LastModifiedDate': datetime.now().strftime("%Y-%m-%dT%H:%M:%S")
 
 
145
  }
146
  response = table.put_item(Item=item)
147
  print(response)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  replace('\n','').replace('\t','').replace('\r','').replace(' ','').strip()
56
  else:
57
  line = element
58
+ line = line + '\n'
59
  text += line
60
  index = text.find('打印本页')
61
  if index != -1:
62
  text = text[:index]
63
+ try:
64
+ summary = '\n'.join(text.split('\n')[:2])
65
+ except:
66
+ summary = text
67
+ return text, summary
68
 
69
  def extract_from_pdf(url):
70
  # Send a GET request to the URL and retrieve the PDF content
 
135
  'title': report['title'],
136
  # 'originalSite': report['originalSite'],
137
  # 'originalTitle': report['originalTitle'],
138
+ 'originContent': report['originContent'],
139
  'category': report['category'],
140
  # 'author': report['author'],
141
  'content': report['content'],
142
+ 'publishDate': report['publishdate'],
143
+ 'link': report['link'],
144
  # 'attachment': report['reporturl'],
145
  # 'authorID': str(report['authorid']),
146
+ 'entityList': report['entitylist'],
147
+ 'sentimentScore': Decimal(str(report['sentimentscore'])).quantize(Decimal('0.01')),
148
+ 'sentimentLabel': report['sentimentlabel'],
149
+ 'LastModifiedDate': datetime.now().strftime("%Y-%m-%dT%H:%M:%S"),
150
+ 'subtitle': report['subtitle']
151
  }
152
  response = table.put_item(Item=item)
153
  print(response)
154
+
155
+ def get_client_connection():
156
+ """Get dynamoDB connection"""
157
+ dynamodb = boto3.client(
158
+ service_name='dynamodb',
159
+ region_name='us-east-1',
160
+ aws_access_key_id=AWS_ACCESS_KEY_ID,
161
+ aws_secret_access_key=AWS_SECRET_ACCESS_KEY
162
+ )
163
+ return dynamodb
164
+
165
+ def delete_records(item):
166
+ dynamodb_client = get_client_connection()
167
+ dynamodb_client.delete_item(
168
+ TableName="article_china",
169
+ Key={
170
+ 'id': {'S': item['id']},
171
+ 'site': {'S': item['site']}
172
+ }
173
+ )
174
+
175
+ def update_content(report):
176
+ dynamodb = get_client_connection()
177
+ response = dynamodb.update_item(
178
+ TableName="article_china",
179
+ Key={
180
+ 'id': {'S': report['id']},
181
+ 'site': {'S': report['site']}
182
+ },
183
+ UpdateExpression='SET sentimentScore = :sentimentScore, sentimentLabel = :sentimentLabel',
184
+ ExpressionAttributeValues={
185
+ ':sentimentScore': {'N': str(Decimal(str(report['sentimentscore'])).quantize(Decimal('0.01')))},
186
+ ':sentimentLabel': {'S': report['sentimentlabel']}
187
+ }
188
+ )