OxbridgeEconomics commited on
Commit
b6dcee5
Ā·
1 Parent(s): b2a3d45
Files changed (5) hide show
  1. cbirc.py +7 -7
  2. eastmoney.py +38 -32
  3. gov.py +36 -41
  4. utils.py +49 -17
  5. xpath.json +19 -0
cbirc.py CHANGED
@@ -19,17 +19,17 @@ while i > -1:
19
  if parsed_datetime < (datetime.today() - timedelta(days=183)):
20
  i = -1
21
  else:
22
- article['originalContent'] = extract_from_pdf("https://www.cbirc.gov.cn" + article['pdfFileUrl'])
23
- if len(article['originalContent']) < 10:
24
  continue
25
  CONTENT_ENG = ''
26
- for element in article['originalContent'].split("怂"):
27
- CONTENT_ENG += translate(element) + ' '
28
  article['content'] = CONTENT_ENG
29
  article['site'] = "National Financial Regulatory Administration of China"
30
- article['originalSite'] = "å›½å®¶é‡‘čžē›‘ē£ē®”ē†ę€»å±€"
31
- article['originalTitle'] = article['docSubtitle']
32
- article['title'] = translate(article['originalTitle'])
33
  article['url'] = "https://www.cbirc.gov.cn" + article['pdfFileUrl']
34
  article['category']= "Policy Interpretation"
35
  article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(article['publishDate'],"%Y-%m-%d %H:%M:%S"))
 
19
  if parsed_datetime < (datetime.today() - timedelta(days=183)):
20
  i = -1
21
  else:
22
+ article['originContent'] = extract_from_pdf("https://www.cbirc.gov.cn" + article['pdfFileUrl'])
23
+ if len(article['originContent']) < 10:
24
  continue
25
  CONTENT_ENG = ''
26
+ for element in article['originContent'].split("\n"):
27
+ CONTENT_ENG += translate(element) + '\n'
28
  article['content'] = CONTENT_ENG
29
  article['site'] = "National Financial Regulatory Administration of China"
30
+ article['originSite'] = "å›½å®¶é‡‘čžē›‘ē£ē®”ē†ę€»å±€"
31
+ article['originTitle'] = article['docSubtitle']
32
+ article['title'] = translate(article['originTitle'])
33
  article['url'] = "https://www.cbirc.gov.cn" + article['pdfFileUrl']
34
  article['category']= "Policy Interpretation"
35
  article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(article['publishDate'],"%Y-%m-%d %H:%M:%S"))
eastmoney.py CHANGED
@@ -1,9 +1,45 @@
1
  import uuid
2
  import json
3
  import urllib.request
 
4
  from datetime import datetime, timedelta
5
  from lxml import etree
6
- from utils import encode, translate, datemodifier, sentiment_computation, upsert_content, fetch_url
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
  today = datetime.today().strftime('%Y-%m-%d')
9
  beginDate = (datetime.today() - timedelta(days=183)).strftime('%Y-%m-%d')
@@ -33,37 +69,7 @@ while i > -1:
33
  for article in reportinfo['data']:
34
  try:
35
  url = f"https://data.eastmoney.com/report/zw_macresearch.jshtml?encodeUrl={article['encodeUrl']}"
36
- req = urllib.request.urlopen(url)
37
- text = req.read()
38
- html_text = text.decode("utf-8")
39
- page = etree.HTML(html_text)
40
- content = encode(page.xpath("//div[contains(@class, 'ctx-content')]//p"))
41
- reporturl = encode(page.xpath("//a[contains(@class, 'pdf-link')]/@href"))
42
- article['url'] = url
43
- if article['orgSName'] == "''":
44
- article['site'] = translate(article['orgSName'])
45
- article['originalSite'] = article['orgSName']
46
- else:
47
- article['site'] = translate(article['orgName'])
48
- article['originalSite'] = article['orgSName']
49
- article['reporturl'] = reporturl
50
- article['originalTitle'] = article['title']
51
- article['title'] = translate(article['title'])
52
- article['author'] = translate(article['researcher'])
53
- article['originalAuthor'] = article['researcher']
54
- article['originalContent'] = content
55
- article['category'] = "Macroeconomic Research"
56
- if len(article['originalContent']) < 10:
57
- continue
58
- CONTENT_ENG = ''
59
- for element in article['originalContent'].split("怂"):
60
- CONTENT_ENG += translate(element) + ' '
61
- article['content'] = CONTENT_ENG
62
- article['authorid'] = uuid.uuid5(uuid.NAMESPACE_OID, article['author'])
63
- article['publishDate'] = datemodifier(article['publishDate'], "%Y-%m-%d %H:%M:%S.%f")
64
- article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
65
- article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
66
- upsert_content(article)
67
  except Exception as error:
68
  print(error)
69
  else:
 
1
  import uuid
2
  import json
3
  import urllib.request
4
+ from urllib.parse import urlparse
5
  from datetime import datetime, timedelta
6
  from lxml import etree
7
+ from utils import encode, translate, datemodifier, sentiment_computation, upsert_content, fetch_url, encode_content
8
+
9
+ with open('xpath.json', 'r', encoding='UTF-8') as f:
10
+ xpath_dict = json.load(f)
11
+
12
+ def crawl(url, article):
13
+ domain = urlparse(url).netloc
14
+ req = urllib.request.urlopen(url)
15
+ text = req.read()
16
+ html_text = text.decode("utf-8")
17
+ page = etree.HTML(html_text)
18
+ originContent, summary = encode_content(page.xpath(xpath_dict[domain]['content']))
19
+ article['attachment'] = encode(page.xpath(xpath_dict[domain]['attachment']))
20
+ article['link'] = url
21
+ if article['orgSName'] == "''":
22
+ article['site'] = translate(article['orgSName'])
23
+ else:
24
+ article['site'] = translate(article['orgName'])
25
+ article['originTitle'] = article['title']
26
+ article['title'] = translate(article['title'])
27
+ article['author'] = translate(article['researcher'])
28
+ article['originAuthor'] = article['researcher']
29
+ article['originContent'] = repr(originContent)
30
+ article['subtitle'] = translate(summary)
31
+ article['category'] = "Macroeconomic Research"
32
+ if len(article['originContent']) < 10:
33
+ return None
34
+ CONTENT_ENG = ''
35
+ for element in originContent.split("\n"):
36
+ CONTENT_ENG += translate(element) + '\n'
37
+ article['content'] = repr(CONTENT_ENG)
38
+ article['authorid'] = uuid.uuid5(uuid.NAMESPACE_OID, article['author'])
39
+ article['publishDate'] = datemodifier(article['publishDate'], xpath_dict[domain]['datetime']['format_string'])
40
+ article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
41
+ article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(originContent.replace("\n",""))
42
+ upsert_content(article)
43
 
44
  today = datetime.today().strftime('%Y-%m-%d')
45
  beginDate = (datetime.today() - timedelta(days=183)).strftime('%Y-%m-%d')
 
69
  for article in reportinfo['data']:
70
  try:
71
  url = f"https://data.eastmoney.com/report/zw_macresearch.jshtml?encodeUrl={article['encodeUrl']}"
72
+ crawl(url,article)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  except Exception as error:
74
  print(error)
75
  else:
gov.py CHANGED
@@ -1,9 +1,42 @@
1
  from datetime import datetime, timedelta
2
  import uuid
3
  import time
 
4
  import urllib.request
5
  from lxml import etree
6
- from utils import encode, translate, datemodifier, sentiment_computation, upsert_content
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
  i = 0
9
  while i > -1:
@@ -32,27 +65,8 @@ while i > -1:
32
  article = {}
33
  url = url.replace('../', 'https://www.gov.cn/zhengce/')
34
  if "https://www.gov.cn" in url:
35
- req = urllib.request.urlopen(url)
36
- text = req.read()
37
- html_text = text.decode("utf-8")
38
- page = etree.HTML(html_text)
39
- article['originalContent'] = encode(page.xpath("//div[contains(@id, 'UCAP-CONTENT')]//p"))
40
- if len(article['originalContent']) < 10:
41
- continue
42
- CONTENT_ENG = ''
43
- for element in article['originalContent'].split("怂"):
44
- CONTENT_ENG += translate(element) + ' '
45
- article['content'] = CONTENT_ENG
46
- article['site'] = "State Council of China"
47
- article['originalSite'] = "国劔院"
48
- article['originalTitle'] = page.xpath("//title/text()")[0]
49
- article['title'] = translate(article['originalTitle'])
50
- article['url'] = url
51
  article['category']= "Policy Interpretation"
52
- article['publishDate'] = datemodifier(page.xpath("//meta[@name = 'firstpublishedtime']/@content")[0], "%Y-%m-%d-%H:%M:%S")
53
- article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
54
- article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
55
- upsert_content(article)
56
  except Exception as error:
57
  print(error)
58
 
@@ -83,26 +97,7 @@ while i > -1:
83
  article = {}
84
  url = url.replace('../', 'https://www.gov.cn/zhengce/')
85
  if "https://www.gov.cn" in url:
86
- req = urllib.request.urlopen(url)
87
- text = req.read()
88
- html_text = text.decode("utf-8")
89
- page = etree.HTML(html_text)
90
- article['originalContent'] = encode(page.xpath("//div[contains(@id, 'UCAP-CONTENT')]//p"))
91
- if len(article['originalContent']) < 10:
92
- continue
93
- CONTENT_ENG = ''
94
- for element in article['originalContent'].split("怂"):
95
- CONTENT_ENG += translate(article['originalContent']) + ' '
96
- article['content'] = CONTENT_ENG
97
- article['site'] = "State Council of China"
98
- article['originalSite'] = "国劔院"
99
- article['originalTitle'] = page.xpath("//title/text()")[0]
100
- article['title'] = translate(article['originalTitle'])
101
- article['url'] = url
102
  article['category']= "Policy Release"
103
- article['publishDate'] = datemodifier(page.xpath("//meta[@name = 'firstpublishedtime']/@content")[0], "%Y-%m-%d-%H:%M:%S")
104
- article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
105
- article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
106
- upsert_content(article)
107
  except Exception as error:
108
  print(error)
 
1
  from datetime import datetime, timedelta
2
  import uuid
3
  import time
4
+ from urllib.parse import urlparse
5
  import urllib.request
6
  from lxml import etree
7
+ from utils import encode, translate, datemodifier, sentiment_computation, upsert_content, encode_content
8
+ import json
9
+
10
+ with open('xpath.json', 'r', encoding='UTF-8') as f:
11
+ xpath_dict = json.load(f)
12
+
13
+ def crawl(url, article):
14
+ domain = urlparse(url).netloc
15
+ req = urllib.request.urlopen(url)
16
+ text = req.read()
17
+ html_text = text.decode("utf-8")
18
+ page = etree.HTML(html_text)
19
+ originContent, summary = encode_content(page.xpath(xpath_dict[domain]['content']))
20
+ article['originContent'] = repr(originContent)
21
+ article['subtitle'] = translate(summary)
22
+ if len(article['originContent']) < 10:
23
+ return None
24
+ CONTENT_ENG = ''
25
+ for element in originContent.split("\n"):
26
+ print(element)
27
+ CONTENT_ENG += translate(element) + '\n'
28
+ article['content'] = repr(CONTENT_ENG)
29
+ article['site'] = "State Council of China"
30
+ article['originSite'] = "国劔院"
31
+ article['originTitle'] = encode(page.xpath(xpath_dict[domain]['title']))
32
+ article['title'] = translate(article['originTitle'])
33
+ article['author'] = translate(encode(page.xpath(xpath_dict[domain]['author'])))
34
+ article['link'] = url
35
+ article['publishDate'] = datemodifier(encode(page.xpath(xpath_dict[domain]['publishdate'])), xpath_dict[domain]['datetime']['format_string'])
36
+ article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
37
+ article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(originContent.replace("\n",""))
38
+ article['attachment'] = ""
39
+ upsert_content(article)
40
 
41
  i = 0
42
  while i > -1:
 
65
  article = {}
66
  url = url.replace('../', 'https://www.gov.cn/zhengce/')
67
  if "https://www.gov.cn" in url:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  article['category']= "Policy Interpretation"
69
+ crawl(url, article)
 
 
 
70
  except Exception as error:
71
  print(error)
72
 
 
97
  article = {}
98
  url = url.replace('../', 'https://www.gov.cn/zhengce/')
99
  if "https://www.gov.cn" in url:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  article['category']= "Policy Release"
101
+ crawl(url, article)
 
 
 
102
  except Exception as error:
103
  print(error)
utils.py CHANGED
@@ -10,10 +10,10 @@ from googletrans import Translator
10
  from transformers import pipeline
11
  from PyPDF2 import PdfReader
12
 
13
- AWS_ACCESS_KEY_ID = os.environ['AWS_ACCESS_KEY_ID']
14
- AWS_SECRET_ACCESS_KEY = os.environ['AWS_SECRET_ACCESS_KEY']
15
- # AWS_ACCESS_KEY_ID="AKIAQFXZMGHQYXKWUDWR"
16
- # AWS_SECRET_ACCESS_KEY="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"
17
 
18
  analyzer = pipeline("sentiment-analysis", model="ProsusAI/finbert")
19
 
@@ -44,6 +44,38 @@ def translist(infolist):
44
  return out
45
 
46
  def encode(content):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  """Encode Function"""
48
  text = ''
49
  for element in content:
@@ -109,9 +141,9 @@ def sentiment_computation(content):
109
  }
110
  sentiment_score = 0
111
  maximum_value = 0
112
- raw_sentiment = analyzer(content[:512], return_all_scores=True)
113
  sentiment_label = None
114
- for sentiment_dict in raw_sentiment[0]:
115
  value = sentiment_dict["score"]
116
  if value > maximum_value:
117
  sentiment_label = sentiment_dict["label"]
@@ -127,25 +159,25 @@ def sentiment_computation(content):
127
  def upsert_content(report):
128
  """Upsert the content records"""
129
  dynamodb = get_db_connection()
130
- table = dynamodb.Table('article_china')
131
  # Define the item data
132
  item = {
133
  'id': str(report['id']),
134
  'site': report['site'],
135
  'title': report['title'],
136
- # 'originalSite': report['originalSite'],
137
- # 'originalTitle': report['originalTitle'],
138
  'originContent': report['originContent'],
139
  'category': report['category'],
140
- # 'author': report['author'],
141
  'content': report['content'],
142
- 'publishDate': report['publishdate'],
143
  'link': report['link'],
144
- # 'attachment': report['reporturl'],
145
  # 'authorID': str(report['authorid']),
146
- 'entityList': report['entitylist'],
147
- 'sentimentScore': Decimal(str(report['sentimentscore'])).quantize(Decimal('0.01')),
148
- 'sentimentLabel': report['sentimentlabel'],
149
  'LastModifiedDate': datetime.now().strftime("%Y-%m-%dT%H:%M:%S"),
150
  'subtitle': report['subtitle']
151
  }
@@ -165,7 +197,7 @@ def get_client_connection():
165
  def delete_records(item):
166
  dynamodb_client = get_client_connection()
167
  dynamodb_client.delete_item(
168
- TableName="article_china",
169
  Key={
170
  'id': {'S': item['id']},
171
  'site': {'S': item['site']}
@@ -175,7 +207,7 @@ def delete_records(item):
175
  def update_content(report):
176
  dynamodb = get_client_connection()
177
  response = dynamodb.update_item(
178
- TableName="article_china",
179
  Key={
180
  'id': {'S': report['id']},
181
  'site': {'S': report['site']}
 
10
  from transformers import pipeline
11
  from PyPDF2 import PdfReader
12
 
13
+ # AWS_ACCESS_KEY_ID = os.environ['AWS_ACCESS_KEY_ID']
14
+ # AWS_SECRET_ACCESS_KEY = os.environ['AWS_SECRET_ACCESS_KEY']
15
+ AWS_ACCESS_KEY_ID="AKIAQFXZMGHQYXKWUDWR"
16
+ AWS_SECRET_ACCESS_KEY="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"
17
 
18
  analyzer = pipeline("sentiment-analysis", model="ProsusAI/finbert")
19
 
 
44
  return out
45
 
46
  def encode(content):
47
+ """Encode Function"""
48
+ text = ''
49
+ for element in content:
50
+ if isinstance(element, etree._Element):
51
+ subelement = etree.tostring(element).decode()
52
+ subpage = etree.HTML(subelement)
53
+ tree = subpage.xpath('//text()')
54
+ line = ''.join(translist(tree)).\
55
+ replace('\n','').replace('\t','').replace('\r','').replace(' ','').strip()
56
+ else:
57
+ line = element
58
+ text += line
59
+ return text
60
+
61
+ # def encode(content):
62
+ # """Encode Function"""
63
+ # text = ''
64
+ # for element in content:
65
+ # if isinstance(element, etree._Element):
66
+ # subelement = etree.tostring(element).decode()
67
+ # subpage = etree.HTML(subelement)
68
+ # tree = subpage.xpath('//text()')
69
+ # line = ''.join(translist(tree)).\
70
+ # replace('\n','').replace('\t','').replace('\r','').replace(' ','').strip()
71
+ # else:
72
+ # line = element
73
+ # text += line
74
+ # index = text.find('ę‰“å°ęœ¬é”µ')
75
+ # if index != -1:
76
+ # text = text[:index]
77
+
78
+ def encode_content(content):
79
  """Encode Function"""
80
  text = ''
81
  for element in content:
 
141
  }
142
  sentiment_score = 0
143
  maximum_value = 0
144
+ raw_sentiment = analyzer(content[:512], top_k=None)
145
  sentiment_label = None
146
+ for sentiment_dict in raw_sentiment:
147
  value = sentiment_dict["score"]
148
  if value > maximum_value:
149
  sentiment_label = sentiment_dict["label"]
 
159
  def upsert_content(report):
160
  """Upsert the content records"""
161
  dynamodb = get_db_connection()
162
+ table = dynamodb.Table('article_test')
163
  # Define the item data
164
  item = {
165
  'id': str(report['id']),
166
  'site': report['site'],
167
  'title': report['title'],
168
+ # 'originSite': report['originSite'],
169
+ 'originTitle': report['originTitle'],
170
  'originContent': report['originContent'],
171
  'category': report['category'],
172
+ 'author': report['author'],
173
  'content': report['content'],
174
+ 'publishDate': report['publishDate'],
175
  'link': report['link'],
176
+ 'attachment': report['attachment'],
177
  # 'authorID': str(report['authorid']),
178
+ # 'entityList': report['entitylist'],
179
+ 'sentimentScore': Decimal(str(report['sentimentScore'])).quantize(Decimal('0.01')),
180
+ 'sentimentLabel': report['sentimentLabel'],
181
  'LastModifiedDate': datetime.now().strftime("%Y-%m-%dT%H:%M:%S"),
182
  'subtitle': report['subtitle']
183
  }
 
197
  def delete_records(item):
198
  dynamodb_client = get_client_connection()
199
  dynamodb_client.delete_item(
200
+ TableName="article_test",
201
  Key={
202
  'id': {'S': item['id']},
203
  'site': {'S': item['site']}
 
207
  def update_content(report):
208
  dynamodb = get_client_connection()
209
  response = dynamodb.update_item(
210
+ TableName="article_test",
211
  Key={
212
  'id': {'S': report['id']},
213
  'site': {'S': report['site']}
xpath.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "data.eastmoney.com": {
3
+ "attachment": "//a[contains(@class, 'pdf-link')]/@href",
4
+ "content": "//div[contains(@class, 'ctx-content')]//p",
5
+ "datetime": {
6
+ "format_string": "%Y-%m-%d %H:%M:%S.%f"
7
+ }
8
+ },
9
+ "www.gov.cn": {
10
+ "title": "//title/text()",
11
+ "subtitle": "//meta[@name = 'description']/@content",
12
+ "author": "//meta[@name = 'author']/@content",
13
+ "publishdate": "//meta[@name = 'firstpublishedtime']/@content",
14
+ "content": "//div[contains(@id, 'UCAP-CONTENT')]//p",
15
+ "datetime": {
16
+ "format_string": "%Y-%m-%d-%H:%M:%S"
17
+ }
18
+ }
19
+ }