OxbridgeEconomics
commited on
Commit
·
422b41b
1
Parent(s):
cd41775
commit
Browse files
cbirc.py
CHANGED
@@ -22,20 +22,20 @@ while i > -1:
|
|
22 |
i = -1
|
23 |
else:
|
24 |
contentCN, summary = extract_from_pdf("https://www.cbirc.gov.cn" + article['pdfFileUrl'])
|
25 |
-
article['contentCN'] = repr(contentCN)
|
26 |
if len(contentCN) < 10:
|
27 |
continue
|
28 |
CONTENT_ENG = ''
|
29 |
for element in article['contentCN'].split("\n"):
|
30 |
CONTENT_ENG += translate(element) + '\n'
|
31 |
-
article['content'] = repr(CONTENT_ENG)
|
32 |
article['site'] = "National Financial Regulatory Administration of China"
|
33 |
article['originSite'] = "国家金融监督管理总局"
|
34 |
article['titleCN'] = article['docSubtitle']
|
35 |
article['title'] = translate(article['docSubtitle'])
|
36 |
article['link'] = "https://www.cbirc.gov.cn" + str(article['pdfFileUrl'])
|
37 |
article['category']= "Policy Interpretation"
|
38 |
-
article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['
|
39 |
article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
|
40 |
article['attachment'] = ''
|
41 |
article['author'] = ''
|
@@ -45,40 +45,40 @@ while i > -1:
|
|
45 |
print(error)
|
46 |
|
47 |
|
48 |
-
ssl._create_default_https_context = ssl._create_stdlib_context
|
49 |
-
i = 0
|
50 |
-
while i > -1:
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
|
|
22 |
i = -1
|
23 |
else:
|
24 |
contentCN, summary = extract_from_pdf("https://www.cbirc.gov.cn" + article['pdfFileUrl'])
|
25 |
+
article['contentCN'] = repr(contentCN)[1:-1].strip()
|
26 |
if len(contentCN) < 10:
|
27 |
continue
|
28 |
CONTENT_ENG = ''
|
29 |
for element in article['contentCN'].split("\n"):
|
30 |
CONTENT_ENG += translate(element) + '\n'
|
31 |
+
article['content'] = repr(CONTENT_ENG)[1:-1].strip()
|
32 |
article['site'] = "National Financial Regulatory Administration of China"
|
33 |
article['originSite'] = "国家金融监督管理总局"
|
34 |
article['titleCN'] = article['docSubtitle']
|
35 |
article['title'] = translate(article['docSubtitle'])
|
36 |
article['link'] = "https://www.cbirc.gov.cn" + str(article['pdfFileUrl'])
|
37 |
article['category']= "Policy Interpretation"
|
38 |
+
article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['titleCN']+article['publishDate'])
|
39 |
article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
|
40 |
article['attachment'] = ''
|
41 |
article['author'] = ''
|
|
|
45 |
print(error)
|
46 |
|
47 |
|
48 |
+
# ssl._create_default_https_context = ssl._create_stdlib_context
|
49 |
+
# i = 0
|
50 |
+
# while i > -1:
|
51 |
+
# CATEGORY_URL = f"https://www.chinatax.gov.cn/search5/search/s?siteCode=bm29000002&searchWord=&type=1&xxgkResolveType=%E6%96%87%E5%AD%97&pageNum={i}&pageSize=10&cwrqStart=&cwrqEnd=&column=%E6%94%BF%E7%AD%96%E8%A7%A3%E8%AF%BB&likeDoc=0&wordPlace=0&videoreSolveType="
|
52 |
+
# i = i + 1
|
53 |
+
# urllib3.disable_warnings()
|
54 |
+
# try:
|
55 |
+
# req = urllib.request.urlopen(CATEGORY_URL, timeout=30)
|
56 |
+
# except:
|
57 |
+
# break
|
58 |
+
# content = req.read().decode("utf-8")
|
59 |
+
# reportinfo = json.loads(content)
|
60 |
+
# for article in reportinfo['searchResultAll']['searchTotal']:
|
61 |
+
# try:
|
62 |
+
# parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(article['cwrq'],"%Y-%m-%d %H:%M:%S")), "%Y-%m-%d")
|
63 |
+
# if parsed_datetime < (datetime.today() - timedelta(days=183)):
|
64 |
+
# i = -1
|
65 |
+
# else:
|
66 |
+
# article['originalContent'] = article['content'].replace('\\u','')
|
67 |
+
# if len(article['originalContent']) < 10:
|
68 |
+
# continue
|
69 |
+
# CONTENT_ENG = ''
|
70 |
+
# for element in article['originalContent'].split("。"):
|
71 |
+
# CONTENT_ENG += translate(element) + ' '
|
72 |
+
# article['content'] = CONTENT_ENG
|
73 |
+
# article['site'] = "State Taxation Administration of China"
|
74 |
+
# article['originalSite'] = "国家税务总局"
|
75 |
+
# article['originalTitle'] = article['title']
|
76 |
+
# article['title'] = translate(article['originalTitle'])
|
77 |
+
# article['url'] = article['snapshotUrl']
|
78 |
+
# article['category']= "Policy Interpretation"
|
79 |
+
# article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(article['cwrq'],"%Y-%m-%d %H:%M:%S"))
|
80 |
+
# article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
|
81 |
+
# article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
|
82 |
+
# upsert_content(article)
|
83 |
+
# except Exception as error:
|
84 |
+
# print(error)
|
chinatax.py
CHANGED
@@ -6,7 +6,7 @@ import time
|
|
6 |
import urllib.request
|
7 |
import urllib3
|
8 |
from lxml import etree
|
9 |
-
from utils import encode, translate, sentiment_computation, upsert_content
|
10 |
|
11 |
ssl._create_default_https_context = ssl._create_stdlib_context
|
12 |
|
@@ -25,22 +25,25 @@ while i > -1:
|
|
25 |
print(parsed_datetime)
|
26 |
i = -1
|
27 |
else:
|
28 |
-
article['
|
29 |
-
|
|
|
|
|
30 |
continue
|
31 |
CONTENT_ENG = ''
|
32 |
-
for element in
|
33 |
CONTENT_ENG += translate(element) + ' '
|
34 |
article['content'] = CONTENT_ENG
|
35 |
article['site'] = "State Taxation Administration of China"
|
36 |
article['originalSite'] = "国家税务总局"
|
37 |
-
article['
|
38 |
article['title'] = translate(article['originalTitle'])
|
39 |
article['url'] = article['snapshotUrl']
|
40 |
-
article['
|
|
|
41 |
article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(article['cwrq'],"%Y-%m-%d %H:%M:%S"))
|
42 |
-
article['
|
43 |
-
article['
|
44 |
upsert_content(article)
|
45 |
except Exception as error:
|
46 |
print(error)
|
@@ -73,21 +76,24 @@ while i > -1:
|
|
73 |
text = req.read()
|
74 |
html_text = text.decode("utf-8")
|
75 |
page = etree.HTML(html_text)
|
76 |
-
|
77 |
-
if len(
|
78 |
continue
|
79 |
CONTENT_ENG = ''
|
80 |
-
for element in
|
81 |
CONTENT_ENG += translate(element) + ' '
|
82 |
-
article['
|
|
|
83 |
article['site'] = "State Taxation Administration of China"
|
84 |
article['originalSite'] = "国家税务总局"
|
85 |
-
article['
|
86 |
article['title'] = translate(article['originalTitle'])
|
87 |
article['url'] = article['url']
|
|
|
|
|
88 |
article['category']= "Policy Interpretation"
|
89 |
article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(article['publishedTimeStr'],"%Y-%m-%d %H:%M:%S"))
|
90 |
-
article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['
|
91 |
article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
|
92 |
upsert_content(article)
|
93 |
except Exception as error:
|
|
|
6 |
import urllib.request
|
7 |
import urllib3
|
8 |
from lxml import etree
|
9 |
+
from utils import encode, translate, sentiment_computation, upsert_content, encode_content
|
10 |
|
11 |
ssl._create_default_https_context = ssl._create_stdlib_context
|
12 |
|
|
|
25 |
print(parsed_datetime)
|
26 |
i = -1
|
27 |
else:
|
28 |
+
article['category']= "Policy Interpretation"
|
29 |
+
contentCN = article['content'].replace('\\u','')
|
30 |
+
article['contentCN'] = repr(contentCN)[1:-1].strip()
|
31 |
+
if len(contentCN) < 10:
|
32 |
continue
|
33 |
CONTENT_ENG = ''
|
34 |
+
for element in contentCN.split("。"):
|
35 |
CONTENT_ENG += translate(element) + ' '
|
36 |
article['content'] = CONTENT_ENG
|
37 |
article['site'] = "State Taxation Administration of China"
|
38 |
article['originalSite'] = "国家税务总局"
|
39 |
+
article['titleCN'] = article['title']
|
40 |
article['title'] = translate(article['originalTitle'])
|
41 |
article['url'] = article['snapshotUrl']
|
42 |
+
article['author'] = ""
|
43 |
+
article['attachment'] = ""
|
44 |
article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(article['cwrq'],"%Y-%m-%d %H:%M:%S"))
|
45 |
+
article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(CONTENT_ENG.replace("\n",""))
|
46 |
+
article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['titleCN']+article['publishDate'])
|
47 |
upsert_content(article)
|
48 |
except Exception as error:
|
49 |
print(error)
|
|
|
76 |
text = req.read()
|
77 |
html_text = text.decode("utf-8")
|
78 |
page = etree.HTML(html_text)
|
79 |
+
contentCN= encode_content(page.xpath("//div[contains(@class, 'article')]//p"))
|
80 |
+
if len(contentCN) < 10:
|
81 |
continue
|
82 |
CONTENT_ENG = ''
|
83 |
+
for element in contentCN.split("。"):
|
84 |
CONTENT_ENG += translate(element) + ' '
|
85 |
+
article['contentCN'] = repr(contentCN)[1:-1].strip()
|
86 |
+
article['content'] = repr(CONTENT_ENG)[1:-1].strip()
|
87 |
article['site'] = "State Taxation Administration of China"
|
88 |
article['originalSite'] = "国家税务总局"
|
89 |
+
article['titleCN'] = article['title']
|
90 |
article['title'] = translate(article['originalTitle'])
|
91 |
article['url'] = article['url']
|
92 |
+
article['attachment'] = ""
|
93 |
+
article['author'] = ""
|
94 |
article['category']= "Policy Interpretation"
|
95 |
article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(article['publishedTimeStr'],"%Y-%m-%d %H:%M:%S"))
|
96 |
+
article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['titleCN']+article['publishDate'])
|
97 |
article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
|
98 |
upsert_content(article)
|
99 |
except Exception as error:
|
csrc.py
CHANGED
@@ -52,22 +52,22 @@ while i > -1:
|
|
52 |
article['category']= "Financial News"
|
53 |
article['site'] = "Securities Regulatory Commission of China"
|
54 |
article['originSite'] = "证监会"
|
55 |
-
article['
|
56 |
-
article['title'] = translate(article['
|
57 |
article['author'] = ''
|
58 |
-
article['
|
59 |
-
if len(article['
|
60 |
continue
|
61 |
CONTENT_ENG = ''
|
62 |
-
for element in article['
|
63 |
CONTENT_ENG += translate(element) + ' '
|
64 |
-
article['content'] = repr(CONTENT_ENG)
|
65 |
article['subtitle'] = article['memo']
|
66 |
article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(article['publishedTimeStr'],"%Y-%m-%d %H:%M:%S"))
|
67 |
article['link'] = article['url']
|
68 |
article['attachment'] = ""
|
69 |
article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
|
70 |
-
article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['
|
71 |
upsert_content(article)
|
72 |
except Exception as error:
|
73 |
print(error)
|
|
|
52 |
article['category']= "Financial News"
|
53 |
article['site'] = "Securities Regulatory Commission of China"
|
54 |
article['originSite'] = "证监会"
|
55 |
+
article['titleCN'] = article['title']
|
56 |
+
article['title'] = translate(article['titleCN'])
|
57 |
article['author'] = ''
|
58 |
+
article['contentCN'] = repr(article['content'])[1:-1].strip()
|
59 |
+
if len(article['contentCN']) < 10:
|
60 |
continue
|
61 |
CONTENT_ENG = ''
|
62 |
+
for element in article['contentCN'].split("。"):
|
63 |
CONTENT_ENG += translate(element) + ' '
|
64 |
+
article['content'] = repr(CONTENT_ENG)[1:-1].strip()
|
65 |
article['subtitle'] = article['memo']
|
66 |
article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(article['publishedTimeStr'],"%Y-%m-%d %H:%M:%S"))
|
67 |
article['link'] = article['url']
|
68 |
article['attachment'] = ""
|
69 |
article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
|
70 |
+
article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['titleCN']+article['publishDate'])
|
71 |
upsert_content(article)
|
72 |
except Exception as error:
|
73 |
print(error)
|
utils.py
CHANGED
@@ -124,9 +124,12 @@ def extract_from_pdf(url):
|
|
124 |
text = pdf_reader.pages[page].extract_text()
|
125 |
if text and text[0].isdigit():
|
126 |
text = text[1:]
|
127 |
-
first_newline_index = text.find('
|
128 |
-
text = text[:first_newline_index+1].replace('\n', '
|
129 |
-
|
|
|
|
|
|
|
130 |
try:
|
131 |
summary = '\n'.join(extracted_text.split('\n')[:2])
|
132 |
except:
|
@@ -202,7 +205,7 @@ def crawl(url, article):
|
|
202 |
def upsert_content(report):
|
203 |
"""Upsert the content records"""
|
204 |
dynamodb = get_db_connection()
|
205 |
-
table = dynamodb.Table('
|
206 |
# Define the item data
|
207 |
item = {
|
208 |
'id': str(report['id']),
|
|
|
124 |
text = pdf_reader.pages[page].extract_text()
|
125 |
if text and text[0].isdigit():
|
126 |
text = text[1:]
|
127 |
+
# first_newline_index = text.find('。\n')
|
128 |
+
# text = text[:first_newline_index+1].replace('\n', '') + text[first_newline_index+1:]
|
129 |
+
text = text.replace('?\n', '?-\n').replace('!\n', '!-\n').replace('。\n', '。-\n').replace('\n','').replace('?-','?\n').replace('!-','!\n').replace('。-','。\n')
|
130 |
+
print(text)
|
131 |
+
if text != '':
|
132 |
+
extracted_text += text
|
133 |
try:
|
134 |
summary = '\n'.join(extracted_text.split('\n')[:2])
|
135 |
except:
|
|
|
205 |
def upsert_content(report):
|
206 |
"""Upsert the content records"""
|
207 |
dynamodb = get_db_connection()
|
208 |
+
table = dynamodb.Table('article_test')
|
209 |
# Define the item data
|
210 |
item = {
|
211 |
'id': str(report['id']),
|