OxbridgeEconomics commited on
Commit
422b41b
·
1 Parent(s): cd41775
Files changed (4) hide show
  1. cbirc.py +40 -40
  2. chinatax.py +20 -14
  3. csrc.py +7 -7
  4. utils.py +7 -4
cbirc.py CHANGED
@@ -22,20 +22,20 @@ while i > -1:
22
  i = -1
23
  else:
24
  contentCN, summary = extract_from_pdf("https://www.cbirc.gov.cn" + article['pdfFileUrl'])
25
- article['contentCN'] = repr(contentCN)
26
  if len(contentCN) < 10:
27
  continue
28
  CONTENT_ENG = ''
29
  for element in article['contentCN'].split("\n"):
30
  CONTENT_ENG += translate(element) + '\n'
31
- article['content'] = repr(CONTENT_ENG)
32
  article['site'] = "National Financial Regulatory Administration of China"
33
  article['originSite'] = "国家金融监督管理总局"
34
  article['titleCN'] = article['docSubtitle']
35
  article['title'] = translate(article['docSubtitle'])
36
  article['link'] = "https://www.cbirc.gov.cn" + str(article['pdfFileUrl'])
37
  article['category']= "Policy Interpretation"
38
- article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
39
  article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
40
  article['attachment'] = ''
41
  article['author'] = ''
@@ -45,40 +45,40 @@ while i > -1:
45
  print(error)
46
 
47
 
48
- ssl._create_default_https_context = ssl._create_stdlib_context
49
- i = 0
50
- while i > -1:
51
- CATEGORY_URL = f"https://www.chinatax.gov.cn/search5/search/s?siteCode=bm29000002&searchWord=&type=1&xxgkResolveType=%E6%96%87%E5%AD%97&pageNum={i}&pageSize=10&cwrqStart=&cwrqEnd=&column=%E6%94%BF%E7%AD%96%E8%A7%A3%E8%AF%BB&likeDoc=0&wordPlace=0&videoreSolveType="
52
- i = i + 1
53
- urllib3.disable_warnings()
54
- try:
55
- req = urllib.request.urlopen(CATEGORY_URL, timeout=30)
56
- except:
57
- break
58
- content = req.read().decode("utf-8")
59
- reportinfo = json.loads(content)
60
- for article in reportinfo['searchResultAll']['searchTotal']:
61
- try:
62
- parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(article['cwrq'],"%Y-%m-%d %H:%M:%S")), "%Y-%m-%d")
63
- if parsed_datetime < (datetime.today() - timedelta(days=183)):
64
- i = -1
65
- else:
66
- article['originalContent'] = article['content'].replace('\\u','')
67
- if len(article['originalContent']) < 10:
68
- continue
69
- CONTENT_ENG = ''
70
- for element in article['originalContent'].split("。"):
71
- CONTENT_ENG += translate(element) + ' '
72
- article['content'] = CONTENT_ENG
73
- article['site'] = "State Taxation Administration of China"
74
- article['originalSite'] = "国家税务总局"
75
- article['originalTitle'] = article['title']
76
- article['title'] = translate(article['originalTitle'])
77
- article['url'] = article['snapshotUrl']
78
- article['category']= "Policy Interpretation"
79
- article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(article['cwrq'],"%Y-%m-%d %H:%M:%S"))
80
- article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
81
- article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
82
- upsert_content(article)
83
- except Exception as error:
84
- print(error)
 
22
  i = -1
23
  else:
24
  contentCN, summary = extract_from_pdf("https://www.cbirc.gov.cn" + article['pdfFileUrl'])
25
+ article['contentCN'] = repr(contentCN)[1:-1].strip()
26
  if len(contentCN) < 10:
27
  continue
28
  CONTENT_ENG = ''
29
  for element in article['contentCN'].split("\n"):
30
  CONTENT_ENG += translate(element) + '\n'
31
+ article['content'] = repr(CONTENT_ENG)[1:-1].strip()
32
  article['site'] = "National Financial Regulatory Administration of China"
33
  article['originSite'] = "国家金融监督管理总局"
34
  article['titleCN'] = article['docSubtitle']
35
  article['title'] = translate(article['docSubtitle'])
36
  article['link'] = "https://www.cbirc.gov.cn" + str(article['pdfFileUrl'])
37
  article['category']= "Policy Interpretation"
38
+ article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['titleCN']+article['publishDate'])
39
  article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
40
  article['attachment'] = ''
41
  article['author'] = ''
 
45
  print(error)
46
 
47
 
48
+ # ssl._create_default_https_context = ssl._create_stdlib_context
49
+ # i = 0
50
+ # while i > -1:
51
+ # CATEGORY_URL = f"https://www.chinatax.gov.cn/search5/search/s?siteCode=bm29000002&searchWord=&type=1&xxgkResolveType=%E6%96%87%E5%AD%97&pageNum={i}&pageSize=10&cwrqStart=&cwrqEnd=&column=%E6%94%BF%E7%AD%96%E8%A7%A3%E8%AF%BB&likeDoc=0&wordPlace=0&videoreSolveType="
52
+ # i = i + 1
53
+ # urllib3.disable_warnings()
54
+ # try:
55
+ # req = urllib.request.urlopen(CATEGORY_URL, timeout=30)
56
+ # except:
57
+ # break
58
+ # content = req.read().decode("utf-8")
59
+ # reportinfo = json.loads(content)
60
+ # for article in reportinfo['searchResultAll']['searchTotal']:
61
+ # try:
62
+ # parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(article['cwrq'],"%Y-%m-%d %H:%M:%S")), "%Y-%m-%d")
63
+ # if parsed_datetime < (datetime.today() - timedelta(days=183)):
64
+ # i = -1
65
+ # else:
66
+ # article['originalContent'] = article['content'].replace('\\u','')
67
+ # if len(article['originalContent']) < 10:
68
+ # continue
69
+ # CONTENT_ENG = ''
70
+ # for element in article['originalContent'].split("。"):
71
+ # CONTENT_ENG += translate(element) + ' '
72
+ # article['content'] = CONTENT_ENG
73
+ # article['site'] = "State Taxation Administration of China"
74
+ # article['originalSite'] = "国家税务总局"
75
+ # article['originalTitle'] = article['title']
76
+ # article['title'] = translate(article['originalTitle'])
77
+ # article['url'] = article['snapshotUrl']
78
+ # article['category']= "Policy Interpretation"
79
+ # article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(article['cwrq'],"%Y-%m-%d %H:%M:%S"))
80
+ # article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
81
+ # article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
82
+ # upsert_content(article)
83
+ # except Exception as error:
84
+ # print(error)
chinatax.py CHANGED
@@ -6,7 +6,7 @@ import time
6
  import urllib.request
7
  import urllib3
8
  from lxml import etree
9
- from utils import encode, translate, sentiment_computation, upsert_content
10
 
11
  ssl._create_default_https_context = ssl._create_stdlib_context
12
 
@@ -25,22 +25,25 @@ while i > -1:
25
  print(parsed_datetime)
26
  i = -1
27
  else:
28
- article['originalContent'] = article['content'].replace('\\u','')
29
- if len(article['originalContent']) < 10:
 
 
30
  continue
31
  CONTENT_ENG = ''
32
- for element in article['originalContent'].split("。"):
33
  CONTENT_ENG += translate(element) + ' '
34
  article['content'] = CONTENT_ENG
35
  article['site'] = "State Taxation Administration of China"
36
  article['originalSite'] = "国家税务总局"
37
- article['originalTitle'] = article['title']
38
  article['title'] = translate(article['originalTitle'])
39
  article['url'] = article['snapshotUrl']
40
- article['category']= "Policy Interpretation"
 
41
  article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(article['cwrq'],"%Y-%m-%d %H:%M:%S"))
42
- article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
43
- article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
44
  upsert_content(article)
45
  except Exception as error:
46
  print(error)
@@ -73,21 +76,24 @@ while i > -1:
73
  text = req.read()
74
  html_text = text.decode("utf-8")
75
  page = etree.HTML(html_text)
76
- article['originalContent'] = encode(page.xpath("//div[contains(@class, 'article')]//p"))
77
- if len(article['originalContent']) < 10:
78
  continue
79
  CONTENT_ENG = ''
80
- for element in article['originalContent'].split("。"):
81
  CONTENT_ENG += translate(element) + ' '
82
- article['content'] = CONTENT_ENG
 
83
  article['site'] = "State Taxation Administration of China"
84
  article['originalSite'] = "国家税务总局"
85
- article['originalTitle'] = article['title']
86
  article['title'] = translate(article['originalTitle'])
87
  article['url'] = article['url']
 
 
88
  article['category']= "Policy Interpretation"
89
  article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(article['publishedTimeStr'],"%Y-%m-%d %H:%M:%S"))
90
- article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
91
  article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
92
  upsert_content(article)
93
  except Exception as error:
 
6
  import urllib.request
7
  import urllib3
8
  from lxml import etree
9
+ from utils import encode, translate, sentiment_computation, upsert_content, encode_content
10
 
11
  ssl._create_default_https_context = ssl._create_stdlib_context
12
 
 
25
  print(parsed_datetime)
26
  i = -1
27
  else:
28
+ article['category']= "Policy Interpretation"
29
+ contentCN = article['content'].replace('\\u','')
30
+ article['contentCN'] = repr(contentCN)[1:-1].strip()
31
+ if len(contentCN) < 10:
32
  continue
33
  CONTENT_ENG = ''
34
+ for element in contentCN.split("。"):
35
  CONTENT_ENG += translate(element) + ' '
36
  article['content'] = CONTENT_ENG
37
  article['site'] = "State Taxation Administration of China"
38
  article['originalSite'] = "国家税务总局"
39
+ article['titleCN'] = article['title']
40
  article['title'] = translate(article['originalTitle'])
41
  article['url'] = article['snapshotUrl']
42
+ article['author'] = ""
43
+ article['attachment'] = ""
44
  article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(article['cwrq'],"%Y-%m-%d %H:%M:%S"))
45
+ article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(CONTENT_ENG.replace("\n",""))
46
+ article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['titleCN']+article['publishDate'])
47
  upsert_content(article)
48
  except Exception as error:
49
  print(error)
 
76
  text = req.read()
77
  html_text = text.decode("utf-8")
78
  page = etree.HTML(html_text)
79
+ contentCN= encode_content(page.xpath("//div[contains(@class, 'article')]//p"))
80
+ if len(contentCN) < 10:
81
  continue
82
  CONTENT_ENG = ''
83
+ for element in contentCN.split("。"):
84
  CONTENT_ENG += translate(element) + ' '
85
+ article['contentCN'] = repr(contentCN)[1:-1].strip()
86
+ article['content'] = repr(CONTENT_ENG)[1:-1].strip()
87
  article['site'] = "State Taxation Administration of China"
88
  article['originalSite'] = "国家税务总局"
89
+ article['titleCN'] = article['title']
90
  article['title'] = translate(article['originalTitle'])
91
  article['url'] = article['url']
92
+ article['attachment'] = ""
93
+ article['author'] = ""
94
  article['category']= "Policy Interpretation"
95
  article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(article['publishedTimeStr'],"%Y-%m-%d %H:%M:%S"))
96
+ article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['titleCN']+article['publishDate'])
97
  article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
98
  upsert_content(article)
99
  except Exception as error:
csrc.py CHANGED
@@ -52,22 +52,22 @@ while i > -1:
52
  article['category']= "Financial News"
53
  article['site'] = "Securities Regulatory Commission of China"
54
  article['originSite'] = "证监会"
55
- article['originTitle'] = article['title']
56
- article['title'] = translate(article['originTitle'])
57
  article['author'] = ''
58
- article['originContent'] = repr(article['content'])
59
- if len(article['originContent']) < 10:
60
  continue
61
  CONTENT_ENG = ''
62
- for element in article['originContent'].split("。"):
63
  CONTENT_ENG += translate(element) + ' '
64
- article['content'] = repr(CONTENT_ENG)
65
  article['subtitle'] = article['memo']
66
  article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(article['publishedTimeStr'],"%Y-%m-%d %H:%M:%S"))
67
  article['link'] = article['url']
68
  article['attachment'] = ""
69
  article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
70
- article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
71
  upsert_content(article)
72
  except Exception as error:
73
  print(error)
 
52
  article['category']= "Financial News"
53
  article['site'] = "Securities Regulatory Commission of China"
54
  article['originSite'] = "证监会"
55
+ article['titleCN'] = article['title']
56
+ article['title'] = translate(article['titleCN'])
57
  article['author'] = ''
58
+ article['contentCN'] = repr(article['content'])[1:-1].strip()
59
+ if len(article['contentCN']) < 10:
60
  continue
61
  CONTENT_ENG = ''
62
+ for element in article['contentCN'].split("。"):
63
  CONTENT_ENG += translate(element) + ' '
64
+ article['content'] = repr(CONTENT_ENG)[1:-1].strip()
65
  article['subtitle'] = article['memo']
66
  article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(article['publishedTimeStr'],"%Y-%m-%d %H:%M:%S"))
67
  article['link'] = article['url']
68
  article['attachment'] = ""
69
  article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
70
+ article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['titleCN']+article['publishDate'])
71
  upsert_content(article)
72
  except Exception as error:
73
  print(error)
utils.py CHANGED
@@ -124,9 +124,12 @@ def extract_from_pdf(url):
124
  text = pdf_reader.pages[page].extract_text()
125
  if text and text[0].isdigit():
126
  text = text[1:]
127
- first_newline_index = text.find('\n')
128
- text = text[:first_newline_index+1].replace('\n', ' ') + text[first_newline_index+1:]
129
- extracted_text += text
 
 
 
130
  try:
131
  summary = '\n'.join(extracted_text.split('\n')[:2])
132
  except:
@@ -202,7 +205,7 @@ def crawl(url, article):
202
  def upsert_content(report):
203
  """Upsert the content records"""
204
  dynamodb = get_db_connection()
205
- table = dynamodb.Table('article_china')
206
  # Define the item data
207
  item = {
208
  'id': str(report['id']),
 
124
  text = pdf_reader.pages[page].extract_text()
125
  if text and text[0].isdigit():
126
  text = text[1:]
127
+ # first_newline_index = text.find('。\n')
128
+ # text = text[:first_newline_index+1].replace('\n', '') + text[first_newline_index+1:]
129
+ text = text.replace('?\n', '?-\n').replace('!\n', '!-\n').replace('。\n', '。-\n').replace('\n','').replace('?-','?\n').replace('!-','!\n').replace('。-','。\n')
130
+ print(text)
131
+ if text != '':
132
+ extracted_text += text
133
  try:
134
  summary = '\n'.join(extracted_text.split('\n')[:2])
135
  except:
 
205
  def upsert_content(report):
206
  """Upsert the content records"""
207
  dynamodb = get_db_connection()
208
+ table = dynamodb.Table('article_test')
209
  # Define the item data
210
  item = {
211
  'id': str(report['id']),