OxbridgeEconomics commited on
Commit
043eca4
·
1 Parent(s): d0ddd7b
Files changed (2) hide show
  1. chinatax.py +10 -9
  2. mof.py +4 -0
chinatax.py CHANGED
@@ -28,12 +28,12 @@ while i > -1:
28
  article['category']= "Policy Interpretation"
29
  contentCN = article['content'].replace('\\u','')
30
  article['contentCN'] = repr(contentCN)[1:-1].strip()
31
- if len(contentCN) < 10:
32
  continue
33
  CONTENT_ENG = ''
34
- for element in contentCN.split(""):
35
- CONTENT_ENG += translate(element) + ' '
36
- article['content'] = CONTENT_ENG
37
  article['site'] = "State Taxation Administration of China"
38
  article['originalSite'] = "国家税务总局"
39
  article['titleCN'] = article['title']
@@ -76,19 +76,20 @@ while i > -1:
76
  text = req.read()
77
  html_text = text.decode("utf-8")
78
  page = etree.HTML(html_text)
79
- contentCN= encode_content(page.xpath("//div[contains(@class, 'article')]//p"))
80
- if len(contentCN) < 10:
 
81
  continue
82
  CONTENT_ENG = ''
83
- for element in contentCN.split(""):
84
- CONTENT_ENG += translate(element) + ' '
85
- article['contentCN'] = repr(contentCN)[1:-1].strip()
86
  article['content'] = repr(CONTENT_ENG)[1:-1].strip()
87
  article['site'] = "State Taxation Administration of China"
88
  article['originalSite'] = "国家税务总局"
89
  article['titleCN'] = article['title']
90
  article['title'] = translate(article['originalTitle'])
91
  article['url'] = article['url']
 
92
  article['attachment'] = ""
93
  article['author'] = ""
94
  article['category']= "Policy Interpretation"
 
28
  article['category']= "Policy Interpretation"
29
  contentCN = article['content'].replace('\\u','')
30
  article['contentCN'] = repr(contentCN)[1:-1].strip()
31
+ if len(article['contentCN']) < 10:
32
  continue
33
  CONTENT_ENG = ''
34
+ for element in contentCN.split("\n"):
35
+ CONTENT_ENG += translate(element) + '\n'
36
+ article['content'] = repr(CONTENT_ENG)[1:-1].strip()
37
  article['site'] = "State Taxation Administration of China"
38
  article['originalSite'] = "国家税务总局"
39
  article['titleCN'] = article['title']
 
76
  text = req.read()
77
  html_text = text.decode("utf-8")
78
  page = etree.HTML(html_text)
79
+ contentCN, summary = encode_content(page.xpath("//div[contains(@class, 'article')]//p"))
80
+ article['contentCN'] = repr(contentCN)[1:-1].strip()
81
+ if len(article['contentCN']) < 10:
82
  continue
83
  CONTENT_ENG = ''
84
+ for element in contentCN.split("\n"):
85
+ CONTENT_ENG += translate(element) + '\n'
 
86
  article['content'] = repr(CONTENT_ENG)[1:-1].strip()
87
  article['site'] = "State Taxation Administration of China"
88
  article['originalSite'] = "国家税务总局"
89
  article['titleCN'] = article['title']
90
  article['title'] = translate(article['originalTitle'])
91
  article['url'] = article['url']
92
+ article['subtitle'] = translate(summary)
93
  article['attachment'] = ""
94
  article['author'] = ""
95
  article['category']= "Policy Interpretation"
mof.py CHANGED
@@ -11,6 +11,7 @@ while i > -1:
11
  else:
12
  CATEGORY_URL = f"https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/index_{i}.htm"
13
  i = i + 1
 
14
  req = urllib.request.urlopen(CATEGORY_URL)
15
  text = req.read()
16
  html_text = text.decode("utf-8")
@@ -31,6 +32,7 @@ while i > -1:
31
  article = {}
32
  url = url.replace("../", "https://www.mof.gov.cn/zhengwuxinxi/")
33
  url = url.replace("./", "https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/")
 
34
  article['category']= "Financial News"
35
  crawl(url, article)
36
  except Exception as error:
@@ -43,6 +45,7 @@ while i > -1:
43
  else:
44
  CATEGORY_URL = f"https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/index_{i}.htm"
45
  i = i + 1
 
46
  req = urllib.request.urlopen(CATEGORY_URL)
47
  text = req.read()
48
  html_text = text.decode("utf-8")
@@ -63,6 +66,7 @@ while i > -1:
63
  article = {}
64
  url = url.replace("./", CATEGORY_URL)
65
  article['category']= "Policy Interpretation"
 
66
  crawl(url, article)
67
  except Exception as error:
68
  print(error)
 
11
  else:
12
  CATEGORY_URL = f"https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/index_{i}.htm"
13
  i = i + 1
14
+ print(CATEGORY_URL)
15
  req = urllib.request.urlopen(CATEGORY_URL)
16
  text = req.read()
17
  html_text = text.decode("utf-8")
 
32
  article = {}
33
  url = url.replace("../", "https://www.mof.gov.cn/zhengwuxinxi/")
34
  url = url.replace("./", "https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/")
35
+ print(url)
36
  article['category']= "Financial News"
37
  crawl(url, article)
38
  except Exception as error:
 
45
  else:
46
  CATEGORY_URL = f"https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/index_{i}.htm"
47
  i = i + 1
48
+ print(CATEGORY_URL)
49
  req = urllib.request.urlopen(CATEGORY_URL)
50
  text = req.read()
51
  html_text = text.decode("utf-8")
 
66
  article = {}
67
  url = url.replace("./", CATEGORY_URL)
68
  article['category']= "Policy Interpretation"
69
+ print(url)
70
  crawl(url, article)
71
  except Exception as error:
72
  print(error)