OxbridgeEconomics
commited on
Commit
·
043eca4
1
Parent(s):
d0ddd7b
commit
Browse files- chinatax.py +10 -9
- mof.py +4 -0
chinatax.py
CHANGED
@@ -28,12 +28,12 @@ while i > -1:
|
|
28 |
article['category']= "Policy Interpretation"
|
29 |
contentCN = article['content'].replace('\\u','')
|
30 |
article['contentCN'] = repr(contentCN)[1:-1].strip()
|
31 |
-
if len(contentCN) < 10:
|
32 |
continue
|
33 |
CONTENT_ENG = ''
|
34 |
-
for element in contentCN.split("
|
35 |
-
CONTENT_ENG += translate(element) + '
|
36 |
-
article['content'] = CONTENT_ENG
|
37 |
article['site'] = "State Taxation Administration of China"
|
38 |
article['originalSite'] = "国家税务总局"
|
39 |
article['titleCN'] = article['title']
|
@@ -76,19 +76,20 @@ while i > -1:
|
|
76 |
text = req.read()
|
77 |
html_text = text.decode("utf-8")
|
78 |
page = etree.HTML(html_text)
|
79 |
-
contentCN= encode_content(page.xpath("//div[contains(@class, 'article')]//p"))
|
80 |
-
|
|
|
81 |
continue
|
82 |
CONTENT_ENG = ''
|
83 |
-
for element in contentCN.split("
|
84 |
-
CONTENT_ENG += translate(element) + '
|
85 |
-
article['contentCN'] = repr(contentCN)[1:-1].strip()
|
86 |
article['content'] = repr(CONTENT_ENG)[1:-1].strip()
|
87 |
article['site'] = "State Taxation Administration of China"
|
88 |
article['originalSite'] = "国家税务总局"
|
89 |
article['titleCN'] = article['title']
|
90 |
article['title'] = translate(article['originalTitle'])
|
91 |
article['url'] = article['url']
|
|
|
92 |
article['attachment'] = ""
|
93 |
article['author'] = ""
|
94 |
article['category']= "Policy Interpretation"
|
|
|
28 |
article['category']= "Policy Interpretation"
|
29 |
contentCN = article['content'].replace('\\u','')
|
30 |
article['contentCN'] = repr(contentCN)[1:-1].strip()
|
31 |
+
if len(article['contentCN']) < 10:
|
32 |
continue
|
33 |
CONTENT_ENG = ''
|
34 |
+
for element in contentCN.split("\n"):
|
35 |
+
CONTENT_ENG += translate(element) + '\n'
|
36 |
+
article['content'] = repr(CONTENT_ENG)[1:-1].strip()
|
37 |
article['site'] = "State Taxation Administration of China"
|
38 |
article['originalSite'] = "国家税务总局"
|
39 |
article['titleCN'] = article['title']
|
|
|
76 |
text = req.read()
|
77 |
html_text = text.decode("utf-8")
|
78 |
page = etree.HTML(html_text)
|
79 |
+
contentCN, summary = encode_content(page.xpath("//div[contains(@class, 'article')]//p"))
|
80 |
+
article['contentCN'] = repr(contentCN)[1:-1].strip()
|
81 |
+
if len(article['contentCN']) < 10:
|
82 |
continue
|
83 |
CONTENT_ENG = ''
|
84 |
+
for element in contentCN.split("\n"):
|
85 |
+
CONTENT_ENG += translate(element) + '\n'
|
|
|
86 |
article['content'] = repr(CONTENT_ENG)[1:-1].strip()
|
87 |
article['site'] = "State Taxation Administration of China"
|
88 |
article['originalSite'] = "国家税务总局"
|
89 |
article['titleCN'] = article['title']
|
90 |
article['title'] = translate(article['originalTitle'])
|
91 |
article['url'] = article['url']
|
92 |
+
article['subtitle'] = translate(summary)
|
93 |
article['attachment'] = ""
|
94 |
article['author'] = ""
|
95 |
article['category']= "Policy Interpretation"
|
mof.py
CHANGED
@@ -11,6 +11,7 @@ while i > -1:
|
|
11 |
else:
|
12 |
CATEGORY_URL = f"https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/index_{i}.htm"
|
13 |
i = i + 1
|
|
|
14 |
req = urllib.request.urlopen(CATEGORY_URL)
|
15 |
text = req.read()
|
16 |
html_text = text.decode("utf-8")
|
@@ -31,6 +32,7 @@ while i > -1:
|
|
31 |
article = {}
|
32 |
url = url.replace("../", "https://www.mof.gov.cn/zhengwuxinxi/")
|
33 |
url = url.replace("./", "https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/")
|
|
|
34 |
article['category']= "Financial News"
|
35 |
crawl(url, article)
|
36 |
except Exception as error:
|
@@ -43,6 +45,7 @@ while i > -1:
|
|
43 |
else:
|
44 |
CATEGORY_URL = f"https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/index_{i}.htm"
|
45 |
i = i + 1
|
|
|
46 |
req = urllib.request.urlopen(CATEGORY_URL)
|
47 |
text = req.read()
|
48 |
html_text = text.decode("utf-8")
|
@@ -63,6 +66,7 @@ while i > -1:
|
|
63 |
article = {}
|
64 |
url = url.replace("./", CATEGORY_URL)
|
65 |
article['category']= "Policy Interpretation"
|
|
|
66 |
crawl(url, article)
|
67 |
except Exception as error:
|
68 |
print(error)
|
|
|
11 |
else:
|
12 |
CATEGORY_URL = f"https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/index_{i}.htm"
|
13 |
i = i + 1
|
14 |
+
print(CATEGORY_URL)
|
15 |
req = urllib.request.urlopen(CATEGORY_URL)
|
16 |
text = req.read()
|
17 |
html_text = text.decode("utf-8")
|
|
|
32 |
article = {}
|
33 |
url = url.replace("../", "https://www.mof.gov.cn/zhengwuxinxi/")
|
34 |
url = url.replace("./", "https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/")
|
35 |
+
print(url)
|
36 |
article['category']= "Financial News"
|
37 |
crawl(url, article)
|
38 |
except Exception as error:
|
|
|
45 |
else:
|
46 |
CATEGORY_URL = f"https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/index_{i}.htm"
|
47 |
i = i + 1
|
48 |
+
print(CATEGORY_URL)
|
49 |
req = urllib.request.urlopen(CATEGORY_URL)
|
50 |
text = req.read()
|
51 |
html_text = text.decode("utf-8")
|
|
|
66 |
article = {}
|
67 |
url = url.replace("./", CATEGORY_URL)
|
68 |
article['category']= "Policy Interpretation"
|
69 |
+
print(url)
|
70 |
crawl(url, article)
|
71 |
except Exception as error:
|
72 |
print(error)
|