OxbridgeEconomics commited on
Commit
f801221
·
unverified ·
1 Parent(s): 71e720e

Update mof.py

Browse files
Files changed (1) hide show
  1. mof.py +119 -119
mof.py CHANGED
@@ -5,63 +5,113 @@ from lxml import etree
5
  from datetime import datetime, timedelta
6
  from utils import encode, translate, datemodifier, sentiment_computation, upsert_content, extract_from_pdf
7
 
8
- # i = 0
9
- # while i > -1:
10
- # if i == 0:
11
- # CATEGORY_URL = "https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/"
12
- # else:
13
- # CATEGORY_URL = f"https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/index_{i}.htm"
14
- # i = i + 1
15
- # req = urllib.request.urlopen(CATEGORY_URL)
16
- # text = req.read()
17
- # html_text = text.decode("utf-8")
18
- # page = etree.HTML(html_text)
19
- # articlelist = page.xpath("//div[contains(@class, 'xwfb_listerji')]/ul/li[not(@class = 'clear')]")
20
- # for article in articlelist:
21
- # if isinstance(article, etree._Element):
22
- # subelement = etree.tostring(article).decode()
23
- # subpage = etree.HTML(subelement)
24
- # date = subpage.xpath("//span/text()")[0]
25
- # parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
26
- # if parsed_datetime < (datetime.today() - timedelta(days=183)):
27
- # i = -1
28
- # else:
29
- # urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
30
- # for url in urls:
31
- # try:
32
- # article = {}
33
- # url = url.replace("../", "https://www.mof.gov.cn/zhengwuxinxi/")
34
- # url = url.replace("./", "https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/")
35
- # req = urllib.request.urlopen(url)
36
- # text = req.read()
37
- # html_text = text.decode("utf-8")
38
- # page = etree.HTML(html_text)
39
- # article['originalContent'] = encode(page.xpath("//div[contains(@class, 'TRS_Editor')]//p"))
40
- # if len(article['originalContent']) < 10:
41
- # continue
42
- # CONTENT_ENG = ''
43
- # for element in article['originalContent'].split("。"):
44
- # CONTENT_ENG += translate(element) + ' '
45
- # article['content'] = CONTENT_ENG
46
- # article['site'] = "Ministry of Finance"
47
- # article['originalSite'] = "财政部"
48
- # article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
49
- # article['title'] = translate(article['originalTitle'])
50
- # article['url'] = url
51
- # article['category']= "Financial News"
52
- # article['publishDate'] = datemodifier(page.xpath("//meta[@name = 'PubDate']/@content")[0],"%Y-%m-%d %H:%M:%S")
53
- # article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
54
- # article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
55
- # upsert_content(article)
56
- # except Exception as error:
57
- # print(error)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
59
  # i = 0
60
  # while i > -1:
61
  # if i == 0:
62
- # CATEGORY_URL = "https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/"
63
  # else:
64
- # CATEGORY_URL = f"https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/index_{i}.htm"
65
  # i = i + 1
66
  # req = urllib.request.urlopen(CATEGORY_URL)
67
  # text = req.read()
@@ -82,11 +132,25 @@ from utils import encode, translate, datemodifier, sentiment_computation, upsert
82
  # try:
83
  # article = {}
84
  # url = url.replace("./", CATEGORY_URL)
 
85
  # req = urllib.request.urlopen(url)
86
  # text = req.read()
87
  # html_text = text.decode("utf-8")
88
  # page = etree.HTML(html_text)
89
- # article['originalContent'] = encode(page.xpath("//div[contains(@class, 'TRS_Editor')]//p"))
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  # if len(article['originalContent']) < 10:
91
  # continue
92
  # CONTENT_ENG = ''
@@ -98,74 +162,10 @@ from utils import encode, translate, datemodifier, sentiment_computation, upsert
98
  # article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
99
  # article['title'] = translate(article['originalTitle'])
100
  # article['url'] = url
101
- # article['category']= "Policy Interpretation"
102
  # article['publishDate'] = datemodifier(page.xpath("//meta[@name = 'PubDate']/@content")[0], "%Y-%m-%d %H:%M:%S")
103
  # article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
104
  # article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
105
- # upsert_content(article)
106
  # except Exception as error:
107
  # print(error)
108
-
109
- i = 0
110
- while i > -1:
111
- if i == 0:
112
- CATEGORY_URL = "https://www.mof.gov.cn/zhengwuxinxi/zhengcefabu/"
113
- else:
114
- CATEGORY_URL = f"https://www.mof.gov.cn/zhengwuxinxi/zhengcefabu/index_{i}.htm"
115
- i = i + 1
116
- req = urllib.request.urlopen(CATEGORY_URL)
117
- text = req.read()
118
- html_text = text.decode("utf-8")
119
- page = etree.HTML(html_text)
120
- articlelist = page.xpath("//div[contains(@class, 'xwfb_listerji')]/ul/li[not(@class = 'clear')]")
121
- for article in articlelist:
122
- if isinstance(article, etree._Element):
123
- subelement = etree.tostring(article).decode()
124
- subpage = etree.HTML(subelement)
125
- date = subpage.xpath("//span/text()")[0]
126
- parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
127
- if parsed_datetime < (datetime.today() - timedelta(days=183)):
128
- i = -1
129
- else:
130
- urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
131
- for url in urls:
132
- try:
133
- article = {}
134
- url = url.replace("./", CATEGORY_URL)
135
- print(url)
136
- req = urllib.request.urlopen(url)
137
- text = req.read()
138
- html_text = text.decode("utf-8")
139
- page = etree.HTML(html_text)
140
- attachments = page.xpath("//span[contains(@id, 'appendix1')]/a/@href")
141
- print(attachments)
142
- if len(attachments) > 0:
143
- for attachment_url in attachments:
144
- if '.pdf' in attachment_url:
145
- attachment_url = attachment_url.replace("./", "https://zyhj.mof.gov.cn/zcfb/202403/")
146
- article['originalContent'] = extract_from_pdf(attachment_url)
147
- if '.doc' in attachment_url:
148
- continue
149
- if '.docx' in attachment_url:
150
- continue
151
- else:
152
- article['originalContent'] = encode(page.xpath("//div[contains(@class, 'TRS_Editor')]/p[@align='justify']"))
153
- print(article['originalContent'])
154
- if len(article['originalContent']) < 10:
155
- continue
156
- CONTENT_ENG = ''
157
- for element in article['originalContent'].split("。"):
158
- CONTENT_ENG += translate(element) + ' '
159
- article['content'] = CONTENT_ENG
160
- article['site'] = "Ministry of Finance"
161
- article['originalSite'] = "财政部"
162
- article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
163
- article['title'] = translate(article['originalTitle'])
164
- article['url'] = url
165
- article['category']= "Policy Release"
166
- article['publishDate'] = datemodifier(page.xpath("//meta[@name = 'PubDate']/@content")[0], "%Y-%m-%d %H:%M:%S")
167
- article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
168
- article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
169
- # upsert_content(article)
170
- except Exception as error:
171
- print(error)
 
5
  from datetime import datetime, timedelta
6
  from utils import encode, translate, datemodifier, sentiment_computation, upsert_content, extract_from_pdf
7
 
8
+ i = 0
9
+ while i > -1:
10
+ if i == 0:
11
+ CATEGORY_URL = "https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/"
12
+ else:
13
+ CATEGORY_URL = f"https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/index_{i}.htm"
14
+ i = i + 1
15
+ req = urllib.request.urlopen(CATEGORY_URL)
16
+ text = req.read()
17
+ html_text = text.decode("utf-8")
18
+ page = etree.HTML(html_text)
19
+ articlelist = page.xpath("//div[contains(@class, 'xwfb_listerji')]/ul/li[not(@class = 'clear')]")
20
+ for article in articlelist:
21
+ if isinstance(article, etree._Element):
22
+ subelement = etree.tostring(article).decode()
23
+ subpage = etree.HTML(subelement)
24
+ date = subpage.xpath("//span/text()")[0]
25
+ parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
26
+ if parsed_datetime < (datetime.today() - timedelta(days=183)):
27
+ i = -1
28
+ else:
29
+ urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
30
+ for url in urls:
31
+ try:
32
+ article = {}
33
+ url = url.replace("../", "https://www.mof.gov.cn/zhengwuxinxi/")
34
+ url = url.replace("./", "https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/")
35
+ req = urllib.request.urlopen(url)
36
+ text = req.read()
37
+ html_text = text.decode("utf-8")
38
+ page = etree.HTML(html_text)
39
+ article['originalContent'] = encode(page.xpath("//div[contains(@class, 'TRS_Editor')]//p"))
40
+ if len(article['originalContent']) < 10:
41
+ continue
42
+ CONTENT_ENG = ''
43
+ for element in article['originalContent'].split("。"):
44
+ CONTENT_ENG += translate(element) + ' '
45
+ article['content'] = CONTENT_ENG
46
+ article['site'] = "Ministry of Finance"
47
+ article['originalSite'] = "财政部"
48
+ article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
49
+ article['title'] = translate(article['originalTitle'])
50
+ article['url'] = url
51
+ article['category']= "Financial News"
52
+ article['publishDate'] = datemodifier(page.xpath("//meta[@name = 'PubDate']/@content")[0],"%Y-%m-%d %H:%M:%S")
53
+ article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
54
+ article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
55
+ upsert_content(article)
56
+ except Exception as error:
57
+ print(error)
58
+
59
+ i = 0
60
+ while i > -1:
61
+ if i == 0:
62
+ CATEGORY_URL = "https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/"
63
+ else:
64
+ CATEGORY_URL = f"https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/index_{i}.htm"
65
+ i = i + 1
66
+ req = urllib.request.urlopen(CATEGORY_URL)
67
+ text = req.read()
68
+ html_text = text.decode("utf-8")
69
+ page = etree.HTML(html_text)
70
+ articlelist = page.xpath("//div[contains(@class, 'xwfb_listerji')]/ul/li[not(@class = 'clear')]")
71
+ for article in articlelist:
72
+ if isinstance(article, etree._Element):
73
+ subelement = etree.tostring(article).decode()
74
+ subpage = etree.HTML(subelement)
75
+ date = subpage.xpath("//span/text()")[0]
76
+ parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
77
+ if parsed_datetime < (datetime.today() - timedelta(days=183)):
78
+ i = -1
79
+ else:
80
+ urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
81
+ for url in urls:
82
+ try:
83
+ article = {}
84
+ url = url.replace("./", CATEGORY_URL)
85
+ req = urllib.request.urlopen(url)
86
+ text = req.read()
87
+ html_text = text.decode("utf-8")
88
+ page = etree.HTML(html_text)
89
+ article['originalContent'] = encode(page.xpath("//div[contains(@class, 'TRS_Editor')]//p"))
90
+ if len(article['originalContent']) < 10:
91
+ continue
92
+ CONTENT_ENG = ''
93
+ for element in article['originalContent'].split("。"):
94
+ CONTENT_ENG += translate(element) + ' '
95
+ article['content'] = CONTENT_ENG
96
+ article['site'] = "Ministry of Finance"
97
+ article['originalSite'] = "财政部"
98
+ article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
99
+ article['title'] = translate(article['originalTitle'])
100
+ article['url'] = url
101
+ article['category']= "Policy Interpretation"
102
+ article['publishDate'] = datemodifier(page.xpath("//meta[@name = 'PubDate']/@content")[0], "%Y-%m-%d %H:%M:%S")
103
+ article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
104
+ article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
105
+ upsert_content(article)
106
+ except Exception as error:
107
+ print(error)
108
 
109
  # i = 0
110
  # while i > -1:
111
  # if i == 0:
112
+ # CATEGORY_URL = "https://www.mof.gov.cn/zhengwuxinxi/zhengcefabu/"
113
  # else:
114
+ # CATEGORY_URL = f"https://www.mof.gov.cn/zhengwuxinxi/zhengcefabu/index_{i}.htm"
115
  # i = i + 1
116
  # req = urllib.request.urlopen(CATEGORY_URL)
117
  # text = req.read()
 
132
  # try:
133
  # article = {}
134
  # url = url.replace("./", CATEGORY_URL)
135
+ # print(url)
136
  # req = urllib.request.urlopen(url)
137
  # text = req.read()
138
  # html_text = text.decode("utf-8")
139
  # page = etree.HTML(html_text)
140
+ # attachments = page.xpath("//span[contains(@id, 'appendix1')]/a/@href")
141
+ # print(attachments)
142
+ # if len(attachments) > 0:
143
+ # for attachment_url in attachments:
144
+ # if '.pdf' in attachment_url:
145
+ # attachment_url = attachment_url.replace("./", "https://zyhj.mof.gov.cn/zcfb/202403/")
146
+ # article['originalContent'] = extract_from_pdf(attachment_url)
147
+ # if '.doc' in attachment_url:
148
+ # continue
149
+ # if '.docx' in attachment_url:
150
+ # continue
151
+ # else:
152
+ # article['originalContent'] = encode(page.xpath("//div[contains(@class, 'TRS_Editor')]/p[@align='justify']"))
153
+ # print(article['originalContent'])
154
  # if len(article['originalContent']) < 10:
155
  # continue
156
  # CONTENT_ENG = ''
 
162
  # article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
163
  # article['title'] = translate(article['originalTitle'])
164
  # article['url'] = url
165
+ # article['category']= "Policy Release"
166
  # article['publishDate'] = datemodifier(page.xpath("//meta[@name = 'PubDate']/@content")[0], "%Y-%m-%d %H:%M:%S")
167
  # article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
168
  # article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
169
+ # # upsert_content(article)
170
  # except Exception as error:
171
  # print(error)