OxbridgeEconomics commited on
Commit
71e720e
·
1 Parent(s): 1580b60
Files changed (3) hide show
  1. .gitignore +2 -1
  2. mof.py +121 -120
  3. utils.py +1 -1
.gitignore CHANGED
@@ -1,2 +1,3 @@
1
  env
2
- __pycache__
 
 
1
  env
2
+ __pycache__
3
+ downloaded_file.pdf
mof.py CHANGED
@@ -3,65 +3,115 @@ import time
3
  import urllib.request
4
  from lxml import etree
5
  from datetime import datetime, timedelta
6
- from utils import encode, translate, datemodifier, sentiment_computation, upsert_content
7
 
8
- i = 0
9
- while i > -1:
10
- if i == 0:
11
- CATEGORY_URL = "https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/"
12
- else:
13
- CATEGORY_URL = f"https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/index_{i}.htm"
14
- i = i + 1
15
- req = urllib.request.urlopen(CATEGORY_URL)
16
- text = req.read()
17
- html_text = text.decode("utf-8")
18
- page = etree.HTML(html_text)
19
- articlelist = page.xpath("//div[contains(@class, 'xwfb_listerji')]/ul/li[not(@class = 'clear')]")
20
- for article in articlelist:
21
- if isinstance(article, etree._Element):
22
- subelement = etree.tostring(article).decode()
23
- subpage = etree.HTML(subelement)
24
- date = subpage.xpath("//span/text()")[0]
25
- parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
26
- if parsed_datetime < (datetime.today() - timedelta(days=183)):
27
- i = -1
28
- else:
29
- urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
30
- for url in urls:
31
- try:
32
- article = {}
33
- url = url.replace("../", "https://www.mof.gov.cn/zhengwuxinxi/")
34
- url = url.replace("./", "https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/")
35
- req = urllib.request.urlopen(url)
36
- text = req.read()
37
- html_text = text.decode("utf-8")
38
- page = etree.HTML(html_text)
39
- article['originalContent'] = encode(page.xpath("//div[contains(@class, 'TRS_Editor')]//p"))
40
- if len(article['originalContent']) < 10:
41
- continue
42
- CONTENT_ENG = ''
43
- for element in article['originalContent'].split("。"):
44
- CONTENT_ENG += translate(element) + ' '
45
- article['content'] = CONTENT_ENG
46
- article['site'] = "Ministry of Finance"
47
- article['originalSite'] = "财政部"
48
- article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
49
- article['title'] = translate(article['originalTitle'])
50
- article['url'] = url
51
- article['category']= "Financial News"
52
- article['publishDate'] = datemodifier(page.xpath("//meta[@name = 'PubDate']/@content")[0],"%Y-%m-%d %H:%M:%S")
53
- article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
54
- article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
55
- upsert_content(article)
56
- except Exception as error:
57
- print(error)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
59
  i = 0
60
  while i > -1:
61
  if i == 0:
62
- CATEGORY_URL = "https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/"
63
  else:
64
- CATEGORY_URL = f"https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/index_{i}.htm"
65
  i = i + 1
66
  req = urllib.request.urlopen(CATEGORY_URL)
67
  text = req.read()
@@ -82,11 +132,25 @@ while i > -1:
82
  try:
83
  article = {}
84
  url = url.replace("./", CATEGORY_URL)
 
85
  req = urllib.request.urlopen(url)
86
  text = req.read()
87
  html_text = text.decode("utf-8")
88
  page = etree.HTML(html_text)
89
- article['originalContent'] = encode(page.xpath("//div[contains(@class, 'TRS_Editor')]//p"))
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  if len(article['originalContent']) < 10:
91
  continue
92
  CONTENT_ENG = ''
@@ -98,73 +162,10 @@ while i > -1:
98
  article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
99
  article['title'] = translate(article['originalTitle'])
100
  article['url'] = url
101
- article['category']= "Policy Interpretation"
102
  article['publishDate'] = datemodifier(page.xpath("//meta[@name = 'PubDate']/@content")[0], "%Y-%m-%d %H:%M:%S")
103
  article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
104
  article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
105
- upsert_content(article)
106
  except Exception as error:
107
- print(error)
108
-
109
-
110
- # categoryu_urls = ["https://www.mof.gov.cn/zhengwuxinxi/zhengcefabu/"]
111
- # for categoryu_url in categoryu_urls:
112
- # req = urllib.request.urlopen(categoryu_url)
113
- # text = req.read()
114
- # html_text = text.decode("utf-8")
115
- # page = etree.HTML(html_text)
116
- # articlelist = page.xpath("//div[contains(@class, 'xwfb_listerji')]/ul/li[not(@class = 'clear')]")
117
- # for article in articlelist:
118
- # if isinstance(article, etree._Element):
119
- # subelement = etree.tostring(article).decode()
120
- # subpage = etree.HTML(subelement)
121
- # date = subpage.xpath("//span/text()")[0]
122
- # parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
123
- # if parsed_datetime > (datetime.today() - timedelta(days=183)):
124
- # urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
125
- # for url in urls:
126
- # try:
127
- # article = {}
128
- # url = url.replace("./", categoryu_url)
129
- # req = urllib.request.urlopen(url)
130
- # text = req.read()
131
- # html_text = text.decode("utf-8")
132
- # page = etree.HTML(html_text)
133
- # article['originalContent'] = encode(page.xpath("//div[contains(@class, 'TRS_Editor')]//p"))
134
- # content_eng = ''
135
- # for element in article['originalContent'].split("。"):
136
- # content_eng += translator.translate(element, dest='en').text + ' '
137
- # article['content'] = content_eng
138
- # article['site'] = "Ministry of Finance"
139
- # article['originalSite'] = "财政部"
140
- # article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
141
- # article['title'] = translator.translate(article['originalTitle'], dest='en').text
142
- # article['url'] = url
143
- # article['category']= "Policy Release"
144
- # article['publishDate'] = datemodifier(page.xpath("//meta[@name = 'PubDate']/@content")[0])
145
- # article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
146
- # label_dict = {
147
- # "positive": "+",
148
- # "negative": "-",
149
- # "neutral": "0",
150
- # }
151
- # sentiment_score = 0
152
- # maximum_value = 0
153
- # raw_sentiment = analyzer(article['content'][:512], return_all_scores=True)
154
- # sentiment_label = None
155
- # for sentiment_dict in raw_sentiment[0]:
156
- # value = sentiment_dict["score"]
157
- # if value > maximum_value:
158
- # sentiment_label = sentiment_dict["label"]
159
- # maximum_value = value
160
- # if sentiment_dict["label"] == "positive":
161
- # sentiment_score = sentiment_score + value
162
- # if sentiment_dict["label"] == "negative":
163
- # sentiment_score = sentiment_score - value
164
- # else:
165
- # sentiment_score = sentiment_score + 0
166
- # article['sentimentScore'] = sentiment_score
167
- # article['sentimentLabel'] = label_dict[sentiment_label]
168
- # upsert_content(article)
169
- # except Exception as error:
170
- # print(error)
 
3
  import urllib.request
4
  from lxml import etree
5
  from datetime import datetime, timedelta
6
+ from utils import encode, translate, datemodifier, sentiment_computation, upsert_content, extract_from_pdf
7
 
8
+ # i = 0
9
+ # while i > -1:
10
+ # if i == 0:
11
+ # CATEGORY_URL = "https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/"
12
+ # else:
13
+ # CATEGORY_URL = f"https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/index_{i}.htm"
14
+ # i = i + 1
15
+ # req = urllib.request.urlopen(CATEGORY_URL)
16
+ # text = req.read()
17
+ # html_text = text.decode("utf-8")
18
+ # page = etree.HTML(html_text)
19
+ # articlelist = page.xpath("//div[contains(@class, 'xwfb_listerji')]/ul/li[not(@class = 'clear')]")
20
+ # for article in articlelist:
21
+ # if isinstance(article, etree._Element):
22
+ # subelement = etree.tostring(article).decode()
23
+ # subpage = etree.HTML(subelement)
24
+ # date = subpage.xpath("//span/text()")[0]
25
+ # parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
26
+ # if parsed_datetime < (datetime.today() - timedelta(days=183)):
27
+ # i = -1
28
+ # else:
29
+ # urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
30
+ # for url in urls:
31
+ # try:
32
+ # article = {}
33
+ # url = url.replace("../", "https://www.mof.gov.cn/zhengwuxinxi/")
34
+ # url = url.replace("./", "https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/")
35
+ # req = urllib.request.urlopen(url)
36
+ # text = req.read()
37
+ # html_text = text.decode("utf-8")
38
+ # page = etree.HTML(html_text)
39
+ # article['originalContent'] = encode(page.xpath("//div[contains(@class, 'TRS_Editor')]//p"))
40
+ # if len(article['originalContent']) < 10:
41
+ # continue
42
+ # CONTENT_ENG = ''
43
+ # for element in article['originalContent'].split("。"):
44
+ # CONTENT_ENG += translate(element) + ' '
45
+ # article['content'] = CONTENT_ENG
46
+ # article['site'] = "Ministry of Finance"
47
+ # article['originalSite'] = "财政部"
48
+ # article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
49
+ # article['title'] = translate(article['originalTitle'])
50
+ # article['url'] = url
51
+ # article['category']= "Financial News"
52
+ # article['publishDate'] = datemodifier(page.xpath("//meta[@name = 'PubDate']/@content")[0],"%Y-%m-%d %H:%M:%S")
53
+ # article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
54
+ # article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
55
+ # upsert_content(article)
56
+ # except Exception as error:
57
+ # print(error)
58
+
59
+ # i = 0
60
+ # while i > -1:
61
+ # if i == 0:
62
+ # CATEGORY_URL = "https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/"
63
+ # else:
64
+ # CATEGORY_URL = f"https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/index_{i}.htm"
65
+ # i = i + 1
66
+ # req = urllib.request.urlopen(CATEGORY_URL)
67
+ # text = req.read()
68
+ # html_text = text.decode("utf-8")
69
+ # page = etree.HTML(html_text)
70
+ # articlelist = page.xpath("//div[contains(@class, 'xwfb_listerji')]/ul/li[not(@class = 'clear')]")
71
+ # for article in articlelist:
72
+ # if isinstance(article, etree._Element):
73
+ # subelement = etree.tostring(article).decode()
74
+ # subpage = etree.HTML(subelement)
75
+ # date = subpage.xpath("//span/text()")[0]
76
+ # parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
77
+ # if parsed_datetime < (datetime.today() - timedelta(days=183)):
78
+ # i = -1
79
+ # else:
80
+ # urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
81
+ # for url in urls:
82
+ # try:
83
+ # article = {}
84
+ # url = url.replace("./", CATEGORY_URL)
85
+ # req = urllib.request.urlopen(url)
86
+ # text = req.read()
87
+ # html_text = text.decode("utf-8")
88
+ # page = etree.HTML(html_text)
89
+ # article['originalContent'] = encode(page.xpath("//div[contains(@class, 'TRS_Editor')]//p"))
90
+ # if len(article['originalContent']) < 10:
91
+ # continue
92
+ # CONTENT_ENG = ''
93
+ # for element in article['originalContent'].split("。"):
94
+ # CONTENT_ENG += translate(element) + ' '
95
+ # article['content'] = CONTENT_ENG
96
+ # article['site'] = "Ministry of Finance"
97
+ # article['originalSite'] = "财政部"
98
+ # article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
99
+ # article['title'] = translate(article['originalTitle'])
100
+ # article['url'] = url
101
+ # article['category']= "Policy Interpretation"
102
+ # article['publishDate'] = datemodifier(page.xpath("//meta[@name = 'PubDate']/@content")[0], "%Y-%m-%d %H:%M:%S")
103
+ # article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
104
+ # article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
105
+ # upsert_content(article)
106
+ # except Exception as error:
107
+ # print(error)
108
 
109
  i = 0
110
  while i > -1:
111
  if i == 0:
112
+ CATEGORY_URL = "https://www.mof.gov.cn/zhengwuxinxi/zhengcefabu/"
113
  else:
114
+ CATEGORY_URL = f"https://www.mof.gov.cn/zhengwuxinxi/zhengcefabu/index_{i}.htm"
115
  i = i + 1
116
  req = urllib.request.urlopen(CATEGORY_URL)
117
  text = req.read()
 
132
  try:
133
  article = {}
134
  url = url.replace("./", CATEGORY_URL)
135
+ print(url)
136
  req = urllib.request.urlopen(url)
137
  text = req.read()
138
  html_text = text.decode("utf-8")
139
  page = etree.HTML(html_text)
140
+ attachments = page.xpath("//span[contains(@id, 'appendix1')]/a/@href")
141
+ print(attachments)
142
+ if len(attachments) > 0:
143
+ for attachment_url in attachments:
144
+ if '.pdf' in attachment_url:
145
+ attachment_url = attachment_url.replace("./", "https://zyhj.mof.gov.cn/zcfb/202403/")
146
+ article['originalContent'] = extract_from_pdf(attachment_url)
147
+ if '.doc' in attachment_url:
148
+ continue
149
+ if '.docx' in attachment_url:
150
+ continue
151
+ else:
152
+ article['originalContent'] = encode(page.xpath("//div[contains(@class, 'TRS_Editor')]/p[@align='justify']"))
153
+ print(article['originalContent'])
154
  if len(article['originalContent']) < 10:
155
  continue
156
  CONTENT_ENG = ''
 
162
  article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
163
  article['title'] = translate(article['originalTitle'])
164
  article['url'] = url
165
+ article['category']= "Policy Release"
166
  article['publishDate'] = datemodifier(page.xpath("//meta[@name = 'PubDate']/@content")[0], "%Y-%m-%d %H:%M:%S")
167
  article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
168
  article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
169
+ # upsert_content(article)
170
  except Exception as error:
171
+ print(error)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils.py CHANGED
@@ -46,7 +46,7 @@ def translist(infolist):
46
  def encode(content):
47
  """Encode Function"""
48
  text = ''
49
- for element in content[:1]:
50
  if isinstance(element, etree._Element):
51
  subelement = etree.tostring(element).decode()
52
  subpage = etree.HTML(subelement)
 
46
  def encode(content):
47
  """Encode Function"""
48
  text = ''
49
+ for element in content:
50
  if isinstance(element, etree._Element):
51
  subelement = etree.tostring(element).decode()
52
  subpage = etree.HTML(subelement)