Spaces:

Oxbridge-Economics
/

Data-Collection-China

Running

App Files Files Community

OxbridgeEconomics commited on Apr 2, 2024

Commit

71e720e

1 Parent(s): 1580b60

commit

Browse files

Files changed (3) hide show

.gitignore +2 -1
mof.py +121 -120
utils.py +1 -1

.gitignore CHANGED Viewed

@@ -1,2 +1,3 @@
 env
-__pycache__

 env
+__pycache__
+downloaded_file.pdf

mof.py CHANGED Viewed

@@ -3,65 +3,115 @@ import time
 import urllib.request
 from lxml import etree
 from datetime import datetime, timedelta
-from utils import encode, translate, datemodifier, sentiment_computation, upsert_content
-i = 0
-while i > -1:
-    if i == 0:
-        CATEGORY_URL = "https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/"
-    else:
-        CATEGORY_URL = f"https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/index_{i}.htm"
-    i = i + 1
-    req = urllib.request.urlopen(CATEGORY_URL)
-    text = req.read()
-    html_text = text.decode("utf-8")
-    page = etree.HTML(html_text)
-    articlelist = page.xpath("//div[contains(@class, 'xwfb_listerji')]/ul/li[not(@class = 'clear')]")
-    for article in articlelist:
-        if isinstance(article, etree._Element):
-            subelement = etree.tostring(article).decode()
-            subpage = etree.HTML(subelement)
-            date = subpage.xpath("//span/text()")[0]
-            parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
-            if  parsed_datetime < (datetime.today() - timedelta(days=183)):
-                i = -1
-            else:
-                urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
-                for url in urls:
-                    try:
-                        article = {}
-                        url = url.replace("../", "https://www.mof.gov.cn/zhengwuxinxi/")
-                        url = url.replace("./", "https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/")
-                        req = urllib.request.urlopen(url)
-                        text = req.read()
-                        html_text = text.decode("utf-8")
-                        page = etree.HTML(html_text)
-                        article['originalContent'] = encode(page.xpath("//div[contains(@class, 'TRS_Editor')]//p"))
-                        if len(article['originalContent']) < 10:
-                            continue
-                        CONTENT_ENG = ''
-                        for element in article['originalContent'].split("。"):
-                            CONTENT_ENG += translate(element) + ' '
-                        article['content'] = CONTENT_ENG
-                        article['site'] = "Ministry of Finance"
-                        article['originalSite'] = "财政部"
-                        article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
-                        article['title'] = translate(article['originalTitle'])
-                        article['url'] = url
-                        article['category']= "Financial News"
-                        article['publishDate'] = datemodifier(page.xpath("//meta[@name = 'PubDate']/@content")[0],"%Y-%m-%d %H:%M:%S")
-                        article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
-                        article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
-                        upsert_content(article)
-                    except Exception as error:
-                        print(error)
 i = 0
 while i > -1:
     if i == 0:
-        CATEGORY_URL = "https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/"
     else:
-        CATEGORY_URL = f"https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/index_{i}.htm"
     i = i + 1
     req = urllib.request.urlopen(CATEGORY_URL)
     text = req.read()
@@ -82,11 +132,25 @@ while i > -1:
                     try:
                         article = {}
                         url = url.replace("./", CATEGORY_URL)
                         req = urllib.request.urlopen(url)
                         text = req.read()
                         html_text = text.decode("utf-8")
                         page = etree.HTML(html_text)
-                        article['originalContent'] = encode(page.xpath("//div[contains(@class, 'TRS_Editor')]//p"))
                         if len(article['originalContent']) < 10:
                             continue
                         CONTENT_ENG = ''
@@ -98,73 +162,10 @@ while i > -1:
                         article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
                         article['title'] = translate(article['originalTitle'])
                         article['url'] = url
-                        article['category']= "Policy Interpretation"
                         article['publishDate'] = datemodifier(page.xpath("//meta[@name = 'PubDate']/@content")[0], "%Y-%m-%d %H:%M:%S")
                         article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
                         article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
-                        upsert_content(article)
                     except Exception as error:
-                        print(error)
-# categoryu_urls = ["https://www.mof.gov.cn/zhengwuxinxi/zhengcefabu/"]
-# for categoryu_url in categoryu_urls:
-#     req = urllib.request.urlopen(categoryu_url)
-#     text = req.read()
-#     html_text = text.decode("utf-8")
-#     page = etree.HTML(html_text)
-#     articlelist = page.xpath("//div[contains(@class, 'xwfb_listerji')]/ul/li[not(@class = 'clear')]")
-#     for article in articlelist:
-#         if isinstance(article, etree._Element):
-#             subelement = etree.tostring(article).decode()
-#             subpage = etree.HTML(subelement)
-#             date = subpage.xpath("//span/text()")[0]
-#             parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
-#             if  parsed_datetime > (datetime.today() - timedelta(days=183)):
-#                 urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
-#                 for url in urls:
-#                     try:
-#                         article = {}
-#                         url = url.replace("./", categoryu_url)
-#                         req = urllib.request.urlopen(url)
-#                         text = req.read()
-#                         html_text = text.decode("utf-8")
-#                         page = etree.HTML(html_text)
-#                         article['originalContent'] = encode(page.xpath("//div[contains(@class, 'TRS_Editor')]//p"))
-#                         content_eng = ''
-#                         for element in article['originalContent'].split("。"):
-#                             content_eng += translator.translate(element, dest='en').text + ' '
-#                         article['content'] = content_eng
-#                         article['site'] = "Ministry of Finance"
-#                         article['originalSite'] = "财政部"
-#                         article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
-#                         article['title'] = translator.translate(article['originalTitle'], dest='en').text
-#                         article['url'] = url
-#                         article['category']= "Policy Release"
-#                         article['publishDate'] = datemodifier(page.xpath("//meta[@name = 'PubDate']/@content")[0])
-#                         article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
-#                         label_dict = {
-#                             "positive": "+",
-#                             "negative": "-",
-#                             "neutral": "0",
-#                         }
-#                         sentiment_score = 0
-#                         maximum_value = 0
-#                         raw_sentiment = analyzer(article['content'][:512], return_all_scores=True)
-#                         sentiment_label = None
-#                         for sentiment_dict in raw_sentiment[0]:
-#                             value = sentiment_dict["score"]
-#                             if value > maximum_value:
-#                                 sentiment_label = sentiment_dict["label"]
-#                                 maximum_value = value
-#                             if sentiment_dict["label"] == "positive":
-#                                 sentiment_score = sentiment_score + value
-#                             if sentiment_dict["label"] == "negative":
-#                                 sentiment_score = sentiment_score - value
-#                             else:
-#                                 sentiment_score = sentiment_score + 0
-#                         article['sentimentScore'] = sentiment_score
-#                         article['sentimentLabel'] = label_dict[sentiment_label]
-#                         upsert_content(article)
-#                     except Exception as error:
-#                         print(error)

 import urllib.request
 from lxml import etree
 from datetime import datetime, timedelta
+from utils import encode, translate, datemodifier, sentiment_computation, upsert_content, extract_from_pdf
+# i = 0
+# while i > -1:
+#     if i == 0:
+#         CATEGORY_URL = "https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/"
+#     else:
+#         CATEGORY_URL = f"https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/index_{i}.htm"
+#     i = i + 1
+#     req = urllib.request.urlopen(CATEGORY_URL)
+#     text = req.read()
+#     html_text = text.decode("utf-8")
+#     page = etree.HTML(html_text)
+#     articlelist = page.xpath("//div[contains(@class, 'xwfb_listerji')]/ul/li[not(@class = 'clear')]")
+#     for article in articlelist:
+#         if isinstance(article, etree._Element):
+#             subelement = etree.tostring(article).decode()
+#             subpage = etree.HTML(subelement)
+#             date = subpage.xpath("//span/text()")[0]
+#             parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
+#             if  parsed_datetime < (datetime.today() - timedelta(days=183)):
+#                 i = -1
+#             else:
+#                 urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
+#                 for url in urls:
+#                     try:
+#                         article = {}
+#                         url = url.replace("../", "https://www.mof.gov.cn/zhengwuxinxi/")
+#                         url = url.replace("./", "https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/")
+#                         req = urllib.request.urlopen(url)
+#                         text = req.read()
+#                         html_text = text.decode("utf-8")
+#                         page = etree.HTML(html_text)
+#                         article['originalContent'] = encode(page.xpath("//div[contains(@class, 'TRS_Editor')]//p"))
+#                         if len(article['originalContent']) < 10:
+#                             continue
+#                         CONTENT_ENG = ''
+#                         for element in article['originalContent'].split("。"):
+#                             CONTENT_ENG += translate(element) + ' '
+#                         article['content'] = CONTENT_ENG
+#                         article['site'] = "Ministry of Finance"
+#                         article['originalSite'] = "财政部"
+#                         article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
+#                         article['title'] = translate(article['originalTitle'])
+#                         article['url'] = url
+#                         article['category']= "Financial News"
+#                         article['publishDate'] = datemodifier(page.xpath("//meta[@name = 'PubDate']/@content")[0],"%Y-%m-%d %H:%M:%S")
+#                         article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
+#                         article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
+#                         upsert_content(article)
+#                     except Exception as error:
+#                         print(error)
+# i = 0
+# while i > -1:
+#     if i == 0:
+#         CATEGORY_URL = "https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/"
+#     else:
+#         CATEGORY_URL = f"https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/index_{i}.htm"
+#     i = i + 1
+#     req = urllib.request.urlopen(CATEGORY_URL)
+#     text = req.read()
+#     html_text = text.decode("utf-8")
+#     page = etree.HTML(html_text)
+#     articlelist = page.xpath("//div[contains(@class, 'xwfb_listerji')]/ul/li[not(@class = 'clear')]")
+#     for article in articlelist:
+#         if isinstance(article, etree._Element):
+#             subelement = etree.tostring(article).decode()
+#             subpage = etree.HTML(subelement)
+#             date = subpage.xpath("//span/text()")[0]
+#             parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
+#             if  parsed_datetime < (datetime.today() - timedelta(days=183)):
+#                 i = -1
+#             else:
+#                 urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
+#                 for url in urls:
+#                     try:
+#                         article = {}
+#                         url = url.replace("./", CATEGORY_URL)
+#                         req = urllib.request.urlopen(url)
+#                         text = req.read()
+#                         html_text = text.decode("utf-8")
+#                         page = etree.HTML(html_text)
+#                         article['originalContent'] = encode(page.xpath("//div[contains(@class, 'TRS_Editor')]//p"))
+#                         if len(article['originalContent']) < 10:
+#                             continue
+#                         CONTENT_ENG = ''
+#                         for element in article['originalContent'].split("。"):
+#                             CONTENT_ENG += translate(element) + ' '
+#                         article['content'] = CONTENT_ENG
+#                         article['site'] = "Ministry of Finance"
+#                         article['originalSite'] = "财政部"
+#                         article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
+#                         article['title'] = translate(article['originalTitle'])
+#                         article['url'] = url
+#                         article['category']= "Policy Interpretation"
+#                         article['publishDate'] = datemodifier(page.xpath("//meta[@name = 'PubDate']/@content")[0], "%Y-%m-%d %H:%M:%S")
+#                         article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
+#                         article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
+#                         upsert_content(article)
+#                     except Exception as error:
+#                         print(error)
 i = 0
 while i > -1:
     if i == 0:
+        CATEGORY_URL = "https://www.mof.gov.cn/zhengwuxinxi/zhengcefabu/"
     else:
+        CATEGORY_URL = f"https://www.mof.gov.cn/zhengwuxinxi/zhengcefabu/index_{i}.htm"
     i = i + 1
     req = urllib.request.urlopen(CATEGORY_URL)
     text = req.read()
                     try:
                         article = {}
                         url = url.replace("./", CATEGORY_URL)
+                        print(url)
                         req = urllib.request.urlopen(url)
                         text = req.read()
                         html_text = text.decode("utf-8")
                         page = etree.HTML(html_text)
+                        attachments = page.xpath("//span[contains(@id, 'appendix1')]/a/@href")
+                        print(attachments)
+                        if len(attachments) > 0:
+                            for attachment_url in attachments:
+                                if '.pdf' in attachment_url:
+                                    attachment_url = attachment_url.replace("./", "https://zyhj.mof.gov.cn/zcfb/202403/")
+                                    article['originalContent'] = extract_from_pdf(attachment_url)
+                                if '.doc' in attachment_url:
+                                    continue
+                                if '.docx' in attachment_url:
+                                    continue
+                        else:
+                            article['originalContent'] = encode(page.xpath("//div[contains(@class, 'TRS_Editor')]/p[@align='justify']"))
+                        print(article['originalContent'])
                         if len(article['originalContent']) < 10:
                             continue
                         CONTENT_ENG = ''
                         article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
                         article['title'] = translate(article['originalTitle'])
                         article['url'] = url
+                        article['category']= "Policy Release"
                         article['publishDate'] = datemodifier(page.xpath("//meta[@name = 'PubDate']/@content")[0], "%Y-%m-%d %H:%M:%S")
                         article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
                         article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
+                        # upsert_content(article)
                     except Exception as error:
+                        print(error)

utils.py CHANGED Viewed

@@ -46,7 +46,7 @@ def translist(infolist):
 def encode(content):
     """Encode Function"""
     text = ''
-    for element in content[:1]:
         if isinstance(element, etree._Element):
             subelement = etree.tostring(element).decode()
             subpage = etree.HTML(subelement)

 def encode(content):
     """Encode Function"""
     text = ''
+    for element in content:
         if isinstance(element, etree._Element):
             subelement = etree.tostring(element).decode()
             subpage = etree.HTML(subelement)