Spaces:

Oxbridge-Economics
/

Data-Collection-China

Running

App Files Files Community

OxbridgeEconomics commited on Apr 3, 2024

Commit

f801221

unverified ·

1 Parent(s): 71e720e

Update mof.py

Browse files

Files changed (1) hide show

mof.py +119 -119

mof.py CHANGED Viewed

@@ -5,63 +5,113 @@ from lxml import etree
 from datetime import datetime, timedelta
 from utils import encode, translate, datemodifier, sentiment_computation, upsert_content, extract_from_pdf
-# i = 0
-# while i > -1:
-#     if i == 0:
-#         CATEGORY_URL = "https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/"
-#     else:
-#         CATEGORY_URL = f"https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/index_{i}.htm"
-#     i = i + 1
-#     req = urllib.request.urlopen(CATEGORY_URL)
-#     text = req.read()
-#     html_text = text.decode("utf-8")
-#     page = etree.HTML(html_text)
-#     articlelist = page.xpath("//div[contains(@class, 'xwfb_listerji')]/ul/li[not(@class = 'clear')]")
-#     for article in articlelist:
-#         if isinstance(article, etree._Element):
-#             subelement = etree.tostring(article).decode()
-#             subpage = etree.HTML(subelement)
-#             date = subpage.xpath("//span/text()")[0]
-#             parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
-#             if  parsed_datetime < (datetime.today() - timedelta(days=183)):
-#                 i = -1
-#             else:
-#                 urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
-#                 for url in urls:
-#                     try:
-#                         article = {}
-#                         url = url.replace("../", "https://www.mof.gov.cn/zhengwuxinxi/")
-#                         url = url.replace("./", "https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/")
-#                         req = urllib.request.urlopen(url)
-#                         text = req.read()
-#                         html_text = text.decode("utf-8")
-#                         page = etree.HTML(html_text)
-#                         article['originalContent'] = encode(page.xpath("//div[contains(@class, 'TRS_Editor')]//p"))
-#                         if len(article['originalContent']) < 10:
-#                             continue
-#                         CONTENT_ENG = ''
-#                         for element in article['originalContent'].split("。"):
-#                             CONTENT_ENG += translate(element) + ' '
-#                         article['content'] = CONTENT_ENG
-#                         article['site'] = "Ministry of Finance"
-#                         article['originalSite'] = "财政部"
-#                         article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
-#                         article['title'] = translate(article['originalTitle'])
-#                         article['url'] = url
-#                         article['category']= "Financial News"
-#                         article['publishDate'] = datemodifier(page.xpath("//meta[@name = 'PubDate']/@content")[0],"%Y-%m-%d %H:%M:%S")
-#                         article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
-#                         article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
-#                         upsert_content(article)
-#                     except Exception as error:
-#                         print(error)
 # i = 0
 # while i > -1:
 #     if i == 0:
-#         CATEGORY_URL = "https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/"
 #     else:
-#         CATEGORY_URL = f"https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/index_{i}.htm"
 #     i = i + 1
 #     req = urllib.request.urlopen(CATEGORY_URL)
 #     text = req.read()
@@ -82,11 +132,25 @@ from utils import encode, translate, datemodifier, sentiment_computation, upsert
 #                     try:
 #                         article = {}
 #                         url = url.replace("./", CATEGORY_URL)
 #                         req = urllib.request.urlopen(url)
 #                         text = req.read()
 #                         html_text = text.decode("utf-8")
 #                         page = etree.HTML(html_text)
-#                         article['originalContent'] = encode(page.xpath("//div[contains(@class, 'TRS_Editor')]//p"))
 #                         if len(article['originalContent']) < 10:
 #                             continue
 #                         CONTENT_ENG = ''
@@ -98,74 +162,10 @@ from utils import encode, translate, datemodifier, sentiment_computation, upsert
 #                         article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
 #                         article['title'] = translate(article['originalTitle'])
 #                         article['url'] = url
-#                         article['category']= "Policy Interpretation"
 #                         article['publishDate'] = datemodifier(page.xpath("//meta[@name = 'PubDate']/@content")[0], "%Y-%m-%d %H:%M:%S")
 #                         article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
 #                         article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
-#                         upsert_content(article)
 #                     except Exception as error:
 #                         print(error)
-i = 0
-while i > -1:
-    if i == 0:
-        CATEGORY_URL = "https://www.mof.gov.cn/zhengwuxinxi/zhengcefabu/"
-    else:
-        CATEGORY_URL = f"https://www.mof.gov.cn/zhengwuxinxi/zhengcefabu/index_{i}.htm"
-    i = i + 1
-    req = urllib.request.urlopen(CATEGORY_URL)
-    text = req.read()
-    html_text = text.decode("utf-8")
-    page = etree.HTML(html_text)
-    articlelist = page.xpath("//div[contains(@class, 'xwfb_listerji')]/ul/li[not(@class = 'clear')]")
-    for article in articlelist:
-        if isinstance(article, etree._Element):
-            subelement = etree.tostring(article).decode()
-            subpage = etree.HTML(subelement)
-            date = subpage.xpath("//span/text()")[0]
-            parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
-            if  parsed_datetime < (datetime.today() - timedelta(days=183)):
-                i = -1
-            else:
-                urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
-                for url in urls:
-                    try:
-                        article = {}
-                        url = url.replace("./", CATEGORY_URL)
-                        print(url)
-                        req = urllib.request.urlopen(url)
-                        text = req.read()
-                        html_text = text.decode("utf-8")
-                        page = etree.HTML(html_text)
-                        attachments = page.xpath("//span[contains(@id, 'appendix1')]/a/@href")
-                        print(attachments)
-                        if len(attachments) > 0:
-                            for attachment_url in attachments:
-                                if '.pdf' in attachment_url:
-                                    attachment_url = attachment_url.replace("./", "https://zyhj.mof.gov.cn/zcfb/202403/")
-                                    article['originalContent'] = extract_from_pdf(attachment_url)
-                                if '.doc' in attachment_url:
-                                    continue
-                                if '.docx' in attachment_url:
-                                    continue
-                        else:
-                            article['originalContent'] = encode(page.xpath("//div[contains(@class, 'TRS_Editor')]/p[@align='justify']"))
-                        print(article['originalContent'])
-                        if len(article['originalContent']) < 10:
-                            continue
-                        CONTENT_ENG = ''
-                        for element in article['originalContent'].split("。"):
-                            CONTENT_ENG += translate(element) + ' '
-                        article['content'] = CONTENT_ENG
-                        article['site'] = "Ministry of Finance"
-                        article['originalSite'] = "财政部"
-                        article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
-                        article['title'] = translate(article['originalTitle'])
-                        article['url'] = url
-                        article['category']= "Policy Release"
-                        article['publishDate'] = datemodifier(page.xpath("//meta[@name = 'PubDate']/@content")[0], "%Y-%m-%d %H:%M:%S")
-                        article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
-                        article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
-                        # upsert_content(article)
-                    except Exception as error:
-                        print(error)

 from datetime import datetime, timedelta
 from utils import encode, translate, datemodifier, sentiment_computation, upsert_content, extract_from_pdf
+i = 0
+while i > -1:
+    if i == 0:
+        CATEGORY_URL = "https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/"
+    else:
+        CATEGORY_URL = f"https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/index_{i}.htm"
+    i = i + 1
+    req = urllib.request.urlopen(CATEGORY_URL)
+    text = req.read()
+    html_text = text.decode("utf-8")
+    page = etree.HTML(html_text)
+    articlelist = page.xpath("//div[contains(@class, 'xwfb_listerji')]/ul/li[not(@class = 'clear')]")
+    for article in articlelist:
+        if isinstance(article, etree._Element):
+            subelement = etree.tostring(article).decode()
+            subpage = etree.HTML(subelement)
+            date = subpage.xpath("//span/text()")[0]
+            parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
+            if  parsed_datetime < (datetime.today() - timedelta(days=183)):
+                i = -1
+            else:
+                urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
+                for url in urls:
+                    try:
+                        article = {}
+                        url = url.replace("../", "https://www.mof.gov.cn/zhengwuxinxi/")
+                        url = url.replace("./", "https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/")
+                        req = urllib.request.urlopen(url)
+                        text = req.read()
+                        html_text = text.decode("utf-8")
+                        page = etree.HTML(html_text)
+                        article['originalContent'] = encode(page.xpath("//div[contains(@class, 'TRS_Editor')]//p"))
+                        if len(article['originalContent']) < 10:
+                            continue
+                        CONTENT_ENG = ''
+                        for element in article['originalContent'].split("。"):
+                            CONTENT_ENG += translate(element) + ' '
+                        article['content'] = CONTENT_ENG
+                        article['site'] = "Ministry of Finance"
+                        article['originalSite'] = "财政部"
+                        article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
+                        article['title'] = translate(article['originalTitle'])
+                        article['url'] = url
+                        article['category']= "Financial News"
+                        article['publishDate'] = datemodifier(page.xpath("//meta[@name = 'PubDate']/@content")[0],"%Y-%m-%d %H:%M:%S")
+                        article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
+                        article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
+                        upsert_content(article)
+                    except Exception as error:
+                        print(error)
+i = 0
+while i > -1:
+    if i == 0:
+        CATEGORY_URL = "https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/"
+    else:
+        CATEGORY_URL = f"https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/index_{i}.htm"
+    i = i + 1
+    req = urllib.request.urlopen(CATEGORY_URL)
+    text = req.read()
+    html_text = text.decode("utf-8")
+    page = etree.HTML(html_text)
+    articlelist = page.xpath("//div[contains(@class, 'xwfb_listerji')]/ul/li[not(@class = 'clear')]")
+    for article in articlelist:
+        if isinstance(article, etree._Element):
+            subelement = etree.tostring(article).decode()
+            subpage = etree.HTML(subelement)
+            date = subpage.xpath("//span/text()")[0]
+            parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
+            if  parsed_datetime < (datetime.today() - timedelta(days=183)):
+                i = -1
+            else:
+                urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
+                for url in urls:
+                    try:
+                        article = {}
+                        url = url.replace("./", CATEGORY_URL)
+                        req = urllib.request.urlopen(url)
+                        text = req.read()
+                        html_text = text.decode("utf-8")
+                        page = etree.HTML(html_text)
+                        article['originalContent'] = encode(page.xpath("//div[contains(@class, 'TRS_Editor')]//p"))
+                        if len(article['originalContent']) < 10:
+                            continue
+                        CONTENT_ENG = ''
+                        for element in article['originalContent'].split("。"):
+                            CONTENT_ENG += translate(element) + ' '
+                        article['content'] = CONTENT_ENG
+                        article['site'] = "Ministry of Finance"
+                        article['originalSite'] = "财政部"
+                        article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
+                        article['title'] = translate(article['originalTitle'])
+                        article['url'] = url
+                        article['category']= "Policy Interpretation"
+                        article['publishDate'] = datemodifier(page.xpath("//meta[@name = 'PubDate']/@content")[0], "%Y-%m-%d %H:%M:%S")
+                        article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
+                        article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
+                        upsert_content(article)
+                    except Exception as error:
+                        print(error)
 # i = 0
 # while i > -1:
 #     if i == 0:
+#         CATEGORY_URL = "https://www.mof.gov.cn/zhengwuxinxi/zhengcefabu/"
 #     else:
+#         CATEGORY_URL = f"https://www.mof.gov.cn/zhengwuxinxi/zhengcefabu/index_{i}.htm"
 #     i = i + 1
 #     req = urllib.request.urlopen(CATEGORY_URL)
 #     text = req.read()
 #                     try:
 #                         article = {}
 #                         url = url.replace("./", CATEGORY_URL)
+#                         print(url)
 #                         req = urllib.request.urlopen(url)
 #                         text = req.read()
 #                         html_text = text.decode("utf-8")
 #                         page = etree.HTML(html_text)
+#                         attachments = page.xpath("//span[contains(@id, 'appendix1')]/a/@href")
+#                         print(attachments)
+#                         if len(attachments) > 0:
+#                             for attachment_url in attachments:
+#                                 if '.pdf' in attachment_url:
+#                                     attachment_url = attachment_url.replace("./", "https://zyhj.mof.gov.cn/zcfb/202403/")
+#                                     article['originalContent'] = extract_from_pdf(attachment_url)
+#                                 if '.doc' in attachment_url:
+#                                     continue
+#                                 if '.docx' in attachment_url:
+#                                     continue
+#                         else:
+#                             article['originalContent'] = encode(page.xpath("//div[contains(@class, 'TRS_Editor')]/p[@align='justify']"))
+#                         print(article['originalContent'])
 #                         if len(article['originalContent']) < 10:
 #                             continue
 #                         CONTENT_ENG = ''
 #                         article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
 #                         article['title'] = translate(article['originalTitle'])
 #                         article['url'] = url
+#                         article['category']= "Policy Release"
 #                         article['publishDate'] = datemodifier(page.xpath("//meta[@name = 'PubDate']/@content")[0], "%Y-%m-%d %H:%M:%S")
 #                         article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
 #                         article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
+#                         # upsert_content(article)
 #                     except Exception as error:
 #                         print(error)