Spaces:

Oxbridge-Economics
/

Data-Collection-China

Running

App Files Files Community

OxbridgeEconomics commited on Apr 16, 2024

Commit

42ba1cc

1 Parent(s): 046bb22

commit

Browse files

Files changed (11) hide show

cbirc.py +19 -11
csrc.py +30 -32
gov.py +0 -2
mof.py +3 -106
mofcom.py +2 -22
ndrc.py +3 -59
pbc.py +38 -30
safe.py +3 -42
stats.py +2 -21
utils.py +28 -18
xpath.json +68 -11

cbirc.py CHANGED Viewed

@@ -11,30 +11,35 @@ i = 1
 while i > -1:
     CATEGORY_URL = f"https://www.cbirc.gov.cn/cn/static/data/DocInfo/SelectDocByItemIdAndChild/data_itemId=917,pageIndex={i},pageSize=18.json"
     i = i + 1
     content = fetch_url(CATEGORY_URL)
     reportinfo = json.loads(content)
     for article in reportinfo['data']['rows']:
         try:
-            parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(article['publishDate'],"%Y-%m-%d %H:%M:%S")), "%Y-%m-%d")
             if  parsed_datetime < (datetime.today() - timedelta(days=183)):
                 i = -1
             else:
-                article['originContent'] = extract_from_pdf("https://www.cbirc.gov.cn" + article['pdfFileUrl'])
-                if len(article['originContent']) < 10:
                     continue
                 CONTENT_ENG = ''
-                for element in article['originContent'].split("\n"):
                     CONTENT_ENG += translate(element) + '\n'
-                article['content'] = CONTENT_ENG
                 article['site'] = "National Financial Regulatory Administration of China"
                 article['originSite'] = "国家金融监督管理总局"
-                article['originTitle'] = article['docSubtitle']
-                article['title'] = translate(article['originTitle'])
-                article['url'] = "https://www.cbirc.gov.cn" + article['pdfFileUrl']
-                article['category']= "Policy Interpretation"
-                article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(article['publishDate'],"%Y-%m-%d %H:%M:%S"))
                 article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
                 article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
                 upsert_content(article)
         except Exception as error:
             print(error)
@@ -46,7 +51,10 @@ while i > -1:
     CATEGORY_URL = f"https://www.chinatax.gov.cn/search5/search/s?siteCode=bm29000002&searchWord=&type=1&xxgkResolveType=%E6%96%87%E5%AD%97&pageNum={i}&pageSize=10&cwrqStart=&cwrqEnd=&column=%E6%94%BF%E7%AD%96%E8%A7%A3%E8%AF%BB&likeDoc=0&wordPlace=0&videoreSolveType="
     i = i + 1
     urllib3.disable_warnings()
-    req = urllib.request.urlopen(CATEGORY_URL)
     content = req.read().decode("utf-8")
     reportinfo = json.loads(content)
     for article in reportinfo['searchResultAll']['searchTotal']:

 while i > -1:
     CATEGORY_URL = f"https://www.cbirc.gov.cn/cn/static/data/DocInfo/SelectDocByItemIdAndChild/data_itemId=917,pageIndex={i},pageSize=18.json"
     i = i + 1
+    print(CATEGORY_URL)
     content = fetch_url(CATEGORY_URL)
     reportinfo = json.loads(content)
     for article in reportinfo['data']['rows']:
         try:
+            article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(article['publishDate'],"%Y-%m-%d %H:%M:%S"))
+            parsed_datetime = datetime.strptime(article['publishDate'], "%Y-%m-%d")
             if  parsed_datetime < (datetime.today() - timedelta(days=183)):
                 i = -1
             else:
+                contentCN, summary = extract_from_pdf("https://www.cbirc.gov.cn" + article['pdfFileUrl'])
+                article['contentCN'] = repr(contentCN)
+                if len(contentCN) < 10:
                     continue
                 CONTENT_ENG = ''
+                for element in article['contentCN'].split("\n"):
                     CONTENT_ENG += translate(element) + '\n'
+                article['content'] = repr(CONTENT_ENG)
                 article['site'] = "National Financial Regulatory Administration of China"
                 article['originSite'] = "国家金融监督管理总局"
+                article['titleCN'] = article['docSubtitle']
+                article['title'] = translate(article['docSubtitle'])
+                article['link'] = "https://www.cbirc.gov.cn" + str(article['pdfFileUrl'])
+                article['category']= "Policy Interpretation"
                 article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
                 article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
+                article['attachment'] = ''
+                article['author'] = ''
+                article['subtitle'] = translate(summary)
                 upsert_content(article)
         except Exception as error:
             print(error)
     CATEGORY_URL = f"https://www.chinatax.gov.cn/search5/search/s?siteCode=bm29000002&searchWord=&type=1&xxgkResolveType=%E6%96%87%E5%AD%97&pageNum={i}&pageSize=10&cwrqStart=&cwrqEnd=&column=%E6%94%BF%E7%AD%96%E8%A7%A3%E8%AF%BB&likeDoc=0&wordPlace=0&videoreSolveType="
     i = i + 1
     urllib3.disable_warnings()
+    try:
+        req = urllib.request.urlopen(CATEGORY_URL, timeout=30)
+    except:
+        break
     content = req.read().decode("utf-8")
     reportinfo = json.loads(content)
     for article in reportinfo['searchResultAll']['searchTotal']:

csrc.py CHANGED Viewed

@@ -6,38 +6,36 @@ from datetime import datetime, timedelta
 from lxml import etree
 from utils import encode, translate, sentiment_computation, upsert_content, fetch_url, crawl
-# i = 1
-# while i > -1:
-#     if i == 1:
-#         CATEGORY_URL = "http://www.csrc.gov.cn/csrc/c100039/common_list.shtml"
-#     else:
-#         CATEGORY_URL = f"http://www.csrc.gov.cn/csrc/c100039/common_list_{i}.shtml"
-#     i = i + 1
-#     req = urllib.request.urlopen(CATEGORY_URL)
-#     text = req.read()
-#     html_text = text.decode("utf-8")
-#     page = etree.HTML(html_text)
-#     articlelist = page.xpath("//div[contains(@class, 'main-right fr common-list')]/ul/li")
-#     for article in articlelist:
-#         if isinstance(article, etree._Element):
-#             subelement = etree.tostring(article).decode()
-#             subpage = etree.HTML(subelement)
-#             date = encode(subpage.xpath("//span[@class='date']"))
-#             parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
-#             if  parsed_datetime < (datetime.today() - timedelta(days=183)):
-#                 i = -1
-#             else:
-#                 urls = subpage.xpath("//a/@href")
-#                 for url in urls:
-#                     try:
-#                         article = {}
-#                         url = "http://www.csrc.gov.cn" + url
-#                         article['site'] = "Securities Regulatory Commission of China"
-#                         article['originSite'] = "证监会"
-#                         article['category']= "Policy Interpretation"
-#                         crawl(url, article)
-#                     except Exception as error:
-#                        print(error)
 i = 1
 while i > -1:

 from lxml import etree
 from utils import encode, translate, sentiment_computation, upsert_content, fetch_url, crawl
+i = 1
+while i > -1:
+    if i == 1:
+        CATEGORY_URL = "http://www.csrc.gov.cn/csrc/c100039/common_list.shtml"
+    else:
+        CATEGORY_URL = f"http://www.csrc.gov.cn/csrc/c100039/common_list_{i}.shtml"
+    i = i + 1
+    req = urllib.request.urlopen(CATEGORY_URL)
+    text = req.read()
+    html_text = text.decode("utf-8")
+    page = etree.HTML(html_text)
+    articlelist = page.xpath("//div[contains(@class, 'main-right fr common-list')]/ul/li")
+    for article in articlelist:
+        if isinstance(article, etree._Element):
+            subelement = etree.tostring(article).decode()
+            subpage = etree.HTML(subelement)
+            date = encode(subpage.xpath("//span[@class='date']"))
+            parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
+            if  parsed_datetime < (datetime.today() - timedelta(days=183)):
+                i = -1
+            else:
+                urls = subpage.xpath("//a/@href")
+                for url in urls:
+                    try:
+                        article = {}
+                        url = "http://www.csrc.gov.cn" + url
+                        article['category']= "Policy Interpretation"
+                        crawl(url, article)
+                    except Exception as error:
+                       print(error)
 i = 1
 while i > -1:

gov.py CHANGED Viewed

@@ -60,8 +60,6 @@ while i > -1:
                         url = url.replace('../', 'https://www.gov.cn/zhengce/')
                         if "https://www.gov.cn" in url:
                             article['category']= "Policy Interpretation"
-                            article['originSite'] = "国务院"
-                            article['site'] = "State Council of China"
                             crawl(url, article)
                     except Exception as error:
                         print(error)

                         url = url.replace('../', 'https://www.gov.cn/zhengce/')
                         if "https://www.gov.cn" in url:
                             article['category']= "Policy Interpretation"
                             crawl(url, article)
                     except Exception as error:
                         print(error)

mof.py CHANGED Viewed

@@ -1,9 +1,8 @@
-import uuid
 import time
 import urllib.request
 from lxml import etree
 from datetime import datetime, timedelta
-from utils import encode, translate, datemodifier, sentiment_computation, upsert_content, extract_from_pdf
 i = 0
 while i > -1:
@@ -32,27 +31,8 @@ while i > -1:
                         article = {}
                         url = url.replace("../", "https://www.mof.gov.cn/zhengwuxinxi/")
                         url = url.replace("./", "https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/")
-                        req = urllib.request.urlopen(url)
-                        text = req.read()
-                        html_text = text.decode("utf-8")
-                        page = etree.HTML(html_text)
-                        article['originalContent'] = encode(page.xpath("//div[contains(@class, 'TRS_Editor')]//p"))
-                        if len(article['originalContent']) < 10:
-                            continue
-                        CONTENT_ENG = ''
-                        for element in article['originalContent'].split("。"):
-                            CONTENT_ENG += translate(element) + ' '
-                        article['content'] = CONTENT_ENG
-                        article['site'] = "Ministry of Finance of China"
-                        article['originalSite'] = "财政部"
-                        article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
-                        article['title'] = translate(article['originalTitle'])
-                        article['url'] = url
                         article['category']= "Financial News"
-                        article['publishDate'] = datemodifier(page.xpath("//meta[@name = 'PubDate']/@content")[0],"%Y-%m-%d %H:%M:%S")
-                        article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
-                        article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
-                        upsert_content(article)
                     except Exception as error:
                         print(error)
@@ -82,90 +62,7 @@ while i > -1:
                     try:
                         article = {}
                         url = url.replace("./", CATEGORY_URL)
-                        req = urllib.request.urlopen(url)
-                        text = req.read()
-                        html_text = text.decode("utf-8")
-                        page = etree.HTML(html_text)
-                        article['originalContent'] = encode(page.xpath("//div[contains(@class, 'TRS_Editor')]//p"))
-                        if len(article['originalContent']) < 10:
-                            continue
-                        CONTENT_ENG = ''
-                        for element in article['originalContent'].split("。"):
-                            CONTENT_ENG += translate(element) + ' '
-                        article['content'] = CONTENT_ENG
-                        article['site'] = "Ministry of Finance of China"
-                        article['originalSite'] = "财政部"
-                        article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
-                        article['title'] = translate(article['originalTitle'])
-                        article['url'] = url
                         article['category']= "Policy Interpretation"
-                        article['publishDate'] = datemodifier(page.xpath("//meta[@name = 'PubDate']/@content")[0], "%Y-%m-%d %H:%M:%S")
-                        article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
-                        article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
-                        upsert_content(article)
                     except Exception as error:
                         print(error)
-# i = 0
-# while i > -1:
-#     if i == 0:
-#         CATEGORY_URL = "https://www.mof.gov.cn/zhengwuxinxi/zhengcefabu/"
-#     else:
-#         CATEGORY_URL = f"https://www.mof.gov.cn/zhengwuxinxi/zhengcefabu/index_{i}.htm"
-#     i = i + 1
-#     req = urllib.request.urlopen(CATEGORY_URL)
-#     text = req.read()
-#     html_text = text.decode("utf-8")
-#     page = etree.HTML(html_text)
-#     articlelist = page.xpath("//div[contains(@class, 'xwfb_listerji')]/ul/li[not(@class = 'clear')]")
-#     for article in articlelist:
-#         if isinstance(article, etree._Element):
-#             subelement = etree.tostring(article).decode()
-#             subpage = etree.HTML(subelement)
-#             date = subpage.xpath("//span/text()")[0]
-#             parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
-#             if  parsed_datetime < (datetime.today() - timedelta(days=183)):
-#                 i = -1
-#             else:
-#                 urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
-#                 for url in urls:
-#                     try:
-#                         article = {}
-#                         url = url.replace("./", CATEGORY_URL)
-#                         print(url)
-#                         req = urllib.request.urlopen(url)
-#                         text = req.read()
-#                         html_text = text.decode("utf-8")
-#                         page = etree.HTML(html_text)
-#                         attachments = page.xpath("//span[contains(@id, 'appendix1')]/a/@href")
-#                         print(attachments)
-#                         if len(attachments) > 0:
-#                             for attachment_url in attachments:
-#                                 if '.pdf' in attachment_url:
-#                                     attachment_url = attachment_url.replace("./", "https://zyhj.mof.gov.cn/zcfb/202403/")
-#                                     article['originalContent'] = extract_from_pdf(attachment_url)
-#                                 if '.doc' in attachment_url:
-#                                     continue
-#                                 if '.docx' in attachment_url:
-#                                     continue
-#                         else:
-#                             article['originalContent'] = encode(page.xpath("//div[contains(@class, 'TRS_Editor')]/p[@align='justify']"))
-#                         print(article['originalContent'])
-#                         if len(article['originalContent']) < 10:
-#                             continue
-#                         CONTENT_ENG = ''
-#                         for element in article['originalContent'].split("。"):
-#                             CONTENT_ENG += translate(element) + ' '
-#                         article['content'] = CONTENT_ENG
-#                         article['site'] = "Ministry of Finance"
-#                         article['originalSite'] = "财政部"
-#                         article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
-#                         article['title'] = translate(article['originalTitle'])
-#                         article['url'] = url
-#                         article['category']= "Policy Release"
-#                         article['publishDate'] = datemodifier(page.xpath("//meta[@name = 'PubDate']/@content")[0], "%Y-%m-%d %H:%M:%S")
-#                         article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
-#                         article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
-#                         # upsert_content(article)
-#                     except Exception as error:
-#                         print(error)

 import time
 import urllib.request
 from lxml import etree
 from datetime import datetime, timedelta
+from utils import crawl
 i = 0
 while i > -1:
                         article = {}
                         url = url.replace("../", "https://www.mof.gov.cn/zhengwuxinxi/")
                         url = url.replace("./", "https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/")
                         article['category']= "Financial News"
+                        crawl(url, article)
                     except Exception as error:
                         print(error)
                     try:
                         article = {}
                         url = url.replace("./", CATEGORY_URL)
                         article['category']= "Policy Interpretation"
+                        crawl(url, article)
                     except Exception as error:
                         print(error)

mofcom.py CHANGED Viewed

@@ -1,9 +1,8 @@
-import uuid
 import time
 import urllib.request
 from datetime import datetime, timedelta
 from lxml import etree
-from utils import encode, translate, sentiment_computation, upsert_content
 categories = ['jdzhsw','jdgnmy','jddwmy','jdtzhz']
 for category in categories:
@@ -37,25 +36,6 @@ for category in categories:
                                 article['category']= "Policy Interpretation"
                             else:
                                 article['category']= "Policy Release"
-                            req = urllib.request.urlopen(url)
-                            text = req.read()
-                            html_text = text.decode("utf-8")
-                            page = etree.HTML(html_text)
-                            article['originalContent'] = encode(page.xpath("//div[contains(@class, 'art-con art-con-bottonmLine')]//p"))
-                            if len(article['originalContent']) < 10:
-                                continue
-                            CONTENT_ENG = ''
-                            for element in article['originalContent'].split("。"):
-                                CONTENT_ENG += translate(element) + ' '
-                            article['content'] = CONTENT_ENG
-                            article['site'] = "Ministry of Commerce of China"
-                            article['originalSite'] = "商务部"
-                            article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
-                            article['title'] = translate(article['originalTitle'])
-                            article['url'] = url
-                            article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(page.xpath("//meta[@name = 'PubDate']/@content")[0],"%Y-%m-%d %H:%M:%S"))
-                            article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
-                            article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
-                            upsert_content(article)
                         except Exception as error:
                             print(error)

 import time
 import urllib.request
 from datetime import datetime, timedelta
 from lxml import etree
+from utils import crawl
 categories = ['jdzhsw','jdgnmy','jddwmy','jdtzhz']
 for category in categories:
                                 article['category']= "Policy Interpretation"
                             else:
                                 article['category']= "Policy Release"
+                            crawl(url, article)
                         except Exception as error:
                             print(error)

ndrc.py CHANGED Viewed

@@ -3,7 +3,7 @@ import uuid
 import time
 import urllib.request
 from lxml import etree
-from utils import encode, translate, datemodifier, sentiment_computation, upsert_content
 i = 0
 while i > -1:
@@ -30,71 +30,15 @@ while i > -1:
                 for url in urls:
                     try:
                         article = {}
-                        print(url)
-                        if "https://www.gov.cn" in url:
-                            req = urllib.request.urlopen(url)
-                            text = req.read()
-                            html_text = text.decode("utf-8")
-                            page = etree.HTML(html_text)
-                            article['originalContent'] = encode(page.xpath("//div[contains(@id, 'UCAP-CONTENT')]//p"))
-                            if len(article['originalContent']) < 10:
-                                continue
-                            CONTENT_ENG = ''
-                            for element in article['originalContent'].split("。"):
-                                CONTENT_ENG += translate(element) + ' '
-                            article['content'] = CONTENT_ENG
-                            article['site'] = "State Council of China"
-                            article['originalSite'] = "国务院"
-                            article['originalTitle'] = page.xpath("//title/text()")[0]
-                            article['title'] = translate(article['originalTitle'])
-                            article['url'] = url
                             article['category']= "Policy Release"
-                            article['publishDate'] = datemodifier(page.xpath("//meta[@name = 'firstpublishedtime']/@content")[0], "%Y-%m-%d-%H:%M:%S")
                         elif "../../zcfb/" in url:
                             url = url.replace("../../zcfb/", "https://www.ndrc.gov.cn/xxgk/zcfb/")
-                            print(url)
-                            req = urllib.request.urlopen(url)
-                            text = req.read()
-                            html_text = text.decode("utf-8")
-                            page = etree.HTML(html_text)
-                            article['originalContent'] = encode(page.xpath("//div[contains(@class, 'TRS_Editor') or contains(@class, 'article_l')]"))
-                            if len(article['originalContent']) < 10:
-                                continue
-                            CONTENT_ENG = ''
-                            for element in article['originalContent'].split("。"):
-                                CONTENT_ENG += translate(element) + ' '
-                            article['content'] = CONTENT_ENG
-                            article['site'] = "National Development and Reform Commission of China"
-                            article['originalSite'] = "国家发展和改革委员会"
-                            article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
-                            article['title'] = translate(article['originalTitle'])
-                            article['url'] = url
                             article['category']= "Policy Release"
-                            article['publishDate'] = datemodifier(page.xpath("//meta[@name = 'PubDate']/@content")[0], "%Y-%m-%d %H:%M:%S")
                         else:
                             url = url.replace("../../", "https://www.ndrc.gov.cn/xxgk/jd/jd/")
                             url = url.replace("./", "https://www.ndrc.gov.cn/xxgk/jd/jd/")
-                            print(url)
-                            req = urllib.request.urlopen(url)
-                            text = req.read()
-                            html_text = text.decode("utf-8")
-                            page = etree.HTML(html_text)
-                            article['originalContent'] = encode(page.xpath("//div[contains(@class, 'TRS_Editor')]//p"))
-                            if len(article['originalContent']) < 10:
-                                continue
-                            CONTENT_ENG = ''
-                            for element in article['originalContent'].split("。"):
-                                CONTENT_ENG += translate(element) + ' '
-                            article['content'] = CONTENT_ENG
-                            article['site'] = "National Development and Reform Commission of China"
-                            article['originalSite'] = "国家发展和改革委员会"
-                            article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
-                            article['title'] = translate(article['originalTitle'])
-                            article['url'] = url
                             article['category']= "Policy Interpretation"
-                            article['publishDate'] = datemodifier(page.xpath("//meta[@name = 'PubDate']/@content")[0], "%Y-%m-%d %H:%M:%S")
-                        article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
-                        article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
-                        upsert_content(article)
                     except Exception as error:
                         print(error)

 import time
 import urllib.request
 from lxml import etree
+from utils import crawl
 i = 0
 while i > -1:
                 for url in urls:
                     try:
                         article = {}
+                        if "www.gov.cn" in url:
                             article['category']= "Policy Release"
                         elif "../../zcfb/" in url:
                             url = url.replace("../../zcfb/", "https://www.ndrc.gov.cn/xxgk/zcfb/")
                             article['category']= "Policy Release"
                         else:
                             url = url.replace("../../", "https://www.ndrc.gov.cn/xxgk/jd/jd/")
                             url = url.replace("./", "https://www.ndrc.gov.cn/xxgk/jd/jd/")
                             article['category']= "Policy Interpretation"
+                        crawl(url, article)
                     except Exception as error:
                         print(error)

pbc.py CHANGED Viewed

@@ -13,37 +13,45 @@ while i > -1:
         j = i + 1
         CATEGORY_URL = f"http://www.pbc.gov.cn/rmyh/3963412/3963426/index_{j}.html"
     i = i + 1
-    response = requests.get(CATEGORY_URL, timeout=20)
     page = etree.HTML(response.text)
-    urls = page.xpath("//td[contains(@height,'22')]//a[contains(@target, '_blank')]/@href")
-    urls = [item for item in urls if item.startswith("/rmyh/")]
-    for url in urls:
-        try:
-            url = "http://www.pbc.gov.cn" + url
-            article = {}
-            response = requests.get(url, timeout=20)
-            response.encoding = 'utf-8'
-            page = etree.HTML(response.text)
-            article['originalContent'] = encode(page.xpath("//div[@class='mainw950']//td[@class='content']/font[@class='zoom1']//p"))
-            if len(article['originalContent']) < 10:
                 continue
-            CONTENT_ENG = ''
-            for element in article['originalContent'].split("。"):
-                CONTENT_ENG += translate(element) + ' '
-            article['content'] = CONTENT_ENG
-            article['site'] = "The People's Bank of China"
-            article['originalSite'] = "中国人民银行"
-            article['originalTitle'] = page.xpath("//title/text()")[0]
-            article['title'] = translate(article['originalTitle'])
-            article['url'] = url
-            article['category']= "Policy Interpretation"
-            article['publishDate'] = datemodifier(page.xpath("//meta[@name = '页面生成时间']/@content")[0], "%Y-%m-%d %H:%M:%S")
-            parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(article['publishDate'],"%Y-%m-%d")), "%Y-%m-%d")
-            if parsed_datetime < (datetime.today() - timedelta(days=183)):
                 i = -1
             else:
-                article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
-                article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
-                upsert_content(article)
-        except Exception as error:
-            print(error)

         j = i + 1
         CATEGORY_URL = f"http://www.pbc.gov.cn/rmyh/3963412/3963426/index_{j}.html"
     i = i + 1
+    response = requests.get(CATEGORY_URL, timeout=30)
     page = etree.HTML(response.text)
+    articlelist = page.xpath("//td[contains(@height, '22')]")
+    for article in articlelist:
+        if isinstance(article, etree._Element):
+            subelement = etree.tostring(article).decode()
+            subpage = etree.HTML(subelement)
+            date = subpage.xpath("//span/text()")
+            try:
+                parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
+            except:
                 continue
+            if  parsed_datetime < (datetime.today() - timedelta(days=183)):
                 i = -1
             else:
+                urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
+                for url in urls:
+                    try:
+                        article = {}
+                        url = "http://www.pbc.gov.cn" + url
+                        response = requests.get(url, timeout=20)
+                        response.encoding = 'utf-8'
+                        page = etree.HTML(response.text)
+                        article['originalContent'] = encode(page.xpath("//div[@class='mainw950']//td[@class='content']/font[@class='zoom1']//p"))
+                        if len(article['originalContent']) < 10:
+                            continue
+                        CONTENT_ENG = ''
+                        for element in article['originalContent'].split("。"):
+                            CONTENT_ENG += translate(element) + ' '
+                        article['content'] = CONTENT_ENG
+                        article['site'] = "The People's Bank of China"
+                        article['originalSite'] = "中国人民银行"
+                        article['originalTitle'] = page.xpath("//title/text()")[0]
+                        article['title'] = translate(article['originalTitle'])
+                        article['url'] = url
+                        article['category']= "Policy Interpretation"
+                        article['publishDate'] = datemodifier(page.xpath("//meta[@name = '页面生成时间']/@content")[0], "%Y-%m-%d %H:%M:%S")
+                        article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
+                        article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
+                        upsert_content(article)
+                    except Exception as error:
+                        print(error)

safe.py CHANGED Viewed

@@ -1,9 +1,8 @@
-import uuid
 import time
 import urllib.request
 from datetime import datetime, timedelta
 from lxml import etree
-from utils import encode, translate, datemodifier, sentiment_computation, upsert_content
 i = 1
 while i > -1:
@@ -31,27 +30,8 @@ while i > -1:
                     try:
                         article = {}
                         url = "https://www.safe.gov.cn" + url
-                        req = urllib.request.urlopen(url)
-                        text = req.read()
-                        html_text = text.decode("utf-8")
-                        page = etree.HTML(html_text)
-                        article['originalContent'] = encode(page.xpath("//div[contains(@class, 'detail_content')]//p"))
-                        if len(article['originalContent']) < 10:
-                            continue
-                        CONTENT_ENG = ''
-                        for element in article['originalContent'].split("。"):
-                            CONTENT_ENG += translate(element) + ' '
-                        article['content'] = CONTENT_ENG
-                        article['site'] = "State Administration of Foregin Exchange of China"
-                        article['originalSite'] = "外汇管理局"
-                        article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
-                        article['title'] = translate(article['originalTitle'])
-                        article['url'] = url
                         article['category']= "Policy Interpretation"
-                        article['publishDate'] = datemodifier(page.xpath("//meta[@name = 'PubDate']/@content")[0], "%Y-%m-%d")
-                        article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
-                        article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
-                        upsert_content(article)
                     except Exception as error:
                         print(error)
@@ -81,26 +61,7 @@ while i > -1:
                     try:
                         article = {}
                         url = "https://www.safe.gov.cn" + url
-                        req = urllib.request.urlopen(url)
-                        text = req.read()
-                        html_text = text.decode("utf-8")
-                        page = etree.HTML(html_text)
-                        article['originalContent'] = encode(page.xpath("//div[contains(@class, 'detail_content')]//p"))
-                        if len(article['originalContent']) < 10:
-                            continue
-                        CONTENT_ENG = ''
-                        for element in article['originalContent'].split("。"):
-                            CONTENT_ENG += translate(element) + ' '
-                        article['content'] = CONTENT_ENG
-                        article['site'] = "State Administration of Foregin Exchange of China"
-                        article['originalSite'] = "外汇管理局"
-                        article['originalTitle'] = page.xpath("//meta[@name = 'ArticleTitle']/@content")[0]
-                        article['title'] = translate(article['originalTitle'])
-                        article['url'] = url
                         article['category']= "Data Interpretation"
-                        article['publishDate'] = datemodifier(page.xpath("//meta[@name = 'PubDate']/@content")[0], "%Y-%m-%d")
-                        article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
-                        article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
-                        upsert_content(article)
                     except Exception as error:
                         print(error)

 import time
 import urllib.request
 from datetime import datetime, timedelta
 from lxml import etree
+from utils import crawl
 i = 1
 while i > -1:
                     try:
                         article = {}
                         url = "https://www.safe.gov.cn" + url
                         article['category']= "Policy Interpretation"
+                        crawl(url, article)
                     except Exception as error:
                         print(error)
                     try:
                         article = {}
                         url = "https://www.safe.gov.cn" + url
                         article['category']= "Data Interpretation"
+                        crawl(url, article)
                     except Exception as error:
                         print(error)

stats.py CHANGED Viewed

@@ -3,7 +3,7 @@ import time
 import urllib.request
 from datetime import datetime, timedelta
 from lxml import etree
-from utils import encode, translate, sentiment_computation, upsert_content
 i = 0
 while i > -1:
@@ -31,26 +31,7 @@ while i > -1:
                     try:
                         article = {}
                         url = url.replace('./', "https://www.stats.gov.cn/sj/sjjd/")
-                        req = urllib.request.urlopen(url)
-                        text = req.read()
-                        html_text = text.decode("utf-8")
-                        page = etree.HTML(html_text)
-                        article['originalContent'] = encode(page.xpath("//div[contains(@class, 'TRS_Editor')]//p"))
-                        if len(article['originalContent']) < 10:
-                            continue
-                        CONTENT_ENG = ''
-                        for element in article['originalContent'].split("。"):
-                            CONTENT_ENG += translate(element) + ' '
-                        article['content'] = CONTENT_ENG
-                        article['site'] = "National Bureau of Statistics of China"
-                        article['originalSite'] = "国家统计局"
-                        article['originalTitle'] = page.xpath("//title/text()")[0]
-                        article['title'] = translate(article['originalTitle'])
-                        article['url'] = url
                         article['category']= "Data Interpretation"
-                        article['publishDate'] = date
-                        article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
-                        article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
-                        upsert_content(article)
                     except Exception as error:
                       print(error)

 import urllib.request
 from datetime import datetime, timedelta
 from lxml import etree
+from utils import encode, crawl
 i = 0
 while i > -1:
                     try:
                         article = {}
                         url = url.replace('./', "https://www.stats.gov.cn/sj/sjjd/")
                         article['category']= "Data Interpretation"
+                        crawl(url, article)
                     except Exception as error:
                       print(error)

utils.py CHANGED Viewed

@@ -119,16 +119,18 @@ def extract_from_pdf(url):
         pdf_reader = PdfReader(f)
         num_pages = len(pdf_reader.pages)
         extracted_text = ""
-        extracted_text_eng = ""
         for page in range(num_pages):
             text = pdf_reader.pages[page].extract_text()
             if text and text[0].isdigit():
                 text = text[1:]
             first_newline_index = text.find('\n')
-            text = text[:first_newline_index+1].replace('\n', ' ') + text[first_newline_index+1:].replace('\n', '')
-            extracted_text_eng += translator.translate(text, dest='en').text
             extracted_text += text
-    return extracted_text, extracted_text_eng
 def get_db_connection():
     """Get dynamoDB connection"""
@@ -164,27 +166,35 @@ def sentiment_computation(content):
     return sentiment_score, label_dict[sentiment_label]
 def crawl(url, article):
-    domain = urlparse(url).netloc
     req = urllib.request.urlopen(url)
     text = req.read()
     html_text = text.decode("utf-8")
     page = etree.HTML(html_text)
-    originContent, summary  = encode_content(page.xpath(xpath_dict[domain]['content']))
-    article['originTitle'] = encode(page.xpath(xpath_dict[domain]['title']))
-    article['title'] = translate(article['originTitle'])
-    article['author'] = translate(encode(page.xpath(xpath_dict[domain]['author'])))
-    article['originContent'] = repr(originContent)
-    if len(article['originContent']) < 10:
         return None
     CONTENT_ENG = ''
-    for element in originContent.split("\n"):
         CONTENT_ENG += translate(element) + '\n'
     article['content'] = repr(CONTENT_ENG)
-    article['subtitle'] = translate(summary)
-    article['publishDate'] = datemodifier(encode(page.xpath(xpath_dict[domain]['publishdate'])), xpath_dict[domain]['datetime']['format_string'])
     article['link'] = url
     article['attachment'] = ""
-    article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(originContent.replace("\n",""))
     article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
     upsert_content(article)
@@ -197,9 +207,9 @@ def upsert_content(report):
         'id': str(report['id']),
         'site': report['site'],
         'title': report['title'],
-        # 'originSite': report['originSite'],
-        'originTitle': report['originTitle'],
-        'originContent': report['originContent'],
         'category': report['category'],
         'author': report['author'],
         'content': report['content'],

         pdf_reader = PdfReader(f)
         num_pages = len(pdf_reader.pages)
         extracted_text = ""
         for page in range(num_pages):
             text = pdf_reader.pages[page].extract_text()
             if text and text[0].isdigit():
                 text = text[1:]
             first_newline_index = text.find('\n')
+            text = text[:first_newline_index+1].replace('\n', ' ') + text[first_newline_index+1:]
             extracted_text += text
+        try:
+            summary = '\n'.join(extracted_text.split('\n')[:2])
+        except:
+            summary = text
+    return extracted_text, summary
 def get_db_connection():
     """Get dynamoDB connection"""
     return sentiment_score, label_dict[sentiment_label]
 def crawl(url, article):
+    domain = '.'.join(urlparse(url).netloc.split('.')[1:])
     req = urllib.request.urlopen(url)
     text = req.read()
     html_text = text.decode("utf-8")
     page = etree.HTML(html_text)
+    contentCN, summary  = encode_content(page.xpath(xpath_dict[domain]['content']))
+    article['originSite'] = xpath_dict[domain]['siteCN']
+    article['site'] = xpath_dict[domain]['site']
+    article['titleCN'] = encode(page.xpath(xpath_dict[domain]['title']))
+    article['title'] = translate(article['titleCN'])
+    if 'author' in xpath_dict[domain]:
+        article['author'] = translate(encode(page.xpath(xpath_dict[domain]['author'])))
+    else:
+        article['author'] = ""
+    article['contentCN'] = repr(contentCN)
+    if len(article['contentCN']) < 10:
         return None
     CONTENT_ENG = ''
+    for element in contentCN.split("\n"):
         CONTENT_ENG += translate(element) + '\n'
     article['content'] = repr(CONTENT_ENG)
+    if 'subtitle' in xpath_dict[domain]:
+        article['subtitle'] = translate(encode(page.xpath(xpath_dict[domain]['subtitle'])))
+    else:
+        article['subtitle'] = translate(summary)
+    article['publishDate'] = datemodifier(encode(page.xpath(xpath_dict[domain]['publishdate'])), xpath_dict[domain]['datetime_format'])
     article['link'] = url
     article['attachment'] = ""
+    article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(CONTENT_ENG.replace("\n",""))
     article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
     upsert_content(article)
         'id': str(report['id']),
         'site': report['site'],
         'title': report['title'],
+        'titleCN': report['titleCN'],
+        'site': report['site'],
+        'contentCN': report['contentCN'],
         'category': report['category'],
         'author': report['author'],
         'content': report['content'],

xpath.json CHANGED Viewed

@@ -2,28 +2,85 @@
     "data.eastmoney.com": {
         "attachment": "//a[contains(@class, 'pdf-link')]/@href",
         "content": "//div[contains(@class, 'ctx-content')]//p",
-        "datetime": {
-            "format_string": "%Y-%m-%d %H:%M:%S.%f"
-        }
     },
-    "www.gov.cn": {
         "title": "//title/text()",
         "subtitle": "//meta[@name = 'description']/@content",
         "author": "//meta[@name = 'author']/@content",
         "publishdate": "//meta[@name = 'firstpublishedtime']/@content",
         "content": "//div[contains(@id, 'UCAP-CONTENT')]//p",
-        "datetime": {
-            "format_string": "%Y-%m-%d-%H:%M:%S"
-        }
     },
-    "www.csrc.gov.cn": {
         "title": "//meta[@name = 'ArticleTitle']/@content",
         "subtitle": "//meta[@name = 'description']/@content",
         "author": "//meta[@name = 'author']/@content",
         "publishdate": "//meta[@name = 'PubDate']/@content",
         "content": "//div[contains(@class, 'detail-news')]//p",
-        "datetime": {
-            "format_string": "%Y-%m-%d %H:%M:%S"
-        }
     }
 }

     "data.eastmoney.com": {
         "attachment": "//a[contains(@class, 'pdf-link')]/@href",
         "content": "//div[contains(@class, 'ctx-content')]//p",
+        "datetime": "%Y-%m-%d %H:%M:%S.%f"
     },
+    "gov.cn": {
         "title": "//title/text()",
         "subtitle": "//meta[@name = 'description']/@content",
         "author": "//meta[@name = 'author']/@content",
         "publishdate": "//meta[@name = 'firstpublishedtime']/@content",
         "content": "//div[contains(@id, 'UCAP-CONTENT')]//p",
+        "datetime_format": "%Y-%m-%d-%H:%M:%S",
+        "siteCN": "中国国务院",
+        "site": "State Council of China"
     },
+    "csrc.gov.cn": {
         "title": "//meta[@name = 'ArticleTitle']/@content",
         "subtitle": "//meta[@name = 'description']/@content",
         "author": "//meta[@name = 'author']/@content",
         "publishdate": "//meta[@name = 'PubDate']/@content",
         "content": "//div[contains(@class, 'detail-news')]//p",
+        "datetime_format": "%Y-%m-%d %H:%M:%S",
+        "siteCN": "中国证监会",
+        "site": "Securities Regulatory Commission of China"
+    },
+    "mof.gov.cn": {
+        "title": "//meta[@name = 'ArticleTitle']/@content",
+        "publishdate": "//meta[@name = 'PubDate']/@content",
+        "content": "//div[contains(@class, 'TRS_Editor')]//p",
+        "datetime_format": "%Y-%m-%d %H:%M:%S",
+        "siteCN": "中国财政部",
+        "site": "Ministry of Finance of China"
+    },
+    "mofcom.gov.cn": {
+        "title": "//meta[@name = 'ArticleTitle']/@content",
+        "subtitle": "//meta[@name = 'Description']/@content",
+        "publishdate": "//meta[@name = 'PubDate']/@content",
+        "content": "//div[contains(@class, 'art-con art-con-bottonmLine')]//p",
+        "datetime_format": "%Y-%m-%d %H:%M:%S",
+        "siteCN": "中国商务部",
+        "site": "Ministry of Commerce of China"
+    },
+    "ndrc.gov.cn": {
+        "title": "//meta[@name = 'ArticleTitle']/@content",
+        "publishdate": "//meta[@name = 'PubDate']/@content",
+        "content": "//div[contains(@class, 'TRS_Editor') or contains(@class, 'article_l')]",
+        "datetime_format": "%Y-%m-%d %H:%M:%S",
+        "siteCN": "中国国家发展和改革委员会",
+        "site": "National Development and Reform Commission of China"
+    },
+    "pbc.gov.cn": {
+        "title": "//title/text()",
+        "subtitle": "//meta[@name = 'description']/@content",
+        "publishdate": "//meta[@name = '页面生成时间']/@content",
+        "content": "//div[@class='mainw950']//td[@class='content']/font[@class='zoom1']//p",
+        "datetime_format": "%Y-%m-%d %H:%M:%S",
+        "siteCN": "中国人民银行",
+        "site": "The People's Bank of China"
+    },
+    "safe.gov.cn": {
+        "title": "//meta[@name = 'ArticleTitle']/@content",
+        "subtitle": "//meta[@name = 'Description']/@content",
+        "publishdate": "//meta[@name = 'PubDate']/@content",
+        "content": "//div[contains(@class, 'detail_content')]//p",
+        "datetime_format": "%Y-%m-%d",
+        "siteCN": "中国外汇管理局",
+        "site": "State Administration of Foreign Exchange of China"
+    },
+    "stats.gov.cn": {
+        "title": "//title/text()",
+        "publishdate": "//div[contains(@class, 'detail-title-des')]//p[1]",
+        "content": "//div[contains(@class, 'TRS_Editor')]//p",
+        "datetime_format": "%Y/%m/%d %H:%M",
+        "siteCN": "中国国家统计局",
+        "site": "National Bureau of Statistics of China"
+    },
+    "chinatax.gov.cn": {
+        "title": "//title/text()",
+        "publishdate": "//div[contains(@class, 'detail-title-des')]//p[1]",
+        "content": "//div[contains(@class, 'article')]//p",
+        "datetime_format": "%Y-%m-%d %H:%M:%S",
+        "siteCN": "中国国家税务总局",
+        "site": "State Taxation Administration of China"
     }
 }