Spaces:

Oxbridge-Economics
/

Data-Collection-China

Running

App Files Files Community

Muhammad Abdur Rahman Saad commited on Aug 15, 2024

Commit

5068e1f

1 Parent(s): 9a22ff3

Update daily.py

Browse files

Files changed (1) hide show

daily.py +411 -418

daily.py CHANGED Viewed

@@ -64,468 +64,461 @@ def crawl_eastmoney(url, article):
     extract_reference(article)
     update_content(article)
-def daily():
-    with open('xpath.json', 'r', encoding='UTF-8') as f:
-        xpath_dict = json.load(f)
-    DELTA = int(os.environ.get('DELTA') or '1')
-    print(f"DELTA = {DELTA}")
-    print("cbirc.gov.cn")
-    i = 1
-    while i > -1:
-        CATEGORY_URL = f"https://www.cbirc.gov.cn/cn/static/data/DocInfo/SelectDocByItemIdAndChild/data_itemId=917,pageIndex={i},pageSize=18.json"
-        i = i + 1
-        content = fetch_url(CATEGORY_URL)
-        reportinfo = json.loads(content)
-        for article in reportinfo['data']['rows']:
-            try:
-                article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(article['publishDate'],"%Y-%m-%d %H:%M:%S"))
-                parsed_datetime = datetime.strptime(article['publishDate'], "%Y-%m-%d")
-                if  parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
-                    i = -1
-                else:
-                    contentCN, summary = extract_from_pdf("https://www.cbirc.gov.cn" + article['pdfFileUrl'])
-                    article['contentCN'] = repr(contentCN)[1:-1].strip()
-                    if len(contentCN) < 10:
-                        continue
-                    CONTENT_ENG = ''
-                    for element in article['contentCN'].split("\n"):
-                        CONTENT_ENG += translate(element) + '\n'
-                    article['content'] = repr(CONTENT_ENG)[1:-1].strip()
-                    article['site'] = "National Financial Regulatory Administration of China"
-                    article['originSite'] = "国家金融监督管理总局"
-                    article['titleCN'] = article['docSubtitle']
-                    article['title'] = translate(article['docSubtitle'])
-                    article['link'] = "https://www.cbirc.gov.cn" + str(article['pdfFileUrl'])
-                    article['category']= "Policy Interpretation"
-                    article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['titleCN']+article['publishDate'])
-                    article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
-                    article['attachment'] = ''
-                    article['author'] = ''
-                    article['subtitle'] = translate(summary)
-                    update_content(article)
-            except Exception as error:
-                print(error)
-    print("csrc.gov.cn")
-    i = 1
-    while i > -1:
         try:
-            if i == 1:
-                CATEGORY_URL = "http://www.csrc.gov.cn/csrc/c100039/common_list.shtml"
             else:
-                CATEGORY_URL = f"http://www.csrc.gov.cn/csrc/c100039/common_list_{i}.shtml"
-            i = i + 1
-            req = urllib.request.urlopen(CATEGORY_URL)
-            text = req.read()
-            html_text = text.decode("utf-8")
-            page = etree.HTML(html_text)
-            articlelist = page.xpath("//div[contains(@class, 'main-right fr common-list')]/ul/li")
-            for article in articlelist:
-                if isinstance(article, etree._Element):
-                    subelement = etree.tostring(article).decode()
-                    subpage = etree.HTML(subelement)
-                    date = encode(subpage.xpath("//span[@class='date']"))
-                    parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
-                    if  parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
-                        i = -1
-                    else:
-                        urls = subpage.xpath("//a/@href")
-                        for url in urls:
-                            try:
-                                article = {}
-                                url = "http://www.csrc.gov.cn" + url
-                                article['category']= "Policy Interpretation"
-                                crawl(url, article)
-                            except Exception as error:
-                                print(error)
-        except Exception as error:
-            i = -1
-            print(error)
-    i = 1
-    while i > -1:
-        CATEGORY_URL = f"http://www.csrc.gov.cn/searchList/a1a078ee0bc54721ab6b148884c784a8?_isAgg=true&_isJson=true&_pageSize=18&_template=index&_rangeTimeGte=&_channelName=&page={i}"
-        i = i + 1
-        try:
-            content = fetch_url(CATEGORY_URL)
-            reportinfo = json.loads(content)
-            for article in reportinfo['data']['results']:
-                parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(article['publishedTimeStr'],"%Y-%m-%d %H:%M:%S")), "%Y-%m-%d")
-                if  parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
-                    i = -1
-                else:
-                    article['category']= "Financial News"
-                    article['site'] = "Securities Regulatory Commission of China"
-                    article['originSite'] = "证监会"
-                    article['titleCN'] = article['title']
-                    article['title'] = translate(article['titleCN'])
-                    article['author'] = ''
-                    article['contentCN'] = repr(article['content'])[1:-1].strip()
-                    if len(article['contentCN']) < 10:
-                        continue
-                    CONTENT_ENG = ''
-                    for element in article['contentCN'].split("。"):
-                        CONTENT_ENG += translate(element) + ' '
-                    article['content'] = repr(CONTENT_ENG)[1:-1].strip()
-                    article['subtitle'] = article['memo']
-                    article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(article['publishedTimeStr'],"%Y-%m-%d %H:%M:%S"))
-                    article['link'] = article['url']
-                    article['attachment'] = ""
-                    article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
-                    article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['titleCN']+article['publishDate'])
-                    update_content(article)
         except Exception as error:
             print(error)
-    print("data.eastmoney.com")
-    today = datetime.today().strftime('%Y-%m-%d')
-    beginDate = (datetime.today() - timedelta(days=DELTA)).strftime('%Y-%m-%d')
-    i = 0
-    while i > -1:
-        URL = "https://reportapi.eastmoney.com/report/jg"
-        params = {
-            "cb": "datatable8544623",
-            "pageSize": "100",
-            "beginTime": beginDate,
-            "endTime": today,
-            "pageNo": i,
-            "qType": "3",
-        }
-        URL = URL + "?" + "&".join(f"{key}={value}" for key, value in params.items())
-        content = fetch_url(URL)
-        if content:
-            start_index = content.find("(")
-            if start_index != -1:
-                result = content[start_index + 1: -1]
-            else:
-                result = content
-            reportinfo = json.loads(result)
-            if reportinfo["size"] > 0:
-                i = i + 1
-                for article in reportinfo['data']:
-                    try:
-                        url = f"https://data.eastmoney.com/report/zw_macresearch.jshtml?encodeUrl={article['encodeUrl']}"
-                        crawl_eastmoney(url,article)
-                    except Exception as error:
-                        print(error)
-            else:
-                i = -1
-        else:
-            print("Failed to fetch URL:", url)
-    print("gov.cn")
-    i = 0
-    while i > -1:
-        if i == 0:
-            CATEGORY_URL = "https://www.gov.cn/zhengce/jiedu/home.htm"
         else:
-            CATEGORY_URL = f"https://www.gov.cn/zhengce/jiedu/home_{i}.htm"
         i = i + 1
         req = urllib.request.urlopen(CATEGORY_URL)
         text = req.read()
         html_text = text.decode("utf-8")
         page = etree.HTML(html_text)
-        articlelist = page.xpath("//div[contains(@class, 'news_box')]//h4")
         for article in articlelist:
             if isinstance(article, etree._Element):
                 subelement = etree.tostring(article).decode()
                 subpage = etree.HTML(subelement)
-                date = subpage.xpath("//span/text()")[0]
                 parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
                 if  parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
                     i = -1
                 else:
-                    urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
                     for url in urls:
                         try:
                             article = {}
-                            url = url.replace('../', 'https://www.gov.cn/zhengce/')
-                            if "https://www.gov.cn" in url:
-                                article['category']= "Policy Interpretation"
-                                crawl(url, article)
                         except Exception as error:
                             print(error)
-    i = 0
-    while i > -1:
-        if i == 0:
-            CATEGORY_URL = "https://www.gov.cn/zhengce/zuixin/home.htm"
-        else:
-            CATEGORY_URL = f"https://www.gov.cn/zhengce/zuixin/home_{i}.htm"
-        i = i + 1
-        req = urllib.request.urlopen(CATEGORY_URL)
-        text = req.read()
-        html_text = text.decode("utf-8")
-        page = etree.HTML(html_text)
-        articlelist = page.xpath("//div[contains(@class, 'news_box')]//h4")
-        for article in articlelist:
-            if isinstance(article, etree._Element):
-                subelement = etree.tostring(article).decode()
-                subpage = etree.HTML(subelement)
-                date = subpage.xpath("//span/text()")[0]
-                parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
-                if  parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
-                    i = -1
-                else:
-                    urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
-                    for url in urls:
-                        try:
-                            article = {}
-                            url = url.replace('../', 'https://www.gov.cn/zhengce/')
-                            if "https://www.gov.cn" in url:
-                                article['site'] = "State Council of China"
-                                crawl(url, article)
-                        except Exception as error:
-                            print(error)
-    print("mof.gov.cn")
-    i = 0
-    while i > -1:
-        if i == 0:
-            CATEGORY_URL = "https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/"
-        else:
-            CATEGORY_URL = f"https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/index_{i}.htm"
-        i = i + 1
-        req = urllib.request.urlopen(CATEGORY_URL)
-        text = req.read()
-        html_text = text.decode("utf-8")
-        page = etree.HTML(html_text)
-        articlelist = page.xpath("//div[contains(@class, 'xwfb_listerji')]/ul/li[not(@class = 'clear')]")
-        for article in articlelist:
-            if isinstance(article, etree._Element):
-                subelement = etree.tostring(article).decode()
-                subpage = etree.HTML(subelement)
-                date = subpage.xpath("//span/text()")[0]
-                parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
-                if  parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
-                    i = -1
-                else:
-                    urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
-                    for url in urls:
-                        try:
-                            article = {}
-                            url = url.replace("../", "https://www.mof.gov.cn/zhengwuxinxi/")
-                            url = url.replace("./", "https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/")
-                            article['category']= "Financial News"
-                            crawl(url, article)
-                        except Exception as error:
-                            print(error)
-    i = 0
-    while i > -1:
-        if i == 0:
-            CATEGORY_URL = "https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/"
         else:
-            CATEGORY_URL = f"https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/index_{i}.htm"
-        i = i + 1
-        req = urllib.request.urlopen(CATEGORY_URL)
-        text = req.read()
-        html_text = text.decode("utf-8")
-        page = etree.HTML(html_text)
-        articlelist = page.xpath("//div[contains(@class, 'xwfb_listerji')]/ul/li[not(@class = 'clear')]")
-        for article in articlelist:
-            if isinstance(article, etree._Element):
-                subelement = etree.tostring(article).decode()
-                subpage = etree.HTML(subelement)
-                date = subpage.xpath("//span/text()")[0]
-                parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
-                if  parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
-                    i = -1
-                else:
-                    urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
-                    for url in urls:
-                        try:
-                            article = {}
-                            url = url.replace("./", CATEGORY_URL)
                             article['category']= "Policy Interpretation"
-                            print(url)
                             crawl(url, article)
-                        except Exception as error:
-                            print(error)
-    print("mofcom.gov.cn")
-    categories = ['jdzhsw','jdgnmy','jddwmy','jdtzhz']
-    for category in categories:
-        i = 1
-        while i > -1:
-            if i == 1:
-                URL = f"http://www.mofcom.gov.cn/article/zcjd/{category}/"
-            else:
-                URL = f"http://www.mofcom.gov.cn/article/zcjd/{category}/?{i}"
-            i = i + 1
-            try:
-                req = urllib.request.urlopen(URL)
-                text = req.read()
-                html_text = text.decode("utf-8")
-                page = etree.HTML(html_text)
-                articlelist = page.xpath("//section[contains(@class, 'listCon iListCon f-mt30')]/ul/li")
-                for article in articlelist:
-                    if isinstance(article, etree._Element):
-                        subelement = etree.tostring(article).decode()
-                        subpage = etree.HTML(subelement)
-                        date = subpage.xpath("//span/text()")[0]
-                        parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d %H:%M:%S")), "%Y-%m-%d")
-                        if  parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
-                            i = -1
-                        else:
-                            urls = subpage.xpath("//a/@href")
-                            for url in urls:
-                                try:
-                                    article = {}
-                                    if '/article/zcjd' in url:
-                                        url = "http://www.mofcom.gov.cn" + url
-                                        article['category']= "Policy Interpretation"
-                                    else:
-                                        article['category']= "Policy Release"
-                                    crawl(url, article)
-                                except Exception as error:
-                                    print(error)
-            except Exception as error:
                 i = -1
-                print(error)
-    print("ndrc.gov.cn")
-    i = 0
-    while i > -1:
-        if i == 0:
-            CATEGORY_URL = "https://www.ndrc.gov.cn/xxgk/jd/jd/index.html"
-        else:
-            CATEGORY_URL = f"https://www.ndrc.gov.cn/xxgk/jd/jd/index_{i}.html"
-        i = i + 1
-        req = urllib.request.urlopen(CATEGORY_URL)
-        text = req.read()
-        html_text = text.decode("utf-8")
-        page = etree.HTML(html_text)
-        articlelist = page.xpath("//div[contains(@class, 'list')]/ul/li[not(@class = 'empty')]")
-        for article in articlelist:
-            if isinstance(article, etree._Element):
-                subelement = etree.tostring(article).decode()
-                subpage = etree.HTML(subelement)
-                date = subpage.xpath("//span/text()")[0]
-                parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y/%m/%d")), "%Y-%m-%d")
-                if  parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
-                    i = -1
-                else:
-                    urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
-                    for url in urls:
-                        try:
-                            article = {}
-                            if "www.gov.cn" in url:
-                                article['category']= "Policy Release"
-                            elif "../../zcfb/" in url:
-                                url = url.replace("../../zcfb/", "https://www.ndrc.gov.cn/xxgk/zcfb/")
-                                article['category']= "Policy Release"
-                            else:
-                                url = url.replace("../../", "https://www.ndrc.gov.cn/xxgk/jd/jd/")
-                                url = url.replace("./", "https://www.ndrc.gov.cn/xxgk/jd/jd/")
-                                article['category']= "Policy Interpretation"
-                            crawl(url, article)
-                        except Exception as error:
-                            print(error)
-    print("safe.gov.cn")
     i = 1
     while i > -1:
         if i == 1:
-            CATEGORY_URL = "https://www.safe.gov.cn/safe/zcfgjd/index.html"
         else:
-            CATEGORY_URL = f"https://www.safe.gov.cn/safe/zcfgjd/index_{i}.html"
         i = i + 1
-        req = urllib.request.urlopen(CATEGORY_URL)
-        text = req.read()
-        html_text = text.decode("utf-8")
-        page = etree.HTML(html_text)
-        articlelist = page.xpath("//div[contains(@class, 'list_conr')]/ul/li")
-        for article in articlelist:
-            if isinstance(article, etree._Element):
-                subelement = etree.tostring(article).decode()
-                subpage = etree.HTML(subelement)
-                date = subpage.xpath("//dd/text()")[0]
-                parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
-                if  parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
-                    i = -1
-                else:
-                    urls = subpage.xpath("//a/@href")
-                    for url in urls:
-                        try:
-                            article = {}
-                            url = "https://www.safe.gov.cn" + url
-                            article['category']= "Policy Interpretation"
-                            crawl(url, article)
-                        except Exception as error:
-                            print(error)
-    i = 1
-    while i > -1:
-        if i == 1:
-            CATEGORY_URL = "https://www.safe.gov.cn/safe/sjjd/index.html"
-        else:
-            CATEGORY_URL = f"https://www.safe.gov.cn/safe/sjjd/index_{i}.html"
-        i = i + 1
-        req = urllib.request.urlopen(CATEGORY_URL)
-        text = req.read()
-        html_text = text.decode("utf-8")
-        page = etree.HTML(html_text)
-        articlelist = page.xpath("//div[contains(@class, 'list_conr')]/ul/li")
-        for article in articlelist:
-            if isinstance(article, etree._Element):
-                subelement = etree.tostring(article).decode()
-                subpage = etree.HTML(subelement)
-                date = subpage.xpath("//dd/text()")[0]
-                parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
-                if  parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
-                    i = -1
-                else:
-                    urls = subpage.xpath("//a/@href")
-                    for url in urls:
-                        try:
-                            article = {}
-                            url = "https://www.safe.gov.cn" + url
-                            article['category']= "Data Interpretation"
-                            crawl(url, article)
-                        except Exception as error:
-                            print(error)
-    print("stats.gov.hk")
-    i = 0
-    while i > -1:
-        if i == 0:
-            CATEGORY_URL = "https://www.stats.gov.cn/sj/sjjd/"
-        else:
-            CATEGORY_URL = f"https://www.stats.gov.cn/sj/sjjd/index_{i}.html"
-        i = i + 1
-        req = urllib.request.urlopen(CATEGORY_URL)
-        text = req.read()
-        html_text = text.decode("utf-8")
-        page = etree.HTML(html_text)
-        articlelist = page.xpath("//div[contains(@class, 'list-content')]/ul/li")
-        for article in articlelist:
-            if isinstance(article, etree._Element):
-                subelement = etree.tostring(article).decode()
-                subpage = etree.HTML(subelement)
-                date = encode(subpage.xpath("//span"))
-                parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
-                if  parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
-                    i = -1
-                else:
-                    urls = subpage.xpath("//a[@class='fl pc_1600']/@href")
-                    for url in urls:
-                        try:
-                            article = {}
-                            url = url.replace('./', "https://www.stats.gov.cn/sj/sjjd/")
-                            article['category']= "Data Interpretation"
-                            crawl(url, article)
-                        except Exception as error:
-                            print(error)
-def data_collection():
-    daily()
-    glue_job_run()
-if __name__ == '__main__':
-    data_collection()

     extract_reference(article)
     update_content(article)
+with open('xpath.json', 'r', encoding='UTF-8') as f:
+    xpath_dict = json.load(f)
+DELTA = int(os.environ.get('DELTA') or '1')
+print(f"DELTA = {DELTA}")
+print("cbirc.gov.cn")
+i = 1
+while i > -1:
+    CATEGORY_URL = f"https://www.cbirc.gov.cn/cn/static/data/DocInfo/SelectDocByItemIdAndChild/data_itemId=917,pageIndex={i},pageSize=18.json"
+    i = i + 1
+    content = fetch_url(CATEGORY_URL)
+    reportinfo = json.loads(content)
+    for article in reportinfo['data']['rows']:
         try:
+            article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(article['publishDate'],"%Y-%m-%d %H:%M:%S"))
+            parsed_datetime = datetime.strptime(article['publishDate'], "%Y-%m-%d")
+            if  parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
+                i = -1
             else:
+                contentCN, summary = extract_from_pdf("https://www.cbirc.gov.cn" + article['pdfFileUrl'])
+                article['contentCN'] = repr(contentCN)[1:-1].strip()
+                if len(contentCN) < 10:
+                    continue
+                CONTENT_ENG = ''
+                for element in article['contentCN'].split("\n"):
+                    CONTENT_ENG += translate(element) + '\n'
+                article['content'] = repr(CONTENT_ENG)[1:-1].strip()
+                article['site'] = "National Financial Regulatory Administration of China"
+                article['originSite'] = "国家金融监督管理总局"
+                article['titleCN'] = article['docSubtitle']
+                article['title'] = translate(article['docSubtitle'])
+                article['link'] = "https://www.cbirc.gov.cn" + str(article['pdfFileUrl'])
+                article['category']= "Policy Interpretation"
+                article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['titleCN']+article['publishDate'])
+                article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
+                article['attachment'] = ''
+                article['author'] = ''
+                article['subtitle'] = translate(summary)
+                update_content(article)
         except Exception as error:
             print(error)
+print("csrc.gov.cn")
+i = 1
+while i > -1:
+    try:
+        if i == 1:
+            CATEGORY_URL = "http://www.csrc.gov.cn/csrc/c100039/common_list.shtml"
         else:
+            CATEGORY_URL = f"http://www.csrc.gov.cn/csrc/c100039/common_list_{i}.shtml"
         i = i + 1
         req = urllib.request.urlopen(CATEGORY_URL)
         text = req.read()
         html_text = text.decode("utf-8")
         page = etree.HTML(html_text)
+        articlelist = page.xpath("//div[contains(@class, 'main-right fr common-list')]/ul/li")
         for article in articlelist:
             if isinstance(article, etree._Element):
                 subelement = etree.tostring(article).decode()
                 subpage = etree.HTML(subelement)
+                date = encode(subpage.xpath("//span[@class='date']"))
                 parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
                 if  parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
                     i = -1
                 else:
+                    urls = subpage.xpath("//a/@href")
                     for url in urls:
                         try:
                             article = {}
+                            url = "http://www.csrc.gov.cn" + url
+                            article['category']= "Policy Interpretation"
+                            crawl(url, article)
                         except Exception as error:
                             print(error)
+    except Exception as error:
+        i = -1
+        print(error)
+i = 1
+while i > -1:
+    CATEGORY_URL = f"http://www.csrc.gov.cn/searchList/a1a078ee0bc54721ab6b148884c784a8?_isAgg=true&_isJson=true&_pageSize=18&_template=index&_rangeTimeGte=&_channelName=&page={i}"
+    i = i + 1
+    try:
+        content = fetch_url(CATEGORY_URL)
+        reportinfo = json.loads(content)
+        for article in reportinfo['data']['results']:
+            parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(article['publishedTimeStr'],"%Y-%m-%d %H:%M:%S")), "%Y-%m-%d")
+            if  parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
+                i = -1
+            else:
+                article['category']= "Financial News"
+                article['site'] = "Securities Regulatory Commission of China"
+                article['originSite'] = "证监会"
+                article['titleCN'] = article['title']
+                article['title'] = translate(article['titleCN'])
+                article['author'] = ''
+                article['contentCN'] = repr(article['content'])[1:-1].strip()
+                if len(article['contentCN']) < 10:
+                    continue
+                CONTENT_ENG = ''
+                for element in article['contentCN'].split("。"):
+                    CONTENT_ENG += translate(element) + ' '
+                article['content'] = repr(CONTENT_ENG)[1:-1].strip()
+                article['subtitle'] = article['memo']
+                article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(article['publishedTimeStr'],"%Y-%m-%d %H:%M:%S"))
+                article['link'] = article['url']
+                article['attachment'] = ""
+                article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
+                article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['titleCN']+article['publishDate'])
+                update_content(article)
+    except Exception as error:
+        print(error)
+print("data.eastmoney.com")
+today = datetime.today().strftime('%Y-%m-%d')
+beginDate = (datetime.today() - timedelta(days=DELTA)).strftime('%Y-%m-%d')
+i = 0
+while i > -1:
+    URL = "https://reportapi.eastmoney.com/report/jg"
+    params = {
+        "cb": "datatable8544623",
+        "pageSize": "100",
+        "beginTime": beginDate,
+        "endTime": today,
+        "pageNo": i,
+        "qType": "3",
+    }
+    URL = URL + "?" + "&".join(f"{key}={value}" for key, value in params.items())
+    content = fetch_url(URL)
+    if content:
+        start_index = content.find("(")
+        if start_index != -1:
+            result = content[start_index + 1: -1]
         else:
+            result = content
+        reportinfo = json.loads(result)
+        if reportinfo["size"] > 0:
+            i = i + 1
+            for article in reportinfo['data']:
+                try:
+                    url = f"https://data.eastmoney.com/report/zw_macresearch.jshtml?encodeUrl={article['encodeUrl']}"
+                    crawl_eastmoney(url,article)
+                except Exception as error:
+                    print(error)
+        else:
+            i = -1
+    else:
+        print("Failed to fetch URL:", url)
+print("gov.cn")
+i = 0
+while i > -1:
+    if i == 0:
+        CATEGORY_URL = "https://www.gov.cn/zhengce/jiedu/home.htm"
+    else:
+        CATEGORY_URL = f"https://www.gov.cn/zhengce/jiedu/home_{i}.htm"
+    i = i + 1
+    req = urllib.request.urlopen(CATEGORY_URL)
+    text = req.read()
+    html_text = text.decode("utf-8")
+    page = etree.HTML(html_text)
+    articlelist = page.xpath("//div[contains(@class, 'news_box')]//h4")
+    for article in articlelist:
+        if isinstance(article, etree._Element):
+            subelement = etree.tostring(article).decode()
+            subpage = etree.HTML(subelement)
+            date = subpage.xpath("//span/text()")[0]
+            parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
+            if  parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
+                i = -1
+            else:
+                urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
+                for url in urls:
+                    try:
+                        article = {}
+                        url = url.replace('../', 'https://www.gov.cn/zhengce/')
+                        if "https://www.gov.cn" in url:
                             article['category']= "Policy Interpretation"
                             crawl(url, article)
+                    except Exception as error:
+                        print(error)
+i = 0
+while i > -1:
+    if i == 0:
+        CATEGORY_URL = "https://www.gov.cn/zhengce/zuixin/home.htm"
+    else:
+        CATEGORY_URL = f"https://www.gov.cn/zhengce/zuixin/home_{i}.htm"
+    i = i + 1
+    req = urllib.request.urlopen(CATEGORY_URL)
+    text = req.read()
+    html_text = text.decode("utf-8")
+    page = etree.HTML(html_text)
+    articlelist = page.xpath("//div[contains(@class, 'news_box')]//h4")
+    for article in articlelist:
+        if isinstance(article, etree._Element):
+            subelement = etree.tostring(article).decode()
+            subpage = etree.HTML(subelement)
+            date = subpage.xpath("//span/text()")[0]
+            parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
+            if  parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
                 i = -1
+            else:
+                urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
+                for url in urls:
+                    try:
+                        article = {}
+                        url = url.replace('../', 'https://www.gov.cn/zhengce/')
+                        if "https://www.gov.cn" in url:
+                            article['site'] = "State Council of China"
+                            crawl(url, article)
+                    except Exception as error:
+                        print(error)
+print("mof.gov.cn")
+i = 0
+while i > -1:
+    if i == 0:
+        CATEGORY_URL = "https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/"
+    else:
+        CATEGORY_URL = f"https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/index_{i}.htm"
+    i = i + 1
+    req = urllib.request.urlopen(CATEGORY_URL)
+    text = req.read()
+    html_text = text.decode("utf-8")
+    page = etree.HTML(html_text)
+    articlelist = page.xpath("//div[contains(@class, 'xwfb_listerji')]/ul/li[not(@class = 'clear')]")
+    for article in articlelist:
+        if isinstance(article, etree._Element):
+            subelement = etree.tostring(article).decode()
+            subpage = etree.HTML(subelement)
+            date = subpage.xpath("//span/text()")[0]
+            parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
+            if  parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
+                i = -1
+            else:
+                urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
+                for url in urls:
+                    try:
+                        article = {}
+                        url = url.replace("../", "https://www.mof.gov.cn/zhengwuxinxi/")
+                        url = url.replace("./", "https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/")
+                        article['category']= "Financial News"
+                        crawl(url, article)
+                    except Exception as error:
+                        print(error)
+i = 0
+while i > -1:
+    if i == 0:
+        CATEGORY_URL = "https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/"
+    else:
+        CATEGORY_URL = f"https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/index_{i}.htm"
+    i = i + 1
+    req = urllib.request.urlopen(CATEGORY_URL)
+    text = req.read()
+    html_text = text.decode("utf-8")
+    page = etree.HTML(html_text)
+    articlelist = page.xpath("//div[contains(@class, 'xwfb_listerji')]/ul/li[not(@class = 'clear')]")
+    for article in articlelist:
+        if isinstance(article, etree._Element):
+            subelement = etree.tostring(article).decode()
+            subpage = etree.HTML(subelement)
+            date = subpage.xpath("//span/text()")[0]
+            parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
+            if  parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
+                i = -1
+            else:
+                urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
+                for url in urls:
+                    try:
+                        article = {}
+                        url = url.replace("./", CATEGORY_URL)
+                        article['category']= "Policy Interpretation"
+                        print(url)
+                        crawl(url, article)
+                    except Exception as error:
+                        print(error)
+print("mofcom.gov.cn")
+categories = ['jdzhsw','jdgnmy','jddwmy','jdtzhz']
+for category in categories:
     i = 1
     while i > -1:
         if i == 1:
+            URL = f"http://www.mofcom.gov.cn/article/zcjd/{category}/"
         else:
+            URL = f"http://www.mofcom.gov.cn/article/zcjd/{category}/?{i}"
         i = i + 1
+        try:
+            req = urllib.request.urlopen(URL)
+            text = req.read()
+            html_text = text.decode("utf-8")
+            page = etree.HTML(html_text)
+            articlelist = page.xpath("//section[contains(@class, 'listCon iListCon f-mt30')]/ul/li")
+            for article in articlelist:
+                if isinstance(article, etree._Element):
+                    subelement = etree.tostring(article).decode()
+                    subpage = etree.HTML(subelement)
+                    date = subpage.xpath("//span/text()")[0]
+                    parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d %H:%M:%S")), "%Y-%m-%d")
+                    if  parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
+                        i = -1
+                    else:
+                        urls = subpage.xpath("//a/@href")
+                        for url in urls:
+                            try:
+                                article = {}
+                                if '/article/zcjd' in url:
+                                    url = "http://www.mofcom.gov.cn" + url
+                                    article['category']= "Policy Interpretation"
+                                else:
+                                    article['category']= "Policy Release"
+                                crawl(url, article)
+                            except Exception as error:
+                                print(error)
+        except Exception as error:
+            i = -1
+            print(error)
+print("ndrc.gov.cn")
+i = 0
+while i > -1:
+    if i == 0:
+        CATEGORY_URL = "https://www.ndrc.gov.cn/xxgk/jd/jd/index.html"
+    else:
+        CATEGORY_URL = f"https://www.ndrc.gov.cn/xxgk/jd/jd/index_{i}.html"
+    i = i + 1
+    req = urllib.request.urlopen(CATEGORY_URL)
+    text = req.read()
+    html_text = text.decode("utf-8")
+    page = etree.HTML(html_text)
+    articlelist = page.xpath("//div[contains(@class, 'list')]/ul/li[not(@class = 'empty')]")
+    for article in articlelist:
+        if isinstance(article, etree._Element):
+            subelement = etree.tostring(article).decode()
+            subpage = etree.HTML(subelement)
+            date = subpage.xpath("//span/text()")[0]
+            parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y/%m/%d")), "%Y-%m-%d")
+            if  parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
+                i = -1
+            else:
+                urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
+                for url in urls:
+                    try:
+                        article = {}
+                        if "www.gov.cn" in url:
+                            article['category']= "Policy Release"
+                        elif "../../zcfb/" in url:
+                            url = url.replace("../../zcfb/", "https://www.ndrc.gov.cn/xxgk/zcfb/")
+                            article['category']= "Policy Release"
+                        else:
+                            url = url.replace("../../", "https://www.ndrc.gov.cn/xxgk/jd/jd/")
+                            url = url.replace("./", "https://www.ndrc.gov.cn/xxgk/jd/jd/")
+                            article['category']= "Policy Interpretation"
+                        crawl(url, article)
+                    except Exception as error:
+                        print(error)
+print("safe.gov.cn")
+i = 1
+while i > -1:
+    if i == 1:
+        CATEGORY_URL = "https://www.safe.gov.cn/safe/zcfgjd/index.html"
+    else:
+        CATEGORY_URL = f"https://www.safe.gov.cn/safe/zcfgjd/index_{i}.html"
+    i = i + 1
+    req = urllib.request.urlopen(CATEGORY_URL)
+    text = req.read()
+    html_text = text.decode("utf-8")
+    page = etree.HTML(html_text)
+    articlelist = page.xpath("//div[contains(@class, 'list_conr')]/ul/li")
+    for article in articlelist:
+        if isinstance(article, etree._Element):
+            subelement = etree.tostring(article).decode()
+            subpage = etree.HTML(subelement)
+            date = subpage.xpath("//dd/text()")[0]
+            parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
+            if  parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
+                i = -1
+            else:
+                urls = subpage.xpath("//a/@href")
+                for url in urls:
+                    try:
+                        article = {}
+                        url = "https://www.safe.gov.cn" + url
+                        article['category']= "Policy Interpretation"
+                        crawl(url, article)
+                    except Exception as error:
+                        print(error)
+i = 1
+while i > -1:
+    if i == 1:
+        CATEGORY_URL = "https://www.safe.gov.cn/safe/sjjd/index.html"
+    else:
+        CATEGORY_URL = f"https://www.safe.gov.cn/safe/sjjd/index_{i}.html"
+    i = i + 1
+    req = urllib.request.urlopen(CATEGORY_URL)
+    text = req.read()
+    html_text = text.decode("utf-8")
+    page = etree.HTML(html_text)
+    articlelist = page.xpath("//div[contains(@class, 'list_conr')]/ul/li")
+    for article in articlelist:
+        if isinstance(article, etree._Element):
+            subelement = etree.tostring(article).decode()
+            subpage = etree.HTML(subelement)
+            date = subpage.xpath("//dd/text()")[0]
+            parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
+            if  parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
+                i = -1
+            else:
+                urls = subpage.xpath("//a/@href")
+                for url in urls:
+                    try:
+                        article = {}
+                        url = "https://www.safe.gov.cn" + url
+                        article['category']= "Data Interpretation"
+                        crawl(url, article)
+                    except Exception as error:
+                        print(error)
+print("stats.gov.hk")
+i = 0
+while i > -1:
+    if i == 0:
+        CATEGORY_URL = "https://www.stats.gov.cn/sj/sjjd/"
+    else:
+        CATEGORY_URL = f"https://www.stats.gov.cn/sj/sjjd/index_{i}.html"
+    i = i + 1
+    req = urllib.request.urlopen(CATEGORY_URL)
+    text = req.read()
+    html_text = text.decode("utf-8")
+    page = etree.HTML(html_text)
+    articlelist = page.xpath("//div[contains(@class, 'list-content')]/ul/li")
+    for article in articlelist:
+        if isinstance(article, etree._Element):
+            subelement = etree.tostring(article).decode()
+            subpage = etree.HTML(subelement)
+            date = encode(subpage.xpath("//span"))
+            parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
+            if  parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
+                i = -1
+            else:
+                urls = subpage.xpath("//a[@class='fl pc_1600']/@href")
+                for url in urls:
+                    try:
+                        article = {}
+                        url = url.replace('./', "https://www.stats.gov.cn/sj/sjjd/")
+                        article['category']= "Data Interpretation"
+                        crawl(url, article)
+                    except Exception as error:
+                        print(error)
+glue_job_run()