Spaces:

Oxbridge-Economics
/

Data-Collection-China

Running

App Files Files Community

Muhammad Abdur Rahman Saad commited on Jul 31, 2024

Commit

cc37e8c

1 Parent(s): b69e69a

add prefect flow

Browse files

Files changed (2) hide show

daily.py +426 -418
requirements.txt +0 -0

daily.py CHANGED Viewed

@@ -10,6 +10,7 @@ import urllib.request
 import uuid
 from datetime import datetime, timedelta
 from urllib.parse import urlparse
 from lxml import etree
@@ -17,121 +18,7 @@ from utils import (crawl, datemodifier, encode, encode_content,
                    extract_from_pdf, extract_reference, fetch_url,
                    sentiment_computation, translate, update_content)
-with open('xpath.json', 'r', encoding='UTF-8') as f:
-    xpath_dict = json.load(f)
-DELTA = int(os.environ.get('DELTA') or '1')
-print(f"DELTA = {DELTA}")
-print("cbirc.gov.cn")
-i = 1
-while i > -1:
-    CATEGORY_URL = f"https://www.cbirc.gov.cn/cn/static/data/DocInfo/SelectDocByItemIdAndChild/data_itemId=917,pageIndex={i},pageSize=18.json"
-    i = i + 1
-    content = fetch_url(CATEGORY_URL)
-    reportinfo = json.loads(content)
-    for article in reportinfo['data']['rows']:
-        try:
-            article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(article['publishDate'],"%Y-%m-%d %H:%M:%S"))
-            parsed_datetime = datetime.strptime(article['publishDate'], "%Y-%m-%d")
-            if  parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
-                i = -1
-            else:
-                contentCN, summary = extract_from_pdf("https://www.cbirc.gov.cn" + article['pdfFileUrl'])
-                article['contentCN'] = repr(contentCN)[1:-1].strip()
-                if len(contentCN) < 10:
-                    continue
-                CONTENT_ENG = ''
-                for element in article['contentCN'].split("\n"):
-                    CONTENT_ENG += translate(element) + '\n'
-                article['content'] = repr(CONTENT_ENG)[1:-1].strip()
-                article['site'] = "National Financial Regulatory Administration of China"
-                article['originSite'] = "国家金融监督管理总局"
-                article['titleCN'] = article['docSubtitle']
-                article['title'] = translate(article['docSubtitle'])
-                article['link'] = "https://www.cbirc.gov.cn" + str(article['pdfFileUrl'])
-                article['category']= "Policy Interpretation"
-                article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['titleCN']+article['publishDate'])
-                article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
-                article['attachment'] = ''
-                article['author'] = ''
-                article['subtitle'] = translate(summary)
-                update_content(article)
-        except Exception as error:
-            print(error)
-print("csrc.gov.cn")
-i = 1
-while i > -1:
-    try:
-        if i == 1:
-            CATEGORY_URL = "http://www.csrc.gov.cn/csrc/c100039/common_list.shtml"
-        else:
-            CATEGORY_URL = f"http://www.csrc.gov.cn/csrc/c100039/common_list_{i}.shtml"
-        i = i + 1
-        req = urllib.request.urlopen(CATEGORY_URL)
-        text = req.read()
-        html_text = text.decode("utf-8")
-        page = etree.HTML(html_text)
-        articlelist = page.xpath("//div[contains(@class, 'main-right fr common-list')]/ul/li")
-        for article in articlelist:
-            if isinstance(article, etree._Element):
-                subelement = etree.tostring(article).decode()
-                subpage = etree.HTML(subelement)
-                date = encode(subpage.xpath("//span[@class='date']"))
-                parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
-                if  parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
-                    i = -1
-                else:
-                    urls = subpage.xpath("//a/@href")
-                    for url in urls:
-                        try:
-                            article = {}
-                            url = "http://www.csrc.gov.cn" + url
-                            article['category']= "Policy Interpretation"
-                            crawl(url, article)
-                        except Exception as error:
-                            print(error)
-    except Exception as error:
-        i = -1
-        print(error)
-i = 1
-while i > -1:
-    CATEGORY_URL = f"http://www.csrc.gov.cn/searchList/a1a078ee0bc54721ab6b148884c784a8?_isAgg=true&_isJson=true&_pageSize=18&_template=index&_rangeTimeGte=&_channelName=&page={i}"
-    i = i + 1
-    try:
-        content = fetch_url(CATEGORY_URL)
-        reportinfo = json.loads(content)
-        for article in reportinfo['data']['results']:
-            parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(article['publishedTimeStr'],"%Y-%m-%d %H:%M:%S")), "%Y-%m-%d")
-            if  parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
-                i = -1
-            else:
-                article['category']= "Financial News"
-                article['site'] = "Securities Regulatory Commission of China"
-                article['originSite'] = "证监会"
-                article['titleCN'] = article['title']
-                article['title'] = translate(article['titleCN'])
-                article['author'] = ''
-                article['contentCN'] = repr(article['content'])[1:-1].strip()
-                if len(article['contentCN']) < 10:
-                    continue
-                CONTENT_ENG = ''
-                for element in article['contentCN'].split("。"):
-                    CONTENT_ENG += translate(element) + ' '
-                article['content'] = repr(CONTENT_ENG)[1:-1].strip()
-                article['subtitle'] = article['memo']
-                article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(article['publishedTimeStr'],"%Y-%m-%d %H:%M:%S"))
-                article['link'] = article['url']
-                article['attachment'] = ""
-                article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
-                article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['titleCN']+article['publishDate'])
-                update_content(article)
-    except Exception as error:
-        print(error)
-print("data.eastmoney.com")
 def crawl_eastmoney(url, article):
     """
     Crawls the given URL and extracts information from the webpage.
@@ -179,193 +66,71 @@ def crawl_eastmoney(url, article):
     extract_reference(article)
     update_content(article)
-today = datetime.today().strftime('%Y-%m-%d')
-beginDate = (datetime.today() - timedelta(days=DELTA)).strftime('%Y-%m-%d')
-i = 0
-while i > -1:
-    URL = "https://reportapi.eastmoney.com/report/jg"
-    params = {
-        "cb": "datatable8544623",
-        "pageSize": "100",
-        "beginTime": beginDate,
-        "endTime": today,
-        "pageNo": i,
-        "qType": "3",
-    }
-    URL = URL + "?" + "&".join(f"{key}={value}" for key, value in params.items())
-    content = fetch_url(URL)
-    if content:
-        start_index = content.find("(")
-        if start_index != -1:
-            result = content[start_index + 1: -1]
-        else:
-            result = content
-        reportinfo = json.loads(result)
-        if reportinfo["size"] > 0:
-            i = i + 1
-            for article in reportinfo['data']:
-                try:
-                    url = f"https://data.eastmoney.com/report/zw_macresearch.jshtml?encodeUrl={article['encodeUrl']}"
-                    crawl_eastmoney(url,article)
-                except Exception as error:
-                    print(error)
-        else:
-            i = -1
-    else:
-        print("Failed to fetch URL:", url)
-print("gov.cn")
-i = 0
-while i > -1:
-    if i == 0:
-        CATEGORY_URL = "https://www.gov.cn/zhengce/jiedu/home.htm"
-    else:
-        CATEGORY_URL = f"https://www.gov.cn/zhengce/jiedu/home_{i}.htm"
-    i = i + 1
-    req = urllib.request.urlopen(CATEGORY_URL)
-    text = req.read()
-    html_text = text.decode("utf-8")
-    page = etree.HTML(html_text)
-    articlelist = page.xpath("//div[contains(@class, 'news_box')]//h4")
-    for article in articlelist:
-        if isinstance(article, etree._Element):
-            subelement = etree.tostring(article).decode()
-            subpage = etree.HTML(subelement)
-            date = subpage.xpath("//span/text()")[0]
-            parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
-            if  parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
-                i = -1
-            else:
-                urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
-                for url in urls:
-                    try:
-                        article = {}
-                        url = url.replace('../', 'https://www.gov.cn/zhengce/')
-                        if "https://www.gov.cn" in url:
-                            article['category']= "Policy Interpretation"
-                            crawl(url, article)
-                    except Exception as error:
-                        print(error)
-i = 0
-while i > -1:
-    if i == 0:
-        CATEGORY_URL = "https://www.gov.cn/zhengce/zuixin/home.htm"
-    else:
-        CATEGORY_URL = f"https://www.gov.cn/zhengce/zuixin/home_{i}.htm"
-    i = i + 1
-    req = urllib.request.urlopen(CATEGORY_URL)
-    text = req.read()
-    html_text = text.decode("utf-8")
-    page = etree.HTML(html_text)
-    articlelist = page.xpath("//div[contains(@class, 'news_box')]//h4")
-    for article in articlelist:
-        if isinstance(article, etree._Element):
-            subelement = etree.tostring(article).decode()
-            subpage = etree.HTML(subelement)
-            date = subpage.xpath("//span/text()")[0]
-            parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
-            if  parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
-                i = -1
-            else:
-                urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
-                for url in urls:
-                    try:
-                        article = {}
-                        url = url.replace('../', 'https://www.gov.cn/zhengce/')
-                        if "https://www.gov.cn" in url:
-                            article['site'] = "State Council of China"
-                            crawl(url, article)
-                    except Exception as error:
-                        print(error)
-print("mof.gov.cn")
-i = 0
-while i > -1:
-    if i == 0:
-        CATEGORY_URL = "https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/"
-    else:
-        CATEGORY_URL = f"https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/index_{i}.htm"
-    i = i + 1
-    req = urllib.request.urlopen(CATEGORY_URL)
-    text = req.read()
-    html_text = text.decode("utf-8")
-    page = etree.HTML(html_text)
-    articlelist = page.xpath("//div[contains(@class, 'xwfb_listerji')]/ul/li[not(@class = 'clear')]")
-    for article in articlelist:
-        if isinstance(article, etree._Element):
-            subelement = etree.tostring(article).decode()
-            subpage = etree.HTML(subelement)
-            date = subpage.xpath("//span/text()")[0]
-            parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
-            if  parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
-                i = -1
-            else:
-                urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
-                for url in urls:
-                    try:
-                        article = {}
-                        url = url.replace("../", "https://www.mof.gov.cn/zhengwuxinxi/")
-                        url = url.replace("./", "https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/")
-                        article['category']= "Financial News"
-                        crawl(url, article)
-                    except Exception as error:
-                        print(error)
-i = 0
-while i > -1:
-    if i == 0:
-        CATEGORY_URL = "https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/"
-    else:
-        CATEGORY_URL = f"https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/index_{i}.htm"
-    i = i + 1
-    req = urllib.request.urlopen(CATEGORY_URL)
-    text = req.read()
-    html_text = text.decode("utf-8")
-    page = etree.HTML(html_text)
-    articlelist = page.xpath("//div[contains(@class, 'xwfb_listerji')]/ul/li[not(@class = 'clear')]")
-    for article in articlelist:
-        if isinstance(article, etree._Element):
-            subelement = etree.tostring(article).decode()
-            subpage = etree.HTML(subelement)
-            date = subpage.xpath("//span/text()")[0]
-            parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
-            if  parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
-                i = -1
-            else:
-                urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
-                for url in urls:
-                    try:
-                        article = {}
-                        url = url.replace("./", CATEGORY_URL)
-                        article['category']= "Policy Interpretation"
-                        print(url)
-                        crawl(url, article)
-                    except Exception as error:
-                        print(error)
-print("mofcom.gov.cn")
-categories = ['jdzhsw','jdgnmy','jddwmy','jdtzhz']
-for category in categories:
     i = 1
     while i > -1:
-        if i == 1:
-            URL = f"http://www.mofcom.gov.cn/article/zcjd/{category}/"
-        else:
-            URL = f"http://www.mofcom.gov.cn/article/zcjd/{category}/?{i}"
         i = i + 1
         try:
-            req = urllib.request.urlopen(URL)
             text = req.read()
             html_text = text.decode("utf-8")
             page = etree.HTML(html_text)
-            articlelist = page.xpath("//section[contains(@class, 'listCon iListCon f-mt30')]/ul/li")
             for article in articlelist:
                 if isinstance(article, etree._Element):
                     subelement = etree.tostring(article).decode()
                     subpage = etree.HTML(subelement)
-                    date = subpage.xpath("//span/text()")[0]
-                    parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d %H:%M:%S")), "%Y-%m-%d")
                     if  parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
                         i = -1
                     else:
@@ -373,11 +138,8 @@ for category in categories:
                         for url in urls:
                             try:
                                 article = {}
-                                if '/article/zcjd' in url:
-                                    url = "http://www.mofcom.gov.cn" + url
-                                    article['category']= "Policy Interpretation"
-                                else:
-                                    article['category']= "Policy Release"
                                 crawl(url, article)
                             except Exception as error:
                                 print(error)
@@ -385,137 +147,383 @@ for category in categories:
             i = -1
             print(error)
-print("ndrc.gov.cn")
-i = 0
-while i > -1:
-    if i == 0:
-        CATEGORY_URL = "https://www.ndrc.gov.cn/xxgk/jd/jd/index.html"
-    else:
-        CATEGORY_URL = f"https://www.ndrc.gov.cn/xxgk/jd/jd/index_{i}.html"
-    i = i + 1
-    req = urllib.request.urlopen(CATEGORY_URL)
-    text = req.read()
-    html_text = text.decode("utf-8")
-    page = etree.HTML(html_text)
-    articlelist = page.xpath("//div[contains(@class, 'list')]/ul/li[not(@class = 'empty')]")
-    for article in articlelist:
-        if isinstance(article, etree._Element):
-            subelement = etree.tostring(article).decode()
-            subpage = etree.HTML(subelement)
-            date = subpage.xpath("//span/text()")[0]
-            parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y/%m/%d")), "%Y-%m-%d")
-            if  parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
-                i = -1
-            else:
-                urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
-                for url in urls:
-                    try:
-                        article = {}
-                        if "www.gov.cn" in url:
-                            article['category']= "Policy Release"
-                        elif "../../zcfb/" in url:
-                            url = url.replace("../../zcfb/", "https://www.ndrc.gov.cn/xxgk/zcfb/")
-                            article['category']= "Policy Release"
-                        else:
-                            url = url.replace("../../", "https://www.ndrc.gov.cn/xxgk/jd/jd/")
-                            url = url.replace("./", "https://www.ndrc.gov.cn/xxgk/jd/jd/")
-                            article['category']= "Policy Interpretation"
-                        crawl(url, article)
-                    except Exception as error:
-                        print(error)
-print("safe.gov.cn")
-i = 1
-while i > -1:
-    if i == 1:
-        CATEGORY_URL = "https://www.safe.gov.cn/safe/zcfgjd/index.html"
-    else:
-        CATEGORY_URL = f"https://www.safe.gov.cn/safe/zcfgjd/index_{i}.html"
-    i = i + 1
-    req = urllib.request.urlopen(CATEGORY_URL)
-    text = req.read()
-    html_text = text.decode("utf-8")
-    page = etree.HTML(html_text)
-    articlelist = page.xpath("//div[contains(@class, 'list_conr')]/ul/li")
-    for article in articlelist:
-        if isinstance(article, etree._Element):
-            subelement = etree.tostring(article).decode()
-            subpage = etree.HTML(subelement)
-            date = subpage.xpath("//dd/text()")[0]
-            parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
-            if  parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
-                i = -1
             else:
-                urls = subpage.xpath("//a/@href")
-                for url in urls:
                     try:
-                        article = {}
-                        url = "https://www.safe.gov.cn" + url
-                        article['category']= "Policy Interpretation"
-                        crawl(url, article)
                     except Exception as error:
                         print(error)
-i = 1
-while i > -1:
-    if i == 1:
-        CATEGORY_URL = "https://www.safe.gov.cn/safe/sjjd/index.html"
-    else:
-        CATEGORY_URL = f"https://www.safe.gov.cn/safe/sjjd/index_{i}.html"
-    i = i + 1
-    req = urllib.request.urlopen(CATEGORY_URL)
-    text = req.read()
-    html_text = text.decode("utf-8")
-    page = etree.HTML(html_text)
-    articlelist = page.xpath("//div[contains(@class, 'list_conr')]/ul/li")
-    for article in articlelist:
-        if isinstance(article, etree._Element):
-            subelement = etree.tostring(article).decode()
-            subpage = etree.HTML(subelement)
-            date = subpage.xpath("//dd/text()")[0]
-            parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
-            if  parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
-                i = -1
             else:
-                urls = subpage.xpath("//a/@href")
-                for url in urls:
-                    try:
-                        article = {}
-                        url = "https://www.safe.gov.cn" + url
-                        article['category']= "Data Interpretation"
-                        crawl(url, article)
-                    except Exception as error:
-                        print(error)
-print("stats.gov.hk")
-i = 0
-while i > -1:
-    if i == 0:
-        CATEGORY_URL = "https://www.stats.gov.cn/sj/sjjd/"
-    else:
-        CATEGORY_URL = f"https://www.stats.gov.cn/sj/sjjd/index_{i}.html"
-    i = i + 1
-    req = urllib.request.urlopen(CATEGORY_URL)
-    text = req.read()
-    html_text = text.decode("utf-8")
-    page = etree.HTML(html_text)
-    articlelist = page.xpath("//div[contains(@class, 'list-content')]/ul/li")
-    for article in articlelist:
-        if isinstance(article, etree._Element):
-            subelement = etree.tostring(article).decode()
-            subpage = etree.HTML(subelement)
-            date = encode(subpage.xpath("//span"))
-            parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
-            if  parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
                 i = -1
             else:
-                urls = subpage.xpath("//a[@class='fl pc_1600']/@href")
-                for url in urls:
-                    try:
-                        article = {}
-                        url = url.replace('./', "https://www.stats.gov.cn/sj/sjjd/")
-                        article['category']= "Data Interpretation"
-                        crawl(url, article)
-                    except Exception as error:
-                        print(error)

 import uuid
 from datetime import datetime, timedelta
 from urllib.parse import urlparse
+from prefect import flow, task
 from lxml import etree
                    extract_from_pdf, extract_reference, fetch_url,
                    sentiment_computation, translate, update_content)
 def crawl_eastmoney(url, article):
     """
     Crawls the given URL and extracts information from the webpage.
     extract_reference(article)
     update_content(article)
+@flow(name = "Data Collection China - Daily", log_prints = True)
+def main():
+    with open('xpath.json', 'r', encoding='UTF-8') as f:
+        xpath_dict = json.load(f)
+    DELTA = int(os.environ.get('DELTA') or '1')
+    print(f"DELTA = {DELTA}")
+    print("cbirc.gov.cn")
     i = 1
     while i > -1:
+        CATEGORY_URL = f"https://www.cbirc.gov.cn/cn/static/data/DocInfo/SelectDocByItemIdAndChild/data_itemId=917,pageIndex={i},pageSize=18.json"
         i = i + 1
+        content = fetch_url(CATEGORY_URL)
+        reportinfo = json.loads(content)
+        for article in reportinfo['data']['rows']:
+            try:
+                article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(article['publishDate'],"%Y-%m-%d %H:%M:%S"))
+                parsed_datetime = datetime.strptime(article['publishDate'], "%Y-%m-%d")
+                if  parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
+                    i = -1
+                else:
+                    contentCN, summary = extract_from_pdf("https://www.cbirc.gov.cn" + article['pdfFileUrl'])
+                    article['contentCN'] = repr(contentCN)[1:-1].strip()
+                    if len(contentCN) < 10:
+                        continue
+                    CONTENT_ENG = ''
+                    for element in article['contentCN'].split("\n"):
+                        CONTENT_ENG += translate(element) + '\n'
+                    article['content'] = repr(CONTENT_ENG)[1:-1].strip()
+                    article['site'] = "National Financial Regulatory Administration of China"
+                    article['originSite'] = "国家金融监督管理总局"
+                    article['titleCN'] = article['docSubtitle']
+                    article['title'] = translate(article['docSubtitle'])
+                    article['link'] = "https://www.cbirc.gov.cn" + str(article['pdfFileUrl'])
+                    article['category']= "Policy Interpretation"
+                    article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['titleCN']+article['publishDate'])
+                    article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
+                    article['attachment'] = ''
+                    article['author'] = ''
+                    article['subtitle'] = translate(summary)
+                    update_content(article)
+            except Exception as error:
+                print(error)
+    print("csrc.gov.cn")
+    i = 1
+    while i > -1:
         try:
+            if i == 1:
+                CATEGORY_URL = "http://www.csrc.gov.cn/csrc/c100039/common_list.shtml"
+            else:
+                CATEGORY_URL = f"http://www.csrc.gov.cn/csrc/c100039/common_list_{i}.shtml"
+            i = i + 1
+            req = urllib.request.urlopen(CATEGORY_URL)
             text = req.read()
             html_text = text.decode("utf-8")
             page = etree.HTML(html_text)
+            articlelist = page.xpath("//div[contains(@class, 'main-right fr common-list')]/ul/li")
             for article in articlelist:
                 if isinstance(article, etree._Element):
                     subelement = etree.tostring(article).decode()
                     subpage = etree.HTML(subelement)
+                    date = encode(subpage.xpath("//span[@class='date']"))
+                    parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
                     if  parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
                         i = -1
                     else:
                         for url in urls:
                             try:
                                 article = {}
+                                url = "http://www.csrc.gov.cn" + url
+                                article['category']= "Policy Interpretation"
                                 crawl(url, article)
                             except Exception as error:
                                 print(error)
             i = -1
             print(error)
+    i = 1
+    while i > -1:
+        CATEGORY_URL = f"http://www.csrc.gov.cn/searchList/a1a078ee0bc54721ab6b148884c784a8?_isAgg=true&_isJson=true&_pageSize=18&_template=index&_rangeTimeGte=&_channelName=&page={i}"
+        i = i + 1
+        try:
+            content = fetch_url(CATEGORY_URL)
+            reportinfo = json.loads(content)
+            for article in reportinfo['data']['results']:
+                parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(article['publishedTimeStr'],"%Y-%m-%d %H:%M:%S")), "%Y-%m-%d")
+                if  parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
+                    i = -1
+                else:
+                    article['category']= "Financial News"
+                    article['site'] = "Securities Regulatory Commission of China"
+                    article['originSite'] = "证监会"
+                    article['titleCN'] = article['title']
+                    article['title'] = translate(article['titleCN'])
+                    article['author'] = ''
+                    article['contentCN'] = repr(article['content'])[1:-1].strip()
+                    if len(article['contentCN']) < 10:
+                        continue
+                    CONTENT_ENG = ''
+                    for element in article['contentCN'].split("。"):
+                        CONTENT_ENG += translate(element) + ' '
+                    article['content'] = repr(CONTENT_ENG)[1:-1].strip()
+                    article['subtitle'] = article['memo']
+                    article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(article['publishedTimeStr'],"%Y-%m-%d %H:%M:%S"))
+                    article['link'] = article['url']
+                    article['attachment'] = ""
+                    article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
+                    article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['titleCN']+article['publishDate'])
+                    update_content(article)
+        except Exception as error:
+            print(error)
+    print("data.eastmoney.com")
+    today = datetime.today().strftime('%Y-%m-%d')
+    beginDate = (datetime.today() - timedelta(days=DELTA)).strftime('%Y-%m-%d')
+    i = 0
+    while i > -1:
+        URL = "https://reportapi.eastmoney.com/report/jg"
+        params = {
+            "cb": "datatable8544623",
+            "pageSize": "100",
+            "beginTime": beginDate,
+            "endTime": today,
+            "pageNo": i,
+            "qType": "3",
+        }
+        URL = URL + "?" + "&".join(f"{key}={value}" for key, value in params.items())
+        content = fetch_url(URL)
+        if content:
+            start_index = content.find("(")
+            if start_index != -1:
+                result = content[start_index + 1: -1]
             else:
+                result = content
+            reportinfo = json.loads(result)
+            if reportinfo["size"] > 0:
+                i = i + 1
+                for article in reportinfo['data']:
                     try:
+                        url = f"https://data.eastmoney.com/report/zw_macresearch.jshtml?encodeUrl={article['encodeUrl']}"
+                        crawl_eastmoney(url,article)
                     except Exception as error:
                         print(error)
             else:
                 i = -1
+        else:
+            print("Failed to fetch URL:", url)
+    print("gov.cn")
+    i = 0
+    while i > -1:
+        if i == 0:
+            CATEGORY_URL = "https://www.gov.cn/zhengce/jiedu/home.htm"
+        else:
+            CATEGORY_URL = f"https://www.gov.cn/zhengce/jiedu/home_{i}.htm"
+        i = i + 1
+        req = urllib.request.urlopen(CATEGORY_URL)
+        text = req.read()
+        html_text = text.decode("utf-8")
+        page = etree.HTML(html_text)
+        articlelist = page.xpath("//div[contains(@class, 'news_box')]//h4")
+        for article in articlelist:
+            if isinstance(article, etree._Element):
+                subelement = etree.tostring(article).decode()
+                subpage = etree.HTML(subelement)
+                date = subpage.xpath("//span/text()")[0]
+                parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
+                if  parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
+                    i = -1
+                else:
+                    urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
+                    for url in urls:
+                        try:
+                            article = {}
+                            url = url.replace('../', 'https://www.gov.cn/zhengce/')
+                            if "https://www.gov.cn" in url:
+                                article['category']= "Policy Interpretation"
+                                crawl(url, article)
+                        except Exception as error:
+                            print(error)
+    i = 0
+    while i > -1:
+        if i == 0:
+            CATEGORY_URL = "https://www.gov.cn/zhengce/zuixin/home.htm"
+        else:
+            CATEGORY_URL = f"https://www.gov.cn/zhengce/zuixin/home_{i}.htm"
+        i = i + 1
+        req = urllib.request.urlopen(CATEGORY_URL)
+        text = req.read()
+        html_text = text.decode("utf-8")
+        page = etree.HTML(html_text)
+        articlelist = page.xpath("//div[contains(@class, 'news_box')]//h4")
+        for article in articlelist:
+            if isinstance(article, etree._Element):
+                subelement = etree.tostring(article).decode()
+                subpage = etree.HTML(subelement)
+                date = subpage.xpath("//span/text()")[0]
+                parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
+                if  parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
+                    i = -1
+                else:
+                    urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
+                    for url in urls:
+                        try:
+                            article = {}
+                            url = url.replace('../', 'https://www.gov.cn/zhengce/')
+                            if "https://www.gov.cn" in url:
+                                article['site'] = "State Council of China"
+                                crawl(url, article)
+                        except Exception as error:
+                            print(error)
+    print("mof.gov.cn")
+    i = 0
+    while i > -1:
+        if i == 0:
+            CATEGORY_URL = "https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/"
+        else:
+            CATEGORY_URL = f"https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/index_{i}.htm"
+        i = i + 1
+        req = urllib.request.urlopen(CATEGORY_URL)
+        text = req.read()
+        html_text = text.decode("utf-8")
+        page = etree.HTML(html_text)
+        articlelist = page.xpath("//div[contains(@class, 'xwfb_listerji')]/ul/li[not(@class = 'clear')]")
+        for article in articlelist:
+            if isinstance(article, etree._Element):
+                subelement = etree.tostring(article).decode()
+                subpage = etree.HTML(subelement)
+                date = subpage.xpath("//span/text()")[0]
+                parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
+                if  parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
+                    i = -1
+                else:
+                    urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
+                    for url in urls:
+                        try:
+                            article = {}
+                            url = url.replace("../", "https://www.mof.gov.cn/zhengwuxinxi/")
+                            url = url.replace("./", "https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/")
+                            article['category']= "Financial News"
+                            crawl(url, article)
+                        except Exception as error:
+                            print(error)
+    i = 0
+    while i > -1:
+        if i == 0:
+            CATEGORY_URL = "https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/"
+        else:
+            CATEGORY_URL = f"https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/index_{i}.htm"
+        i = i + 1
+        req = urllib.request.urlopen(CATEGORY_URL)
+        text = req.read()
+        html_text = text.decode("utf-8")
+        page = etree.HTML(html_text)
+        articlelist = page.xpath("//div[contains(@class, 'xwfb_listerji')]/ul/li[not(@class = 'clear')]")
+        for article in articlelist:
+            if isinstance(article, etree._Element):
+                subelement = etree.tostring(article).decode()
+                subpage = etree.HTML(subelement)
+                date = subpage.xpath("//span/text()")[0]
+                parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
+                if  parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
+                    i = -1
+                else:
+                    urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
+                    for url in urls:
+                        try:
+                            article = {}
+                            url = url.replace("./", CATEGORY_URL)
+                            article['category']= "Policy Interpretation"
+                            print(url)
+                            crawl(url, article)
+                        except Exception as error:
+                            print(error)
+    print("mofcom.gov.cn")
+    categories = ['jdzhsw','jdgnmy','jddwmy','jdtzhz']
+    for category in categories:
+        i = 1
+        while i > -1:
+            if i == 1:
+                URL = f"http://www.mofcom.gov.cn/article/zcjd/{category}/"
             else:
+                URL = f"http://www.mofcom.gov.cn/article/zcjd/{category}/?{i}"
+            i = i + 1
+            try:
+                req = urllib.request.urlopen(URL)
+                text = req.read()
+                html_text = text.decode("utf-8")
+                page = etree.HTML(html_text)
+                articlelist = page.xpath("//section[contains(@class, 'listCon iListCon f-mt30')]/ul/li")
+                for article in articlelist:
+                    if isinstance(article, etree._Element):
+                        subelement = etree.tostring(article).decode()
+                        subpage = etree.HTML(subelement)
+                        date = subpage.xpath("//span/text()")[0]
+                        parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d %H:%M:%S")), "%Y-%m-%d")
+                        if  parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
+                            i = -1
+                        else:
+                            urls = subpage.xpath("//a/@href")
+                            for url in urls:
+                                try:
+                                    article = {}
+                                    if '/article/zcjd' in url:
+                                        url = "http://www.mofcom.gov.cn" + url
+                                        article['category']= "Policy Interpretation"
+                                    else:
+                                        article['category']= "Policy Release"
+                                    crawl(url, article)
+                                except Exception as error:
+                                    print(error)
+            except Exception as error:
+                i = -1
+                print(error)
+    print("ndrc.gov.cn")
+    i = 0
+    while i > -1:
+        if i == 0:
+            CATEGORY_URL = "https://www.ndrc.gov.cn/xxgk/jd/jd/index.html"
+        else:
+            CATEGORY_URL = f"https://www.ndrc.gov.cn/xxgk/jd/jd/index_{i}.html"
+        i = i + 1
+        req = urllib.request.urlopen(CATEGORY_URL)
+        text = req.read()
+        html_text = text.decode("utf-8")
+        page = etree.HTML(html_text)
+        articlelist = page.xpath("//div[contains(@class, 'list')]/ul/li[not(@class = 'empty')]")
+        for article in articlelist:
+            if isinstance(article, etree._Element):
+                subelement = etree.tostring(article).decode()
+                subpage = etree.HTML(subelement)
+                date = subpage.xpath("//span/text()")[0]
+                parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y/%m/%d")), "%Y-%m-%d")
+                if  parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
+                    i = -1
+                else:
+                    urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
+                    for url in urls:
+                        try:
+                            article = {}
+                            if "www.gov.cn" in url:
+                                article['category']= "Policy Release"
+                            elif "../../zcfb/" in url:
+                                url = url.replace("../../zcfb/", "https://www.ndrc.gov.cn/xxgk/zcfb/")
+                                article['category']= "Policy Release"
+                            else:
+                                url = url.replace("../../", "https://www.ndrc.gov.cn/xxgk/jd/jd/")
+                                url = url.replace("./", "https://www.ndrc.gov.cn/xxgk/jd/jd/")
+                                article['category']= "Policy Interpretation"
+                            crawl(url, article)
+                        except Exception as error:
+                            print(error)
+    print("safe.gov.cn")
+    i = 1
+    while i > -1:
+        if i == 1:
+            CATEGORY_URL = "https://www.safe.gov.cn/safe/zcfgjd/index.html"
+        else:
+            CATEGORY_URL = f"https://www.safe.gov.cn/safe/zcfgjd/index_{i}.html"
+        i = i + 1
+        req = urllib.request.urlopen(CATEGORY_URL)
+        text = req.read()
+        html_text = text.decode("utf-8")
+        page = etree.HTML(html_text)
+        articlelist = page.xpath("//div[contains(@class, 'list_conr')]/ul/li")
+        for article in articlelist:
+            if isinstance(article, etree._Element):
+                subelement = etree.tostring(article).decode()
+                subpage = etree.HTML(subelement)
+                date = subpage.xpath("//dd/text()")[0]
+                parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
+                if  parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
+                    i = -1
+                else:
+                    urls = subpage.xpath("//a/@href")
+                    for url in urls:
+                        try:
+                            article = {}
+                            url = "https://www.safe.gov.cn" + url
+                            article['category']= "Policy Interpretation"
+                            crawl(url, article)
+                        except Exception as error:
+                            print(error)
+    i = 1
+    while i > -1:
+        if i == 1:
+            CATEGORY_URL = "https://www.safe.gov.cn/safe/sjjd/index.html"
+        else:
+            CATEGORY_URL = f"https://www.safe.gov.cn/safe/sjjd/index_{i}.html"
+        i = i + 1
+        req = urllib.request.urlopen(CATEGORY_URL)
+        text = req.read()
+        html_text = text.decode("utf-8")
+        page = etree.HTML(html_text)
+        articlelist = page.xpath("//div[contains(@class, 'list_conr')]/ul/li")
+        for article in articlelist:
+            if isinstance(article, etree._Element):
+                subelement = etree.tostring(article).decode()
+                subpage = etree.HTML(subelement)
+                date = subpage.xpath("//dd/text()")[0]
+                parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
+                if  parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
+                    i = -1
+                else:
+                    urls = subpage.xpath("//a/@href")
+                    for url in urls:
+                        try:
+                            article = {}
+                            url = "https://www.safe.gov.cn" + url
+                            article['category']= "Data Interpretation"
+                            crawl(url, article)
+                        except Exception as error:
+                            print(error)
+    print("stats.gov.hk")
+    i = 0
+    while i > -1:
+        if i == 0:
+            CATEGORY_URL = "https://www.stats.gov.cn/sj/sjjd/"
+        else:
+            CATEGORY_URL = f"https://www.stats.gov.cn/sj/sjjd/index_{i}.html"
+        i = i + 1
+        req = urllib.request.urlopen(CATEGORY_URL)
+        text = req.read()
+        html_text = text.decode("utf-8")
+        page = etree.HTML(html_text)
+        articlelist = page.xpath("//div[contains(@class, 'list-content')]/ul/li")
+        for article in articlelist:
+            if isinstance(article, etree._Element):
+                subelement = etree.tostring(article).decode()
+                subpage = etree.HTML(subelement)
+                date = encode(subpage.xpath("//span"))
+                parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
+                if  parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
+                    i = -1
+                else:
+                    urls = subpage.xpath("//a[@class='fl pc_1600']/@href")
+                    for url in urls:
+                        try:
+                            article = {}
+                            url = url.replace('./', "https://www.stats.gov.cn/sj/sjjd/")
+                            article['category']= "Data Interpretation"
+                            crawl(url, article)
+                        except Exception as error:
+                            print(error)
+if __name__ == '__main__':
+    main()

requirements.txt CHANGED Viewed

Binary files a/requirements.txt and b/requirements.txt differ