Spaces:

Oxbridge-Economics
/

Data-Collection-China

Sleeping

App Files Files Community

Muhammad Abdur Rahman Saad commited on Aug 19, 2024

Commit

4259f95

1 Parent(s): 91fadcf

fix logging issue

Browse files

Files changed (9) hide show

source/cbirc.py +4 -4
source/csrc.py +11 -11
source/eastmoney.py +8 -8
source/gov.py +5 -5
source/mof.py +5 -5
source/mofcom.py +5 -5
source/ndrc.py +4 -4
source/safe.py +5 -5
source/stats.py +4 -4

source/cbirc.py CHANGED Viewed

@@ -1,10 +1,9 @@
 """Module to crawl the website 'https://www.cbirc.gov.cn' to fetch and process articles."""
 import json
-import logging
 import time
 import uuid
 from datetime import datetime, timedelta
-from prefect import task
 from controllers.summarizer import summarize
 from controllers.utils import (
@@ -29,7 +28,8 @@ def crawl(delta):
     Raises:
         None
     """
-    logging.info("cbirc.gov.cn")
     i = 1
     while i > -1:
         category_url= f"https://www.cbirc.gov.cn/cn/static/data/DocInfo/SelectDocByItemIdAndChild/data_itemId=917,pageIndex={i},pageSize=18.json"
@@ -73,4 +73,4 @@ def crawl(delta):
                         article['subtitle'] = summarize(article['content'])
                         update_content(article)
                 except Exception as error:
-                    logging.error(error)

 """Module to crawl the website 'https://www.cbirc.gov.cn' to fetch and process articles."""
 import json
 import time
 import uuid
 from datetime import datetime, timedelta
+from prefect import task, get_run_logger
 from controllers.summarizer import summarize
 from controllers.utils import (
     Raises:
         None
     """
+    logger = get_run_logger()
+    logger.info("cbirc.gov.cn")
     i = 1
     while i > -1:
         category_url= f"https://www.cbirc.gov.cn/cn/static/data/DocInfo/SelectDocByItemIdAndChild/data_itemId=917,pageIndex={i},pageSize=18.json"
                         article['subtitle'] = summarize(article['content'])
                         update_content(article)
                 except Exception as error:
+                    logger.error(error)

source/csrc.py CHANGED Viewed

@@ -1,12 +1,11 @@
 """Module to crawl the website 'https://www.csrc.gov.cn' to fetch and process articles."""
 import json
-import logging
 import time
 import urllib.request
 import uuid
 from datetime import datetime, timedelta
-from prefect import task
 from lxml import etree
 from controllers.summarizer import summarize
@@ -33,7 +32,8 @@ def crawl(delta):
     Raises:
         None
     """
-    logging.info("csrc.gov.cn")
     i = 1
     while i > -1:
         try:
@@ -42,7 +42,7 @@ def crawl(delta):
             else:
                 category_url = f"http://www.csrc.gov.cn/csrc/c100039/common_list_{i}.shtml"
             i = i + 1
-            logging.info(category_url)
             req = urllib.request.urlopen(category_url)
             text = req.read()
             html_text = text.decode("utf-8")
@@ -66,20 +66,20 @@ def crawl(delta):
                                 article = {}
                                 url = "http://www.csrc.gov.cn" + url
                                 article['category'] = "Policy Interpretation"
-                                logging.info(url)
                                 crawl_by_url(url, article)
                             except Exception as error:
-                                logging.error(error)
         except Exception as error:
             i = -1
-            logging.error(error)
     i = 1
     while i > -1:
         category_url = f"http://www.csrc.gov.cn/searchList/a1a078ee0bc54721ab6b148884c784a8?_isAgg=true&_isJson=true&_pageSize=18&_template=index&_rangeTimeGte=&_channelName=&page={i}"
         i = i + 1
         try:
-            logging.info(category_url)
             content = fetch_url(category_url)
             if content is None:
                 i = -1
@@ -87,7 +87,7 @@ def crawl(delta):
                 reportinfo = json.loads(content)
                 if len(reportinfo['data']['results']) == 0:
                     i = -1
-                    logging.info(len(reportinfo['data']['results']))
                 for article in reportinfo['data']['results']:
                     parsed_datetime = datetime.strptime(
                             time.strftime(
@@ -122,8 +122,8 @@ def crawl(delta):
                         article['id'] = uuid.uuid5(
                                 uuid.NAMESPACE_OID,
                                 article['titleCN'] + article['publishDate'])
-                        logging.info(article)
                         # update_content(article)
         except Exception as error:
             i = -1
-            logging.error(error)

 """Module to crawl the website 'https://www.csrc.gov.cn' to fetch and process articles."""
 import json
 import time
 import urllib.request
 import uuid
 from datetime import datetime, timedelta
+from prefect import task, get_run_logger
 from lxml import etree
 from controllers.summarizer import summarize
     Raises:
         None
     """
+    logger = get_run_logger()
+    logger.info("csrc.gov.cn")
     i = 1
     while i > -1:
         try:
             else:
                 category_url = f"http://www.csrc.gov.cn/csrc/c100039/common_list_{i}.shtml"
             i = i + 1
+            logger.info(category_url)
             req = urllib.request.urlopen(category_url)
             text = req.read()
             html_text = text.decode("utf-8")
                                 article = {}
                                 url = "http://www.csrc.gov.cn" + url
                                 article['category'] = "Policy Interpretation"
+                                logger.info(url)
                                 crawl_by_url(url, article)
                             except Exception as error:
+                                logger.error(error)
         except Exception as error:
             i = -1
+            logger.error(error)
     i = 1
     while i > -1:
         category_url = f"http://www.csrc.gov.cn/searchList/a1a078ee0bc54721ab6b148884c784a8?_isAgg=true&_isJson=true&_pageSize=18&_template=index&_rangeTimeGte=&_channelName=&page={i}"
         i = i + 1
         try:
+            logger.info(category_url)
             content = fetch_url(category_url)
             if content is None:
                 i = -1
                 reportinfo = json.loads(content)
                 if len(reportinfo['data']['results']) == 0:
                     i = -1
+                    logger.info(len(reportinfo['data']['results']))
                 for article in reportinfo['data']['results']:
                     parsed_datetime = datetime.strptime(
                             time.strftime(
                         article['id'] = uuid.uuid5(
                                 uuid.NAMESPACE_OID,
                                 article['titleCN'] + article['publishDate'])
+                        logger.info(article)
                         # update_content(article)
         except Exception as error:
             i = -1
+            logger.error(error)

source/eastmoney.py CHANGED Viewed

@@ -1,12 +1,11 @@
 """Module to crawl the website 'eastmoney.com' to fetch and process articles."""
 import json
-import logging
 import urllib.request
 import uuid
 from datetime import datetime, timedelta
 from urllib.parse import urlparse
-from prefect import task
 from lxml import etree
 from controllers.summarizer import summarize
@@ -55,7 +54,7 @@ def _crawl(url, article):
         article['site'] = translate(article['orgSName'])
     else:
         article['site'] = translate(article['orgName'])
-    logging.info(article)
     article['titleCN'] = article['title']
     article['title'] = translate(article['title'])
     article['author'] = translate(article['researcher'])
@@ -68,7 +67,7 @@ def _crawl(url, article):
     for element in contentcn.split("\n"):
         contenteng += translate(element) + '\n'
     article['content'] = repr(contenteng)[1:-1].strip()
-    logging.info(article)
     article['subtitle'] = summarize(article['content'])
     article['authorid'] = uuid.uuid5(uuid.NAMESPACE_OID, article['author'])
     article['publishDate'] = datemodifier(
@@ -77,7 +76,7 @@ def _crawl(url, article):
                                article['titleCN'] + article['publishDate'])
     article['sentimentScore'], article[
         'sentimentLabel'] = sentiment_computation(contentcn.replace("\n", ""))
-    logging.info(article)
     extract_reference(article)
     update_content(article)
@@ -95,7 +94,8 @@ def crawl(delta):
     Raises:
         None
     """
-    logging.info("data.eastmoney.com")
     today = datetime.today().strftime('%Y-%m-%d')
     i = 0
     while i > -1:
@@ -123,8 +123,8 @@ def crawl(delta):
                         url = f"https://data.eastmoney.com/report/zw_macresearch.jshtml?encodeUrl={article['encodeUrl']}"
                         _crawl(url, article)
                     except Exception as error:
-                        logging.error(error)
             else:
                 i = -1
         else:
-            logging.error("Failed to fetch URL: %s", category_url)

 """Module to crawl the website 'eastmoney.com' to fetch and process articles."""
 import json
 import urllib.request
 import uuid
 from datetime import datetime, timedelta
 from urllib.parse import urlparse
+from prefect import task, get_run_logger
 from lxml import etree
 from controllers.summarizer import summarize
         article['site'] = translate(article['orgSName'])
     else:
         article['site'] = translate(article['orgName'])
+    print(f'INFO - {article}')
     article['titleCN'] = article['title']
     article['title'] = translate(article['title'])
     article['author'] = translate(article['researcher'])
     for element in contentcn.split("\n"):
         contenteng += translate(element) + '\n'
     article['content'] = repr(contenteng)[1:-1].strip()
+    print(f'INFO - {article}')
     article['subtitle'] = summarize(article['content'])
     article['authorid'] = uuid.uuid5(uuid.NAMESPACE_OID, article['author'])
     article['publishDate'] = datemodifier(
                                article['titleCN'] + article['publishDate'])
     article['sentimentScore'], article[
         'sentimentLabel'] = sentiment_computation(contentcn.replace("\n", ""))
+    print(f'INFO - {article}')
     extract_reference(article)
     update_content(article)
     Raises:
         None
     """
+    logger = get_run_logger()
+    logger.info("data.eastmoney.com")
     today = datetime.today().strftime('%Y-%m-%d')
     i = 0
     while i > -1:
                         url = f"https://data.eastmoney.com/report/zw_macresearch.jshtml?encodeUrl={article['encodeUrl']}"
                         _crawl(url, article)
                     except Exception as error:
+                        logger.error(error)
             else:
                 i = -1
         else:
+            logger.error("Failed to fetch URL: %s", category_url)

source/gov.py CHANGED Viewed

@@ -1,11 +1,10 @@
 """Module to crawl the website 'https://www.gov.cn' to fetch and process articles."""
-import logging
 import time
 import urllib.request
 from datetime import datetime, timedelta
 from lxml import etree
-from prefect import task
 from controllers.utils import crawl_by_url
@@ -20,7 +19,8 @@ def crawl(delta):
     Returns:
         None
     """
-    logging.info("gov.cn")
     i = 0
     while i > -1:
         if i == 0:
@@ -53,7 +53,7 @@ def crawl(delta):
                                 article['category'] = "Policy Interpretation"
                                 crawl_by_url(url, article)
                         except Exception as error:
-                            logging.error(error)
     i = 0
     while i > -1:
         if i == 0:
@@ -86,4 +86,4 @@ def crawl(delta):
                                 article['site'] = "State Council of China"
                                 crawl_by_url(url, article)
                         except Exception as error:
-                            logging.error(error)

 """Module to crawl the website 'https://www.gov.cn' to fetch and process articles."""
 import time
 import urllib.request
 from datetime import datetime, timedelta
 from lxml import etree
+from prefect import task, get_run_logger
 from controllers.utils import crawl_by_url
     Returns:
         None
     """
+    logger = get_run_logger()
+    logger.info("gov.cn")
     i = 0
     while i > -1:
         if i == 0:
                                 article['category'] = "Policy Interpretation"
                                 crawl_by_url(url, article)
                         except Exception as error:
+                            logger.error(error)
     i = 0
     while i > -1:
         if i == 0:
                                 article['site'] = "State Council of China"
                                 crawl_by_url(url, article)
                         except Exception as error:
+                            logger.error(error)

source/mof.py CHANGED Viewed

@@ -1,11 +1,10 @@
 """Module to crawl the website 'https://www.mof.gov.cn' to fetch and process articles."""
-import logging
 import time
 import urllib.request
 from datetime import datetime, timedelta
 from lxml import etree
-from prefect import task
 from controllers.utils import crawl_by_url
@@ -20,7 +19,8 @@ def crawl(delta):
     Returns:
         None
     """
-    logging.info("mof.gov.cn")
     i = 0
     while i > -1:
         if i == 0:
@@ -56,7 +56,7 @@ def crawl(delta):
                             article['category'] = "Financial News"
                             crawl_by_url(url, article)
                         except Exception as error:
-                            logging.error(error)
     i = 0
     while i > -1:
@@ -91,4 +91,4 @@ def crawl(delta):
                             article['category'] = "Policy Interpretation"
                             crawl_by_url(url, article)
                         except Exception as error:
-                            logging.error(error)

 """Module to crawl the website 'https://www.mof.gov.cn' to fetch and process articles."""
 import time
 import urllib.request
 from datetime import datetime, timedelta
 from lxml import etree
+from prefect import task, get_run_logger
 from controllers.utils import crawl_by_url
     Returns:
         None
     """
+    logger = get_run_logger()
+    logger.info("mof.gov.cn")
     i = 0
     while i > -1:
         if i == 0:
                             article['category'] = "Financial News"
                             crawl_by_url(url, article)
                         except Exception as error:
+                            logger.error(error)
     i = 0
     while i > -1:
                             article['category'] = "Policy Interpretation"
                             crawl_by_url(url, article)
                         except Exception as error:
+                            logger.error(error)

source/mofcom.py CHANGED Viewed

@@ -1,11 +1,10 @@
 """Module to crawl the website 'https://www.mofcom.gov.cn' to fetch and process articles."""
-import logging
 import time
 import urllib.request
 from datetime import datetime, timedelta
 from lxml import etree
-from prefect import task
 from controllers.utils import crawl_by_url
@@ -20,7 +19,8 @@ def crawl(delta):
     Returns:
     None
     """
-    logging.info("mofcom.gov.cn")
     categories = ['jdzhsw', 'jdgnmy', 'jddwmy', 'jdtzhz']
     for category in categories:
         i = 1
@@ -60,7 +60,7 @@ def crawl(delta):
                                         article['category'] = "Policy Release"
                                     crawl_by_url(url, article)
                                 except Exception as error:
-                                    logging.error(error)
             except Exception as error:
                 i = -1
-                logging.error(error)

 """Module to crawl the website 'https://www.mofcom.gov.cn' to fetch and process articles."""
 import time
 import urllib.request
 from datetime import datetime, timedelta
 from lxml import etree
+from prefect import task, get_run_logger
 from controllers.utils import crawl_by_url
     Returns:
     None
     """
+    logger = get_run_logger()
+    logger.info("mofcom.gov.cn")
     categories = ['jdzhsw', 'jdgnmy', 'jddwmy', 'jdtzhz']
     for category in categories:
         i = 1
                                         article['category'] = "Policy Release"
                                     crawl_by_url(url, article)
                                 except Exception as error:
+                                    logger.error(error)
             except Exception as error:
                 i = -1
+                logger.error(error)

source/ndrc.py CHANGED Viewed

@@ -1,11 +1,10 @@
 """Module to crawl the website 'https://www.ndrc.gov.cn' to fetch and process articles."""
-import logging
 import time
 import urllib.request
 from datetime import datetime, timedelta
 from lxml import etree
-from prefect import task
 from controllers.utils import crawl_by_url
@@ -23,7 +22,8 @@ def crawl(delta):
     Raises:
         None
     """
-    logging.info("ndrc.gov.cn")
     i = 0
     while i > -1:
         if i == 0:
@@ -65,4 +65,4 @@ def crawl(delta):
                                 article['category'] = "Policy Interpretation"
                             crawl_by_url(url, article)
                         except Exception as error:
-                            logging.error(error)

 """Module to crawl the website 'https://www.ndrc.gov.cn' to fetch and process articles."""
 import time
 import urllib.request
 from datetime import datetime, timedelta
 from lxml import etree
+from prefect import task, get_run_logger
 from controllers.utils import crawl_by_url
     Raises:
         None
     """
+    logger = get_run_logger()
+    logger.info("ndrc.gov.cn")
     i = 0
     while i > -1:
         if i == 0:
                                 article['category'] = "Policy Interpretation"
                             crawl_by_url(url, article)
                         except Exception as error:
+                            logger.error(error)

source/safe.py CHANGED Viewed

@@ -1,11 +1,10 @@
 """Module to crawl the website 'https://www.safe.gov.cn' to fetch and process articles."""
-import logging
 import time
 import urllib.request
 from datetime import datetime, timedelta
 from lxml import etree
-from prefect import task
 from controllers.utils import crawl_by_url
@@ -20,7 +19,8 @@ def crawl(delta):
     Returns:
         None
     """
-    logging.info("safe.gov.cn")
     i = 1
     while i > -1:
         if i == 1:
@@ -52,7 +52,7 @@ def crawl(delta):
                             article['category'] = "Policy Interpretation"
                             crawl_by_url(url, article)
                         except Exception as error:
-                            logging.error(error)
     i = 1
     while i > -1:
@@ -85,4 +85,4 @@ def crawl(delta):
                             article['category'] = "Data Interpretation"
                             crawl_by_url(url, article)
                         except Exception as error:
-                            logging.error(error)

 """Module to crawl the website 'https://www.safe.gov.cn' to fetch and process articles."""
 import time
 import urllib.request
 from datetime import datetime, timedelta
 from lxml import etree
+from prefect import task, get_run_logger
 from controllers.utils import crawl_by_url
     Returns:
         None
     """
+    logger = get_run_logger()
+    logger.info("safe.gov.cn")
     i = 1
     while i > -1:
         if i == 1:
                             article['category'] = "Policy Interpretation"
                             crawl_by_url(url, article)
                         except Exception as error:
+                            logger.error(error)
     i = 1
     while i > -1:
                             article['category'] = "Data Interpretation"
                             crawl_by_url(url, article)
                         except Exception as error:
+                            logger.error(error)

source/stats.py CHANGED Viewed

@@ -1,11 +1,10 @@
 """Module to crawl the website 'https://www.stats.gov.cn' to fetch and process articles."""
-import logging
 import time
 import urllib.request
 from datetime import datetime, timedelta
 from lxml import etree
-from prefect import task
 from controllers.utils import crawl_by_url, encode
@@ -23,7 +22,8 @@ def crawl(delta):
     Raises:
         None
     """
-    logging.info("stats.gov.hk")
     i = 0
     while i > -1:
         if i == 0:
@@ -55,4 +55,4 @@ def crawl(delta):
                             article['category'] = "Data Interpretation"
                             crawl_by_url(url, article)
                         except Exception as error:
-                            logging.info(error)

 """Module to crawl the website 'https://www.stats.gov.cn' to fetch and process articles."""
 import time
 import urllib.request
 from datetime import datetime, timedelta
 from lxml import etree
+from prefect import task, get_run_logger
 from controllers.utils import crawl_by_url, encode
     Raises:
         None
     """
+    logger = get_run_logger()
+    logger.info("stats.gov.hk")
     i = 0
     while i > -1:
         if i == 0:
                             article['category'] = "Data Interpretation"
                             crawl_by_url(url, article)
                         except Exception as error:
+                            logger.info(error)