Spaces:

Oxbridge-Economics
/

Data-Collection-China

Sleeping

App Files Files Community

OxbridgeEconomics commited on Aug 18, 2024

Commit

01677a0

1 Parent(s): 270ad28

commit

Browse files

Files changed (13) hide show

controllers/utils.py +1 -5
main.py +2 -0
requirements.txt +1 -0
source/cbirc.py +2 -1
source/csrc.py +2 -1
source/eastmoney.py +6 -3
source/gov.py +2 -1
source/mof.py +2 -1
source/mofcom.py +2 -1
source/ndrc.py +2 -1
source/safe.py +2 -1
source/stats.py +2 -1
utils.py +1 -1

controllers/utils.py CHANGED Viewed

@@ -534,6 +534,7 @@ def extract_reference(row):
                     update_reference(row)
     except Exception as error:
         logging.error(error)
 def translist(infolist):
@@ -651,11 +652,6 @@ def crawl_by_url(url, article):
             contenteng += translate(element) + '. '
     article['content'] = repr(contenteng)[1:-1].strip()
     article['subtitle'] = summarize(article['content'])
-    # if 'subtitle' in xpath_dict[domain]:
-    #     article['subtitle'] = translate(
-    #         encode(page.xpath(xpath_dict[domain]['subtitle'])))
-    # else:
-    #     article['subtitle'] = translate(summary)
     article['publishDate'] = datemodifier(
         encode(page.xpath(xpath_dict[domain]['publishdate'])),
         xpath_dict[domain]['datetime_format'])

                     update_reference(row)
     except Exception as error:
         logging.error(error)
+        return None
 def translist(infolist):
             contenteng += translate(element) + '. '
     article['content'] = repr(contenteng)[1:-1].strip()
     article['subtitle'] = summarize(article['content'])
     article['publishDate'] = datemodifier(
         encode(page.xpath(xpath_dict[domain]['publishdate'])),
         xpath_dict[domain]['datetime_format'])

main.py CHANGED Viewed

@@ -9,6 +9,7 @@ import os
 from dotenv import load_dotenv
 from source import cbirc, csrc, eastmoney, gov, mofcom, ndrc, safe, stats, mof
 load_dotenv()
@@ -29,3 +30,4 @@ if __name__ == '__main__':
     mofcom.crawl(delta)
     ndrc.crawl(delta)
     mof.crawl(delta)

 from dotenv import load_dotenv
 from source import cbirc, csrc, eastmoney, gov, mofcom, ndrc, safe, stats, mof
+from glue import glue_job_run
 load_dotenv()
     mofcom.crawl(delta)
     ndrc.crawl(delta)
     mof.crawl(delta)
+    glue_job_run()

requirements.txt CHANGED Viewed

@@ -192,3 +192,4 @@ websocket-client==1.8.0
 Werkzeug==3.0.3
 wrapt==1.16.0
 yarl==1.9.4

 Werkzeug==3.0.3
 wrapt==1.16.0
 yarl==1.9.4
+prefect==2.20.2

source/cbirc.py CHANGED Viewed

@@ -4,6 +4,7 @@ import logging
 import time
 import uuid
 from datetime import datetime, timedelta
 from controllers.summarizer import summarize
 from controllers.utils import (
@@ -14,7 +15,7 @@ from controllers.utils import (
         update_content,
 )
 def crawl(delta):
     """
     Crawls the website 'https://www.cbirc.gov.cn' to fetch and process articles.

 import time
 import uuid
 from datetime import datetime, timedelta
+from prefect import flow
 from controllers.summarizer import summarize
 from controllers.utils import (
         update_content,
 )
+@flow(name = "Data Collection - cbirc")
 def crawl(delta):
     """
     Crawls the website 'https://www.cbirc.gov.cn' to fetch and process articles.

source/csrc.py CHANGED Viewed

@@ -6,6 +6,7 @@ import urllib.request
 import uuid
 from datetime import datetime, timedelta
 from lxml import etree
 from controllers.summarizer import summarize
@@ -18,7 +19,7 @@ from controllers.utils import (
     update_content,
 )
 def crawl(delta):
     """
     Crawls the website http://www.csrc.gov.cn to retrieve articles based on the specified delta.

 import uuid
 from datetime import datetime, timedelta
+from prefect import flow
 from lxml import etree
 from controllers.summarizer import summarize
     update_content,
 )
+@flow(name = "Data Collection - csrc")
 def crawl(delta):
     """
     Crawls the website http://www.csrc.gov.cn to retrieve articles based on the specified delta.

source/eastmoney.py CHANGED Viewed

@@ -6,6 +6,7 @@ import uuid
 from datetime import datetime, timedelta
 from urllib.parse import urlparse
 from lxml import etree
 from controllers.summarizer import summarize
@@ -45,7 +46,7 @@ def _crawl(url, article):
     text = req.read()
     html_text = text.decode("utf-8")
     page = etree.HTML(html_text)
-    contentcn, summary = encode_content(
         page.xpath(xpath_dict[domain]['content']))
     article['attachment'] = encode(page.xpath(
         xpath_dict[domain]['attachment']))
@@ -54,6 +55,7 @@ def _crawl(url, article):
         article['site'] = translate(article['orgSName'])
     else:
         article['site'] = translate(article['orgName'])
     article['titleCN'] = article['title']
     article['title'] = translate(article['title'])
     article['author'] = translate(article['researcher'])
@@ -66,6 +68,7 @@ def _crawl(url, article):
     for element in contentcn.split("\n"):
         contenteng += translate(element) + '\n'
     article['content'] = repr(contenteng)[1:-1].strip()
     article['subtitle'] = summarize(article['content'])
     article['authorid'] = uuid.uuid5(uuid.NAMESPACE_OID, article['author'])
     article['publishDate'] = datemodifier(
@@ -74,11 +77,11 @@ def _crawl(url, article):
                                article['titleCN'] + article['publishDate'])
     article['sentimentScore'], article[
         'sentimentLabel'] = sentiment_computation(contentcn.replace("\n", ""))
     extract_reference(article)
     update_content(article)
-    logging.info(article)
 def crawl(delta):
     """
     Crawls the website data.eastmoney.com and retrieves reports within a specified time range.

 from datetime import datetime, timedelta
 from urllib.parse import urlparse
+from prefect import flow
 from lxml import etree
 from controllers.summarizer import summarize
     text = req.read()
     html_text = text.decode("utf-8")
     page = etree.HTML(html_text)
+    contentcn, _ = encode_content(
         page.xpath(xpath_dict[domain]['content']))
     article['attachment'] = encode(page.xpath(
         xpath_dict[domain]['attachment']))
         article['site'] = translate(article['orgSName'])
     else:
         article['site'] = translate(article['orgName'])
+    logging.info(article)
     article['titleCN'] = article['title']
     article['title'] = translate(article['title'])
     article['author'] = translate(article['researcher'])
     for element in contentcn.split("\n"):
         contenteng += translate(element) + '\n'
     article['content'] = repr(contenteng)[1:-1].strip()
+    logging.info(article)
     article['subtitle'] = summarize(article['content'])
     article['authorid'] = uuid.uuid5(uuid.NAMESPACE_OID, article['author'])
     article['publishDate'] = datemodifier(
                                article['titleCN'] + article['publishDate'])
     article['sentimentScore'], article[
         'sentimentLabel'] = sentiment_computation(contentcn.replace("\n", ""))
+    logging.info(article)
     extract_reference(article)
     update_content(article)
+@flow(name = "Data Collection - eastmoney")
 def crawl(delta):
     """
     Crawls the website data.eastmoney.com and retrieves reports within a specified time range.

source/gov.py CHANGED Viewed

@@ -5,10 +5,11 @@ import urllib.request
 from datetime import datetime, timedelta
 from lxml import etree
 from controllers.utils import crawl_by_url
 def crawl(delta):
     """
     Crawls the government website for policy interpretation and latest news articles.

 from datetime import datetime, timedelta
 from lxml import etree
+from prefect import flow
 from controllers.utils import crawl_by_url
+@flow(name = "Data Collection - gov")
 def crawl(delta):
     """
     Crawls the government website for policy interpretation and latest news articles.

source/mof.py CHANGED Viewed

@@ -5,10 +5,11 @@ import urllib.request
 from datetime import datetime, timedelta
 from lxml import etree
 from controllers.utils import crawl_by_url
 def crawl(delta):
     """
     Crawls the website to retrieve articles based on the specified delta.

 from datetime import datetime, timedelta
 from lxml import etree
+from prefect import flow
 from controllers.utils import crawl_by_url
+@flow(name = "Data Collection - mof")
 def crawl(delta):
     """
     Crawls the website to retrieve articles based on the specified delta.

source/mofcom.py CHANGED Viewed

@@ -5,10 +5,11 @@ import urllib.request
 from datetime import datetime, timedelta
 from lxml import etree
 from controllers.utils import crawl_by_url
 def crawl(delta):
     """
     Crawls the website http://www.mofcom.gov.cn to retrieve articles based on the specified delta.

 from datetime import datetime, timedelta
 from lxml import etree
+from prefect import flow
 from controllers.utils import crawl_by_url
+@flow(name = "Data Collection - mofcom")
 def crawl(delta):
     """
     Crawls the website http://www.mofcom.gov.cn to retrieve articles based on the specified delta.

source/ndrc.py CHANGED Viewed

@@ -5,10 +5,11 @@ import urllib.request
 from datetime import datetime, timedelta
 from lxml import etree
 from controllers.utils import crawl_by_url
 def crawl(delta):
     """
     Crawls the website "https://www.ndrc.gov.cn/xxgk/jd/jd/" and retrieves articles based on the specified time delta.

 from datetime import datetime, timedelta
 from lxml import etree
+from prefect import flow
 from controllers.utils import crawl_by_url
+@flow(name = "Data Collection - ndrc")
 def crawl(delta):
     """
     Crawls the website "https://www.ndrc.gov.cn/xxgk/jd/jd/" and retrieves articles based on the specified time delta.

source/safe.py CHANGED Viewed

@@ -5,10 +5,11 @@ import urllib.request
 from datetime import datetime, timedelta
 from lxml import etree
 from controllers.utils import crawl_by_url
 def crawl(delta):
     """
     Crawls the website "https://www.safe.gov.cn" to retrieve policy interpretation and data interpretation articles.

 from datetime import datetime, timedelta
 from lxml import etree
+from prefect import flow
 from controllers.utils import crawl_by_url
+@flow(name = "Data Collection - safe")
 def crawl(delta):
     """
     Crawls the website "https://www.safe.gov.cn" to retrieve policy interpretation and data interpretation articles.

source/stats.py CHANGED Viewed

@@ -5,10 +5,11 @@ import urllib.request
 from datetime import datetime, timedelta
 from lxml import etree
 from controllers.utils import crawl_by_url, encode
 def crawl(delta):
     """
     Crawls the website "https://www.stats.gov.cn/sj/sjjd/" and retrieves articles within a specified time range.

 from datetime import datetime, timedelta
 from lxml import etree
+from prefect import flow
 from controllers.utils import crawl_by_url, encode
+@flow(name = "Data Collection - stats")
 def crawl(delta):
     """
     Crawls the website "https://www.stats.gov.cn/sj/sjjd/" and retrieves articles within a specified time range.

utils.py CHANGED Viewed

@@ -14,7 +14,7 @@ import requests
 import boto3
 from dotenv import load_dotenv
 from lxml import etree
-from googletrans import Translator
 from transformers import pipeline
 from PyPDF2 import PdfReader
 from langdetect import detect

 import boto3
 from dotenv import load_dotenv
 from lxml import etree
+# from googletrans import Translator
 from transformers import pipeline
 from PyPDF2 import PdfReader
 from langdetect import detect