OxbridgeEconomics commited on
Commit
01677a0
·
1 Parent(s): 270ad28
controllers/utils.py CHANGED
@@ -534,6 +534,7 @@ def extract_reference(row):
534
  update_reference(row)
535
  except Exception as error:
536
  logging.error(error)
 
537
 
538
 
539
  def translist(infolist):
@@ -651,11 +652,6 @@ def crawl_by_url(url, article):
651
  contenteng += translate(element) + '. '
652
  article['content'] = repr(contenteng)[1:-1].strip()
653
  article['subtitle'] = summarize(article['content'])
654
- # if 'subtitle' in xpath_dict[domain]:
655
- # article['subtitle'] = translate(
656
- # encode(page.xpath(xpath_dict[domain]['subtitle'])))
657
- # else:
658
- # article['subtitle'] = translate(summary)
659
  article['publishDate'] = datemodifier(
660
  encode(page.xpath(xpath_dict[domain]['publishdate'])),
661
  xpath_dict[domain]['datetime_format'])
 
534
  update_reference(row)
535
  except Exception as error:
536
  logging.error(error)
537
+ return None
538
 
539
 
540
  def translist(infolist):
 
652
  contenteng += translate(element) + '. '
653
  article['content'] = repr(contenteng)[1:-1].strip()
654
  article['subtitle'] = summarize(article['content'])
 
 
 
 
 
655
  article['publishDate'] = datemodifier(
656
  encode(page.xpath(xpath_dict[domain]['publishdate'])),
657
  xpath_dict[domain]['datetime_format'])
main.py CHANGED
@@ -9,6 +9,7 @@ import os
9
  from dotenv import load_dotenv
10
 
11
  from source import cbirc, csrc, eastmoney, gov, mofcom, ndrc, safe, stats, mof
 
12
 
13
  load_dotenv()
14
 
@@ -29,3 +30,4 @@ if __name__ == '__main__':
29
  mofcom.crawl(delta)
30
  ndrc.crawl(delta)
31
  mof.crawl(delta)
 
 
9
  from dotenv import load_dotenv
10
 
11
  from source import cbirc, csrc, eastmoney, gov, mofcom, ndrc, safe, stats, mof
12
+ from glue import glue_job_run
13
 
14
  load_dotenv()
15
 
 
30
  mofcom.crawl(delta)
31
  ndrc.crawl(delta)
32
  mof.crawl(delta)
33
+ glue_job_run()
requirements.txt CHANGED
@@ -192,3 +192,4 @@ websocket-client==1.8.0
192
  Werkzeug==3.0.3
193
  wrapt==1.16.0
194
  yarl==1.9.4
 
 
192
  Werkzeug==3.0.3
193
  wrapt==1.16.0
194
  yarl==1.9.4
195
+ prefect==2.20.2
source/cbirc.py CHANGED
@@ -4,6 +4,7 @@ import logging
4
  import time
5
  import uuid
6
  from datetime import datetime, timedelta
 
7
 
8
  from controllers.summarizer import summarize
9
  from controllers.utils import (
@@ -14,7 +15,7 @@ from controllers.utils import (
14
  update_content,
15
  )
16
 
17
-
18
  def crawl(delta):
19
  """
20
  Crawls the website 'https://www.cbirc.gov.cn' to fetch and process articles.
 
4
  import time
5
  import uuid
6
  from datetime import datetime, timedelta
7
+ from prefect import flow
8
 
9
  from controllers.summarizer import summarize
10
  from controllers.utils import (
 
15
  update_content,
16
  )
17
 
18
+ @flow(name = "Data Collection - cbirc")
19
  def crawl(delta):
20
  """
21
  Crawls the website 'https://www.cbirc.gov.cn' to fetch and process articles.
source/csrc.py CHANGED
@@ -6,6 +6,7 @@ import urllib.request
6
  import uuid
7
  from datetime import datetime, timedelta
8
 
 
9
  from lxml import etree
10
 
11
  from controllers.summarizer import summarize
@@ -18,7 +19,7 @@ from controllers.utils import (
18
  update_content,
19
  )
20
 
21
-
22
  def crawl(delta):
23
  """
24
  Crawls the website http://www.csrc.gov.cn to retrieve articles based on the specified delta.
 
6
  import uuid
7
  from datetime import datetime, timedelta
8
 
9
+ from prefect import flow
10
  from lxml import etree
11
 
12
  from controllers.summarizer import summarize
 
19
  update_content,
20
  )
21
 
22
+ @flow(name = "Data Collection - csrc")
23
  def crawl(delta):
24
  """
25
  Crawls the website http://www.csrc.gov.cn to retrieve articles based on the specified delta.
source/eastmoney.py CHANGED
@@ -6,6 +6,7 @@ import uuid
6
  from datetime import datetime, timedelta
7
  from urllib.parse import urlparse
8
 
 
9
  from lxml import etree
10
 
11
  from controllers.summarizer import summarize
@@ -45,7 +46,7 @@ def _crawl(url, article):
45
  text = req.read()
46
  html_text = text.decode("utf-8")
47
  page = etree.HTML(html_text)
48
- contentcn, summary = encode_content(
49
  page.xpath(xpath_dict[domain]['content']))
50
  article['attachment'] = encode(page.xpath(
51
  xpath_dict[domain]['attachment']))
@@ -54,6 +55,7 @@ def _crawl(url, article):
54
  article['site'] = translate(article['orgSName'])
55
  else:
56
  article['site'] = translate(article['orgName'])
 
57
  article['titleCN'] = article['title']
58
  article['title'] = translate(article['title'])
59
  article['author'] = translate(article['researcher'])
@@ -66,6 +68,7 @@ def _crawl(url, article):
66
  for element in contentcn.split("\n"):
67
  contenteng += translate(element) + '\n'
68
  article['content'] = repr(contenteng)[1:-1].strip()
 
69
  article['subtitle'] = summarize(article['content'])
70
  article['authorid'] = uuid.uuid5(uuid.NAMESPACE_OID, article['author'])
71
  article['publishDate'] = datemodifier(
@@ -74,11 +77,11 @@ def _crawl(url, article):
74
  article['titleCN'] + article['publishDate'])
75
  article['sentimentScore'], article[
76
  'sentimentLabel'] = sentiment_computation(contentcn.replace("\n", ""))
 
77
  extract_reference(article)
78
  update_content(article)
79
- logging.info(article)
80
-
81
 
 
82
  def crawl(delta):
83
  """
84
  Crawls the website data.eastmoney.com and retrieves reports within a specified time range.
 
6
  from datetime import datetime, timedelta
7
  from urllib.parse import urlparse
8
 
9
+ from prefect import flow
10
  from lxml import etree
11
 
12
  from controllers.summarizer import summarize
 
46
  text = req.read()
47
  html_text = text.decode("utf-8")
48
  page = etree.HTML(html_text)
49
+ contentcn, _ = encode_content(
50
  page.xpath(xpath_dict[domain]['content']))
51
  article['attachment'] = encode(page.xpath(
52
  xpath_dict[domain]['attachment']))
 
55
  article['site'] = translate(article['orgSName'])
56
  else:
57
  article['site'] = translate(article['orgName'])
58
+ logging.info(article)
59
  article['titleCN'] = article['title']
60
  article['title'] = translate(article['title'])
61
  article['author'] = translate(article['researcher'])
 
68
  for element in contentcn.split("\n"):
69
  contenteng += translate(element) + '\n'
70
  article['content'] = repr(contenteng)[1:-1].strip()
71
+ logging.info(article)
72
  article['subtitle'] = summarize(article['content'])
73
  article['authorid'] = uuid.uuid5(uuid.NAMESPACE_OID, article['author'])
74
  article['publishDate'] = datemodifier(
 
77
  article['titleCN'] + article['publishDate'])
78
  article['sentimentScore'], article[
79
  'sentimentLabel'] = sentiment_computation(contentcn.replace("\n", ""))
80
+ logging.info(article)
81
  extract_reference(article)
82
  update_content(article)
 
 
83
 
84
+ @flow(name = "Data Collection - eastmoney")
85
  def crawl(delta):
86
  """
87
  Crawls the website data.eastmoney.com and retrieves reports within a specified time range.
source/gov.py CHANGED
@@ -5,10 +5,11 @@ import urllib.request
5
  from datetime import datetime, timedelta
6
 
7
  from lxml import etree
 
8
 
9
  from controllers.utils import crawl_by_url
10
 
11
-
12
  def crawl(delta):
13
  """
14
  Crawls the government website for policy interpretation and latest news articles.
 
5
  from datetime import datetime, timedelta
6
 
7
  from lxml import etree
8
+ from prefect import flow
9
 
10
  from controllers.utils import crawl_by_url
11
 
12
+ @flow(name = "Data Collection - gov")
13
  def crawl(delta):
14
  """
15
  Crawls the government website for policy interpretation and latest news articles.
source/mof.py CHANGED
@@ -5,10 +5,11 @@ import urllib.request
5
  from datetime import datetime, timedelta
6
 
7
  from lxml import etree
 
8
 
9
  from controllers.utils import crawl_by_url
10
 
11
-
12
  def crawl(delta):
13
  """
14
  Crawls the website to retrieve articles based on the specified delta.
 
5
  from datetime import datetime, timedelta
6
 
7
  from lxml import etree
8
+ from prefect import flow
9
 
10
  from controllers.utils import crawl_by_url
11
 
12
+ @flow(name = "Data Collection - mof")
13
  def crawl(delta):
14
  """
15
  Crawls the website to retrieve articles based on the specified delta.
source/mofcom.py CHANGED
@@ -5,10 +5,11 @@ import urllib.request
5
  from datetime import datetime, timedelta
6
 
7
  from lxml import etree
 
8
 
9
  from controllers.utils import crawl_by_url
10
 
11
-
12
  def crawl(delta):
13
  """
14
  Crawls the website http://www.mofcom.gov.cn to retrieve articles based on the specified delta.
 
5
  from datetime import datetime, timedelta
6
 
7
  from lxml import etree
8
+ from prefect import flow
9
 
10
  from controllers.utils import crawl_by_url
11
 
12
+ @flow(name = "Data Collection - mofcom")
13
  def crawl(delta):
14
  """
15
  Crawls the website http://www.mofcom.gov.cn to retrieve articles based on the specified delta.
source/ndrc.py CHANGED
@@ -5,10 +5,11 @@ import urllib.request
5
  from datetime import datetime, timedelta
6
 
7
  from lxml import etree
 
8
 
9
  from controllers.utils import crawl_by_url
10
 
11
-
12
  def crawl(delta):
13
  """
14
  Crawls the website "https://www.ndrc.gov.cn/xxgk/jd/jd/" and retrieves articles based on the specified time delta.
 
5
  from datetime import datetime, timedelta
6
 
7
  from lxml import etree
8
+ from prefect import flow
9
 
10
  from controllers.utils import crawl_by_url
11
 
12
+ @flow(name = "Data Collection - ndrc")
13
  def crawl(delta):
14
  """
15
  Crawls the website "https://www.ndrc.gov.cn/xxgk/jd/jd/" and retrieves articles based on the specified time delta.
source/safe.py CHANGED
@@ -5,10 +5,11 @@ import urllib.request
5
  from datetime import datetime, timedelta
6
 
7
  from lxml import etree
 
8
 
9
  from controllers.utils import crawl_by_url
10
 
11
-
12
  def crawl(delta):
13
  """
14
  Crawls the website "https://www.safe.gov.cn" to retrieve policy interpretation and data interpretation articles.
 
5
  from datetime import datetime, timedelta
6
 
7
  from lxml import etree
8
+ from prefect import flow
9
 
10
  from controllers.utils import crawl_by_url
11
 
12
+ @flow(name = "Data Collection - safe")
13
  def crawl(delta):
14
  """
15
  Crawls the website "https://www.safe.gov.cn" to retrieve policy interpretation and data interpretation articles.
source/stats.py CHANGED
@@ -5,10 +5,11 @@ import urllib.request
5
  from datetime import datetime, timedelta
6
 
7
  from lxml import etree
 
8
 
9
  from controllers.utils import crawl_by_url, encode
10
 
11
-
12
  def crawl(delta):
13
  """
14
  Crawls the website "https://www.stats.gov.cn/sj/sjjd/" and retrieves articles within a specified time range.
 
5
  from datetime import datetime, timedelta
6
 
7
  from lxml import etree
8
+ from prefect import flow
9
 
10
  from controllers.utils import crawl_by_url, encode
11
 
12
+ @flow(name = "Data Collection - stats")
13
  def crawl(delta):
14
  """
15
  Crawls the website "https://www.stats.gov.cn/sj/sjjd/" and retrieves articles within a specified time range.
utils.py CHANGED
@@ -14,7 +14,7 @@ import requests
14
  import boto3
15
  from dotenv import load_dotenv
16
  from lxml import etree
17
- from googletrans import Translator
18
  from transformers import pipeline
19
  from PyPDF2 import PdfReader
20
  from langdetect import detect
 
14
  import boto3
15
  from dotenv import load_dotenv
16
  from lxml import etree
17
+ # from googletrans import Translator
18
  from transformers import pipeline
19
  from PyPDF2 import PdfReader
20
  from langdetect import detect