Spaces:

Oxbridge-Economics
/

Data-Collection-China

Running

App Files Files Community

OxbridgeEconomics commited on Apr 21, 2024

Commit

eaaafcb

1 Parent(s): 043eca4

commit

Browse files

Files changed (2) hide show

.github/workflows/daily.yml +42 -0
daily.py +501 -0

.github/workflows/daily.yml ADDED Viewed

	@@ -0,0 +1,42 @@

+# This workflow will install Python dependencies, run tests and lint with a single version of Python
+# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
+name: Data Collection - Daily
+on:
+  schedule:
+    - cron: '0 16 * * *'
+  workflow_dispatch:
+    inputs:
+      delta:
+        description: 'delta'
+        required: true
+        default: '1'
+permissions:
+  contents: read
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    timeout-minutes: 14400
+    steps:
+    - uses: actions/checkout@v3
+    - name: Set up Python 3.10
+      uses: actions/setup-python@v3
+      with:
+        python-version: "3.10"
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+        pip install transformers
+        pip install tensorflow
+        pip install tf-keras
+    - name: Data Collection
+      env:
+        AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
+        AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+        DELTA: ${{ github.event.inputs.delta}}
+      run: |
+        python cbirc.py

daily.py ADDED Viewed

	@@ -0,0 +1,501 @@

+import os
+import json
+import uuid
+import time
+import urllib.request
+from lxml import etree
+from datetime import datetime, timedelta
+from urllib.parse import urlparse
+from utils import (encode,
+                   translate,
+                   sentiment_computation,
+                   upsert_content,
+                   fetch_url,
+                   extract_from_pdf,
+                   crawl,
+                   datemodifier,
+                   encode_content)
+with open('xpath.json', 'r', encoding='UTF-8') as f:
+    xpath_dict = json.load(f)
+DELTA = int(os.environ.get('DELTA', '1'))
+print(f"DELTA = {DELTA}")
+# cbirc.gov.cn
+i = 1
+while i > -1:
+    CATEGORY_URL = f"https://www.cbirc.gov.cn/cn/static/data/DocInfo/SelectDocByItemIdAndChild/data_itemId=917,pageIndex={i},pageSize=18.json"
+    i = i + 1
+    content = fetch_url(CATEGORY_URL)
+    reportinfo = json.loads(content)
+    for article in reportinfo['data']['rows']:
+        try:
+            article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(article['publishDate'],"%Y-%m-%d %H:%M:%S"))
+            parsed_datetime = datetime.strptime(article['publishDate'], "%Y-%m-%d")
+            if  parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
+                i = -1
+            else:
+                contentCN, summary = extract_from_pdf("https://www.cbirc.gov.cn" + article['pdfFileUrl'])
+                article['contentCN'] = repr(contentCN)[1:-1].strip()
+                if len(contentCN) < 10:
+                    continue
+                CONTENT_ENG = ''
+                for element in article['contentCN'].split("\n"):
+                    CONTENT_ENG += translate(element) + '\n'
+                article['content'] = repr(CONTENT_ENG)[1:-1].strip()
+                article['site'] = "National Financial Regulatory Administration of China"
+                article['originSite'] = "国家金融监督管理总局"
+                article['titleCN'] = article['docSubtitle']
+                article['title'] = translate(article['docSubtitle'])
+                article['link'] = "https://www.cbirc.gov.cn" + str(article['pdfFileUrl'])
+                article['category']= "Policy Interpretation"
+                article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['titleCN']+article['publishDate'])
+                article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
+                article['attachment'] = ''
+                article['author'] = ''
+                article['subtitle'] = translate(summary)
+                upsert_content(article)
+        except Exception as error:
+            print(error)
+# csrc.gov.cn
+i = 1
+while i > -1:
+    if i == 1:
+        CATEGORY_URL = "http://www.csrc.gov.cn/csrc/c100039/common_list.shtml"
+    else:
+        CATEGORY_URL = f"http://www.csrc.gov.cn/csrc/c100039/common_list_{i}.shtml"
+    i = i + 1
+    req = urllib.request.urlopen(CATEGORY_URL)
+    text = req.read()
+    html_text = text.decode("utf-8")
+    page = etree.HTML(html_text)
+    articlelist = page.xpath("//div[contains(@class, 'main-right fr common-list')]/ul/li")
+    for article in articlelist:
+        if isinstance(article, etree._Element):
+            subelement = etree.tostring(article).decode()
+            subpage = etree.HTML(subelement)
+            date = encode(subpage.xpath("//span[@class='date']"))
+            parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
+            if  parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
+                i = -1
+            else:
+                urls = subpage.xpath("//a/@href")
+                for url in urls:
+                    try:
+                        article = {}
+                        url = "http://www.csrc.gov.cn" + url
+                        article['category']= "Policy Interpretation"
+                        crawl(url, article)
+                    except Exception as error:
+                       print(error)
+i = 1
+while i > -1:
+    CATEGORY_URL = f"http://www.csrc.gov.cn/searchList/a1a078ee0bc54721ab6b148884c784a8?_isAgg=true&_isJson=true&_pageSize=18&_template=index&_rangeTimeGte=&_channelName=&page={i}"
+    i = i + 1
+    content = fetch_url(CATEGORY_URL)
+    reportinfo = json.loads(content)
+    for article in reportinfo['data']['results']:
+        try:
+            parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(article['publishedTimeStr'],"%Y-%m-%d %H:%M:%S")), "%Y-%m-%d")
+            if  parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
+                i = -1
+            else:
+                article['category']= "Financial News"
+                article['site'] = "Securities Regulatory Commission of China"
+                article['originSite'] = "证监会"
+                article['titleCN'] = article['title']
+                article['title'] = translate(article['titleCN'])
+                article['author'] = ''
+                article['contentCN'] = repr(article['content'])[1:-1].strip()
+                if len(article['contentCN']) < 10:
+                    continue
+                CONTENT_ENG = ''
+                for element in article['contentCN'].split("。"):
+                    CONTENT_ENG += translate(element) + ' '
+                article['content'] = repr(CONTENT_ENG)[1:-1].strip()
+                article['subtitle'] = article['memo']
+                article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(article['publishedTimeStr'],"%Y-%m-%d %H:%M:%S"))
+                article['link'] = article['url']
+                article['attachment'] = ""
+                article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
+                article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['titleCN']+article['publishDate'])
+                upsert_content(article)
+        except Exception as error:
+            print(error)
+# data.eastmoney.com
+def crawl_eastmoney(url, article):
+    domain = urlparse(url).netloc
+    req = urllib.request.urlopen(url)
+    text = req.read()
+    html_text = text.decode("utf-8")
+    page = etree.HTML(html_text)
+    contentCN, summary = encode_content(page.xpath(xpath_dict[domain]['content']))
+    article['attachment'] = encode(page.xpath(xpath_dict[domain]['attachment']))
+    article['link'] = url
+    if article['orgSName'] == "''":
+        article['site'] = translate(article['orgSName'])
+    else:
+        article['site'] = translate(article['orgName'])
+    article['titleCN'] = article['title']
+    article['title'] = translate(article['title'])
+    article['author'] = translate(article['researcher'])
+    article['originAuthor'] = article['researcher']
+    article['contentCN'] = repr(contentCN)[1:-1].strip()
+    article['subtitle'] = translate(summary)
+    article['category'] = "Macroeconomic Research"
+    if len(article['contentCN']) < 10:
+        return None
+    CONTENT_ENG = ''
+    for element in contentCN.split("\n"):
+        CONTENT_ENG += translate(element) + '\n'
+    article['content'] = repr(CONTENT_ENG)[1:-1].strip()
+    article['authorid'] = uuid.uuid5(uuid.NAMESPACE_OID, article['author'])
+    article['publishDate'] = datemodifier(article['publishDate'], xpath_dict[domain]['datetime_format'])
+    article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['titleCN']+article['publishDate'])
+    article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(contentCN.replace("\n",""))
+    upsert_content(article)
+today = datetime.today().strftime('%Y-%m-%d')
+beginDate = (datetime.today() - timedelta(days=DELTA)).strftime('%Y-%m-%d')
+i = 0
+while i > -1:
+    URL = "https://reportapi.eastmoney.com/report/jg"
+    params = {
+        "cb": "datatable8544623",
+        "pageSize": "100",
+        "beginTime": beginDate,
+        "endTime": today,
+        "pageNo": i,
+        "qType": "3",
+    }
+    URL = URL + "?" + "&".join(f"{key}={value}" for key, value in params.items())
+    print(URL)
+    content = fetch_url(URL)
+    if content:
+        start_index = content.find("(")
+        if start_index != -1:
+            result = content[start_index + 1: -1]
+        else:
+            result = content
+        reportinfo = json.loads(result)
+        if reportinfo["size"] > 0:
+            i = i + 1
+            for article in reportinfo['data']:
+                try:
+                    url = f"https://data.eastmoney.com/report/zw_macresearch.jshtml?encodeUrl={article['encodeUrl']}"
+                    crawl_eastmoney(url,article)
+                except Exception as error:
+                    print(error)
+        else:
+            print(reportinfo)
+            i = -1
+    else:
+        print("Failed to fetch URL:", url)
+# gov.cn
+i = 0
+while i > -1:
+    if i == 0:
+        CATEGORY_URL = "https://www.gov.cn/zhengce/jiedu/home.htm"
+    else:
+        CATEGORY_URL = f"https://www.gov.cn/zhengce/jiedu/home_{i}.htm"
+    i = i + 1
+    req = urllib.request.urlopen(CATEGORY_URL)
+    text = req.read()
+    html_text = text.decode("utf-8")
+    page = etree.HTML(html_text)
+    articlelist = page.xpath("//div[contains(@class, 'news_box')]//h4")
+    for article in articlelist:
+        if isinstance(article, etree._Element):
+            subelement = etree.tostring(article).decode()
+            subpage = etree.HTML(subelement)
+            date = subpage.xpath("//span/text()")[0]
+            parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
+            if  parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
+                i = -1
+            else:
+                urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
+                for url in urls:
+                    try:
+                        article = {}
+                        url = url.replace('../', 'https://www.gov.cn/zhengce/')
+                        if "https://www.gov.cn" in url:
+                            article['category']= "Policy Interpretation"
+                            crawl(url, article)
+                    except Exception as error:
+                        print(error)
+i = 0
+while i > -1:
+    if i == 0:
+        CATEGORY_URL = "https://www.gov.cn/zhengce/zuixin/home.htm"
+    else:
+        CATEGORY_URL = f"https://www.gov.cn/zhengce/zuixin/home_{i}.htm"
+    i = i + 1
+    req = urllib.request.urlopen(CATEGORY_URL)
+    text = req.read()
+    html_text = text.decode("utf-8")
+    page = etree.HTML(html_text)
+    articlelist = page.xpath("//div[contains(@class, 'news_box')]//h4")
+    for article in articlelist:
+        if isinstance(article, etree._Element):
+            subelement = etree.tostring(article).decode()
+            subpage = etree.HTML(subelement)
+            date = subpage.xpath("//span/text()")[0]
+            parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
+            if  parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
+                i = -1
+            else:
+                urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
+                for url in urls:
+                    try:
+                        article = {}
+                        url = url.replace('../', 'https://www.gov.cn/zhengce/')
+                        if "https://www.gov.cn" in url:
+                            article['site'] = "State Council of China"
+                            crawl(url, article)
+                    except Exception as error:
+                        print(error)
+# mof.gov.cn
+i = 0
+while i > -1:
+    if i == 0:
+        CATEGORY_URL = "https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/"
+    else:
+        CATEGORY_URL = f"https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/index_{i}.htm"
+    i = i + 1
+    print(CATEGORY_URL)
+    req = urllib.request.urlopen(CATEGORY_URL)
+    text = req.read()
+    html_text = text.decode("utf-8")
+    page = etree.HTML(html_text)
+    articlelist = page.xpath("//div[contains(@class, 'xwfb_listerji')]/ul/li[not(@class = 'clear')]")
+    for article in articlelist:
+        if isinstance(article, etree._Element):
+            subelement = etree.tostring(article).decode()
+            subpage = etree.HTML(subelement)
+            date = subpage.xpath("//span/text()")[0]
+            parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
+            if  parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
+                i = -1
+            else:
+                urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
+                for url in urls:
+                    try:
+                        article = {}
+                        url = url.replace("../", "https://www.mof.gov.cn/zhengwuxinxi/")
+                        url = url.replace("./", "https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/")
+                        print(url)
+                        article['category']= "Financial News"
+                        crawl(url, article)
+                    except Exception as error:
+                        print(error)
+i = 0
+while i > -1:
+    if i == 0:
+        CATEGORY_URL = "https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/"
+    else:
+        CATEGORY_URL = f"https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/index_{i}.htm"
+    i = i + 1
+    print(CATEGORY_URL)
+    req = urllib.request.urlopen(CATEGORY_URL)
+    text = req.read()
+    html_text = text.decode("utf-8")
+    page = etree.HTML(html_text)
+    articlelist = page.xpath("//div[contains(@class, 'xwfb_listerji')]/ul/li[not(@class = 'clear')]")
+    for article in articlelist:
+        if isinstance(article, etree._Element):
+            subelement = etree.tostring(article).decode()
+            subpage = etree.HTML(subelement)
+            date = subpage.xpath("//span/text()")[0]
+            parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
+            if  parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
+                i = -1
+            else:
+                urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
+                for url in urls:
+                    try:
+                        article = {}
+                        url = url.replace("./", CATEGORY_URL)
+                        article['category']= "Policy Interpretation"
+                        print(url)
+                        crawl(url, article)
+                    except Exception as error:
+                        print(error)
+# mofcom.gov.cn
+categories = ['jdzhsw','jdgnmy','jddwmy','jdtzhz']
+for category in categories:
+    i = 1
+    while i > -1:
+        if i == 1:
+            URL = f"http://www.mofcom.gov.cn/article/zcjd/{category}/"
+        else:
+            URL = f"http://www.mofcom.gov.cn/article/zcjd/{category}/?{i}"
+        i = i + 1
+        req = urllib.request.urlopen(URL)
+        text = req.read()
+        html_text = text.decode("utf-8")
+        page = etree.HTML(html_text)
+        articlelist = page.xpath("//section[contains(@class, 'listCon iListCon f-mt30')]/ul/li")
+        for article in articlelist:
+            if isinstance(article, etree._Element):
+                subelement = etree.tostring(article).decode()
+                subpage = etree.HTML(subelement)
+                date = subpage.xpath("//span/text()")[0]
+                parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d %H:%M:%S")), "%Y-%m-%d")
+                if  parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
+                    i = -1
+                else:
+                    urls = subpage.xpath("//a/@href")
+                    for url in urls:
+                        try:
+                            article = {}
+                            if '/article/zcjd' in url:
+                                url = "http://www.mofcom.gov.cn" + url
+                                article['category']= "Policy Interpretation"
+                            else:
+                                article['category']= "Policy Release"
+                            crawl(url, article)
+                        except Exception as error:
+                            print(error)
+# ndrc.gov.cn
+i = 0
+while i > -1:
+    if i == 0:
+        CATEGORY_URL = "https://www.ndrc.gov.cn/xxgk/jd/jd/index.html"
+    else:
+        CATEGORY_URL = f"https://www.ndrc.gov.cn/xxgk/jd/jd/index_{i}.html"
+    i = i + 1
+    req = urllib.request.urlopen(CATEGORY_URL)
+    text = req.read()
+    html_text = text.decode("utf-8")
+    page = etree.HTML(html_text)
+    articlelist = page.xpath("//div[contains(@class, 'list')]/ul/li[not(@class = 'empty')]")
+    for article in articlelist:
+        if isinstance(article, etree._Element):
+            subelement = etree.tostring(article).decode()
+            subpage = etree.HTML(subelement)
+            date = subpage.xpath("//span/text()")[0]
+            parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y/%m/%d")), "%Y-%m-%d")
+            if  parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
+                i = -1
+            else:
+                urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
+                for url in urls:
+                    try:
+                        article = {}
+                        if "www.gov.cn" in url:
+                            article['category']= "Policy Release"
+                        elif "../../zcfb/" in url:
+                            url = url.replace("../../zcfb/", "https://www.ndrc.gov.cn/xxgk/zcfb/")
+                            article['category']= "Policy Release"
+                        else:
+                            url = url.replace("../../", "https://www.ndrc.gov.cn/xxgk/jd/jd/")
+                            url = url.replace("./", "https://www.ndrc.gov.cn/xxgk/jd/jd/")
+                            article['category']= "Policy Interpretation"
+                        crawl(url, article)
+                    except Exception as error:
+                        print(error)
+# safe.gov.cn
+i = 1
+while i > -1:
+    if i == 1:
+        CATEGORY_URL = "https://www.safe.gov.cn/safe/zcfgjd/index.html"
+    else:
+        CATEGORY_URL = f"https://www.safe.gov.cn/safe/zcfgjd/index_{i}.html"
+    i = i + 1
+    req = urllib.request.urlopen(CATEGORY_URL)
+    text = req.read()
+    html_text = text.decode("utf-8")
+    page = etree.HTML(html_text)
+    articlelist = page.xpath("//div[contains(@class, 'list_conr')]/ul/li")
+    for article in articlelist:
+        if isinstance(article, etree._Element):
+            subelement = etree.tostring(article).decode()
+            subpage = etree.HTML(subelement)
+            date = subpage.xpath("//dd/text()")[0]
+            parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
+            if  parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
+                i = -1
+            else:
+                urls = subpage.xpath("//a/@href")
+                for url in urls:
+                    try:
+                        article = {}
+                        url = "https://www.safe.gov.cn" + url
+                        article['category']= "Policy Interpretation"
+                        crawl(url, article)
+                    except Exception as error:
+                        print(error)
+i = 1
+while i > -1:
+    if i == 1:
+        CATEGORY_URL = "https://www.safe.gov.cn/safe/sjjd/index.html"
+    else:
+        CATEGORY_URL = f"https://www.safe.gov.cn/safe/sjjd/index_{i}.html"
+    i = i + 1
+    req = urllib.request.urlopen(CATEGORY_URL)
+    text = req.read()
+    html_text = text.decode("utf-8")
+    page = etree.HTML(html_text)
+    articlelist = page.xpath("//div[contains(@class, 'list_conr')]/ul/li")
+    for article in articlelist:
+        if isinstance(article, etree._Element):
+            subelement = etree.tostring(article).decode()
+            subpage = etree.HTML(subelement)
+            date = subpage.xpath("//dd/text()")[0]
+            parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
+            if  parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
+                i = -1
+            else:
+                urls = subpage.xpath("//a/@href")
+                for url in urls:
+                    try:
+                        article = {}
+                        url = "https://www.safe.gov.cn" + url
+                        article['category']= "Data Interpretation"
+                        crawl(url, article)
+                    except Exception as error:
+                        print(error)
+# stats.gov.hk
+i = 0
+while i > -1:
+    if i == 0:
+        CATEGORY_URL = "https://www.stats.gov.cn/sj/sjjd/"
+    else:
+        CATEGORY_URL = f"https://www.stats.gov.cn/sj/sjjd/index_{i}.html"
+    i = i + 1
+    req = urllib.request.urlopen(CATEGORY_URL)
+    text = req.read()
+    html_text = text.decode("utf-8")
+    page = etree.HTML(html_text)
+    articlelist = page.xpath("//div[contains(@class, 'list-content')]/ul/li")
+    for article in articlelist:
+        if isinstance(article, etree._Element):
+            subelement = etree.tostring(article).decode()
+            subpage = etree.HTML(subelement)
+            date = encode(subpage.xpath("//span"))
+            parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
+            if  parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
+                i = -1
+            else:
+                urls = subpage.xpath("//a[@class='fl pc_1600']/@href")
+                for url in urls:
+                    try:
+                        article = {}
+                        url = url.replace('./', "https://www.stats.gov.cn/sj/sjjd/")
+                        article['category']= "Data Interpretation"
+                        crawl(url, article)
+                    except Exception as error:
+                      print(error)