Spaces:

Oxbridge-Economics
/

Data-Collection-China

Running

File size: 22,778 Bytes

"""
This script is responsible for collecting data from various websites related to financial and policy information in China.
It fetches data from different sources, extracts relevant information, translates it, and updates the content accordingly.
The collected data includes policy interpretations, financial news, macroeconomic research, and more.
"""
import json
import os
import time
import urllib.request
import uuid
from datetime import datetime, timedelta
from urllib.parse import urlparse

from lxml import etree

from utils import (crawl, datemodifier, encode, encode_content,
                   extract_from_pdf, extract_reference, fetch_url,
                   sentiment_computation, translate, update_content)

with open('xpath.json', 'r', encoding='UTF-8') as f:
    xpath_dict = json.load(f)

DELTA = int(os.environ.get('DELTA') or '1')
print(f"DELTA = {DELTA}")

print("cbirc.gov.cn")
i = 1
while i > -1:
    CATEGORY_URL = f"https://www.cbirc.gov.cn/cn/static/data/DocInfo/SelectDocByItemIdAndChild/data_itemId=917,pageIndex={i},pageSize=18.json"
    i = i + 1
    content = fetch_url(CATEGORY_URL)
    reportinfo = json.loads(content)
    for article in reportinfo['data']['rows']:
        try:
            article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(article['publishDate'],"%Y-%m-%d %H:%M:%S"))
            parsed_datetime = datetime.strptime(article['publishDate'], "%Y-%m-%d")
            if  parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
                i = -1
            else:
                contentCN, summary = extract_from_pdf("https://www.cbirc.gov.cn" + article['pdfFileUrl'])
                article['contentCN'] = repr(contentCN)[1:-1].strip()
                if len(contentCN) < 10:
                    continue
                CONTENT_ENG = ''
                for element in article['contentCN'].split("\n"):
                    CONTENT_ENG += translate(element) + '\n'
                article['content'] = repr(CONTENT_ENG)[1:-1].strip()
                article['site'] = "National Financial Regulatory Administration of China"
                article['originSite'] = "国家金融监督管理总局"
                article['titleCN'] = article['docSubtitle']
                article['title'] = translate(article['docSubtitle'])
                article['link'] = "https://www.cbirc.gov.cn" + str(article['pdfFileUrl'])
                article['category']= "Policy Interpretation"
                article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['titleCN']+article['publishDate'])
                article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
                article['attachment'] = ''
                article['author'] = ''
                article['subtitle'] = translate(summary)
                update_content(article)
        except Exception as error:
            print(error)

print("csrc.gov.cn")
i = 1
while i > -1:
    try:
        if i == 1:
            CATEGORY_URL = "http://www.csrc.gov.cn/csrc/c100039/common_list.shtml"
        else:
            CATEGORY_URL = f"http://www.csrc.gov.cn/csrc/c100039/common_list_{i}.shtml"
        i = i + 1
        req = urllib.request.urlopen(CATEGORY_URL)
        text = req.read()
        html_text = text.decode("utf-8")
        page = etree.HTML(html_text)
        articlelist = page.xpath("//div[contains(@class, 'main-right fr common-list')]/ul/li")
        for article in articlelist:
            if isinstance(article, etree._Element):
                subelement = etree.tostring(article).decode()
                subpage = etree.HTML(subelement)
                date = encode(subpage.xpath("//span[@class='date']"))
                parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
                if  parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
                    i = -1
                else:
                    urls = subpage.xpath("//a/@href")
                    for url in urls:
                        try:
                            article = {}
                            url = "http://www.csrc.gov.cn" + url
                            article['category']= "Policy Interpretation"
                            crawl(url, article)
                        except Exception as error:
                            print(error)
    except Exception as error:
        i = -1
        print(error)

i = 1
while i > -1:
    CATEGORY_URL = f"http://www.csrc.gov.cn/searchList/a1a078ee0bc54721ab6b148884c784a8?_isAgg=true&_isJson=true&_pageSize=18&_template=index&_rangeTimeGte=&_channelName=&page={i}"
    i = i + 1
    try:
        content = fetch_url(CATEGORY_URL)
        reportinfo = json.loads(content)
        for article in reportinfo['data']['results']:
            parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(article['publishedTimeStr'],"%Y-%m-%d %H:%M:%S")), "%Y-%m-%d")
            if  parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
                i = -1
            else:
                article['category']= "Financial News"
                article['site'] = "Securities Regulatory Commission of China"
                article['originSite'] = "证监会"
                article['titleCN'] = article['title']
                article['title'] = translate(article['titleCN'])
                article['author'] = ''
                article['contentCN'] = repr(article['content'])[1:-1].strip()
                if len(article['contentCN']) < 10:
                    continue
                CONTENT_ENG = ''
                for element in article['contentCN'].split("。"):
                    CONTENT_ENG += translate(element) + ' '
                article['content'] = repr(CONTENT_ENG)[1:-1].strip()
                article['subtitle'] = article['memo']
                article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(article['publishedTimeStr'],"%Y-%m-%d %H:%M:%S"))
                article['link'] = article['url']
                article['attachment'] = ""
                article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
                article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['titleCN']+article['publishDate'])
                update_content(article)
    except Exception as error:
        print(error)

print("data.eastmoney.com")
def crawl_eastmoney(url, article):
    """
    Crawls the given URL and extracts information from the webpage.

    Args:
        url (str): The URL of the webpage to crawl.
        article (dict): A dictionary to store the extracted information.

    Returns:
        None: If the length of the extracted content is less than 10 characters.

    Raises:
        None.

    """
    domain = urlparse(url).netloc
    req = urllib.request.urlopen(url)
    text = req.read()
    html_text = text.decode("utf-8")
    page = etree.HTML(html_text)
    contentCN, summary = encode_content(page.xpath(xpath_dict[domain]['content']))
    article['attachment'] = encode(page.xpath(xpath_dict[domain]['attachment']))
    article['link'] = url
    if article['orgSName'] == "''":
        article['site'] = translate(article['orgSName'])
    else:
        article['site'] = translate(article['orgName'])
    article['titleCN'] = article['title']
    article['title'] = translate(article['title'])
    article['author'] = translate(article['researcher'])
    article['originAuthor'] = article['researcher']
    article['contentCN'] = repr(contentCN)[1:-1].strip()
    article['subtitle'] = translate(summary)
    article['category'] = "Macroeconomic Research"
    if len(article['contentCN']) < 10:
        return None
    CONTENT_ENG = ''
    for element in contentCN.split("\n"):
        CONTENT_ENG += translate(element) + '\n'
    article['content'] = repr(CONTENT_ENG)[1:-1].strip()
    article['authorid'] = uuid.uuid5(uuid.NAMESPACE_OID, article['author'])
    article['publishDate'] = datemodifier(article['publishDate'], xpath_dict[domain]['datetime_format'])
    article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['titleCN']+article['publishDate'])
    article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(contentCN.replace("\n",""))
    extract_reference(article)
    update_content(article)

today = datetime.today().strftime('%Y-%m-%d')
beginDate = (datetime.today() - timedelta(days=DELTA)).strftime('%Y-%m-%d')
i = 0
while i > -1:
    URL = "https://reportapi.eastmoney.com/report/jg"
    params = {
        "cb": "datatable8544623",
        "pageSize": "100",
        "beginTime": beginDate,
        "endTime": today,
        "pageNo": i,
        "qType": "3",
    }
    URL = URL + "?" + "&".join(f"{key}={value}" for key, value in params.items())
    content = fetch_url(URL)
    if content:
        start_index = content.find("(")
        if start_index != -1:
            result = content[start_index + 1: -1]
        else:
            result = content
        reportinfo = json.loads(result)
        if reportinfo["size"] > 0:
            i = i + 1
            for article in reportinfo['data']:
                try:
                    url = f"https://data.eastmoney.com/report/zw_macresearch.jshtml?encodeUrl={article['encodeUrl']}"
                    crawl_eastmoney(url,article)
                except Exception as error:
                    print(error)
        else:
            i = -1
    else:
        print("Failed to fetch URL:", url)

print("gov.cn")
i = 0
while i > -1:
    if i == 0:
        CATEGORY_URL = "https://www.gov.cn/zhengce/jiedu/home.htm"
    else:
        CATEGORY_URL = f"https://www.gov.cn/zhengce/jiedu/home_{i}.htm"
    i = i + 1
    req = urllib.request.urlopen(CATEGORY_URL)
    text = req.read()
    html_text = text.decode("utf-8")
    page = etree.HTML(html_text)
    articlelist = page.xpath("//div[contains(@class, 'news_box')]//h4")
    for article in articlelist:
        if isinstance(article, etree._Element):
            subelement = etree.tostring(article).decode()
            subpage = etree.HTML(subelement)
            date = subpage.xpath("//span/text()")[0]
            parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
            if  parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
                i = -1
            else:
                urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
                for url in urls:
                    try:
                        article = {}
                        url = url.replace('../', 'https://www.gov.cn/zhengce/')
                        if "https://www.gov.cn" in url:
                            article['category']= "Policy Interpretation"
                            crawl(url, article)
                    except Exception as error:
                        print(error)

i = 0
while i > -1:
    if i == 0:
        CATEGORY_URL = "https://www.gov.cn/zhengce/zuixin/home.htm"
    else:
        CATEGORY_URL = f"https://www.gov.cn/zhengce/zuixin/home_{i}.htm"
    i = i + 1
    req = urllib.request.urlopen(CATEGORY_URL)
    text = req.read()
    html_text = text.decode("utf-8")
    page = etree.HTML(html_text)
    articlelist = page.xpath("//div[contains(@class, 'news_box')]//h4")
    for article in articlelist:
        if isinstance(article, etree._Element):
            subelement = etree.tostring(article).decode()
            subpage = etree.HTML(subelement)
            date = subpage.xpath("//span/text()")[0]
            parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
            if  parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
                i = -1
            else:
                urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
                for url in urls:
                    try:
                        article = {}
                        url = url.replace('../', 'https://www.gov.cn/zhengce/')
                        if "https://www.gov.cn" in url:
                            article['site'] = "State Council of China"
                            crawl(url, article)
                    except Exception as error:
                        print(error)

print("mof.gov.cn")
i = 0
while i > -1:
    if i == 0:
        CATEGORY_URL = "https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/"
    else:
        CATEGORY_URL = f"https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/index_{i}.htm"
    i = i + 1
    req = urllib.request.urlopen(CATEGORY_URL)
    text = req.read()
    html_text = text.decode("utf-8")
    page = etree.HTML(html_text)
    articlelist = page.xpath("//div[contains(@class, 'xwfb_listerji')]/ul/li[not(@class = 'clear')]")
    for article in articlelist:
        if isinstance(article, etree._Element):
            subelement = etree.tostring(article).decode()
            subpage = etree.HTML(subelement)
            date = subpage.xpath("//span/text()")[0]
            parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
            if  parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
                i = -1
            else:
                urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
                for url in urls:
                    try:
                        article = {}
                        url = url.replace("../", "https://www.mof.gov.cn/zhengwuxinxi/")
                        url = url.replace("./", "https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/")
                        article['category']= "Financial News"
                        crawl(url, article)
                    except Exception as error:
                        print(error)

i = 0
while i > -1:
    if i == 0:
        CATEGORY_URL = "https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/"
    else:
        CATEGORY_URL = f"https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/index_{i}.htm"
    i = i + 1
    req = urllib.request.urlopen(CATEGORY_URL)
    text = req.read()
    html_text = text.decode("utf-8")
    page = etree.HTML(html_text)
    articlelist = page.xpath("//div[contains(@class, 'xwfb_listerji')]/ul/li[not(@class = 'clear')]")
    for article in articlelist:
        if isinstance(article, etree._Element):
            subelement = etree.tostring(article).decode()
            subpage = etree.HTML(subelement)
            date = subpage.xpath("//span/text()")[0]
            parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
            if  parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
                i = -1
            else:
                urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
                for url in urls:
                    try:
                        article = {}
                        url = url.replace("./", CATEGORY_URL)
                        article['category']= "Policy Interpretation"
                        print(url)
                        crawl(url, article)
                    except Exception as error:
                        print(error)

print("mofcom.gov.cn")
categories = ['jdzhsw','jdgnmy','jddwmy','jdtzhz']
for category in categories:
    i = 1
    while i > -1:
        if i == 1:
            URL = f"http://www.mofcom.gov.cn/article/zcjd/{category}/"
        else:
            URL = f"http://www.mofcom.gov.cn/article/zcjd/{category}/?{i}"
        i = i + 1
        req = urllib.request.urlopen(URL)
        text = req.read()
        html_text = text.decode("utf-8")
        page = etree.HTML(html_text)
        articlelist = page.xpath("//section[contains(@class, 'listCon iListCon f-mt30')]/ul/li")
        for article in articlelist:
            if isinstance(article, etree._Element):
                subelement = etree.tostring(article).decode()
                subpage = etree.HTML(subelement)
                date = subpage.xpath("//span/text()")[0]
                parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d %H:%M:%S")), "%Y-%m-%d")
                if  parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
                    i = -1
                else:
                    urls = subpage.xpath("//a/@href")
                    for url in urls:
                        try:
                            article = {}
                            if '/article/zcjd' in url:
                                url = "http://www.mofcom.gov.cn" + url
                                article['category']= "Policy Interpretation"
                            else:
                                article['category']= "Policy Release"
                            crawl(url, article)
                        except Exception as error:
                            print(error)

print("ndrc.gov.cn")
i = 0
while i > -1:
    if i == 0:
        CATEGORY_URL = "https://www.ndrc.gov.cn/xxgk/jd/jd/index.html"
    else:
        CATEGORY_URL = f"https://www.ndrc.gov.cn/xxgk/jd/jd/index_{i}.html"
    i = i + 1
    req = urllib.request.urlopen(CATEGORY_URL)
    text = req.read()
    html_text = text.decode("utf-8")
    page = etree.HTML(html_text)
    articlelist = page.xpath("//div[contains(@class, 'list')]/ul/li[not(@class = 'empty')]")
    for article in articlelist:
        if isinstance(article, etree._Element):
            subelement = etree.tostring(article).decode()
            subpage = etree.HTML(subelement)
            date = subpage.xpath("//span/text()")[0]
            parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y/%m/%d")), "%Y-%m-%d")
            if  parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
                i = -1
            else:
                urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
                for url in urls:
                    try:
                        article = {}
                        if "www.gov.cn" in url:
                            article['category']= "Policy Release"
                        elif "../../zcfb/" in url:
                            url = url.replace("../../zcfb/", "https://www.ndrc.gov.cn/xxgk/zcfb/")
                            article['category']= "Policy Release"
                        else:
                            url = url.replace("../../", "https://www.ndrc.gov.cn/xxgk/jd/jd/")
                            url = url.replace("./", "https://www.ndrc.gov.cn/xxgk/jd/jd/")
                            article['category']= "Policy Interpretation"
                        crawl(url, article)
                    except Exception as error:
                        print(error)

print("safe.gov.cn")
i = 1
while i > -1:
    if i == 1:
        CATEGORY_URL = "https://www.safe.gov.cn/safe/zcfgjd/index.html"
    else:
        CATEGORY_URL = f"https://www.safe.gov.cn/safe/zcfgjd/index_{i}.html"
    i = i + 1
    req = urllib.request.urlopen(CATEGORY_URL)
    text = req.read()
    html_text = text.decode("utf-8")
    page = etree.HTML(html_text)
    articlelist = page.xpath("//div[contains(@class, 'list_conr')]/ul/li")
    for article in articlelist:
        if isinstance(article, etree._Element):
            subelement = etree.tostring(article).decode()
            subpage = etree.HTML(subelement)
            date = subpage.xpath("//dd/text()")[0]
            parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
            if  parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
                i = -1
            else:
                urls = subpage.xpath("//a/@href")
                for url in urls:
                    try:
                        article = {}
                        url = "https://www.safe.gov.cn" + url
                        article['category']= "Policy Interpretation"
                        crawl(url, article)
                    except Exception as error:
                        print(error)

i = 1
while i > -1:
    if i == 1:
        CATEGORY_URL = "https://www.safe.gov.cn/safe/sjjd/index.html"
    else:
        CATEGORY_URL = f"https://www.safe.gov.cn/safe/sjjd/index_{i}.html"
    i = i + 1
    req = urllib.request.urlopen(CATEGORY_URL)
    text = req.read()
    html_text = text.decode("utf-8")
    page = etree.HTML(html_text)
    articlelist = page.xpath("//div[contains(@class, 'list_conr')]/ul/li")
    for article in articlelist:
        if isinstance(article, etree._Element):
            subelement = etree.tostring(article).decode()
            subpage = etree.HTML(subelement)
            date = subpage.xpath("//dd/text()")[0]
            parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
            if  parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
                i = -1
            else:
                urls = subpage.xpath("//a/@href")
                for url in urls:
                    try:
                        article = {}
                        url = "https://www.safe.gov.cn" + url
                        article['category']= "Data Interpretation"
                        crawl(url, article)
                    except Exception as error:
                        print(error)

print("stats.gov.hk")
i = 0
while i > -1:
    if i == 0:
        CATEGORY_URL = "https://www.stats.gov.cn/sj/sjjd/"
    else:
        CATEGORY_URL = f"https://www.stats.gov.cn/sj/sjjd/index_{i}.html"
    i = i + 1
    req = urllib.request.urlopen(CATEGORY_URL)
    text = req.read()
    html_text = text.decode("utf-8")
    page = etree.HTML(html_text)
    articlelist = page.xpath("//div[contains(@class, 'list-content')]/ul/li")
    for article in articlelist:
        if isinstance(article, etree._Element):
            subelement = etree.tostring(article).decode()
            subpage = etree.HTML(subelement)
            date = encode(subpage.xpath("//span"))
            parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
            if  parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
                i = -1
            else:
                urls = subpage.xpath("//a[@class='fl pc_1600']/@href")
                for url in urls:
                    try:
                        article = {}
                        url = url.replace('./', "https://www.stats.gov.cn/sj/sjjd/")
                        article['category']= "Data Interpretation"
                        crawl(url, article)
                    except Exception as error:
                        print(error)