Spaces:

Oxbridge-Economics
/

Data-Collection-China

Running

File size: 5,108 Bytes

39fe3d1
 
 
 
 
 
 
 
 
 
 
 
 
57c4050
 
 
 
 
 
 
 
 
 
42ba1cc
57c4050
 
 
 
42ba1cc
 
57c4050
 
 
42ba1cc
422b41b
42ba1cc
57c4050
 
42ba1cc
b6dcee5
422b41b
b2a3d45
b6dcee5
42ba1cc
 
 
39fe3d1
422b41b
57c4050
42ba1cc
 
 
57c4050
 
 
 
 
422b41b

"""
This script fetches data from the China Banking and Insurance Regulatory Commission (CBIRC) website and extracts relevant information from the fetched data.
The extracted information is then processed and stored in a database.

The script performs the following steps:
1. Fetches data from the CBIRC website by making HTTP requests.
2. Parses the fetched data and extracts relevant information.
3. Translates the extracted information to English.
4. Computes sentiment scores for the translated content.
5. Stores the processed information in a database.

Note: The script also includes commented code for fetching data from the State Taxation Administration of China website, but it is currently disabled.
"""
import json
import uuid
import time
from datetime import datetime, timedelta
from utils import translate, sentiment_computation, upsert_content, fetch_url, extract_from_pdf

i = 1
while i > -1:
    CATEGORY_URL = f"https://www.cbirc.gov.cn/cn/static/data/DocInfo/SelectDocByItemIdAndChild/data_itemId=917,pageIndex={i},pageSize=18.json"
    i = i + 1
    print(CATEGORY_URL)
    content = fetch_url(CATEGORY_URL)
    reportinfo = json.loads(content)
    for article in reportinfo['data']['rows']:
        try:
            article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(article['publishDate'],"%Y-%m-%d %H:%M:%S"))
            parsed_datetime = datetime.strptime(article['publishDate'], "%Y-%m-%d")
            if  parsed_datetime < (datetime.today() - timedelta(days=183)):
                i = -1
            else:
                contentCN, summary = extract_from_pdf("https://www.cbirc.gov.cn" + article['pdfFileUrl'])
                article['contentCN'] = repr(contentCN)[1:-1].strip()
                if len(contentCN) < 10:
                    continue
                CONTENT_ENG = ''
                for element in article['contentCN'].split("\n"):
                    CONTENT_ENG += translate(element) + '\n'
                article['content'] = repr(CONTENT_ENG)[1:-1].strip()
                article['site'] = "National Financial Regulatory Administration of China"
                article['originSite'] = "国家金融监督管理总局"
                article['titleCN'] = article['docSubtitle']
                article['title'] = translate(article['docSubtitle'])
                article['link'] = "https://www.cbirc.gov.cn" + str(article['pdfFileUrl'])
                article['category']= "Policy Interpretation"
                article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['titleCN']+article['publishDate'])
                article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
                article['attachment'] = ''
                article['author'] = ''
                article['subtitle'] = translate(summary)
                upsert_content(article)
        except Exception as error:
            print(error)


# ssl._create_default_https_context = ssl._create_stdlib_context
# i = 0
# while i > -1:
#     CATEGORY_URL = f"https://www.chinatax.gov.cn/search5/search/s?siteCode=bm29000002&searchWord=&type=1&xxgkResolveType=%E6%96%87%E5%AD%97&pageNum={i}&pageSize=10&cwrqStart=&cwrqEnd=&column=%E6%94%BF%E7%AD%96%E8%A7%A3%E8%AF%BB&likeDoc=0&wordPlace=0&videoreSolveType="
#     i = i + 1
#     urllib3.disable_warnings()
#     try:
#         req = urllib.request.urlopen(CATEGORY_URL, timeout=30)
#     except:
#         break
#     content = req.read().decode("utf-8")
#     reportinfo = json.loads(content)
#     for article in reportinfo['searchResultAll']['searchTotal']:
#         try:
#             parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(article['cwrq'],"%Y-%m-%d %H:%M:%S")), "%Y-%m-%d")
#             if  parsed_datetime < (datetime.today() - timedelta(days=183)):
#                 i = -1
#             else:
#                 article['originalContent'] = article['content'].replace('\\u','')
#                 if len(article['originalContent']) < 10:
#                     continue
#                 CONTENT_ENG = ''
#                 for element in article['originalContent'].split("。"):
#                     CONTENT_ENG += translate(element) + ' '
#                 article['content'] = CONTENT_ENG
#                 article['site'] = "State Taxation Administration of China"
#                 article['originalSite'] = "国家税务总局"
#                 article['originalTitle'] = article['title']
#                 article['title'] = translate(article['originalTitle'])
#                 article['url'] = article['snapshotUrl']
#                 article['category']= "Policy Interpretation"
#                 article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(article['cwrq'],"%Y-%m-%d %H:%M:%S"))
#                 article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
#                 article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
#                 upsert_content(article)
#         except Exception as error:
#             print(error)