File size: 5,108 Bytes
39fe3d1 57c4050 42ba1cc 57c4050 42ba1cc 57c4050 42ba1cc 422b41b 42ba1cc 57c4050 42ba1cc b6dcee5 422b41b b2a3d45 b6dcee5 42ba1cc 39fe3d1 422b41b 57c4050 42ba1cc 57c4050 422b41b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 |
"""
This script fetches data from the China Banking and Insurance Regulatory Commission (CBIRC) website and extracts relevant information from the fetched data.
The extracted information is then processed and stored in a database.
The script performs the following steps:
1. Fetches data from the CBIRC website by making HTTP requests.
2. Parses the fetched data and extracts relevant information.
3. Translates the extracted information to English.
4. Computes sentiment scores for the translated content.
5. Stores the processed information in a database.
Note: The script also includes commented code for fetching data from the State Taxation Administration of China website, but it is currently disabled.
"""
import json
import uuid
import time
from datetime import datetime, timedelta
from utils import translate, sentiment_computation, upsert_content, fetch_url, extract_from_pdf
i = 1
while i > -1:
CATEGORY_URL = f"https://www.cbirc.gov.cn/cn/static/data/DocInfo/SelectDocByItemIdAndChild/data_itemId=917,pageIndex={i},pageSize=18.json"
i = i + 1
print(CATEGORY_URL)
content = fetch_url(CATEGORY_URL)
reportinfo = json.loads(content)
for article in reportinfo['data']['rows']:
try:
article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(article['publishDate'],"%Y-%m-%d %H:%M:%S"))
parsed_datetime = datetime.strptime(article['publishDate'], "%Y-%m-%d")
if parsed_datetime < (datetime.today() - timedelta(days=183)):
i = -1
else:
contentCN, summary = extract_from_pdf("https://www.cbirc.gov.cn" + article['pdfFileUrl'])
article['contentCN'] = repr(contentCN)[1:-1].strip()
if len(contentCN) < 10:
continue
CONTENT_ENG = ''
for element in article['contentCN'].split("\n"):
CONTENT_ENG += translate(element) + '\n'
article['content'] = repr(CONTENT_ENG)[1:-1].strip()
article['site'] = "National Financial Regulatory Administration of China"
article['originSite'] = "国家金融监督管理总局"
article['titleCN'] = article['docSubtitle']
article['title'] = translate(article['docSubtitle'])
article['link'] = "https://www.cbirc.gov.cn" + str(article['pdfFileUrl'])
article['category']= "Policy Interpretation"
article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['titleCN']+article['publishDate'])
article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
article['attachment'] = ''
article['author'] = ''
article['subtitle'] = translate(summary)
upsert_content(article)
except Exception as error:
print(error)
# ssl._create_default_https_context = ssl._create_stdlib_context
# i = 0
# while i > -1:
# CATEGORY_URL = f"https://www.chinatax.gov.cn/search5/search/s?siteCode=bm29000002&searchWord=&type=1&xxgkResolveType=%E6%96%87%E5%AD%97&pageNum={i}&pageSize=10&cwrqStart=&cwrqEnd=&column=%E6%94%BF%E7%AD%96%E8%A7%A3%E8%AF%BB&likeDoc=0&wordPlace=0&videoreSolveType="
# i = i + 1
# urllib3.disable_warnings()
# try:
# req = urllib.request.urlopen(CATEGORY_URL, timeout=30)
# except:
# break
# content = req.read().decode("utf-8")
# reportinfo = json.loads(content)
# for article in reportinfo['searchResultAll']['searchTotal']:
# try:
# parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(article['cwrq'],"%Y-%m-%d %H:%M:%S")), "%Y-%m-%d")
# if parsed_datetime < (datetime.today() - timedelta(days=183)):
# i = -1
# else:
# article['originalContent'] = article['content'].replace('\\u','')
# if len(article['originalContent']) < 10:
# continue
# CONTENT_ENG = ''
# for element in article['originalContent'].split("。"):
# CONTENT_ENG += translate(element) + ' '
# article['content'] = CONTENT_ENG
# article['site'] = "State Taxation Administration of China"
# article['originalSite'] = "国家税务总局"
# article['originalTitle'] = article['title']
# article['title'] = translate(article['originalTitle'])
# article['url'] = article['snapshotUrl']
# article['category']= "Policy Interpretation"
# article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(article['cwrq'],"%Y-%m-%d %H:%M:%S"))
# article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
# article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
# upsert_content(article)
# except Exception as error:
# print(error)
|