gavinzli's picture
chore: Add script descriptions and improve code readability
39fe3d1
raw
history blame
22.8 kB
"""
This script is responsible for collecting data from various websites related to financial and policy information in China.
It fetches data from different sources, extracts relevant information, translates it, and updates the content accordingly.
The collected data includes policy interpretations, financial news, macroeconomic research, and more.
"""
import json
import os
import time
import urllib.request
import uuid
from datetime import datetime, timedelta
from urllib.parse import urlparse
from lxml import etree
from utils import (crawl, datemodifier, encode, encode_content,
extract_from_pdf, extract_reference, fetch_url,
sentiment_computation, translate, update_content)
with open('xpath.json', 'r', encoding='UTF-8') as f:
xpath_dict = json.load(f)
DELTA = int(os.environ.get('DELTA') or '1')
print(f"DELTA = {DELTA}")
print("cbirc.gov.cn")
i = 1
while i > -1:
CATEGORY_URL = f"https://www.cbirc.gov.cn/cn/static/data/DocInfo/SelectDocByItemIdAndChild/data_itemId=917,pageIndex={i},pageSize=18.json"
i = i + 1
content = fetch_url(CATEGORY_URL)
reportinfo = json.loads(content)
for article in reportinfo['data']['rows']:
try:
article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(article['publishDate'],"%Y-%m-%d %H:%M:%S"))
parsed_datetime = datetime.strptime(article['publishDate'], "%Y-%m-%d")
if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
i = -1
else:
contentCN, summary = extract_from_pdf("https://www.cbirc.gov.cn" + article['pdfFileUrl'])
article['contentCN'] = repr(contentCN)[1:-1].strip()
if len(contentCN) < 10:
continue
CONTENT_ENG = ''
for element in article['contentCN'].split("\n"):
CONTENT_ENG += translate(element) + '\n'
article['content'] = repr(CONTENT_ENG)[1:-1].strip()
article['site'] = "National Financial Regulatory Administration of China"
article['originSite'] = "国家金融监督管理总局"
article['titleCN'] = article['docSubtitle']
article['title'] = translate(article['docSubtitle'])
article['link'] = "https://www.cbirc.gov.cn" + str(article['pdfFileUrl'])
article['category']= "Policy Interpretation"
article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['titleCN']+article['publishDate'])
article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
article['attachment'] = ''
article['author'] = ''
article['subtitle'] = translate(summary)
update_content(article)
except Exception as error:
print(error)
print("csrc.gov.cn")
i = 1
while i > -1:
try:
if i == 1:
CATEGORY_URL = "http://www.csrc.gov.cn/csrc/c100039/common_list.shtml"
else:
CATEGORY_URL = f"http://www.csrc.gov.cn/csrc/c100039/common_list_{i}.shtml"
i = i + 1
req = urllib.request.urlopen(CATEGORY_URL)
text = req.read()
html_text = text.decode("utf-8")
page = etree.HTML(html_text)
articlelist = page.xpath("//div[contains(@class, 'main-right fr common-list')]/ul/li")
for article in articlelist:
if isinstance(article, etree._Element):
subelement = etree.tostring(article).decode()
subpage = etree.HTML(subelement)
date = encode(subpage.xpath("//span[@class='date']"))
parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
i = -1
else:
urls = subpage.xpath("//a/@href")
for url in urls:
try:
article = {}
url = "http://www.csrc.gov.cn" + url
article['category']= "Policy Interpretation"
crawl(url, article)
except Exception as error:
print(error)
except Exception as error:
i = -1
print(error)
i = 1
while i > -1:
CATEGORY_URL = f"http://www.csrc.gov.cn/searchList/a1a078ee0bc54721ab6b148884c784a8?_isAgg=true&_isJson=true&_pageSize=18&_template=index&_rangeTimeGte=&_channelName=&page={i}"
i = i + 1
try:
content = fetch_url(CATEGORY_URL)
reportinfo = json.loads(content)
for article in reportinfo['data']['results']:
parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(article['publishedTimeStr'],"%Y-%m-%d %H:%M:%S")), "%Y-%m-%d")
if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
i = -1
else:
article['category']= "Financial News"
article['site'] = "Securities Regulatory Commission of China"
article['originSite'] = "证监会"
article['titleCN'] = article['title']
article['title'] = translate(article['titleCN'])
article['author'] = ''
article['contentCN'] = repr(article['content'])[1:-1].strip()
if len(article['contentCN']) < 10:
continue
CONTENT_ENG = ''
for element in article['contentCN'].split("。"):
CONTENT_ENG += translate(element) + ' '
article['content'] = repr(CONTENT_ENG)[1:-1].strip()
article['subtitle'] = article['memo']
article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(article['publishedTimeStr'],"%Y-%m-%d %H:%M:%S"))
article['link'] = article['url']
article['attachment'] = ""
article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['titleCN']+article['publishDate'])
update_content(article)
except Exception as error:
print(error)
print("data.eastmoney.com")
def crawl_eastmoney(url, article):
"""
Crawls the given URL and extracts information from the webpage.
Args:
url (str): The URL of the webpage to crawl.
article (dict): A dictionary to store the extracted information.
Returns:
None: If the length of the extracted content is less than 10 characters.
Raises:
None.
"""
domain = urlparse(url).netloc
req = urllib.request.urlopen(url)
text = req.read()
html_text = text.decode("utf-8")
page = etree.HTML(html_text)
contentCN, summary = encode_content(page.xpath(xpath_dict[domain]['content']))
article['attachment'] = encode(page.xpath(xpath_dict[domain]['attachment']))
article['link'] = url
if article['orgSName'] == "''":
article['site'] = translate(article['orgSName'])
else:
article['site'] = translate(article['orgName'])
article['titleCN'] = article['title']
article['title'] = translate(article['title'])
article['author'] = translate(article['researcher'])
article['originAuthor'] = article['researcher']
article['contentCN'] = repr(contentCN)[1:-1].strip()
article['subtitle'] = translate(summary)
article['category'] = "Macroeconomic Research"
if len(article['contentCN']) < 10:
return None
CONTENT_ENG = ''
for element in contentCN.split("\n"):
CONTENT_ENG += translate(element) + '\n'
article['content'] = repr(CONTENT_ENG)[1:-1].strip()
article['authorid'] = uuid.uuid5(uuid.NAMESPACE_OID, article['author'])
article['publishDate'] = datemodifier(article['publishDate'], xpath_dict[domain]['datetime_format'])
article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['titleCN']+article['publishDate'])
article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(contentCN.replace("\n",""))
extract_reference(article)
update_content(article)
today = datetime.today().strftime('%Y-%m-%d')
beginDate = (datetime.today() - timedelta(days=DELTA)).strftime('%Y-%m-%d')
i = 0
while i > -1:
URL = "https://reportapi.eastmoney.com/report/jg"
params = {
"cb": "datatable8544623",
"pageSize": "100",
"beginTime": beginDate,
"endTime": today,
"pageNo": i,
"qType": "3",
}
URL = URL + "?" + "&".join(f"{key}={value}" for key, value in params.items())
content = fetch_url(URL)
if content:
start_index = content.find("(")
if start_index != -1:
result = content[start_index + 1: -1]
else:
result = content
reportinfo = json.loads(result)
if reportinfo["size"] > 0:
i = i + 1
for article in reportinfo['data']:
try:
url = f"https://data.eastmoney.com/report/zw_macresearch.jshtml?encodeUrl={article['encodeUrl']}"
crawl_eastmoney(url,article)
except Exception as error:
print(error)
else:
i = -1
else:
print("Failed to fetch URL:", url)
print("gov.cn")
i = 0
while i > -1:
if i == 0:
CATEGORY_URL = "https://www.gov.cn/zhengce/jiedu/home.htm"
else:
CATEGORY_URL = f"https://www.gov.cn/zhengce/jiedu/home_{i}.htm"
i = i + 1
req = urllib.request.urlopen(CATEGORY_URL)
text = req.read()
html_text = text.decode("utf-8")
page = etree.HTML(html_text)
articlelist = page.xpath("//div[contains(@class, 'news_box')]//h4")
for article in articlelist:
if isinstance(article, etree._Element):
subelement = etree.tostring(article).decode()
subpage = etree.HTML(subelement)
date = subpage.xpath("//span/text()")[0]
parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
i = -1
else:
urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
for url in urls:
try:
article = {}
url = url.replace('../', 'https://www.gov.cn/zhengce/')
if "https://www.gov.cn" in url:
article['category']= "Policy Interpretation"
crawl(url, article)
except Exception as error:
print(error)
i = 0
while i > -1:
if i == 0:
CATEGORY_URL = "https://www.gov.cn/zhengce/zuixin/home.htm"
else:
CATEGORY_URL = f"https://www.gov.cn/zhengce/zuixin/home_{i}.htm"
i = i + 1
req = urllib.request.urlopen(CATEGORY_URL)
text = req.read()
html_text = text.decode("utf-8")
page = etree.HTML(html_text)
articlelist = page.xpath("//div[contains(@class, 'news_box')]//h4")
for article in articlelist:
if isinstance(article, etree._Element):
subelement = etree.tostring(article).decode()
subpage = etree.HTML(subelement)
date = subpage.xpath("//span/text()")[0]
parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
i = -1
else:
urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
for url in urls:
try:
article = {}
url = url.replace('../', 'https://www.gov.cn/zhengce/')
if "https://www.gov.cn" in url:
article['site'] = "State Council of China"
crawl(url, article)
except Exception as error:
print(error)
print("mof.gov.cn")
i = 0
while i > -1:
if i == 0:
CATEGORY_URL = "https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/"
else:
CATEGORY_URL = f"https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/index_{i}.htm"
i = i + 1
req = urllib.request.urlopen(CATEGORY_URL)
text = req.read()
html_text = text.decode("utf-8")
page = etree.HTML(html_text)
articlelist = page.xpath("//div[contains(@class, 'xwfb_listerji')]/ul/li[not(@class = 'clear')]")
for article in articlelist:
if isinstance(article, etree._Element):
subelement = etree.tostring(article).decode()
subpage = etree.HTML(subelement)
date = subpage.xpath("//span/text()")[0]
parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
i = -1
else:
urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
for url in urls:
try:
article = {}
url = url.replace("../", "https://www.mof.gov.cn/zhengwuxinxi/")
url = url.replace("./", "https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/")
article['category']= "Financial News"
crawl(url, article)
except Exception as error:
print(error)
i = 0
while i > -1:
if i == 0:
CATEGORY_URL = "https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/"
else:
CATEGORY_URL = f"https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/index_{i}.htm"
i = i + 1
req = urllib.request.urlopen(CATEGORY_URL)
text = req.read()
html_text = text.decode("utf-8")
page = etree.HTML(html_text)
articlelist = page.xpath("//div[contains(@class, 'xwfb_listerji')]/ul/li[not(@class = 'clear')]")
for article in articlelist:
if isinstance(article, etree._Element):
subelement = etree.tostring(article).decode()
subpage = etree.HTML(subelement)
date = subpage.xpath("//span/text()")[0]
parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
i = -1
else:
urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
for url in urls:
try:
article = {}
url = url.replace("./", CATEGORY_URL)
article['category']= "Policy Interpretation"
print(url)
crawl(url, article)
except Exception as error:
print(error)
print("mofcom.gov.cn")
categories = ['jdzhsw','jdgnmy','jddwmy','jdtzhz']
for category in categories:
i = 1
while i > -1:
if i == 1:
URL = f"http://www.mofcom.gov.cn/article/zcjd/{category}/"
else:
URL = f"http://www.mofcom.gov.cn/article/zcjd/{category}/?{i}"
i = i + 1
req = urllib.request.urlopen(URL)
text = req.read()
html_text = text.decode("utf-8")
page = etree.HTML(html_text)
articlelist = page.xpath("//section[contains(@class, 'listCon iListCon f-mt30')]/ul/li")
for article in articlelist:
if isinstance(article, etree._Element):
subelement = etree.tostring(article).decode()
subpage = etree.HTML(subelement)
date = subpage.xpath("//span/text()")[0]
parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d %H:%M:%S")), "%Y-%m-%d")
if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
i = -1
else:
urls = subpage.xpath("//a/@href")
for url in urls:
try:
article = {}
if '/article/zcjd' in url:
url = "http://www.mofcom.gov.cn" + url
article['category']= "Policy Interpretation"
else:
article['category']= "Policy Release"
crawl(url, article)
except Exception as error:
print(error)
print("ndrc.gov.cn")
i = 0
while i > -1:
if i == 0:
CATEGORY_URL = "https://www.ndrc.gov.cn/xxgk/jd/jd/index.html"
else:
CATEGORY_URL = f"https://www.ndrc.gov.cn/xxgk/jd/jd/index_{i}.html"
i = i + 1
req = urllib.request.urlopen(CATEGORY_URL)
text = req.read()
html_text = text.decode("utf-8")
page = etree.HTML(html_text)
articlelist = page.xpath("//div[contains(@class, 'list')]/ul/li[not(@class = 'empty')]")
for article in articlelist:
if isinstance(article, etree._Element):
subelement = etree.tostring(article).decode()
subpage = etree.HTML(subelement)
date = subpage.xpath("//span/text()")[0]
parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y/%m/%d")), "%Y-%m-%d")
if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
i = -1
else:
urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
for url in urls:
try:
article = {}
if "www.gov.cn" in url:
article['category']= "Policy Release"
elif "../../zcfb/" in url:
url = url.replace("../../zcfb/", "https://www.ndrc.gov.cn/xxgk/zcfb/")
article['category']= "Policy Release"
else:
url = url.replace("../../", "https://www.ndrc.gov.cn/xxgk/jd/jd/")
url = url.replace("./", "https://www.ndrc.gov.cn/xxgk/jd/jd/")
article['category']= "Policy Interpretation"
crawl(url, article)
except Exception as error:
print(error)
print("safe.gov.cn")
i = 1
while i > -1:
if i == 1:
CATEGORY_URL = "https://www.safe.gov.cn/safe/zcfgjd/index.html"
else:
CATEGORY_URL = f"https://www.safe.gov.cn/safe/zcfgjd/index_{i}.html"
i = i + 1
req = urllib.request.urlopen(CATEGORY_URL)
text = req.read()
html_text = text.decode("utf-8")
page = etree.HTML(html_text)
articlelist = page.xpath("//div[contains(@class, 'list_conr')]/ul/li")
for article in articlelist:
if isinstance(article, etree._Element):
subelement = etree.tostring(article).decode()
subpage = etree.HTML(subelement)
date = subpage.xpath("//dd/text()")[0]
parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
i = -1
else:
urls = subpage.xpath("//a/@href")
for url in urls:
try:
article = {}
url = "https://www.safe.gov.cn" + url
article['category']= "Policy Interpretation"
crawl(url, article)
except Exception as error:
print(error)
i = 1
while i > -1:
if i == 1:
CATEGORY_URL = "https://www.safe.gov.cn/safe/sjjd/index.html"
else:
CATEGORY_URL = f"https://www.safe.gov.cn/safe/sjjd/index_{i}.html"
i = i + 1
req = urllib.request.urlopen(CATEGORY_URL)
text = req.read()
html_text = text.decode("utf-8")
page = etree.HTML(html_text)
articlelist = page.xpath("//div[contains(@class, 'list_conr')]/ul/li")
for article in articlelist:
if isinstance(article, etree._Element):
subelement = etree.tostring(article).decode()
subpage = etree.HTML(subelement)
date = subpage.xpath("//dd/text()")[0]
parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
i = -1
else:
urls = subpage.xpath("//a/@href")
for url in urls:
try:
article = {}
url = "https://www.safe.gov.cn" + url
article['category']= "Data Interpretation"
crawl(url, article)
except Exception as error:
print(error)
print("stats.gov.hk")
i = 0
while i > -1:
if i == 0:
CATEGORY_URL = "https://www.stats.gov.cn/sj/sjjd/"
else:
CATEGORY_URL = f"https://www.stats.gov.cn/sj/sjjd/index_{i}.html"
i = i + 1
req = urllib.request.urlopen(CATEGORY_URL)
text = req.read()
html_text = text.decode("utf-8")
page = etree.HTML(html_text)
articlelist = page.xpath("//div[contains(@class, 'list-content')]/ul/li")
for article in articlelist:
if isinstance(article, etree._Element):
subelement = etree.tostring(article).decode()
subpage = etree.HTML(subelement)
date = encode(subpage.xpath("//span"))
parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
i = -1
else:
urls = subpage.xpath("//a[@class='fl pc_1600']/@href")
for url in urls:
try:
article = {}
url = url.replace('./', "https://www.stats.gov.cn/sj/sjjd/")
article['category']= "Data Interpretation"
crawl(url, article)
except Exception as error:
print(error)