OxbridgeEconomics
commit
8925fd4
raw
history blame
22.2 kB
import os
import json
import uuid
import time
import urllib.request
from lxml import etree
from datetime import datetime, timedelta
from urllib.parse import urlparse
from utils import (encode,
translate,
sentiment_computation,
fetch_url,
extract_from_pdf,
crawl,
datemodifier,
encode_content,
update_content,
extract_reference)
with open('xpath.json', 'r', encoding='UTF-8') as f:
xpath_dict = json.load(f)
DELTA = int(os.environ.get('DELTA') or '1')
print(f"DELTA = {DELTA}")
print("cbirc.gov.cn")
i = 1
while i > -1:
CATEGORY_URL = f"https://www.cbirc.gov.cn/cn/static/data/DocInfo/SelectDocByItemIdAndChild/data_itemId=917,pageIndex={i},pageSize=18.json"
i = i + 1
content = fetch_url(CATEGORY_URL)
reportinfo = json.loads(content)
for article in reportinfo['data']['rows']:
try:
article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(article['publishDate'],"%Y-%m-%d %H:%M:%S"))
parsed_datetime = datetime.strptime(article['publishDate'], "%Y-%m-%d")
if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
i = -1
else:
contentCN, summary = extract_from_pdf("https://www.cbirc.gov.cn" + article['pdfFileUrl'])
article['contentCN'] = repr(contentCN)[1:-1].strip()
if len(contentCN) < 10:
continue
CONTENT_ENG = ''
for element in article['contentCN'].split("\n"):
CONTENT_ENG += translate(element) + '\n'
article['content'] = repr(CONTENT_ENG)[1:-1].strip()
article['site'] = "National Financial Regulatory Administration of China"
article['originSite'] = "国家金融监督管理总局"
article['titleCN'] = article['docSubtitle']
article['title'] = translate(article['docSubtitle'])
article['link'] = "https://www.cbirc.gov.cn" + str(article['pdfFileUrl'])
article['category']= "Policy Interpretation"
article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['titleCN']+article['publishDate'])
article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
article['attachment'] = ''
article['author'] = ''
article['subtitle'] = translate(summary)
update_content(article)
except Exception as error:
print(error)
print("csrc.gov.cn")
i = 1
while i > -1:
try:
if i == 1:
CATEGORY_URL = "http://www.csrc.gov.cn/csrc/c100039/common_list.shtml"
else:
CATEGORY_URL = f"http://www.csrc.gov.cn/csrc/c100039/common_list_{i}.shtml"
i = i + 1
req = urllib.request.urlopen(CATEGORY_URL)
text = req.read()
html_text = text.decode("utf-8")
page = etree.HTML(html_text)
articlelist = page.xpath("//div[contains(@class, 'main-right fr common-list')]/ul/li")
for article in articlelist:
if isinstance(article, etree._Element):
subelement = etree.tostring(article).decode()
subpage = etree.HTML(subelement)
date = encode(subpage.xpath("//span[@class='date']"))
parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
i = -1
else:
urls = subpage.xpath("//a/@href")
for url in urls:
try:
article = {}
url = "http://www.csrc.gov.cn" + url
article['category']= "Policy Interpretation"
crawl(url, article)
except Exception as error:
print(error)
except Exception as error:
i = -1
print(error)
i = 1
while i > -1:
CATEGORY_URL = f"http://www.csrc.gov.cn/searchList/a1a078ee0bc54721ab6b148884c784a8?_isAgg=true&_isJson=true&_pageSize=18&_template=index&_rangeTimeGte=&_channelName=&page={i}"
i = i + 1
content = fetch_url(CATEGORY_URL)
reportinfo = json.loads(content)
for article in reportinfo['data']['results']:
try:
parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(article['publishedTimeStr'],"%Y-%m-%d %H:%M:%S")), "%Y-%m-%d")
if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
i = -1
else:
article['category']= "Financial News"
article['site'] = "Securities Regulatory Commission of China"
article['originSite'] = "证监会"
article['titleCN'] = article['title']
article['title'] = translate(article['titleCN'])
article['author'] = ''
article['contentCN'] = repr(article['content'])[1:-1].strip()
if len(article['contentCN']) < 10:
continue
CONTENT_ENG = ''
for element in article['contentCN'].split("。"):
CONTENT_ENG += translate(element) + ' '
article['content'] = repr(CONTENT_ENG)[1:-1].strip()
article['subtitle'] = article['memo']
article['publishDate'] = time.strftime("%Y-%m-%d", time.strptime(article['publishedTimeStr'],"%Y-%m-%d %H:%M:%S"))
article['link'] = article['url']
article['attachment'] = ""
article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(article['content'])
article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['titleCN']+article['publishDate'])
update_content(article)
except Exception as error:
print(error)
print("data.eastmoney.com")
def crawl_eastmoney(url, article):
domain = urlparse(url).netloc
req = urllib.request.urlopen(url)
text = req.read()
html_text = text.decode("utf-8")
page = etree.HTML(html_text)
contentCN, summary = encode_content(page.xpath(xpath_dict[domain]['content']))
article['attachment'] = encode(page.xpath(xpath_dict[domain]['attachment']))
article['link'] = url
if article['orgSName'] == "''":
article['site'] = translate(article['orgSName'])
else:
article['site'] = translate(article['orgName'])
article['titleCN'] = article['title']
article['title'] = translate(article['title'])
article['author'] = translate(article['researcher'])
article['originAuthor'] = article['researcher']
article['contentCN'] = repr(contentCN)[1:-1].strip()
article['subtitle'] = translate(summary)
article['category'] = "Macroeconomic Research"
if len(article['contentCN']) < 10:
return None
CONTENT_ENG = ''
for element in contentCN.split("\n"):
CONTENT_ENG += translate(element) + '\n'
article['content'] = repr(CONTENT_ENG)[1:-1].strip()
article['authorid'] = uuid.uuid5(uuid.NAMESPACE_OID, article['author'])
article['publishDate'] = datemodifier(article['publishDate'], xpath_dict[domain]['datetime_format'])
article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['titleCN']+article['publishDate'])
article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(contentCN.replace("\n",""))
extract_reference(article)
update_content(article)
today = datetime.today().strftime('%Y-%m-%d')
beginDate = (datetime.today() - timedelta(days=DELTA)).strftime('%Y-%m-%d')
i = 0
while i > -1:
URL = "https://reportapi.eastmoney.com/report/jg"
params = {
"cb": "datatable8544623",
"pageSize": "100",
"beginTime": beginDate,
"endTime": today,
"pageNo": i,
"qType": "3",
}
URL = URL + "?" + "&".join(f"{key}={value}" for key, value in params.items())
content = fetch_url(URL)
if content:
start_index = content.find("(")
if start_index != -1:
result = content[start_index + 1: -1]
else:
result = content
reportinfo = json.loads(result)
if reportinfo["size"] > 0:
i = i + 1
for article in reportinfo['data']:
try:
url = f"https://data.eastmoney.com/report/zw_macresearch.jshtml?encodeUrl={article['encodeUrl']}"
crawl_eastmoney(url,article)
except Exception as error:
print(error)
else:
i = -1
else:
print("Failed to fetch URL:", url)
print("gov.cn")
i = 0
while i > -1:
if i == 0:
CATEGORY_URL = "https://www.gov.cn/zhengce/jiedu/home.htm"
else:
CATEGORY_URL = f"https://www.gov.cn/zhengce/jiedu/home_{i}.htm"
i = i + 1
req = urllib.request.urlopen(CATEGORY_URL)
text = req.read()
html_text = text.decode("utf-8")
page = etree.HTML(html_text)
articlelist = page.xpath("//div[contains(@class, 'news_box')]//h4")
for article in articlelist:
if isinstance(article, etree._Element):
subelement = etree.tostring(article).decode()
subpage = etree.HTML(subelement)
date = subpage.xpath("//span/text()")[0]
parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
i = -1
else:
urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
for url in urls:
try:
article = {}
url = url.replace('../', 'https://www.gov.cn/zhengce/')
if "https://www.gov.cn" in url:
article['category']= "Policy Interpretation"
crawl(url, article)
except Exception as error:
print(error)
i = 0
while i > -1:
if i == 0:
CATEGORY_URL = "https://www.gov.cn/zhengce/zuixin/home.htm"
else:
CATEGORY_URL = f"https://www.gov.cn/zhengce/zuixin/home_{i}.htm"
i = i + 1
req = urllib.request.urlopen(CATEGORY_URL)
text = req.read()
html_text = text.decode("utf-8")
page = etree.HTML(html_text)
articlelist = page.xpath("//div[contains(@class, 'news_box')]//h4")
for article in articlelist:
if isinstance(article, etree._Element):
subelement = etree.tostring(article).decode()
subpage = etree.HTML(subelement)
date = subpage.xpath("//span/text()")[0]
parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
i = -1
else:
urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
for url in urls:
try:
article = {}
url = url.replace('../', 'https://www.gov.cn/zhengce/')
if "https://www.gov.cn" in url:
article['site'] = "State Council of China"
crawl(url, article)
except Exception as error:
print(error)
print("mof.gov.cn")
i = 0
while i > -1:
if i == 0:
CATEGORY_URL = "https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/"
else:
CATEGORY_URL = f"https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/index_{i}.htm"
i = i + 1
req = urllib.request.urlopen(CATEGORY_URL)
text = req.read()
html_text = text.decode("utf-8")
page = etree.HTML(html_text)
articlelist = page.xpath("//div[contains(@class, 'xwfb_listerji')]/ul/li[not(@class = 'clear')]")
for article in articlelist:
if isinstance(article, etree._Element):
subelement = etree.tostring(article).decode()
subpage = etree.HTML(subelement)
date = subpage.xpath("//span/text()")[0]
parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
i = -1
else:
urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
for url in urls:
try:
article = {}
url = url.replace("../", "https://www.mof.gov.cn/zhengwuxinxi/")
url = url.replace("./", "https://www.mof.gov.cn/zhengwuxinxi/caizhengxinwen/")
article['category']= "Financial News"
crawl(url, article)
except Exception as error:
print(error)
i = 0
while i > -1:
if i == 0:
CATEGORY_URL = "https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/"
else:
CATEGORY_URL = f"https://www.mof.gov.cn/zhengwuxinxi/zhengcejiedu/index_{i}.htm"
i = i + 1
req = urllib.request.urlopen(CATEGORY_URL)
text = req.read()
html_text = text.decode("utf-8")
page = etree.HTML(html_text)
articlelist = page.xpath("//div[contains(@class, 'xwfb_listerji')]/ul/li[not(@class = 'clear')]")
for article in articlelist:
if isinstance(article, etree._Element):
subelement = etree.tostring(article).decode()
subpage = etree.HTML(subelement)
date = subpage.xpath("//span/text()")[0]
parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
i = -1
else:
urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
for url in urls:
try:
article = {}
url = url.replace("./", CATEGORY_URL)
article['category']= "Policy Interpretation"
print(url)
crawl(url, article)
except Exception as error:
print(error)
print("mofcom.gov.cn")
categories = ['jdzhsw','jdgnmy','jddwmy','jdtzhz']
for category in categories:
i = 1
while i > -1:
if i == 1:
URL = f"http://www.mofcom.gov.cn/article/zcjd/{category}/"
else:
URL = f"http://www.mofcom.gov.cn/article/zcjd/{category}/?{i}"
i = i + 1
req = urllib.request.urlopen(URL)
text = req.read()
html_text = text.decode("utf-8")
page = etree.HTML(html_text)
articlelist = page.xpath("//section[contains(@class, 'listCon iListCon f-mt30')]/ul/li")
for article in articlelist:
if isinstance(article, etree._Element):
subelement = etree.tostring(article).decode()
subpage = etree.HTML(subelement)
date = subpage.xpath("//span/text()")[0]
parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d %H:%M:%S")), "%Y-%m-%d")
if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
i = -1
else:
urls = subpage.xpath("//a/@href")
for url in urls:
try:
article = {}
if '/article/zcjd' in url:
url = "http://www.mofcom.gov.cn" + url
article['category']= "Policy Interpretation"
else:
article['category']= "Policy Release"
crawl(url, article)
except Exception as error:
print(error)
print("ndrc.gov.cn")
i = 0
while i > -1:
if i == 0:
CATEGORY_URL = "https://www.ndrc.gov.cn/xxgk/jd/jd/index.html"
else:
CATEGORY_URL = f"https://www.ndrc.gov.cn/xxgk/jd/jd/index_{i}.html"
i = i + 1
req = urllib.request.urlopen(CATEGORY_URL)
text = req.read()
html_text = text.decode("utf-8")
page = etree.HTML(html_text)
articlelist = page.xpath("//div[contains(@class, 'list')]/ul/li[not(@class = 'empty')]")
for article in articlelist:
if isinstance(article, etree._Element):
subelement = etree.tostring(article).decode()
subpage = etree.HTML(subelement)
date = subpage.xpath("//span/text()")[0]
parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y/%m/%d")), "%Y-%m-%d")
if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
i = -1
else:
urls = subpage.xpath("//a[contains(@target, '_blank')]/@href")
for url in urls:
try:
article = {}
if "www.gov.cn" in url:
article['category']= "Policy Release"
elif "../../zcfb/" in url:
url = url.replace("../../zcfb/", "https://www.ndrc.gov.cn/xxgk/zcfb/")
article['category']= "Policy Release"
else:
url = url.replace("../../", "https://www.ndrc.gov.cn/xxgk/jd/jd/")
url = url.replace("./", "https://www.ndrc.gov.cn/xxgk/jd/jd/")
article['category']= "Policy Interpretation"
crawl(url, article)
except Exception as error:
print(error)
print("safe.gov.cn")
i = 1
while i > -1:
if i == 1:
CATEGORY_URL = "https://www.safe.gov.cn/safe/zcfgjd/index.html"
else:
CATEGORY_URL = f"https://www.safe.gov.cn/safe/zcfgjd/index_{i}.html"
i = i + 1
req = urllib.request.urlopen(CATEGORY_URL)
text = req.read()
html_text = text.decode("utf-8")
page = etree.HTML(html_text)
articlelist = page.xpath("//div[contains(@class, 'list_conr')]/ul/li")
for article in articlelist:
if isinstance(article, etree._Element):
subelement = etree.tostring(article).decode()
subpage = etree.HTML(subelement)
date = subpage.xpath("//dd/text()")[0]
parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
i = -1
else:
urls = subpage.xpath("//a/@href")
for url in urls:
try:
article = {}
url = "https://www.safe.gov.cn" + url
article['category']= "Policy Interpretation"
crawl(url, article)
except Exception as error:
print(error)
i = 1
while i > -1:
if i == 1:
CATEGORY_URL = "https://www.safe.gov.cn/safe/sjjd/index.html"
else:
CATEGORY_URL = f"https://www.safe.gov.cn/safe/sjjd/index_{i}.html"
i = i + 1
req = urllib.request.urlopen(CATEGORY_URL)
text = req.read()
html_text = text.decode("utf-8")
page = etree.HTML(html_text)
articlelist = page.xpath("//div[contains(@class, 'list_conr')]/ul/li")
for article in articlelist:
if isinstance(article, etree._Element):
subelement = etree.tostring(article).decode()
subpage = etree.HTML(subelement)
date = subpage.xpath("//dd/text()")[0]
parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
i = -1
else:
urls = subpage.xpath("//a/@href")
for url in urls:
try:
article = {}
url = "https://www.safe.gov.cn" + url
article['category']= "Data Interpretation"
crawl(url, article)
except Exception as error:
print(error)
print("stats.gov.hk")
i = 0
while i > -1:
if i == 0:
CATEGORY_URL = "https://www.stats.gov.cn/sj/sjjd/"
else:
CATEGORY_URL = f"https://www.stats.gov.cn/sj/sjjd/index_{i}.html"
i = i + 1
req = urllib.request.urlopen(CATEGORY_URL)
text = req.read()
html_text = text.decode("utf-8")
page = etree.HTML(html_text)
articlelist = page.xpath("//div[contains(@class, 'list-content')]/ul/li")
for article in articlelist:
if isinstance(article, etree._Element):
subelement = etree.tostring(article).decode()
subpage = etree.HTML(subelement)
date = encode(subpage.xpath("//span"))
parsed_datetime = datetime.strptime(time.strftime("%Y-%m-%d", time.strptime(date,"%Y-%m-%d")), "%Y-%m-%d")
if parsed_datetime < (datetime.today() - timedelta(days=DELTA)):
i = -1
else:
urls = subpage.xpath("//a[@class='fl pc_1600']/@href")
for url in urls:
try:
article = {}
url = url.replace('./', "https://www.stats.gov.cn/sj/sjjd/")
article['category']= "Data Interpretation"
crawl(url, article)
except Exception as error:
print(error)