Data-Collection-China / eastmoney.py
gavinzli's picture
chore: Add script descriptions and improve code readability
39fe3d1
raw
history blame
3.98 kB
"""
This script is used to crawl a webpage and extract relevant information from it. It defines a function `crawl` that takes a URL and a dictionary to store the extracted information. The function crawls the webpage, extracts the content, translates it to English, and stores it in the dictionary.
The script also includes a main loop that fetches data from a specific URL and calls the `crawl` function for each article in the fetched data.
"""
import uuid
import json
import urllib.request
from urllib.parse import urlparse
from datetime import datetime, timedelta
from lxml import etree
from utils import encode, translate, datemodifier, sentiment_computation, upsert_content, fetch_url, encode_content
# Load XPath dictionary from a JSON file
with open('xpath.json', 'r', encoding='UTF-8') as f:
xpath_dict = json.load(f)
def crawl(url, article):
"""
Crawls the given URL and extracts relevant information from the webpage.
Args:
url (str): The URL of the webpage to crawl.
article (dict): A dictionary to store the extracted information.
Returns:
None: If the length of the extracted content is less than 10 characters.
str: The extracted content in English if successful.
Raises:
None
"""
domain = urlparse(url).netloc
req = urllib.request.urlopen(url)
text = req.read()
html_text = text.decode("utf-8")
page = etree.HTML(html_text)
contentCN, summary = encode_content(page.xpath(xpath_dict[domain]['content']))
article['attachment'] = encode(page.xpath(xpath_dict[domain]['attachment']))
article['link'] = url
if article['orgSName'] == "''":
article['site'] = translate(article['orgSName'])
else:
article['site'] = translate(article['orgName'])
article['titleCN'] = article['title']
article['title'] = translate(article['title'])
article['author'] = translate(article['researcher'])
article['originAuthor'] = article['researcher']
article['contentCN'] = repr(contentCN)[1:-1].strip()
article['subtitle'] = translate(summary)
article['category'] = "Macroeconomic Research"
if len(article['contentCN']) < 10:
return None
CONTENT_ENG = ''
for element in contentCN.split("\n"):
CONTENT_ENG += translate(element) + '\n'
article['content'] = repr(CONTENT_ENG)[1:-1].strip()
article['authorid'] = uuid.uuid5(uuid.NAMESPACE_OID, article['author'])
article['publishDate'] = datemodifier(article['publishDate'], xpath_dict[domain]['datetime_format'])
article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['titleCN']+article['publishDate'])
article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(contentCN.replace("\n",""))
upsert_content(article)
today = datetime.today().strftime('%Y-%m-%d')
beginDate = (datetime.today() - timedelta(days=183)).strftime('%Y-%m-%d')
i = 0
while i > -1:
URL = "https://reportapi.eastmoney.com/report/jg"
params = {
"cb": "datatable8544623",
"pageSize": "100",
"beginTime": beginDate,
"endTime": today,
"pageNo": i,
"qType": "3",
}
URL = URL + "?" + "&".join(f"{key}={value}" for key, value in params.items())
print(URL)
content = fetch_url(URL)
if content:
start_index = content.find("(")
if start_index != -1:
result = content[start_index + 1: -1]
else:
result = content
reportinfo = json.loads(result)
if reportinfo["size"] > 0:
i = i + 1
for article in reportinfo['data']:
try:
url = f"https://data.eastmoney.com/report/zw_macresearch.jshtml?encodeUrl={article['encodeUrl']}"
crawl(url,article)
except Exception as error:
print(error)
else:
print(reportinfo)
i = -1
else:
print("Failed to fetch URL:", url)