File size: 4,594 Bytes
74475ac 4259f95 74475ac 964df08 74475ac 270ad28 74475ac c39d841 74475ac 4259f95 74475ac 4259f95 5737030 d705151 5737030 74475ac 4259f95 74475ac 270ad28 74475ac 5d719e2 74475ac 4259f95 74475ac 9b17ba3 74475ac d705151 4259f95 74475ac 4259f95 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 |
"""Module to crawl the website 'eastmoney.com' to fetch and process articles."""
import json
import urllib.request
import uuid
from datetime import datetime, timedelta
from urllib.parse import urlparse
from prefect import task, get_run_logger
from lxml import etree
from controllers.summarizer import summarize
from controllers.utils import (
datemodifier,
encode,
encode_content,
extract_reference,
fetch_url,
sentiment_computation,
translate,
update_content
)
with open('xpath.json', 'r', encoding='UTF-8') as f:
xpath_dict = json.load(f)
def _crawl(url, article):
"""
Crawls the given URL and extracts information from the webpage.
Args:
url (str): The URL of the webpage to crawl.
article (dict): A dictionary to store the extracted information.
Returns:
None: If the length of the extracted content is less than 10 characters.
Raises:
None.
"""
domain = urlparse(url).netloc
req = urllib.request.urlopen(url)
text = req.read()
html_text = text.decode("utf-8")
page = etree.HTML(html_text)
contentcn, summary = encode_content(
page.xpath(xpath_dict[domain]['content']))
article['attachment'] = encode(page.xpath(
xpath_dict[domain]['attachment']))
article['link'] = url
if article['orgSName'] == "''":
article['site'] = translate(article['orgSName'])
else:
article['site'] = translate(article['orgName'])
print(f'INFO - {article}')
article['titleCN'] = article['title']
article['title'] = translate(article['title'])
article['author'] = translate(article['researcher'])
article['originAuthor'] = article['researcher']
article['contentCN'] = repr(contentcn)[1:-1].strip()
article['category'] = "Macroeconomic Research"
if len(article['contentCN']) < 10:
return None
contenteng = ''
for element in contentcn.split("\n"):
contenteng += translate(element) + '\n'
article['content'] = repr(contenteng)[1:-1].strip()
print(f'INFO - {article}')
try:
article['subtitle'] = summarize(article['content'])
except (RuntimeError, ValueError):
article['subtitle'] = translate(summary)
article['authorid'] = uuid.uuid5(uuid.NAMESPACE_OID, article['author'])
article['publishDate'] = datemodifier(
article['publishDate'], xpath_dict[domain]['datetime_format'])
article['id'] = uuid.uuid5(uuid.NAMESPACE_OID,
article['titleCN'] + article['publishDate'])
article['sentimentScore'], article[
'sentimentLabel'] = sentiment_computation(contentcn.replace("\n", ""))
print(f'INFO - {article}')
extract_reference(article)
update_content(article)
@task(name = "Data Collection - eastmoney", log_prints = True)
def crawl(delta):
"""
Crawls the website data.eastmoney.com and retrieves reports within a specified time range.
Args:
delta (int): The number of days to go back from the current date.
Returns:
None
Raises:
None
"""
logger = get_run_logger()
logger.info("data.eastmoney.com")
today = datetime.today().strftime('%Y-%m-%d')
i = 0
while i > -1:
category_url = "https://reportapi.eastmoney.com/report/jg"
params = {
"cb": "datatable8544623",
"pageSize": "100",
"beginTime": (datetime.today() - timedelta(days=delta)).strftime('%Y-%m-%d'),
"endTime": today,
"pageNo": i,
"qType": "3",
}
category_url = category_url + "?" + "&".join(f"{key}={value}"
for key, value in params.items())
content = fetch_url(category_url)
print(content)
print(category_url)
if content:
start_index = content.find("(")
result = content[start_index +
1:-1] if start_index != -1 else content
reportinfo = json.loads(result)
if reportinfo["size"] > 0:
i = i + 1
for article in reportinfo['data']:
try:
url = f"https://data.eastmoney.com/report/zw_macresearch.jshtml?encodeUrl={article['encodeUrl']}"
_crawl(url, article)
except (urllib.error.URLError, json.JSONDecodeError, KeyError) as error:
logger.error(error)
else:
i = -1
else:
logger.error("Failed to fetch URL: %s", category_url)
|