File size: 5,224 Bytes
74475ac 1c5f2e5 0383f38 74475ac d710384 74475ac 4259f95 74475ac 964df08 74475ac 270ad28 74475ac 5fea365 74475ac 0383f38 74475ac 0383f38 cc76656 0383f38 cc76656 0383f38 1c5f2e5 61b6281 74475ac c39d841 74475ac 293d18b 74475ac 5737030 d705151 5737030 74475ac b68d569 693e166 93058c6 5fea365 74475ac 5d719e2 74475ac 4259f95 74475ac 1c5f2e5 74475ac 1c87e0d 74475ac d705151 4259f95 74475ac 4259f95 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 |
"""Module to crawl the website 'eastmoney.com' to fetch and process articles."""
import logging
import time
import json
import urllib.request
import uuid
from datetime import datetime, timedelta
from urllib.parse import urlparse
from http.client import IncompleteRead
from prefect import task, get_run_logger
from lxml import etree
from controllers.summarizer import summarize
from controllers.utils import (
datemodifier,
encode,
encode_content,
extract_reference,
fetch_url,
sentiment_computation,
translate,
update_content
)
from controllers.vectorizer import vectorize
with open('xpath.json', 'r', encoding='UTF-8') as f:
xpath_dict = json.load(f)
def _crawl(url, article, retries=3):
"""
Crawls the given URL and extracts information from the webpage.
Args:
url (str): The URL of the webpage to crawl.
article (dict): A dictionary to store the extracted information.
Returns:
None: If the length of the extracted content is less than 10 characters.
Raises:
None.
"""
domain = urlparse(url).netloc
for attempt in range(retries):
try:
req = urllib.request.urlopen(url, timeout=60)
text = req.read()
break
except (IncompleteRead, TimeoutError) as e:
if attempt == retries - 1:
time.sleep(1) # Wait before retrying
continue
else:
logging.error(e)
return None
html_text = text.decode("utf-8")
page = etree.HTML(html_text)
contentcn, summary = encode_content(
page.xpath(xpath_dict[domain]['content']))
article['attachment'] = encode(page.xpath(
xpath_dict[domain]['attachment']))
article['link'] = url
if article['orgSName'] == "''":
article['site'] = translate(article['orgSName'])
else:
article['site'] = translate(article['orgName'])
article['titleCN'] = article['title']
article['title'] = translate(article['title'])
article['author'] = translate(article['researcher'])
article['originAuthor'] = article['researcher']
article['contentCN'] = repr(contentcn)[1:-1].strip()
article['category'] = "Macroeconomic Research"
if len(article['contentCN']) < 10:
return None
contenteng = ''
for element in contentcn.split("\n"):
contenteng += translate(element) + '\n'
logging.info(contenteng)
article['content'] = repr(contenteng)[1:-1].strip()
try:
article['subtitle'] = summarize(article['content'])
except (RuntimeError, ValueError):
article['subtitle'] = translate(summary)
article['authorid'] = uuid.uuid5(uuid.NAMESPACE_OID, article['author'])
article['publishDate'] = datemodifier(
article['publishDate'], xpath_dict[domain]['datetime_format'])
article['id'] = uuid.uuid5(uuid.NAMESPACE_OID,
article['titleCN'] + article['publishDate'])
article['sentimentScore'], article[
'sentimentLabel'] = sentiment_computation(contentcn.replace("\n", ""))
article['referenceid'] = None
reference_id = extract_reference(article)
if reference_id:
article['referenceid'] = reference_id
update_content(article)
vectorize(article)
# openai_vectorize(article)
@task(name = "Data Collection - eastmoney", log_prints = True)
def crawl(delta):
"""
Crawls the website data.eastmoney.com and retrieves reports within a specified time range.
Args:
delta (int): The number of days to go back from the current date.
Returns:
None
Raises:
None
"""
logger = get_run_logger()
logger.info("data.eastmoney.com")
today = datetime.today().strftime('%Y-%m-%d')
i = 0
while i > -1:
category_url = "https://reportapi.eastmoney.com/report/jg"
params = {
"cb": "datatable8544623",
"pageSize": "100",
"beginTime": (datetime.today() - timedelta(days=delta)).strftime('%Y-%m-%d'),
"endTime": today,
"pageNo": i,
"qType": "3",
}
category_url = category_url + "?" + "&".join(f"{key}={value}"
for key, value in params.items())
content = fetch_url(category_url)
logging.info(content)
logging.info(category_url)
if content:
start_index = content.find("(")
result = content[start_index +
1:-1] if start_index != -1 else content
reportinfo = json.loads(result)
if reportinfo["size"] > 0:
i = i + 1
for article in reportinfo['data']:
try:
link = "https://data.eastmoney.com/report/zw_macresearch.jshtml"
url = f"{link}?encodeUrl={article['encodeUrl']}"
_crawl(url, article)
except (urllib.error.URLError, json.JSONDecodeError, KeyError) as error:
logger.error(error)
else:
i = -1
else:
logger.error("Failed to fetch URL: %s", category_url)
|