Spaces:
Build error
Build error
File size: 4,214 Bytes
74475ac 964df08 74475ac 270ad28 74475ac 964df08 74475ac 270ad28 74475ac |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 |
"""Module to crawl the website 'eastmoney.com' to fetch and process articles."""
import json
import logging
import urllib.request
import uuid
from datetime import datetime, timedelta
from urllib.parse import urlparse
from lxml import etree
from controllers.summarizer import summarize
from controllers.utils import (
datemodifier,
encode,
encode_content,
extract_reference,
fetch_url,
sentiment_computation,
translate,
update_content
)
with open('xpath.json', 'r', encoding='UTF-8') as f:
xpath_dict = json.load(f)
def _crawl(url, article):
"""
Crawls the given URL and extracts information from the webpage.
Args:
url (str): The URL of the webpage to crawl.
article (dict): A dictionary to store the extracted information.
Returns:
None: If the length of the extracted content is less than 10 characters.
Raises:
None.
"""
domain = urlparse(url).netloc
req = urllib.request.urlopen(url)
text = req.read()
html_text = text.decode("utf-8")
page = etree.HTML(html_text)
contentcn, summary = encode_content(
page.xpath(xpath_dict[domain]['content']))
article['attachment'] = encode(page.xpath(
xpath_dict[domain]['attachment']))
article['link'] = url
if article['orgSName'] == "''":
article['site'] = translate(article['orgSName'])
else:
article['site'] = translate(article['orgName'])
article['titleCN'] = article['title']
article['title'] = translate(article['title'])
article['author'] = translate(article['researcher'])
article['originAuthor'] = article['researcher']
article['contentCN'] = repr(contentcn)[1:-1].strip()
article['category'] = "Macroeconomic Research"
if len(article['contentCN']) < 10:
return None
contenteng = ''
for element in contentcn.split("\n"):
contenteng += translate(element) + '\n'
article['content'] = repr(contenteng)[1:-1].strip()
article['subtitle'] = summarize(article['content'])
article['authorid'] = uuid.uuid5(uuid.NAMESPACE_OID, article['author'])
article['publishDate'] = datemodifier(
article['publishDate'], xpath_dict[domain]['datetime_format'])
article['id'] = uuid.uuid5(uuid.NAMESPACE_OID,
article['titleCN'] + article['publishDate'])
article['sentimentScore'], article[
'sentimentLabel'] = sentiment_computation(contentcn.replace("\n", ""))
extract_reference(article)
update_content(article)
logging.info(article)
def crawl(delta):
"""
Crawls the website data.eastmoney.com and retrieves reports within a specified time range.
Args:
delta (int): The number of days to go back from the current date.
Returns:
None
Raises:
None
"""
logging.info("data.eastmoney.com")
today = datetime.today().strftime('%Y-%m-%d')
i = 0
while i > -1:
category_url = "https://reportapi.eastmoney.com/report/jg"
params = {
"cb": "datatable8544623",
"pageSize": "100",
"beginTime": (datetime.today() - timedelta(days=delta)).strftime('%Y-%m-%d'),
"endTime": today,
"pageNo": i,
"qType": "3",
}
category_url = category_url + "?" + "&".join(f"{key}={value}"
for key, value in params.items())
content = fetch_url(category_url)
if content:
start_index = content.find("(")
result = content[start_index +
1:-1] if start_index != -1 else content
reportinfo = json.loads(result)
if reportinfo["size"] > 0:
i = i + 1
for article in reportinfo['data']:
try:
url = f"https://data.eastmoney.com/report/zw_macresearch.jshtml?encodeUrl={article['encodeUrl']}"
_crawl(url, article)
except Exception as error:
logging.error(error)
else:
i = -1
else:
logging.error("Failed to fetch URL: %s", category_url)
|