File size: 4,214 Bytes
74475ac
 
 
 
 
 
 
 
 
 
964df08
74475ac
 
 
 
 
 
 
 
270ad28
74475ac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
964df08
74475ac
 
 
 
 
 
 
 
270ad28
74475ac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
"""Module to crawl the website 'eastmoney.com' to fetch and process articles."""
import json
import logging
import urllib.request
import uuid
from datetime import datetime, timedelta
from urllib.parse import urlparse

from lxml import etree

from controllers.summarizer import summarize
from controllers.utils import (
    datemodifier,
    encode,
    encode_content,
    extract_reference,
    fetch_url,
    sentiment_computation,
    translate,
    update_content
)

with open('xpath.json', 'r', encoding='UTF-8') as f:
    xpath_dict = json.load(f)


def _crawl(url, article):
    """
    Crawls the given URL and extracts information from the webpage.

    Args:
        url (str): The URL of the webpage to crawl.
        article (dict): A dictionary to store the extracted information.

    Returns:

        None: If the length of the extracted content is less than 10 characters.

    Raises:
        None.

    """
    domain = urlparse(url).netloc
    req = urllib.request.urlopen(url)
    text = req.read()
    html_text = text.decode("utf-8")
    page = etree.HTML(html_text)
    contentcn, summary = encode_content(
        page.xpath(xpath_dict[domain]['content']))
    article['attachment'] = encode(page.xpath(
        xpath_dict[domain]['attachment']))
    article['link'] = url
    if article['orgSName'] == "''":
        article['site'] = translate(article['orgSName'])
    else:
        article['site'] = translate(article['orgName'])
    article['titleCN'] = article['title']
    article['title'] = translate(article['title'])
    article['author'] = translate(article['researcher'])
    article['originAuthor'] = article['researcher']
    article['contentCN'] = repr(contentcn)[1:-1].strip()
    article['category'] = "Macroeconomic Research"
    if len(article['contentCN']) < 10:
        return None
    contenteng = ''
    for element in contentcn.split("\n"):
        contenteng += translate(element) + '\n'
    article['content'] = repr(contenteng)[1:-1].strip()
    article['subtitle'] = summarize(article['content'])
    article['authorid'] = uuid.uuid5(uuid.NAMESPACE_OID, article['author'])
    article['publishDate'] = datemodifier(
        article['publishDate'], xpath_dict[domain]['datetime_format'])
    article['id'] = uuid.uuid5(uuid.NAMESPACE_OID,
                               article['titleCN'] + article['publishDate'])
    article['sentimentScore'], article[
        'sentimentLabel'] = sentiment_computation(contentcn.replace("\n", ""))
    extract_reference(article)
    update_content(article)
    logging.info(article)


def crawl(delta):
    """
    Crawls the website data.eastmoney.com and retrieves reports within a specified time range.

    Args:
        delta (int): The number of days to go back from the current date.

    Returns:
        None

    Raises:
        None
    """
    logging.info("data.eastmoney.com")
    today = datetime.today().strftime('%Y-%m-%d')
    i = 0
    while i > -1:
        category_url = "https://reportapi.eastmoney.com/report/jg"
        params = {
            "cb": "datatable8544623",
            "pageSize": "100",
            "beginTime": (datetime.today() - timedelta(days=delta)).strftime('%Y-%m-%d'),
            "endTime": today,
            "pageNo": i,
            "qType": "3",
        }
        category_url = category_url + "?" + "&".join(f"{key}={value}"
                                   for key, value in params.items())
        content = fetch_url(category_url)
        if content:
            start_index = content.find("(")
            result = content[start_index +
                             1:-1] if start_index != -1 else content
            reportinfo = json.loads(result)
            if reportinfo["size"] > 0:
                i = i + 1
                for article in reportinfo['data']:
                    try:
                        url = f"https://data.eastmoney.com/report/zw_macresearch.jshtml?encodeUrl={article['encodeUrl']}"
                        _crawl(url, article)
                    except Exception as error:
                        logging.error(error)
            else:
                i = -1
        else:
            logging.error("Failed to fetch URL: %s", category_url)