File size: 3,083 Bytes
c48c6cf
 
 
b6dcee5
86c11ee
57c4050
b6dcee5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c48c6cf
86c11ee
54c7a12
c48c6cf
 
57c4050
c48c6cf
 
 
86c11ee
 
c48c6cf
 
 
57c4050
 
 
c48c6cf
 
 
 
 
 
 
 
 
57c4050
c48c6cf
57c4050
b6dcee5
c48c6cf
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import uuid
import json
import urllib.request
from urllib.parse import urlparse
from datetime import datetime, timedelta
from lxml import etree
from utils import encode, translate, datemodifier, sentiment_computation, upsert_content, fetch_url, encode_content

with open('xpath.json', 'r', encoding='UTF-8') as f:
    xpath_dict = json.load(f)

def crawl(url, article):
    domain = urlparse(url).netloc
    req = urllib.request.urlopen(url)
    text = req.read()
    html_text = text.decode("utf-8")
    page = etree.HTML(html_text)
    originContent, summary = encode_content(page.xpath(xpath_dict[domain]['content']))
    article['attachment'] = encode(page.xpath(xpath_dict[domain]['attachment']))
    article['link'] = url
    if article['orgSName'] == "''":
        article['site'] = translate(article['orgSName'])
    else:
        article['site'] = translate(article['orgName'])
    article['originTitle'] = article['title']
    article['title'] = translate(article['title'])
    article['author'] = translate(article['researcher'])
    article['originAuthor'] = article['researcher']
    article['originContent'] = repr(originContent)
    article['subtitle'] = translate(summary)
    article['category'] = "Macroeconomic Research"
    if len(article['originContent']) < 10:
        return None
    CONTENT_ENG = ''
    for element in originContent.split("\n"):
        CONTENT_ENG += translate(element) + '\n'
    article['content'] = repr(CONTENT_ENG)
    article['authorid'] = uuid.uuid5(uuid.NAMESPACE_OID, article['author'])
    article['publishDate'] = datemodifier(article['publishDate'], xpath_dict[domain]['datetime']['format_string'])
    article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['title']+article['publishDate'])
    article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(originContent.replace("\n",""))
    upsert_content(article)

today = datetime.today().strftime('%Y-%m-%d')
beginDate = (datetime.today() - timedelta(days=183)).strftime('%Y-%m-%d')
i = 0
while i > -1:
    URL = "https://reportapi.eastmoney.com/report/jg"
    params = {
        "cb": "datatable8544623",
        "pageSize": "100",
        "beginTime": beginDate,
        "endTime": today,
        "pageNo": i,
        "qType": "3",
    }
    URL = URL + "?" + "&".join(f"{key}={value}" for key, value in params.items())
    print(URL)
    content = fetch_url(URL)
    if content:
        start_index = content.find("(")
        if start_index != -1:
            result = content[start_index + 1: -1]
        else:
            result = content
        reportinfo = json.loads(result)
        if reportinfo["size"] > 0:
            i = i + 1
            for article in reportinfo['data']:
                try:
                    url = f"https://data.eastmoney.com/report/zw_macresearch.jshtml?encodeUrl={article['encodeUrl']}"
                    crawl(url,article)
                except Exception as error:
                    print(error)
        else:
            print(reportinfo)
            i = -1
    else:
        print("Failed to fetch URL:", url)