File size: 7,005 Bytes
c48c6cf
 
 
 
 
 
 
 
 
86c11ee
 
986000b
b2dbbaf
 
 
 
986000b
c48c6cf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86c11ee
c48c6cf
 
 
 
 
 
79fe6b3
 
 
c48c6cf
 
 
 
 
 
 
86c11ee
 
 
c48c6cf
 
 
 
86c11ee
54c7a12
c48c6cf
 
 
 
 
 
86c11ee
 
c48c6cf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86c11ee
c48c6cf
 
86c11ee
c48c6cf
86c11ee
a6d7194
c48c6cf
86c11ee
 
b2dbbaf
ae6aa5f
469400a
ae6aa5f
c48c6cf
 
 
86c11ee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c48c6cf
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
import requests
import uuid
import time
import json
import urllib.request
from lxml import etree
from googletrans import Translator
import boto3
import os
from datetime import datetime, timedelta
from decimal import Decimal
from transformers import pipeline

AWS_ACCESS_KEY_ID = os.environ['AWS_ACCESS_KEY_ID']
AWS_SECRET_ACCESS_KEY = os.environ['AWS_SECRET_ACCESS_KEY']

analyzer = pipeline("sentiment-analysis", model="ProsusAI/finbert")

translator = Translator()

def datemodifier(date_string):
    """Date Modifier Function"""
    try:
        to_date = time.strptime(date_string,"%Y-%m-%d %H:%M:%S.%f")
        return time.strftime("%Y-%m-%d",to_date)
    except:
        return False

def fetch_url(url):
    response = requests.get(url)
    if response.status_code == 200:
        return response.text
    else:
        return None
    
def translist(infolist):
    """Translist Function"""
    out = list(filter(lambda s: s and
                      (isinstance (s,str) or len(s.strip()) > 0), [i.strip() for i in infolist]))
    return out

def encode(content):
    """Encode Function"""
    text = ''
    for element in content:
        if isinstance(element, etree._Element):
            subelement = etree.tostring(element).decode()
            subpage = etree.HTML(subelement)
            tree = subpage.xpath('//text()')
            line = ''.join(translist(tree)).\
                replace('\n','').replace('\t','').replace('\r','').replace('  ','').strip()
        else:
            line = element
        text += line
    return text

def get_db_connection():
    """Get dynamoDB connection"""
    dynamodb = boto3.resource(
    service_name='dynamodb',
    region_name='us-east-1',
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY
    )
    return dynamodb

def upsert_content(report):
    """Upsert the content records"""
    dynamodb = get_db_connection()
    table = dynamodb.Table('article_china')

        # Define the item data
    item = {
        'id': str(report['id']),
        'site': report['site'],
        'title': report['title'],
        # 'originalSite': report['originalSite'],
        # 'originalTitle': report['originalTitle'],
        # 'originalContent': report['originalContent'],
        'category': "Macroeconomic Research",
        'author': report['author'],
        'content': report['content'],
        'publishDate': report['publishDate'],
        'link': report['url'],
        'attachment': report['reporturl'],
        'authorID': str(report['authorid']),
        'sentimentScore': str(Decimal(report['sentimentScore']).quantize(Decimal('0.01'))),
        'sentimentLabel': report['sentimentLabel'],
        'LastModifiedDate': datetime.now().strftime("%Y-%m-%dT%H:%M:%S")
    }
    response = table.put_item(Item=item)
    print(response)

today = datetime.today().strftime('%Y-%m-%d')
beginDate = (datetime.today() - timedelta(days=183)).strftime('%Y-%m-%d')
i = 0
while i > -1:
    url = "https://reportapi.eastmoney.com/report/jg"
    params = {
        "cb": "datatable8544623",
        "pageSize": "100",
        "beginTime": beginDate,
        "endTime": today,
        "pageNo": i,
        "qType": "3",
    }
    url = url + "?" + "&".join(f"{key}={value}" for key, value in params.items())
    print(url)
    content = fetch_url(url)
    if content:
        start_index = content.find("(")
        if start_index != -1:
            result = content[start_index + 1: -1]
        else:
            result = content
        reportinfo = json.loads(result)
        if reportinfo["size"] > 0:
            i = i + 1
            for report in reportinfo['data']:
                try:
                    url = f"https://data.eastmoney.com/report/zw_macresearch.jshtml?encodeUrl={report['encodeUrl']}"
                    req = urllib.request.urlopen(url)
                    text = req.read()
                    html_text = text.decode("utf-8")
                    page = etree.HTML(html_text)
                    content = encode(page.xpath("//div[contains(@class, 'ctx-content')]//p"))
                    reporturl = encode(page.xpath("//a[contains(@class, 'pdf-link')]/@href"))
                    report['url'] = url
                    if report['orgSName'] == "''":
                        report['site'] = translator.translate(report['orgSName'], dest='en').text
                        report['originalSite'] = report['orgSName']
                    else:
                        report['site'] = translator.translate(report['orgName'], dest='en').text
                        report['originalSite'] = report['orgSName']
                    report['reporturl'] = reporturl
                    report['originalTitle'] = report['title']
                    report['title'] = translator.translate(report['title'], dest='en').text
                    report['author'] = translator.translate(report['researcher'], dest='en').text
                    report['originalAuthor'] = report['researcher']
                    report['originalContent'] = content
                    content_eng = ''
                    for element in report['originalContent'].split("。"):
                        content_eng += translator.translate(element, dest='en').text + ' '
                    report['content'] = content_eng
                    report['authorid'] = uuid.uuid5(uuid.NAMESPACE_OID, report['author'])
                    report['publishDate'] = datemodifier(report['publishDate'])
                    report['id'] = uuid.uuid5(uuid.NAMESPACE_OID, report['title']+report['publishDate'])
                    label_dict = {
                        "positive": "+",
                        "negative": "-",
                        "neutral": "0",
                    }
                    sentiment_score = 0
                    maximum_value = 0
                    raw_sentiment = analyzer(report['content'][:512], return_all_scores=True)
                    sentiment_label = None
                    for sentiment_dict in raw_sentiment[0]:
                        value = sentiment_dict["score"]
                        if value > maximum_value:
                            sentiment_label = sentiment_dict["label"]
                            maximum_value = value
                        if sentiment_dict["label"] == "positive":
                            sentiment_score = sentiment_score + value
                        if sentiment_dict["label"] == "negative":
                            sentiment_score = sentiment_score - value
                        else:
                            sentiment_score = sentiment_score + 0
                    report['sentimentScore'] = sentiment_score
                    report['sentimentLabel'] = label_dict[sentiment_label]
                    upsert_content(report)
                except Exception as error:
                    print(error)
        else:
            print(reportinfo)
            i = -1
    else:
        print("Failed to fetch URL:", url)