OxbridgeEconomics commited on
Commit
c48c6cf
·
1 Parent(s): 2ef4c29
Files changed (4) hide show
  1. .gitignore +1 -0
  2. main.ipynb +0 -0
  3. main.py +141 -0
  4. requirements.txt +23 -0
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ ./env
main.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
main.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import uuid
3
+ import time
4
+ import json
5
+ import urllib.request
6
+ from lxml import etree
7
+ from googletrans import Translator
8
+ import datetime
9
+ import boto3
10
+ import os
11
+
12
+ translator = Translator()
13
+
14
+ def datemodifier(date_string):
15
+ """Date Modifier Function"""
16
+ try:
17
+ to_date = time.strptime(date_string,"%Y-%m-%d %H:%M:%S.%f")
18
+ return time.strftime("%Y-%m-%d",to_date)
19
+ except:
20
+ return False
21
+
22
+ def fetch_url(url):
23
+ response = requests.get(url)
24
+ if response.status_code == 200:
25
+ return response.text
26
+ else:
27
+ return None
28
+
29
+ def translist(infolist):
30
+ """Translist Function"""
31
+ out = list(filter(lambda s: s and
32
+ (isinstance (s,str) or len(s.strip()) > 0), [i.strip() for i in infolist]))
33
+ return out
34
+
35
+ def encode(content):
36
+ """Encode Function"""
37
+ text = ''
38
+ for element in content:
39
+ if isinstance(element, etree._Element):
40
+ subelement = etree.tostring(element).decode()
41
+ subpage = etree.HTML(subelement)
42
+ tree = subpage.xpath('//text()')
43
+ line = ''.join(translist(tree)).\
44
+ replace('\n','').replace('\t','').replace('\r','').replace(' ','').strip()
45
+ else:
46
+ line = element
47
+ text += line
48
+ return text
49
+
50
+ AWS_ACCESS_KEY_ID = "AKIAQFXZMGHQYXKWUDWR"
51
+ AWS_SECRET_ACCESS_KEY = "D2A0IEVl5g3Ljbu0Y5iq9WuFETpDeoEpl69C+6xo"
52
+
53
+ def get_db_connection():
54
+ """Get dynamoDB connection"""
55
+ dynamodb = boto3.resource(
56
+ service_name='dynamodb',
57
+ region_name='us-east-1',
58
+ aws_access_key_id=AWS_ACCESS_KEY_ID,
59
+ aws_secret_access_key=AWS_SECRET_ACCESS_KEY
60
+ )
61
+ return dynamodb
62
+
63
+ def upsert_content(report):
64
+ """Upsert the content records"""
65
+ dynamodb = get_db_connection()
66
+ table = dynamodb.Table('article_test')
67
+
68
+ # Define the item data
69
+ item = {
70
+ 'id': str(report['id']),
71
+ 'site': report['site'],
72
+ 'title': report['title'],
73
+ 'category': "Macroeconomic Research",
74
+ 'author': report['author'],
75
+ 'content': report['content'],
76
+ 'publishDate': report['publishDate'],
77
+ 'link': report['url'],
78
+ 'attachment': report['reporturl'],
79
+ 'authorID': str(report['authorid']),
80
+ 'LastModifiedDate': datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S")
81
+ }
82
+ response = table.put_item(Item=item)
83
+ print(response)
84
+
85
+
86
+ reportList = []
87
+ i = 0
88
+ while i > -1:
89
+ url = "https://reportapi.eastmoney.com/report/jg"
90
+ params = {
91
+ "cb": "datatable8544623",
92
+ "pageSize": "100",
93
+ # "beginTime": "2023-12-07",
94
+ "beginTime": "2024-03-07",
95
+ "endTime": "2024-03-07",
96
+ "pageNo": i,
97
+ "qType": "3",
98
+ }
99
+ url = url + "?" + "&".join(f"{key}={value}" for key, value in params.items())
100
+ print(url)
101
+ content = fetch_url(url)
102
+ if content:
103
+ start_index = content.find("(")
104
+ if start_index != -1:
105
+ result = content[start_index + 1: -1]
106
+ else:
107
+ result = content
108
+ reportinfo = json.loads(result)
109
+ if reportinfo["size"] > 0:
110
+ i = i + 1
111
+ for report in reportinfo['data']:
112
+ try:
113
+ url = f"https://data.eastmoney.com/report/zw_macresearch.jshtml?encodeUrl={report['encodeUrl']}"
114
+ req = urllib.request.urlopen(url)
115
+ text = req.read()
116
+ html_text = text.decode("utf-8")
117
+ page = etree.HTML(html_text)
118
+ content = encode(page.xpath("//div[contains(@class, 'ctx-content')]//p"))
119
+ reporturl = encode(page.xpath("//a[contains(@class, 'pdf-link')]/@href"))
120
+ report['url'] = url
121
+ if report['orgSName'] == "''":
122
+ report['site'] = translator.translate(report['orgSName'], dest='en').text
123
+ else:
124
+ report['site'] = translator.translate(report['orgName'], dest='en').text
125
+ report['reporturl'] = reporturl
126
+ report['title'] = translator.translate(report['title'], dest='en').text
127
+ report['author'] = translator.translate(report['researcher'], dest='en').text
128
+ report['content'] = translator.translate(content, dest='en').text
129
+ report['authorid'] = uuid.uuid5(uuid.NAMESPACE_OID, report['author'])
130
+ report['publishDate'] = datemodifier(report['publishDate'])
131
+ report['id'] = uuid.uuid5(uuid.NAMESPACE_OID, report['title']+report['publishDate'])
132
+ print(report)
133
+ upsert_content(report)
134
+ reportList.append(report)
135
+ except Exception as error:
136
+ print(error)
137
+ else:
138
+ print(reportinfo)
139
+ i = -1
140
+ else:
141
+ print("Failed to fetch URL:", url)
requirements.txt ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ boto3==1.34.57
2
+ botocore==1.34.57
3
+ certifi==2024.2.2
4
+ chardet==3.0.4
5
+ charset-normalizer==3.3.2
6
+ googletrans==3.1.0a0
7
+ h11==0.9.0
8
+ h2==3.2.0
9
+ hpack==3.0.0
10
+ hstspreload==2024.3.1
11
+ httpcore==0.9.1
12
+ httpx==0.13.3
13
+ hyperframe==5.2.0
14
+ idna==2.10
15
+ jmespath==1.0.1
16
+ lxml==5.1.0
17
+ python-dateutil==2.9.0.post0
18
+ requests==2.31.0
19
+ rfc3986==1.5.0
20
+ s3transfer==0.10.0
21
+ six==1.16.0
22
+ sniffio==1.3.1
23
+ urllib3==2.0.7