File size: 4,900 Bytes
c48c6cf |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 |
import requests
import uuid
import time
import json
import urllib.request
from lxml import etree
from googletrans import Translator
import datetime
import boto3
import os
translator = Translator()
def datemodifier(date_string):
"""Date Modifier Function"""
try:
to_date = time.strptime(date_string,"%Y-%m-%d %H:%M:%S.%f")
return time.strftime("%Y-%m-%d",to_date)
except:
return False
def fetch_url(url):
response = requests.get(url)
if response.status_code == 200:
return response.text
else:
return None
def translist(infolist):
"""Translist Function"""
out = list(filter(lambda s: s and
(isinstance (s,str) or len(s.strip()) > 0), [i.strip() for i in infolist]))
return out
def encode(content):
"""Encode Function"""
text = ''
for element in content:
if isinstance(element, etree._Element):
subelement = etree.tostring(element).decode()
subpage = etree.HTML(subelement)
tree = subpage.xpath('//text()')
line = ''.join(translist(tree)).\
replace('\n','').replace('\t','').replace('\r','').replace(' ','').strip()
else:
line = element
text += line
return text
AWS_ACCESS_KEY_ID = "AKIAQFXZMGHQYXKWUDWR"
AWS_SECRET_ACCESS_KEY = "D2A0IEVl5g3Ljbu0Y5iq9WuFETpDeoEpl69C+6xo"
def get_db_connection():
"""Get dynamoDB connection"""
dynamodb = boto3.resource(
service_name='dynamodb',
region_name='us-east-1',
aws_access_key_id=AWS_ACCESS_KEY_ID,
aws_secret_access_key=AWS_SECRET_ACCESS_KEY
)
return dynamodb
def upsert_content(report):
"""Upsert the content records"""
dynamodb = get_db_connection()
table = dynamodb.Table('article_test')
# Define the item data
item = {
'id': str(report['id']),
'site': report['site'],
'title': report['title'],
'category': "Macroeconomic Research",
'author': report['author'],
'content': report['content'],
'publishDate': report['publishDate'],
'link': report['url'],
'attachment': report['reporturl'],
'authorID': str(report['authorid']),
'LastModifiedDate': datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S")
}
response = table.put_item(Item=item)
print(response)
reportList = []
i = 0
while i > -1:
url = "https://reportapi.eastmoney.com/report/jg"
params = {
"cb": "datatable8544623",
"pageSize": "100",
# "beginTime": "2023-12-07",
"beginTime": "2024-03-07",
"endTime": "2024-03-07",
"pageNo": i,
"qType": "3",
}
url = url + "?" + "&".join(f"{key}={value}" for key, value in params.items())
print(url)
content = fetch_url(url)
if content:
start_index = content.find("(")
if start_index != -1:
result = content[start_index + 1: -1]
else:
result = content
reportinfo = json.loads(result)
if reportinfo["size"] > 0:
i = i + 1
for report in reportinfo['data']:
try:
url = f"https://data.eastmoney.com/report/zw_macresearch.jshtml?encodeUrl={report['encodeUrl']}"
req = urllib.request.urlopen(url)
text = req.read()
html_text = text.decode("utf-8")
page = etree.HTML(html_text)
content = encode(page.xpath("//div[contains(@class, 'ctx-content')]//p"))
reporturl = encode(page.xpath("//a[contains(@class, 'pdf-link')]/@href"))
report['url'] = url
if report['orgSName'] == "''":
report['site'] = translator.translate(report['orgSName'], dest='en').text
else:
report['site'] = translator.translate(report['orgName'], dest='en').text
report['reporturl'] = reporturl
report['title'] = translator.translate(report['title'], dest='en').text
report['author'] = translator.translate(report['researcher'], dest='en').text
report['content'] = translator.translate(content, dest='en').text
report['authorid'] = uuid.uuid5(uuid.NAMESPACE_OID, report['author'])
report['publishDate'] = datemodifier(report['publishDate'])
report['id'] = uuid.uuid5(uuid.NAMESPACE_OID, report['title']+report['publishDate'])
print(report)
upsert_content(report)
reportList.append(report)
except Exception as error:
print(error)
else:
print(reportinfo)
i = -1
else:
print("Failed to fetch URL:", url)
|