OxbridgeEconomics
commit
c48c6cf
raw
history blame
4.9 kB
import requests
import uuid
import time
import json
import urllib.request
from lxml import etree
from googletrans import Translator
import datetime
import boto3
import os
translator = Translator()
def datemodifier(date_string):
"""Date Modifier Function"""
try:
to_date = time.strptime(date_string,"%Y-%m-%d %H:%M:%S.%f")
return time.strftime("%Y-%m-%d",to_date)
except:
return False
def fetch_url(url):
response = requests.get(url)
if response.status_code == 200:
return response.text
else:
return None
def translist(infolist):
"""Translist Function"""
out = list(filter(lambda s: s and
(isinstance (s,str) or len(s.strip()) > 0), [i.strip() for i in infolist]))
return out
def encode(content):
"""Encode Function"""
text = ''
for element in content:
if isinstance(element, etree._Element):
subelement = etree.tostring(element).decode()
subpage = etree.HTML(subelement)
tree = subpage.xpath('//text()')
line = ''.join(translist(tree)).\
replace('\n','').replace('\t','').replace('\r','').replace(' ','').strip()
else:
line = element
text += line
return text
AWS_ACCESS_KEY_ID = "AKIAQFXZMGHQYXKWUDWR"
AWS_SECRET_ACCESS_KEY = "D2A0IEVl5g3Ljbu0Y5iq9WuFETpDeoEpl69C+6xo"
def get_db_connection():
"""Get dynamoDB connection"""
dynamodb = boto3.resource(
service_name='dynamodb',
region_name='us-east-1',
aws_access_key_id=AWS_ACCESS_KEY_ID,
aws_secret_access_key=AWS_SECRET_ACCESS_KEY
)
return dynamodb
def upsert_content(report):
"""Upsert the content records"""
dynamodb = get_db_connection()
table = dynamodb.Table('article_test')
# Define the item data
item = {
'id': str(report['id']),
'site': report['site'],
'title': report['title'],
'category': "Macroeconomic Research",
'author': report['author'],
'content': report['content'],
'publishDate': report['publishDate'],
'link': report['url'],
'attachment': report['reporturl'],
'authorID': str(report['authorid']),
'LastModifiedDate': datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S")
}
response = table.put_item(Item=item)
print(response)
reportList = []
i = 0
while i > -1:
url = "https://reportapi.eastmoney.com/report/jg"
params = {
"cb": "datatable8544623",
"pageSize": "100",
# "beginTime": "2023-12-07",
"beginTime": "2024-03-07",
"endTime": "2024-03-07",
"pageNo": i,
"qType": "3",
}
url = url + "?" + "&".join(f"{key}={value}" for key, value in params.items())
print(url)
content = fetch_url(url)
if content:
start_index = content.find("(")
if start_index != -1:
result = content[start_index + 1: -1]
else:
result = content
reportinfo = json.loads(result)
if reportinfo["size"] > 0:
i = i + 1
for report in reportinfo['data']:
try:
url = f"https://data.eastmoney.com/report/zw_macresearch.jshtml?encodeUrl={report['encodeUrl']}"
req = urllib.request.urlopen(url)
text = req.read()
html_text = text.decode("utf-8")
page = etree.HTML(html_text)
content = encode(page.xpath("//div[contains(@class, 'ctx-content')]//p"))
reporturl = encode(page.xpath("//a[contains(@class, 'pdf-link')]/@href"))
report['url'] = url
if report['orgSName'] == "''":
report['site'] = translator.translate(report['orgSName'], dest='en').text
else:
report['site'] = translator.translate(report['orgName'], dest='en').text
report['reporturl'] = reporturl
report['title'] = translator.translate(report['title'], dest='en').text
report['author'] = translator.translate(report['researcher'], dest='en').text
report['content'] = translator.translate(content, dest='en').text
report['authorid'] = uuid.uuid5(uuid.NAMESPACE_OID, report['author'])
report['publishDate'] = datemodifier(report['publishDate'])
report['id'] = uuid.uuid5(uuid.NAMESPACE_OID, report['title']+report['publishDate'])
print(report)
upsert_content(report)
reportList.append(report)
except Exception as error:
print(error)
else:
print(reportinfo)
i = -1
else:
print("Failed to fetch URL:", url)