import requests import uuid import time import json import urllib.request from lxml import etree from googletrans import Translator import datetime import boto3 import os translator = Translator() def datemodifier(date_string): """Date Modifier Function""" try: to_date = time.strptime(date_string,"%Y-%m-%d %H:%M:%S.%f") return time.strftime("%Y-%m-%d",to_date) except: return False def fetch_url(url): response = requests.get(url) if response.status_code == 200: return response.text else: return None def translist(infolist): """Translist Function""" out = list(filter(lambda s: s and (isinstance (s,str) or len(s.strip()) > 0), [i.strip() for i in infolist])) return out def encode(content): """Encode Function""" text = '' for element in content: if isinstance(element, etree._Element): subelement = etree.tostring(element).decode() subpage = etree.HTML(subelement) tree = subpage.xpath('//text()') line = ''.join(translist(tree)).\ replace('\n','').replace('\t','').replace('\r','').replace(' ','').strip() else: line = element text += line return text AWS_ACCESS_KEY_ID = "AKIAQFXZMGHQYXKWUDWR" AWS_SECRET_ACCESS_KEY = "D2A0IEVl5g3Ljbu0Y5iq9WuFETpDeoEpl69C+6xo" def get_db_connection(): """Get dynamoDB connection""" dynamodb = boto3.resource( service_name='dynamodb', region_name='us-east-1', aws_access_key_id=AWS_ACCESS_KEY_ID, aws_secret_access_key=AWS_SECRET_ACCESS_KEY ) return dynamodb def upsert_content(report): """Upsert the content records""" dynamodb = get_db_connection() table = dynamodb.Table('article_test') # Define the item data item = { 'id': str(report['id']), 'site': report['site'], 'title': report['title'], 'category': "Macroeconomic Research", 'author': report['author'], 'content': report['content'], 'publishDate': report['publishDate'], 'link': report['url'], 'attachment': report['reporturl'], 'authorID': str(report['authorid']), 'LastModifiedDate': datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S") } response = table.put_item(Item=item) print(response) reportList = [] i = 0 while i > -1: url = "https://reportapi.eastmoney.com/report/jg" params = { "cb": "datatable8544623", "pageSize": "100", # "beginTime": "2023-12-07", "beginTime": "2024-03-07", "endTime": "2024-03-07", "pageNo": i, "qType": "3", } url = url + "?" + "&".join(f"{key}={value}" for key, value in params.items()) print(url) content = fetch_url(url) if content: start_index = content.find("(") if start_index != -1: result = content[start_index + 1: -1] else: result = content reportinfo = json.loads(result) if reportinfo["size"] > 0: i = i + 1 for report in reportinfo['data']: try: url = f"https://data.eastmoney.com/report/zw_macresearch.jshtml?encodeUrl={report['encodeUrl']}" req = urllib.request.urlopen(url) text = req.read() html_text = text.decode("utf-8") page = etree.HTML(html_text) content = encode(page.xpath("//div[contains(@class, 'ctx-content')]//p")) reporturl = encode(page.xpath("//a[contains(@class, 'pdf-link')]/@href")) report['url'] = url if report['orgSName'] == "''": report['site'] = translator.translate(report['orgSName'], dest='en').text else: report['site'] = translator.translate(report['orgName'], dest='en').text report['reporturl'] = reporturl report['title'] = translator.translate(report['title'], dest='en').text report['author'] = translator.translate(report['researcher'], dest='en').text report['content'] = translator.translate(content, dest='en').text report['authorid'] = uuid.uuid5(uuid.NAMESPACE_OID, report['author']) report['publishDate'] = datemodifier(report['publishDate']) report['id'] = uuid.uuid5(uuid.NAMESPACE_OID, report['title']+report['publishDate']) print(report) upsert_content(report) reportList.append(report) except Exception as error: print(error) else: print(reportinfo) i = -1 else: print("Failed to fetch URL:", url)