Spaces:

Oxbridge-Economics
/

Data-Collection-China

Running

App Files Files Community

OxbridgeEconomics commited on Jun 26, 2024

Commit

8925fd4

1 Parent(s): efcd6b8

commit

Browse files

Files changed (3) hide show

daily.py +3 -1
patterns.json +380 -0
utils.py +158 -24

daily.py CHANGED Viewed

@@ -14,7 +14,8 @@ from utils import (encode,
                    crawl,
                    datemodifier,
                    encode_content,
-                   update_content)
 with open('xpath.json', 'r', encoding='UTF-8') as f:
     xpath_dict = json.load(f)
@@ -161,6 +162,7 @@ def crawl_eastmoney(url, article):
     article['publishDate'] = datemodifier(article['publishDate'], xpath_dict[domain]['datetime_format'])
     article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['titleCN']+article['publishDate'])
     article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(contentCN.replace("\n",""))
     update_content(article)
 today = datetime.today().strftime('%Y-%m-%d')

                    crawl,
                    datemodifier,
                    encode_content,
+                   update_content,
+                   extract_reference)
 with open('xpath.json', 'r', encoding='UTF-8') as f:
     xpath_dict = json.load(f)
     article['publishDate'] = datemodifier(article['publishDate'], xpath_dict[domain]['datetime_format'])
     article['id'] = uuid.uuid5(uuid.NAMESPACE_OID, article['titleCN']+article['publishDate'])
     article['sentimentScore'], article['sentimentLabel'] = sentiment_computation(contentCN.replace("\n",""))
+    extract_reference(article)
     update_content(article)
 today = datetime.today().strftime('%Y-%m-%d')

patterns.json ADDED Viewed

	@@ -0,0 +1,380 @@

+[
+    {
+    "site": "Guosen Securities Co., Ltd.",
+        "pages": [0],
+        "date_range": 1,
+        "keyword": "相关研究报告",
+        "article_regex": "《(.*?)》",
+        "date_regex": "(\\d{4}-\\d{2}-\\d{2})",
+        "date_format": "%Y-%m-%d",
+        "split":[
+                  {
+                    "string": "-",
+                    "index": -1
+                  }
+                ]
+    },
+    {
+        "site": "Soochow Securities Co., Ltd.",
+        "pages": [0],
+        "date_range": 2,
+        "keyword": "相关研究",
+        "article_regex": "《(.*?)》",
+        "date_regex": "\\b\\d{4}-\\d{2}-\\d{2}|\\d{4} -\\d{2}-\\d{2}\\b",
+        "date_format": "%Y-%m-%d",
+        "split":[
+                  {
+                    "string": "-",
+                    "index": 0
+                  },
+                  {
+                    "string": "—",
+                    "index": 0
+                  }
+                ]
+    },
+    {
+        "site": "BOCI Securities Co., Ltd.",
+        "pages": [0],
+        "date_range": 1,
+        "keyword": "相关研究报告",
+        "article_regex": "《(.*?)》",
+        "date_regex": "20\\d{6}|20\\d{5}\\s{1}\\d{1}",
+        "date_format": "%Y%m%d"
+    },
+    {
+        "site": "Tianfeng Securities Co., Ltd.",
+        "pages": [0],
+        "date_range": 3,
+        "keyword": "相关报告",
+        "article_regex": " 《(.*?)》",
+        "date_regex": "\\b\\d{4}-\\d{2}-\\d{2}|\\d{4} -\\d{2}-\\d{2}\\b",
+        "date_format": "%Y-%m-%d",
+        "remove": ["宏观报告：", "宏观-", "宏观报告-", "——"],
+        "split":[
+                  {
+                    "string": "：",
+                    "index": 1
+                  },
+                  {
+                    "string": "-",
+                    "index": 0
+                  },
+                  {
+                    "string": "(",
+                    "index": 1
+                  }
+                ]
+    },
+    {
+        "site": "Kaiyuan Securities Co., Ltd.",
+        "pages": [0],
+        "date_range": 1,
+        "keyword": " ",
+        "article_regex": " 《(.*?)》",
+        "date_regex": "\\b\\d{4}\\.\\d{1,2}\\.\\d{1,2}\\b",
+        "date_format": "%Y.%m.%d",
+        "split":[
+                  {
+                    "string": "—",
+                    "index": 1
+                  }
+                ]
+    },
+    {
+        "site": "Huafu Securities Co., Ltd.",
+        "pages": [0],
+        "date_range": 4,
+        "keyword": "相关报告",
+        "article_regex": "《(.*?)》",
+        "date_regex": "20\\d{2}\\s?\\.\\s?\\d{1}\\s?\\d{1}\\s?\\.\\s?\\d{1,2}",
+        "date_format": "%Y.%m.%d",
+        "split":[
+                  {
+                    "string": "：",
+                    "index": 1
+                  },
+                  {
+                    "string": "——",
+                    "index": 0
+                  }
+                ]
+    },
+    {
+        "site": "Minsheng Securities Co., Ltd.",
+        "pages": [0],
+        "date_range": 1,
+        "keyword": "相关研究",
+        "article_regex": "\\.(.*?)\\-",
+        "date_regex": "20\\d{2}\\/\\d{2}\\/\\d{2}",
+        "date_format": "%Y/%m/%d",
+        "split":[
+                  {
+                    "string": "：",
+                    "index": 1
+                  }
+                ]
+    },
+    {
+        "site": "Guolian Securities Co., Ltd.",
+        "pages": [0],
+        "date_range": 1,
+        "keyword": "相关报告 ",
+        "article_regex": "《(.*?)》",
+        "date_regex": "[》 ]20\\d{2}\\.\\d{2}\\.\\d{2}",
+        "date_format": "%Y.%m.%d",
+        "split":[
+                  {
+                    "string": "：",
+                    "index": 0
+                  }
+                ]
+    },
+    {
+        "site": "Southwest Securities Co., Ltd.",
+        "pages": [0],
+        "date_range": 1,
+        "keyword": "相关研究",
+        "article_regex": "\\.(.*?)\\(",
+        "date_regex": "(20\\d{2}\\s?-\\d{2}\\-\\d{2})",
+        "date_format": "%Y-%m-%d"
+    },
+    {
+        "site": "Guangdong Securities Co., Ltd.",
+        "pages": [0],
+        "date_range": 1,
+        "keyword": "近期报告",
+        "article_regex": "《(.*?)》",
+        "date_regex": "20\\d{2}\\s?-\\d{2}\\-\\d{2}",
+        "date_format": "%Y-%m-%d"
+    },
+    {
+        "site": "China Post Securities Co., Ltd.",
+        "pages": [0],
+        "date_range": 1,
+        "keyword": "近期研究报告",
+        "article_regex": "《(.*?)》",
+        "date_regex": "20\\d{2}\\s?.\\d{2}\\.\\d{2}",
+        "date_format": "%Y.%m.%d",
+        "split":[
+                  {
+                    "string": "-",
+                    "index": 1
+                  },
+                  {
+                    "string": "——",
+                    "index": 0
+                  }
+                ]
+    },
+    {
+        "site": "Shanxi Securities Co., Ltd.",
+        "pages": [0],
+        "date_range": 1,
+        "keyword": " ",
+        "article_regex": "】(.*?)\\（",
+        "date_regex": "20\\d{2}\\s?.\\s?\\d{1,2}\\s?.\\s?\\d{1,2} ",
+        "date_format": "%Y.%m.%d"
+    },
+    {
+        "site": "Shanghai Securities Co., Ltd.",
+        "pages": [0],
+        "date_range": 1,
+        "keyword": "Table_Rep",
+        "article_regex": "《(.*?)》",
+        "date_regex": "20\\d{2}年\\d{2}月\\d{2}",
+        "date_format": "%Y年%m月%d"
+    },
+    {
+        "site": "Guoyuan Securities Co., Ltd.",
+        "pages": [0],
+        "date_range": 2,
+        "keyword": "[Table_Report]",
+        "article_regex": "《(.*?)》 ",
+        "date_regex": " 20\\d{2}.\\d{2}.\\d{2} ",
+        "date_format": "%Y.%m.%d",
+        "split":[
+                  {
+                    "string": "：",
+                    "index": 0
+                  }
+                ]
+    },
+    {
+        "site": "Mago Securities Co., Ltd.",
+        "pages": [0],
+        "date_range": 1,
+        "keyword": "相关研究",
+        "article_regex": "《(.*?)》",
+        "date_regex": "20\\d{2}\\s?.\\s?\\d{2}\\s?.\\s?\\d{2} ",
+        "date_format": "%Y.%m.%d",
+        "split":[
+                  {
+                    "string": "（",
+                    "index": 0
+                  }
+                ]
+    },
+    {
+        "site": "Fed Securities, Inc.",
+        "pages": [0],
+        "date_range": 3,
+        "keyword": "相关报告",
+        "article_regex": "：(.*?)20",
+        "date_regex": "20\\d{2}\\s?.\\s?\\d{2}\\s?.\\s?\\d{2}",
+        "date_format": "%Y.%m.%d"
+    },
+    {
+        "site": "Huabao Securities Co., Ltd.",
+        "pages": [0],
+        "date_range": 1,
+        "keyword": "相关研究报告",
+        "article_regex": "《(.*?)》",
+        "date_regex": "(\\d{4}-\\d{2}-\\d{2})",
+        "date_format": "%Y-%m-%d"
+    },
+    {
+        "site": "Ruitingdog (Shenzhen) Information Technology Co., Ltd.",
+        "pages": [0],
+        "date_range": 1,
+        "keyword": "近期研究",
+        "article_regex": "：(.*?)-",
+        "date_regex": "\\d{4}\\s?/\\s?\\d{1,2}\\s?/\\s?\\d{1,2}",
+        "date_format": "%Y/%m/%d"
+    },
+    {
+        "site": "Oriental Fortune Securities Co., Ltd.",
+        "pages": [0],
+        "date_range": 1,
+        "keyword": "相关研究",
+        "article_regex": "《(.*?)》",
+        "date_regex": "\\d{4}\\s?.\\s?\\d{1,2}\\s?.\\s?\\d{1,2}",
+        "date_format": "%Y.%m.%d"
+    },
+    {
+        "site": "Yongxing Securities Co., Ltd.",
+        "pages": [0],
+        "date_range": 1,
+        "keyword": "相关报告：",
+        "article_regex": "《(.*?)》",
+        "date_regex": "—— \\d{4}\\s?年\\s?\\d{1,2}\\s?月\\s?\\d{1,2}",
+        "date_format": "——%Y年%m月%d"
+    },
+    {
+        "site": "Minmetals Securities Co., Ltd.",
+        "pages": [0],
+        "date_range": 1,
+        "keyword": "相关研究",
+        "article_regex": "《(.*?)》",
+        "date_regex": "(d{4}\\s/\\d{2}/\\d{2}) ",
+        "date_format": "(%Y/%m/%d) "
+    },
+    {
+        "site": "Hualong Securities Co., Ltd.",
+        "pages": [0],
+        "date_range": 1,
+        "keyword": "相关阅读",
+        "article_regex": "《(.*?)》",
+        "date_regex": "\\d{4}\\s?.\\s?\\d{1,2}\\s?.\\s?\\d{1,2}",
+        "date_format": "%Y.%m.%d"
+    },
+    {
+        "site": "Hebei Yuanda Information Technology Co., Ltd.",
+        "pages": [0],
+        "date_range": 1,
+        "keyword": "相关报告：",
+        "article_regex": "《(.*?)》",
+        "date_regex": "\\d{4}\\s?.\\s?\\d{1,2}\\s?.\\s?\\d{1,2}",
+        "date_format": "%Y.%m.%d"
+    },
+    {
+        "site": "Huaxin Securities Co., Ltd.",
+        "pages": [0],
+        "date_range": 1,
+        "keyword": "相关研究",
+        "article_regex": "《(.*?)》",
+        "date_regex": "(\\d{4}-\\d{2}-\\d{2})",
+        "date_format": "%Y-%m-%d"
+    },
+    {
+        "site": "Far East Credit Rating Co., Ltd.",
+        "pages": [0],
+        "date_range": 1,
+        "keyword": "1.",
+        "article_regex": "《(.*?)》",
+        "date_regex": "\\d{4}\\s?.\\s?\\d{1,2}\\s?.\\s?\\d{1,2}",
+        "date_format": "%Y.%m.%d"
+    },
+    {
+        "site": "Beijing Tengjing Big Data Application Technology Research Institute",
+        "pages": [0],
+        "date_range": 1,
+        "keyword": "相关报告",
+        "article_regex": "《(.*?)》",
+        "date_regex": "(\\d{4}-\\d{2}-\\d{2})",
+        "date_format": "%Y-%m-%d"
+    },
+    {
+        "site": "Wanhe Securities Co., Ltd.",
+        "pages": [0],
+        "date_range": 1,
+        "keyword": "相关报告",
+        "article_regex": "《(.*?)》",
+        "date_regex": "(\\d{4}-\\d{2}-\\d{2})",
+        "date_format": "%Y-%m-%d"
+    },
+    {
+        "site": "Centaline Securities Co., Ltd.",
+        "pages": [0],
+        "date_range": 1,
+        "keyword": "相关报告",
+        "article_regex": "《(.*?)》",
+        "date_regex": "(\\d{4}\\s?-\\s?\\d{2}\\s?-\\s?\\d{2})",
+        "date_format": "%Y-%m-%d"
+    },
+    {
+        "site": "Tengjing Digital Research",
+        "pages": [0],
+        "date_range": 1,
+        "keyword": "相关报告",
+        "article_regex": "《(.*?)》",
+        "date_regex": "(\\d{4}\\s?-\\s?\\d{2}\\s?-\\s?\\d{2})",
+        "date_format": "%Y-%m-%d"
+    },
+    {
+        "site": "Guoyuan Securities",
+        "pages": [0],
+        "date_range": 1,
+        "keyword": "相关研究报告",
+        "article_regex": "《(.*?)》",
+        "date_regex": "\\d{4}\\s?.\\s?\\d{1,2}\\s?.\\s?\\d{1,2}",
+        "date_format": "%Y.%m.%d"
+    },
+    {
+        "site": "China Galaxy Co., Ltd.",
+        "pages": [0],
+        "date_range": 1,
+        "keyword": "相关报告",
+        "article_regex": "《(.*?)》",
+        "date_regex": "(\\d{4}\\s?-\\s?\\d{2}\\s?-\\s?\\d{2})",
+        "date_format": "%Y-%m-%d"
+    },
+    {
+        "site": "Shengang Securities Co., Ltd.",
+        "pages": [0],
+        "date_range": 1,
+        "keyword": "相关报告",
+        "article_regex": "《(.*?)》",
+        "date_regex": "(\\d{4}\\s?-\\s?\\d{2}\\s?-\\s?\\d{2})",
+        "date_format": "%Y-%m-%d"
+    },
+    {
+        "site": "SDIC Anxin Futures",
+        "pages": [0],
+        "date_range": 1,
+        "keyword": "相关报告",
+        "article_regex": "《(.*?)》",
+        "date_regex": "(\\d{4}\\s?-\\s?\\d{2}\\s?-\\s?\\d{2})",
+        "date_format": "%Y-%m-%d"
+    }
+]

utils.py CHANGED Viewed

@@ -1,12 +1,15 @@
 """Utilis Functions"""
 import os
 import json
 import uuid
 import time
 import urllib.request
 from urllib.parse import urlparse
-from datetime import datetime
 from decimal import Decimal
 import requests
 import boto3
 from lxml import etree
@@ -26,6 +29,136 @@ translator = Translator()
 with open('xpath.json', 'r', encoding='UTF-8') as f:
     xpath_dict = json.load(f)
 def translate(text):
     return translator.translate(text, dest='en').text
@@ -38,7 +171,7 @@ def datemodifier(date_string, date_format):
         return False
 def fetch_url(url):
-    response = requests.get(url)
     if response.status_code == 200:
         return response.text
     else:
@@ -78,29 +211,29 @@ def encode_content(content):
         else:
             line = element
         if line != '':
-          line = line + '\n'
-          text += line
         index = text.find('打印本页')
     if index != -1:
-      text = text[:index]
     try:
-      summary = '\n'.join(text.split('\n')[:2])
     except:
-      summary = text
     return text, summary
 def extract_from_pdf(url):
     # Send a GET request to the URL and retrieve the PDF content
-    response = requests.get(url)
     pdf_content = response.content
     # Save the PDF content to a local file
-    with open("downloaded_file.pdf", "wb") as f:
-        f.write(pdf_content)
     # Open the downloaded PDF file and extract the text
-    with open("downloaded_file.pdf", "rb") as f:
-        pdf_reader = PdfReader(f)
         num_pages = len(pdf_reader.pages)
         extracted_text = ""
         for page in range(num_pages):
@@ -213,19 +346,19 @@ def upsert_content(report):
     response = table.put_item(Item=item)
     print(response)
-def get_client_connection():
-    """Get dynamoDB connection"""
-    dynamodb = boto3.client(
-        service_name='dynamodb',
-        region_name='us-east-1',
-        aws_access_key_id=AWS_ACCESS_KEY_ID,
-        aws_secret_access_key=AWS_SECRET_ACCESS_KEY
-    )
-    return dynamodb
 def delete_records(item):
-  dynamodb_client = get_client_connection()
-  dynamodb_client.delete_item(
             TableName="article_test",
             Key={
                 'id': {'S': item['id']},
@@ -275,4 +408,5 @@ def update_content_sentiment(report):
                 }
             )
     print(response)

 """Utilis Functions"""
 import os
+import re
 import json
 import uuid
 import time
+import glob
 import urllib.request
 from urllib.parse import urlparse
+from datetime import datetime, timedelta
 from decimal import Decimal
+import pandas as pd
 import requests
 import boto3
 from lxml import etree
 with open('xpath.json', 'r', encoding='UTF-8') as f:
     xpath_dict = json.load(f)
+with open('xpath.json', 'r', encoding='UTF-8') as f:
+    patterns = json.load(f)
+def get_client_connection():
+    """Get dynamoDB connection"""
+    dynamodb = boto3.client(
+        service_name='dynamodb',
+        region_name='us-east-1',
+        aws_access_key_id=AWS_ACCESS_KEY_ID,
+        aws_secret_access_key=AWS_SECRET_ACCESS_KEY
+    )
+    return dynamodb
+def update_reference(report):
+    dynamodb = get_client_connection()
+    response = dynamodb.update_item(
+                TableName="reference_china",
+                Key={
+                    'id': {'S': str(report['refID'])},
+                    'sourceID': {'S': report['sourceID']}
+                },
+                UpdateExpression='SET link = :link, referenceID = :referenceID, LastModifiedDate = :LastModifiedDate',
+                ExpressionAttributeValues={
+                    ':link': {'S': report['link']},
+                    ':referenceID': {'S': report['referenceID']},
+                    ':LastModifiedDate': {'S': datetime.now().strftime("%Y-%m-%dT%H:%M:%S")},
+                }
+            )
+    print(response)
+def download_files_from_s3(folder):
+    """Download Data Files"""
+    if not os.path.exists(folder):
+        os.makedirs(folder)
+    client = boto3.client(
+        's3',
+        aws_access_key_id=AWS_ACCESS_KEY_ID,
+        aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
+    )
+    response = client.list_objects_v2(Bucket='china-securities-report', Prefix=f"{folder}/")
+    for obj in response['Contents']:
+        key = obj['Key']
+        if key.endswith('.parquet'):
+            client.download_file('china-securities-report', key, key)
+    file_paths = glob.glob(os.path.join(folder, '*.parquet'))
+    return pd.concat([pd.read_parquet(file_path) for file_path in file_paths], ignore_index=True)
+def extract_from_pdf_by_pattern(url, pattern):
+    # Send a GET request to the URL and retrieve the PDF content
+    try:
+        response = requests.get(url, timeout=60)
+        pdf_content = response.content
+        # Save the PDF content to a local file
+        with open("downloaded_file.pdf", "wb") as file:
+            file.write(pdf_content)
+        # Open the downloaded PDF file and extract the text
+        with open("downloaded_file.pdf", "rb") as file:
+            pdf_reader = PdfReader(file)
+            extracted_text = ""
+            if 'pages' in pattern:
+                pages = pattern['pages']
+            else:
+                pages = len(pdf_reader.pages)
+            for page in pages:
+                text = pdf_reader.pages[page].extract_text()
+                if 'keyword' in pattern and pattern['keyword'] in text:
+                    text = text.split(pattern['keyword'], 1)[1].strip()
+                else:
+                    text = text.strip()
+                extracted_text += text
+    except:
+        extracted_text = ''
+    return extracted_text.replace('?\n', '?-\n').replace('！\n', '！-\n').replace('。\n', '。-\n').replace('\n',' ').replace('?-','?\n').replace('！-','！\n').replace('。-','。\n')
+def get_reference_by_regex(pattern, text):
+    return re.findall(pattern, text)
+def isnot_substring(list_a, string_to_check):
+    for s in list_a:
+        if s in string_to_check:
+            return False
+    return True
+def extract_reference(row):
+    pattern = next((elem for elem in patterns if elem['site'] == row['site']), None)
+    extracted_text = extract_from_pdf_by_pattern(row['attachment'],pattern)
+    reference_titles = re.findall(pattern['article_regex'], extracted_text)
+    reference_dates = re.findall(pattern['date_regex'], extracted_text)
+    reference_titles = [s.replace(' ', '') for s in reference_titles]
+    reference_dates = [s.replace(' ', '') for s in reference_dates]
+    if 'remove' in pattern:
+        for remove_string in pattern['remove']:
+            reference_titles = [s.replace(remove_string, '') for s in reference_titles]
+    for title, date in zip(reference_titles, reference_dates):
+        try:
+            date = datetime.strptime(date, pattern['date_format'])
+        except:
+            date = datetime(2006, 1, 1)
+        dates = []
+        if 'date_range' in pattern:
+            for i in range(pattern['date_range'] + 1):
+                dates.append((date + timedelta(days=i)).strftime('%Y-%m-%d'))
+                dates.append((date - timedelta(days=i)).strftime('%Y-%m-%d'))
+        dates.append(date.strftime('%Y-%m-%d'))
+        date = date.strftime('%Y-%m-%d')
+        if 'split' in pattern:
+            for split_item in pattern['split']:
+                if 'exceptional_string' in split_item:
+                    if split_item['string'] in title and isnot_substring(split_item['exceptional_string'], title):
+                        title = re.split(split_item['string'], title)[split_item['index']]
+                else:
+                    if split_item['string'] in title:
+                        title = title.split(split_item['string'])[split_item['index']]
+        if len(data[(data['titleCN'].str.contains(title)) & (data['site'] == row['site']) & (data['publishdate'].isin(dates))]) == 0:
+            print("------------ = 0 ------------")
+            print(date, repr(title))
+        elif len(data[(data['titleCN'].str.contains(title)) & (data['site'] == row['site']) & (data['publishdate'].isin(dates))]) > 1:
+            print("------------ > 1 ------------")
+            print(date, repr(title))
+        else:
+            print("------------ = 1 ------------")
+            reference_df = data[(data['titleCN'].str.contains(title)) & (data['site'] == row['site']) & (data['publishdate'].isin(dates))]
+            row['referenceID'] = reference_df.iloc[0]['id']
+            row['link'] = reference_df.iloc[0]['link']
+            row['sourceID'] = row['id_x']
+            row['refID'] = uuid.uuid5(uuid.NAMESPACE_OID, str(row['sourceID'])+str(row['referenceID']))
+            print(date, repr(title), row['sourceID'],row['referenceID'])
+            # update_reference(row)
 def translate(text):
     return translator.translate(text, dest='en').text
         return False
 def fetch_url(url):
+    response = requests.get(url, timeout = 60)
     if response.status_code == 200:
         return response.text
     else:
         else:
             line = element
         if line != '':
+            line = line + '\n'
+            text += line
         index = text.find('打印本页')
     if index != -1:
+        text = text[:index]
     try:
+        summary = '\n'.join(text.split('\n')[:2])
     except:
+        summary = text
     return text, summary
 def extract_from_pdf(url):
     # Send a GET request to the URL and retrieve the PDF content
+    response = requests.get(url, timeout=60)
     pdf_content = response.content
     # Save the PDF content to a local file
+    with open("downloaded_file.pdf", "wb") as file:
+        file.write(pdf_content)
     # Open the downloaded PDF file and extract the text
+    with open("downloaded_file.pdf", "rb") as file:
+        pdf_reader = PdfReader(file)
         num_pages = len(pdf_reader.pages)
         extracted_text = ""
         for page in range(num_pages):
     response = table.put_item(Item=item)
     print(response)
+# def get_client_connection():
+#     """Get dynamoDB connection"""
+#     dynamodb = boto3.client(
+#         service_name='dynamodb',
+#         region_name='us-east-1',
+#         aws_access_key_id=AWS_ACCESS_KEY_ID,
+#         aws_secret_access_key=AWS_SECRET_ACCESS_KEY
+#     )
+#     return dynamodb
 def delete_records(item):
+    dynamodb_client = get_client_connection()
+    dynamodb_client.delete_item(
             TableName="article_test",
             Key={
                 'id': {'S': item['id']},
                 }
             )
     print(response)
+data = download_files_from_s3('data')