OxbridgeEconomics commited on
Commit
548e659
·
unverified ·
1 Parent(s): 0c6d7b2

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +24 -24
utils.py CHANGED
@@ -71,30 +71,30 @@ def update_reference(report):
71
  )
72
  print(response)
73
 
74
- def download_files_from_s3(folder):
75
- """
76
- Downloads Parquet files from an S3 bucket and returns a concatenated DataFrame.
77
-
78
- Args:
79
- folder (str): The folder in the S3 bucket to download files from.
80
-
81
- Returns:
82
- pandas.DataFrame: A concatenated DataFrame containing the data from the downloaded Parquet files.
83
- """
84
- if not os.path.exists(folder):
85
- os.makedirs(folder)
86
- client = boto3.client(
87
- 's3',
88
- aws_access_key_id=AWS_ACCESS_KEY_ID,
89
- aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
90
- )
91
- response = client.list_objects_v2(Bucket='china-securities-report', Prefix=f"{folder}/")
92
- for obj in response['Contents']:
93
- key = obj['Key']
94
- if key.endswith('.parquet'):
95
- client.download_file('china-securities-report', key, key)
96
- file_paths = glob.glob(os.path.join(folder, '*.parquet'))
97
- return pd.concat([pd.read_parquet(file_path) for file_path in file_paths], ignore_index=True)
98
 
99
  def extract_from_pdf_by_pattern(url, pattern):
100
  """
 
71
  )
72
  print(response)
73
 
74
+ # def download_files_from_s3(folder):
75
+ # """
76
+ # Downloads Parquet files from an S3 bucket and returns a concatenated DataFrame.
77
+
78
+ # Args:
79
+ # folder (str): The folder in the S3 bucket to download files from.
80
+
81
+ # Returns:
82
+ # pandas.DataFrame: A concatenated DataFrame containing the data from the downloaded Parquet files.
83
+ # """
84
+ # if not os.path.exists(folder):
85
+ # os.makedirs(folder)
86
+ # client = boto3.client(
87
+ # 's3',
88
+ # aws_access_key_id=AWS_ACCESS_KEY_ID,
89
+ # aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
90
+ # )
91
+ # response = client.list_objects_v2(Bucket='china-securities-report', Prefix=f"{folder}/")
92
+ # for obj in response['Contents']:
93
+ # key = obj['Key']
94
+ # if key.endswith('.parquet'):
95
+ # client.download_file('china-securities-report', key, key)
96
+ # file_paths = glob.glob(os.path.join(folder, '*.parquet'))
97
+ # return pd.concat([pd.read_parquet(file_path) for file_path in file_paths], ignore_index=True)
98
 
99
  def extract_from_pdf_by_pattern(url, pattern):
100
  """