Spaces:

Oxbridge-Economics
/

Data-Collection-China

Running

OxbridgeEconomics commited on Jul 26, 2024

Commit

548e659

unverified ·

1 Parent(s): 0c6d7b2

Update utils.py

Files changed (1) hide show

utils.py CHANGED Viewed

@@ -71,30 +71,30 @@ def update_reference(report):
             )
     print(response)
-def download_files_from_s3(folder):
-    """
-    Downloads Parquet files from an S3 bucket and returns a concatenated DataFrame.
-    Args:
-        folder (str): The folder in the S3 bucket to download files from.
-    Returns:
-        pandas.DataFrame: A concatenated DataFrame containing the data from the downloaded Parquet files.
-    """
-    if not os.path.exists(folder):
-        os.makedirs(folder)
-    client = boto3.client(
-        's3',
-        aws_access_key_id=AWS_ACCESS_KEY_ID,
-        aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
-    )
-    response = client.list_objects_v2(Bucket='china-securities-report', Prefix=f"{folder}/")
-    for obj in response['Contents']:
-        key = obj['Key']
-        if key.endswith('.parquet'):
-            client.download_file('china-securities-report', key, key)
-    file_paths = glob.glob(os.path.join(folder, '*.parquet'))
-    return pd.concat([pd.read_parquet(file_path) for file_path in file_paths], ignore_index=True)
 def extract_from_pdf_by_pattern(url, pattern):
     """

             )
     print(response)
+# def download_files_from_s3(folder):
+#     """
+#     Downloads Parquet files from an S3 bucket and returns a concatenated DataFrame.
+#     Args:
+#         folder (str): The folder in the S3 bucket to download files from.
+#     Returns:
+#         pandas.DataFrame: A concatenated DataFrame containing the data from the downloaded Parquet files.
+#     """
+#     if not os.path.exists(folder):
+#         os.makedirs(folder)
+#     client = boto3.client(
+#         's3',
+#         aws_access_key_id=AWS_ACCESS_KEY_ID,
+#         aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
+#     )
+#     response = client.list_objects_v2(Bucket='china-securities-report', Prefix=f"{folder}/")
+#     for obj in response['Contents']:
+#         key = obj['Key']
+#         if key.endswith('.parquet'):
+#             client.download_file('china-securities-report', key, key)
+#     file_paths = glob.glob(os.path.join(folder, '*.parquet'))
+#     return pd.concat([pd.read_parquet(file_path) for file_path in file_paths], ignore_index=True)
 def extract_from_pdf_by_pattern(url, pattern):
     """