import os import pandas as pd import spaces # GLOBAL VARIABLES if os.path.isfile('data/s2l2a_metadata.parquet'): l2a_meta_path = 'data/s2l2a_metadata.parquet' else: DATASET_NAME = 'Major-TOM/Core-S2L2A' l2a_meta_path = 'https://huggingface.co/datasets/{}/resolve/main/metadata.parquet'.format(DATASET_NAME) if os.path.isfile('data/s2l1c_metadata.parquet'): l1c_meta_path = 'data/s2l1c_metadata.parquet' else: DATASET_NAME = 'Major-TOM/Core-S2L1C' l1c_meta_path = 'https://huggingface.co/datasets/{}/resolve/main/metadata.parquet'.format(DATASET_NAME) if os.path.isfile('/s1rtc_metadata.parquet'): rtc_meta_path = 'data/s1rtc_metadata.parquet' else: DATASET_NAME = 'Major-TOM/Core-S1RTC' rtc_meta_path = 'https://huggingface.co/datasets/{}/resolve/main/metadata.parquet'.format(DATASET_NAME) if os.path.isfile('helpers/dem_metadata.parquet'): dem_meta_path = 'data/dem_metadata.parquet' else: DATASET_NAME = 'Major-TOM/Core-DEM' dem_meta_path = 'https://huggingface.co/datasets/{}/resolve/main/metadata.parquet'.format(DATASET_NAME) print('Loading Major TOM meta...') l2a_df = pd.read_parquet(l2a_meta_path) l1c_df = pd.read_parquet(l1c_meta_path) rtc_df = pd.read_parquet(rtc_meta_path) dem_df = pd.read_parquet(dem_meta_path) # skip files with missing parts l2a_df = l2a_df[l2a_df.nodata == 0] l1c_df = l1c_df[l1c_df.nodata == 0] rtc_df = rtc_df[rtc_df.nodata == 0] dem_df = dem_df[dem_df.nodata == 0] # collect grid_cells, drop duplicates, and extract grid cell column only grid_cell_df = l2a_df[l2a_df.grid_cell.isin(l1c_df.grid_cell) &l2a_df.grid_cell.isin(rtc_df.grid_cell) & l2a_df.grid_cell.isin(dem_df.grid_cell)] gird_cell_df = grid_cell_df.drop_duplicates(subset=['grid_cell']) grid_cell_df = grid_cell_df.grid_cell print('[DONE]') import pyarrow.parquet as pq import fsspec from fsspec.parquet import open_parquet_file from io import BytesIO from PIL import Image import random def row2image(row, fullrow_read=True): """ Extracts an image from a specific row in a Parquet file. Args: row: A row object containing information about the Parquet file and row index. It is expected to have attributes 'parquet_row' (the row index within the Parquet file) and 'parquet_url' (the URL or path to the Parquet file). fullrow_read (bool, optional): Determines whether to read the entire Parquet file or just the 'thumbnail' column initially. Defaults to True. - If True, it opens the Parquet file using fsspec and reads the entire file. - If False, it uses fsspec.parquet.open_parquet_file to only open the 'thumbnail' column. Returns: PIL.Image.Image: An Image object loaded from the 'thumbnail' data in the specified row. """ parquet_row = row.parquet_row parquet_url = row.parquet_url if fullrow_read: # Option 1: Read the entire Parquet file f = fsspec.open(parquet_url) temp_path = f.open() else: # Option 2: Read only the 'thumbnail' column initially temp_path = open_parquet_file(parquet_url, columns=["thumbnail"]) with pq.ParquetFile(temp_path) as pf: first_row_group = pf.read_row_group(parquet_row, columns=['thumbnail']) stream = BytesIO(first_row_group['thumbnail'][0].as_py()) return Image.open(stream) # Example usage (assuming 'dem_df' is a Pandas DataFrame with the required structure): # row2image(dem_df.iloc[1000]) def get_rows(grid_cell): """ Retrieves the first row from multiple DataFrames based on a given 'grid_cell' value. Args: grid_cell: The value to filter the DataFrames by in the 'grid_cell' column. Returns: tuple: A tuple containing the first matching row from each of the following DataFrames: l2a_df, l1c_df, rtc_df, and dem_df. It assumes these DataFrames are defined in the scope. Each element of the tuple is a Pandas Series representing a row. """ return l1c_df[l1c_df.grid_cell == grid_cell].iloc[0], \ l2a_df[l2a_df.grid_cell == grid_cell].iloc[0], \ rtc_df[rtc_df.grid_cell == grid_cell].iloc[0], \ dem_df[dem_df.grid_cell == grid_cell].iloc[0] def get_images(grid_cell): """ Retrieves images corresponding to a specific 'grid_cell' by calling get_rows and row2image. Args: grid_cell: The grid cell identifier to fetch images for. Returns: list: A list of PIL.Image.Image objects, where each image is extracted from the rows returned by the get_rows function for the given grid cell. """ img_rows = get_rows(grid_cell) imgs = [] for row in img_rows: imgs.append(row2image(row)) return imgs def resize_and_crop(images, image_size=(1068, 1068), crop_size=(256, 256)): """ Resizes a list of images to a specified size and then crops a random portion from each. Args: images (list): A list of PIL.Image.Image objects to be processed. image_size (tuple, optional): The target size (width, height) to resize the images to. Defaults to (1068, 1068). crop_size (tuple, optional): The size (width, height) of the random crop to be taken from the resized images. Defaults to (256, 256). Returns: list: A list of PIL.Image.Image objects, where each image has been resized and then cropped. """ left = random.randint(0, image_size[0] - crop_size[0]) top = random.randint(0, image_size[1] - crop_size[1]) right = left + crop_size[0] bottom = top + crop_size[1] return [img.resize(image_size).crop((left, top, right, bottom)) for img in images] def sample_shuffle(): """ Randomly selects a 'grid_cell', retrieves corresponding images, and optionally prepares them for an interface. Args: interface (bool, optional): If True, the function returns a list where each image is followed by True. This might be intended for an interface that expects an image and a boolean flag. If False, it returns just the list of processed images. Defaults to True. Returns: list: If interface is False, returns a list of resized and cropped PIL.Image.Image objects. If interface is True, returns a list where each image is followed by the boolean value True. """ grid_cell = grid_cell_df.sample().iloc[0] return resize_and_crop(get_images(grid_cell))