Spaces:
Running
on
Zero
Running
on
Zero
import os | |
import pandas as pd | |
import spaces | |
# GLOBAL VARIABLES | |
if os.path.isfile('data/s2l2a_metadata.parquet'): | |
l2a_meta_path = 'data/s2l2a_metadata.parquet' | |
else: | |
DATASET_NAME = 'Major-TOM/Core-S2L2A' | |
l2a_meta_path = 'https://huggingface.co/datasets/{}/resolve/main/metadata.parquet'.format(DATASET_NAME) | |
if os.path.isfile('data/s2l1c_metadata.parquet'): | |
l1c_meta_path = 'data/s2l1c_metadata.parquet' | |
else: | |
DATASET_NAME = 'Major-TOM/Core-S2L1C' | |
l1c_meta_path = 'https://huggingface.co/datasets/{}/resolve/main/metadata.parquet'.format(DATASET_NAME) | |
if os.path.isfile('/s1rtc_metadata.parquet'): | |
rtc_meta_path = 'data/s1rtc_metadata.parquet' | |
else: | |
DATASET_NAME = 'Major-TOM/Core-S1RTC' | |
rtc_meta_path = 'https://huggingface.co/datasets/{}/resolve/main/metadata.parquet'.format(DATASET_NAME) | |
if os.path.isfile('helpers/dem_metadata.parquet'): | |
dem_meta_path = 'data/dem_metadata.parquet' | |
else: | |
DATASET_NAME = 'Major-TOM/Core-DEM' | |
dem_meta_path = 'https://huggingface.co/datasets/{}/resolve/main/metadata.parquet'.format(DATASET_NAME) | |
print('Loading Major TOM meta...') | |
l2a_df = pd.read_parquet(l2a_meta_path) | |
l1c_df = pd.read_parquet(l1c_meta_path) | |
rtc_df = pd.read_parquet(rtc_meta_path) | |
dem_df = pd.read_parquet(dem_meta_path) | |
# skip files with missing parts | |
l2a_df = l2a_df[l2a_df.nodata == 0] | |
l1c_df = l1c_df[l1c_df.nodata == 0] | |
rtc_df = rtc_df[rtc_df.nodata == 0] | |
dem_df = dem_df[dem_df.nodata == 0] | |
# collect grid_cells, drop duplicates, and extract grid cell column only | |
grid_cell_df = l2a_df[l2a_df.grid_cell.isin(l1c_df.grid_cell) &l2a_df.grid_cell.isin(rtc_df.grid_cell) & l2a_df.grid_cell.isin(dem_df.grid_cell)] | |
gird_cell_df = grid_cell_df.drop_duplicates(subset=['grid_cell']) | |
grid_cell_df = grid_cell_df.grid_cell | |
print('[DONE]') | |
import pyarrow.parquet as pq | |
import fsspec | |
from fsspec.parquet import open_parquet_file | |
from io import BytesIO | |
from PIL import Image | |
import random | |
def row2image(row, fullrow_read=True): | |
""" | |
Extracts an image from a specific row in a Parquet file. | |
Args: | |
row: A row object containing information about the Parquet file and row index. | |
It is expected to have attributes 'parquet_row' (the row index within the Parquet file) | |
and 'parquet_url' (the URL or path to the Parquet file). | |
fullrow_read (bool, optional): Determines whether to read the entire Parquet file or just the 'thumbnail' column initially. | |
Defaults to True. | |
- If True, it opens the Parquet file using fsspec and reads the entire file. | |
- If False, it uses fsspec.parquet.open_parquet_file to only open the 'thumbnail' column. | |
Returns: | |
PIL.Image.Image: An Image object loaded from the 'thumbnail' data in the specified row. | |
""" | |
parquet_row = row.parquet_row | |
parquet_url = row.parquet_url | |
if fullrow_read: | |
# Option 1: Read the entire Parquet file | |
f = fsspec.open(parquet_url) | |
temp_path = f.open() | |
else: | |
# Option 2: Read only the 'thumbnail' column initially | |
temp_path = open_parquet_file(parquet_url, columns=["thumbnail"]) | |
with pq.ParquetFile(temp_path) as pf: | |
first_row_group = pf.read_row_group(parquet_row, columns=['thumbnail']) | |
stream = BytesIO(first_row_group['thumbnail'][0].as_py()) | |
return Image.open(stream) | |
# Example usage (assuming 'dem_df' is a Pandas DataFrame with the required structure): | |
# row2image(dem_df.iloc[1000]) | |
def get_rows(grid_cell): | |
""" | |
Retrieves the first row from multiple DataFrames based on a given 'grid_cell' value. | |
Args: | |
grid_cell: The value to filter the DataFrames by in the 'grid_cell' column. | |
Returns: | |
tuple: A tuple containing the first matching row from each of the following DataFrames: | |
l2a_df, l1c_df, rtc_df, and dem_df. It assumes these DataFrames are defined in the scope. | |
Each element of the tuple is a Pandas Series representing a row. | |
""" | |
return l1c_df[l1c_df.grid_cell == grid_cell].iloc[0], \ | |
l2a_df[l2a_df.grid_cell == grid_cell].iloc[0], \ | |
rtc_df[rtc_df.grid_cell == grid_cell].iloc[0], \ | |
dem_df[dem_df.grid_cell == grid_cell].iloc[0] | |
def get_images(grid_cell): | |
""" | |
Retrieves images corresponding to a specific 'grid_cell' by calling get_rows and row2image. | |
Args: | |
grid_cell: The grid cell identifier to fetch images for. | |
Returns: | |
list: A list of PIL.Image.Image objects, where each image is extracted from the rows | |
returned by the get_rows function for the given grid cell. | |
""" | |
img_rows = get_rows(grid_cell) | |
imgs = [] | |
for row in img_rows: | |
imgs.append(row2image(row)) | |
return imgs | |
def resize_and_crop(images, image_size=(1068, 1068), crop_size=(256, 256)): | |
""" | |
Resizes a list of images to a specified size and then crops a random portion from each. | |
Args: | |
images (list): A list of PIL.Image.Image objects to be processed. | |
image_size (tuple, optional): The target size (width, height) to resize the images to. | |
Defaults to (1068, 1068). | |
crop_size (tuple, optional): The size (width, height) of the random crop to be taken | |
from the resized images. Defaults to (256, 256). | |
Returns: | |
list: A list of PIL.Image.Image objects, where each image has been resized and then cropped. | |
""" | |
left = random.randint(0, image_size[0] - crop_size[0]) | |
top = random.randint(0, image_size[1] - crop_size[1]) | |
right = left + crop_size[0] | |
bottom = top + crop_size[1] | |
return [img.resize(image_size).crop((left, top, right, bottom)) for img in images] | |
def sample_shuffle(): | |
""" | |
Randomly selects a 'grid_cell', retrieves corresponding images, and optionally prepares them for an interface. | |
Args: | |
interface (bool, optional): If True, the function returns a list where each image is followed by True. | |
This might be intended for an interface that expects an image and a boolean flag. | |
If False, it returns just the list of processed images. Defaults to True. | |
Returns: | |
list: If interface is False, returns a list of resized and cropped PIL.Image.Image objects. | |
If interface is True, returns a list where each image is followed by the boolean value True. | |
""" | |
grid_cell = grid_cell_df.sample().iloc[0] | |
return resize_and_crop(get_images(grid_cell)) |