Spaces:

mikonvergence
/

COP-GEN-Beta

Running on Zero

mikonvergence

spaces fix

acfd194 10 days ago

6.48 kB

	import os
	import pandas as pd
	import spaces

	# GLOBAL VARIABLES
	if os.path.isfile('data/s2l2a_metadata.parquet'):
	l2a_meta_path = 'data/s2l2a_metadata.parquet'
	else:
	DATASET_NAME = 'Major-TOM/Core-S2L2A'
	l2a_meta_path = 'https://huggingface.co/datasets/{}/resolve/main/metadata.parquet'.format(DATASET_NAME)

	if os.path.isfile('data/s2l1c_metadata.parquet'):
	l1c_meta_path = 'data/s2l1c_metadata.parquet'
	else:
	DATASET_NAME = 'Major-TOM/Core-S2L1C'
	l1c_meta_path = 'https://huggingface.co/datasets/{}/resolve/main/metadata.parquet'.format(DATASET_NAME)

	if os.path.isfile('/s1rtc_metadata.parquet'):
	rtc_meta_path = 'data/s1rtc_metadata.parquet'
	else:
	DATASET_NAME = 'Major-TOM/Core-S1RTC'
	rtc_meta_path = 'https://huggingface.co/datasets/{}/resolve/main/metadata.parquet'.format(DATASET_NAME)

	if os.path.isfile('helpers/dem_metadata.parquet'):
	dem_meta_path = 'data/dem_metadata.parquet'
	else:
	DATASET_NAME = 'Major-TOM/Core-DEM'
	dem_meta_path = 'https://huggingface.co/datasets/{}/resolve/main/metadata.parquet'.format(DATASET_NAME)

	print('Loading Major TOM meta...')
	l2a_df = pd.read_parquet(l2a_meta_path)
	l1c_df = pd.read_parquet(l1c_meta_path)
	rtc_df = pd.read_parquet(rtc_meta_path)
	dem_df = pd.read_parquet(dem_meta_path)

	# skip files with missing parts
	l2a_df = l2a_df[l2a_df.nodata == 0]
	l1c_df = l1c_df[l1c_df.nodata == 0]
	rtc_df = rtc_df[rtc_df.nodata == 0]
	dem_df = dem_df[dem_df.nodata == 0]

	# collect grid_cells, drop duplicates, and extract grid cell column only
	grid_cell_df = l2a_df[l2a_df.grid_cell.isin(l1c_df.grid_cell) &l2a_df.grid_cell.isin(rtc_df.grid_cell) & l2a_df.grid_cell.isin(dem_df.grid_cell)]
	gird_cell_df = grid_cell_df.drop_duplicates(subset=['grid_cell'])
	grid_cell_df = grid_cell_df.grid_cell
	print('[DONE]')

	import pyarrow.parquet as pq
	import fsspec
	from fsspec.parquet import open_parquet_file
	from io import BytesIO
	from PIL import Image
	import random

	def row2image(row, fullrow_read=True):
	"""
	Extracts an image from a specific row in a Parquet file.

	Args:
	row: A row object containing information about the Parquet file and row index.
	It is expected to have attributes 'parquet_row' (the row index within the Parquet file)
	and 'parquet_url' (the URL or path to the Parquet file).
	fullrow_read (bool, optional): Determines whether to read the entire Parquet file or just the 'thumbnail' column initially.
	Defaults to True.
	- If True, it opens the Parquet file using fsspec and reads the entire file.
	- If False, it uses fsspec.parquet.open_parquet_file to only open the 'thumbnail' column.

	Returns:
	PIL.Image.Image: An Image object loaded from the 'thumbnail' data in the specified row.
	"""
	parquet_row = row.parquet_row
	parquet_url = row.parquet_url

	if fullrow_read:
	# Option 1: Read the entire Parquet file
	f = fsspec.open(parquet_url)
	temp_path = f.open()
	else:
	# Option 2: Read only the 'thumbnail' column initially
	temp_path = open_parquet_file(parquet_url, columns=["thumbnail"])

	with pq.ParquetFile(temp_path) as pf:
	first_row_group = pf.read_row_group(parquet_row, columns=['thumbnail'])

	stream = BytesIO(first_row_group['thumbnail'][0].as_py())
	return Image.open(stream)

	# Example usage (assuming 'dem_df' is a Pandas DataFrame with the required structure):
	# row2image(dem_df.iloc[1000])

	def get_rows(grid_cell):
	"""
	Retrieves the first row from multiple DataFrames based on a given 'grid_cell' value.

	Args:
	grid_cell: The value to filter the DataFrames by in the 'grid_cell' column.

	Returns:
	tuple: A tuple containing the first matching row from each of the following DataFrames:
	l2a_df, l1c_df, rtc_df, and dem_df. It assumes these DataFrames are defined in the scope.
	Each element of the tuple is a Pandas Series representing a row.
	"""
	return l1c_df[l1c_df.grid_cell == grid_cell].iloc[0], \
	l2a_df[l2a_df.grid_cell == grid_cell].iloc[0], \
	rtc_df[rtc_df.grid_cell == grid_cell].iloc[0], \
	dem_df[dem_df.grid_cell == grid_cell].iloc[0]

	def get_images(grid_cell):
	"""
	Retrieves images corresponding to a specific 'grid_cell' by calling get_rows and row2image.

	Args:
	grid_cell: The grid cell identifier to fetch images for.

	Returns:
	list: A list of PIL.Image.Image objects, where each image is extracted from the rows
	returned by the get_rows function for the given grid cell.
	"""
	img_rows = get_rows(grid_cell)

	imgs = []
	for row in img_rows:
	imgs.append(row2image(row))

	return imgs

	def resize_and_crop(images, image_size=(1068, 1068), crop_size=(256, 256)):
	"""
	Resizes a list of images to a specified size and then crops a random portion from each.

	Args:
	images (list): A list of PIL.Image.Image objects to be processed.
	image_size (tuple, optional): The target size (width, height) to resize the images to.
	Defaults to (1068, 1068).
	crop_size (tuple, optional): The size (width, height) of the random crop to be taken
	from the resized images. Defaults to (256, 256).

	Returns:
	list: A list of PIL.Image.Image objects, where each image has been resized and then cropped.
	"""
	left = random.randint(0, image_size[0] - crop_size[0])
	top = random.randint(0, image_size[1] - crop_size[1])
	right = left + crop_size[0]
	bottom = top + crop_size[1]

	return [img.resize(image_size).crop((left, top, right, bottom)) for img in images]

	def sample_shuffle():
	"""
	Randomly selects a 'grid_cell', retrieves corresponding images, and optionally prepares them for an interface.

	Args:
	interface (bool, optional): If True, the function returns a list where each image is followed by True.
	This might be intended for an interface that expects an image and a boolean flag.
	If False, it returns just the list of processed images. Defaults to True.

	Returns:
	list: If interface is False, returns a list of resized and cropped PIL.Image.Image objects.
	If interface is True, returns a list where each image is followed by the boolean value True.
	"""
	grid_cell = grid_cell_df.sample().iloc[0]

	return resize_and_crop(get_images(grid_cell))