|
/****************************************************************************** |
|
* Copyright (c) 2011, Duane Merrill. All rights reserved. |
|
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. |
|
* |
|
* Redistribution and use in source and binary forms, with or without |
|
* modification, are permitted provided that the following conditions are met: |
|
* * Redistributions of source code must retain the above copyright |
|
* notice, this list of conditions and the following disclaimer. |
|
* * Redistributions in binary form must reproduce the above copyright |
|
* notice, this list of conditions and the following disclaimer in the |
|
* documentation and/or other materials provided with the distribution. |
|
* * Neither the name of the NVIDIA CORPORATION nor the |
|
* names of its contributors may be used to endorse or promote products |
|
* derived from this software without specific prior written permission. |
|
* |
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND |
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
|
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
|
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY |
|
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
|
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
|
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND |
|
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
|
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
|
* |
|
******************************************************************************/ |
|
|
|
/****************************************************************************** |
|
* Test evaluation for caching allocator of device memory |
|
******************************************************************************/ |
|
|
|
// Ensure printing of CUDA runtime errors to console |
|
#define CUB_STDERR |
|
|
|
#include <stdio.h> |
|
|
|
#include <cub/util_allocator.cuh> |
|
#include "test_util.h" |
|
|
|
using namespace cub; |
|
|
|
|
|
//--------------------------------------------------------------------- |
|
// Main |
|
//--------------------------------------------------------------------- |
|
|
|
/** |
|
* Main |
|
*/ |
|
int main(int argc, char** argv) |
|
{ |
|
// Initialize command line |
|
CommandLineArgs args(argc, argv); |
|
|
|
// Print usage |
|
if (args.CheckCmdLineFlag("help")) |
|
{ |
|
printf("%s " |
|
"[--device=<device-id>]" |
|
"[--bytes=<timing bytes>]" |
|
"[--i=<timing iterations>]" |
|
"\n", argv[0]); |
|
exit(0); |
|
} |
|
|
|
#if (CUB_PTX_ARCH == 0) |
|
|
|
// Initialize device |
|
CubDebugExit(args.DeviceInit()); |
|
|
|
// Get number of GPUs and current GPU |
|
int num_gpus; |
|
int initial_gpu; |
|
int timing_iterations = 10000; |
|
int timing_bytes = 1024 * 1024; |
|
|
|
if (CubDebug(cudaGetDeviceCount(&num_gpus))) exit(1); |
|
if (CubDebug(cudaGetDevice(&initial_gpu))) exit(1); |
|
args.GetCmdLineArgument("i", timing_iterations); |
|
args.GetCmdLineArgument("bytes", timing_bytes); |
|
|
|
// Create default allocator (caches up to 6MB in device allocations per GPU) |
|
CachingDeviceAllocator allocator; |
|
allocator.debug = true; |
|
|
|
printf("Running single-gpu tests...\n"); fflush(stdout); |
|
|
|
// |
|
// Test0 |
|
// |
|
|
|
// Create a new stream |
|
cudaStream_t other_stream; |
|
CubDebugExit(cudaStreamCreate(&other_stream)); |
|
|
|
// Allocate 999 bytes on the current gpu in stream0 |
|
char *d_999B_stream0_a; |
|
char *d_999B_stream0_b; |
|
CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream0_a, 999, 0)); |
|
|
|
// Run some big kernel in stream 0 |
|
EmptyKernel<void><<<32000, 512, 1024 * 8, 0>>>(); |
|
|
|
// Free d_999B_stream0_a |
|
CubDebugExit(allocator.DeviceFree(d_999B_stream0_a)); |
|
|
|
// Allocate another 999 bytes in stream 0 |
|
CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream0_b, 999, 0)); |
|
|
|
// Check that that we have 1 live block on the initial GPU |
|
AssertEquals(allocator.live_blocks.size(), 1); |
|
|
|
// Check that that we have no cached block on the initial GPU |
|
AssertEquals(allocator.cached_blocks.size(), 0); |
|
|
|
// Run some big kernel in stream 0 |
|
EmptyKernel<void><<<32000, 512, 1024 * 8, 0>>>(); |
|
|
|
// Free d_999B_stream0_b |
|
CubDebugExit(allocator.DeviceFree(d_999B_stream0_b)); |
|
|
|
// Allocate 999 bytes on the current gpu in other_stream |
|
char *d_999B_stream_other_a; |
|
char *d_999B_stream_other_b; |
|
allocator.DeviceAllocate((void **) &d_999B_stream_other_a, 999, other_stream); |
|
|
|
// Check that that we have 1 live blocks on the initial GPU (that we allocated a new one because d_999B_stream0_b is only available for stream 0 until it becomes idle) |
|
AssertEquals(allocator.live_blocks.size(), 1); |
|
|
|
// Check that that we have one cached block on the initial GPU |
|
AssertEquals(allocator.cached_blocks.size(), 1); |
|
|
|
// Run some big kernel in other_stream |
|
EmptyKernel<void><<<32000, 512, 1024 * 8, other_stream>>>(); |
|
|
|
// Free d_999B_stream_other |
|
CubDebugExit(allocator.DeviceFree(d_999B_stream_other_a)); |
|
|
|
// Check that we can now use both allocations in stream 0 after synchronizing the device |
|
CubDebugExit(cudaDeviceSynchronize()); |
|
CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream0_a, 999, 0)); |
|
CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream0_b, 999, 0)); |
|
|
|
// Check that that we have 2 live blocks on the initial GPU |
|
AssertEquals(allocator.live_blocks.size(), 2); |
|
|
|
// Check that that we have no cached block on the initial GPU |
|
AssertEquals(allocator.cached_blocks.size(), 0); |
|
|
|
// Free d_999B_stream0_a and d_999B_stream0_b |
|
CubDebugExit(allocator.DeviceFree(d_999B_stream0_a)); |
|
CubDebugExit(allocator.DeviceFree(d_999B_stream0_b)); |
|
|
|
// Check that we can now use both allocations in other_stream |
|
CubDebugExit(cudaDeviceSynchronize()); |
|
CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream_other_a, 999, other_stream)); |
|
CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream_other_b, 999, other_stream)); |
|
|
|
// Check that that we have 2 live blocks on the initial GPU |
|
AssertEquals(allocator.live_blocks.size(), 2); |
|
|
|
// Check that that we have no cached block on the initial GPU |
|
AssertEquals(allocator.cached_blocks.size(), 0); |
|
|
|
// Run some big kernel in other_stream |
|
EmptyKernel<void><<<32000, 512, 1024 * 8, other_stream>>>(); |
|
|
|
// Free d_999B_stream_other_a and d_999B_stream_other_b |
|
CubDebugExit(allocator.DeviceFree(d_999B_stream_other_a)); |
|
CubDebugExit(allocator.DeviceFree(d_999B_stream_other_b)); |
|
|
|
// Check that we can now use both allocations in stream 0 after synchronizing the device and destroying the other stream |
|
CubDebugExit(cudaDeviceSynchronize()); |
|
CubDebugExit(cudaStreamDestroy(other_stream)); |
|
CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream0_a, 999, 0)); |
|
CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream0_b, 999, 0)); |
|
|
|
// Check that that we have 2 live blocks on the initial GPU |
|
AssertEquals(allocator.live_blocks.size(), 2); |
|
|
|
// Check that that we have no cached block on the initial GPU |
|
AssertEquals(allocator.cached_blocks.size(), 0); |
|
|
|
// Free d_999B_stream0_a and d_999B_stream0_b |
|
CubDebugExit(allocator.DeviceFree(d_999B_stream0_a)); |
|
CubDebugExit(allocator.DeviceFree(d_999B_stream0_b)); |
|
|
|
// Free all cached |
|
CubDebugExit(allocator.FreeAllCached()); |
|
|
|
// |
|
// Test1 |
|
// |
|
|
|
// Allocate 5 bytes on the current gpu |
|
char *d_5B; |
|
CubDebugExit(allocator.DeviceAllocate((void **) &d_5B, 5)); |
|
|
|
// Check that that we have zero free bytes cached on the initial GPU |
|
AssertEquals(allocator.cached_bytes[initial_gpu].free, 0); |
|
|
|
// Check that that we have 1 live block on the initial GPU |
|
AssertEquals(allocator.live_blocks.size(), 1); |
|
|
|
// |
|
// Test2 |
|
// |
|
|
|
// Allocate 4096 bytes on the current gpu |
|
char *d_4096B; |
|
CubDebugExit(allocator.DeviceAllocate((void **) &d_4096B, 4096)); |
|
|
|
// Check that that we have 2 live blocks on the initial GPU |
|
AssertEquals(allocator.live_blocks.size(), 2); |
|
|
|
// |
|
// Test3 |
|
// |
|
|
|
// DeviceFree d_5B |
|
CubDebugExit(allocator.DeviceFree(d_5B)); |
|
|
|
// Check that that we have min_bin_bytes free bytes cached on the initial gpu |
|
AssertEquals(allocator.cached_bytes[initial_gpu].free, allocator.min_bin_bytes); |
|
|
|
// Check that that we have 1 live block on the initial GPU |
|
AssertEquals(allocator.live_blocks.size(), 1); |
|
|
|
// Check that that we have 1 cached block on the initial GPU |
|
AssertEquals(allocator.cached_blocks.size(), 1); |
|
|
|
// |
|
// Test4 |
|
// |
|
|
|
// DeviceFree d_4096B |
|
CubDebugExit(allocator.DeviceFree(d_4096B)); |
|
|
|
// Check that that we have the 4096 + min_bin free bytes cached on the initial gpu |
|
AssertEquals(allocator.cached_bytes[initial_gpu].free, allocator.min_bin_bytes + 4096); |
|
|
|
// Check that that we have 0 live block on the initial GPU |
|
AssertEquals(allocator.live_blocks.size(), 0); |
|
|
|
// Check that that we have 2 cached block on the initial GPU |
|
AssertEquals(allocator.cached_blocks.size(), 2); |
|
|
|
// |
|
// Test5 |
|
// |
|
|
|
// Allocate 768 bytes on the current gpu |
|
char *d_768B; |
|
CubDebugExit(allocator.DeviceAllocate((void **) &d_768B, 768)); |
|
|
|
// Check that that we have the min_bin free bytes cached on the initial gpu (4096 was reused) |
|
AssertEquals(allocator.cached_bytes[initial_gpu].free, allocator.min_bin_bytes); |
|
|
|
// Check that that we have 1 live block on the initial GPU |
|
AssertEquals(allocator.live_blocks.size(), 1); |
|
|
|
// Check that that we have 1 cached block on the initial GPU |
|
AssertEquals(allocator.cached_blocks.size(), 1); |
|
|
|
// |
|
// Test6 |
|
// |
|
|
|
// Allocate max_cached_bytes on the current gpu |
|
char *d_max_cached; |
|
CubDebugExit(allocator.DeviceAllocate((void **) &d_max_cached, allocator.max_cached_bytes)); |
|
|
|
// DeviceFree d_max_cached |
|
CubDebugExit(allocator.DeviceFree(d_max_cached)); |
|
|
|
// Check that that we have the min_bin free bytes cached on the initial gpu (max cached was not returned because we went over) |
|
AssertEquals(allocator.cached_bytes[initial_gpu].free, allocator.min_bin_bytes); |
|
|
|
// Check that that we have 1 live block on the initial GPU |
|
AssertEquals(allocator.live_blocks.size(), 1); |
|
|
|
// Check that that we still have 1 cached block on the initial GPU |
|
AssertEquals(allocator.cached_blocks.size(), 1); |
|
|
|
// |
|
// Test7 |
|
// |
|
|
|
// Free all cached blocks on all GPUs |
|
CubDebugExit(allocator.FreeAllCached()); |
|
|
|
// Check that that we have 0 bytes cached on the initial GPU |
|
AssertEquals(allocator.cached_bytes[initial_gpu].free, 0); |
|
|
|
// Check that that we have 0 cached blocks across all GPUs |
|
AssertEquals(allocator.cached_blocks.size(), 0); |
|
|
|
// Check that that still we have 1 live block across all GPUs |
|
AssertEquals(allocator.live_blocks.size(), 1); |
|
|
|
// |
|
// Test8 |
|
// |
|
|
|
// Allocate max cached bytes + 1 on the current gpu |
|
char *d_max_cached_plus; |
|
CubDebugExit(allocator.DeviceAllocate((void **) &d_max_cached_plus, allocator.max_cached_bytes + 1)); |
|
|
|
// DeviceFree max cached bytes |
|
CubDebugExit(allocator.DeviceFree(d_max_cached_plus)); |
|
|
|
// DeviceFree d_768B |
|
CubDebugExit(allocator.DeviceFree(d_768B)); |
|
|
|
unsigned int power; |
|
size_t rounded_bytes; |
|
allocator.NearestPowerOf(power, rounded_bytes, allocator.bin_growth, 768); |
|
|
|
// Check that that we have 4096 free bytes cached on the initial gpu |
|
AssertEquals(allocator.cached_bytes[initial_gpu].free, rounded_bytes); |
|
|
|
// Check that that we have 1 cached blocks across all GPUs |
|
AssertEquals(allocator.cached_blocks.size(), 1); |
|
|
|
// Check that that still we have 0 live block across all GPUs |
|
AssertEquals(allocator.live_blocks.size(), 0); |
|
|
|
#ifndef CUB_CDP |
|
// BUG: find out why these tests fail when one GPU is CDP compliant and the other is not |
|
|
|
if (num_gpus > 1) |
|
{ |
|
printf("\nRunning multi-gpu tests...\n"); fflush(stdout); |
|
|
|
// |
|
// Test9 |
|
// |
|
|
|
// Allocate 768 bytes on the next gpu |
|
int next_gpu = (initial_gpu + 1) % num_gpus; |
|
char *d_768B_2; |
|
CubDebugExit(allocator.DeviceAllocate(next_gpu, (void **) &d_768B_2, 768)); |
|
|
|
// DeviceFree d_768B on the next gpu |
|
CubDebugExit(allocator.DeviceFree(next_gpu, d_768B_2)); |
|
|
|
// Re-allocate 768 bytes on the next gpu |
|
CubDebugExit(allocator.DeviceAllocate(next_gpu, (void **) &d_768B_2, 768)); |
|
|
|
// Re-free d_768B on the next gpu |
|
CubDebugExit(allocator.DeviceFree(next_gpu, d_768B_2)); |
|
|
|
// Check that that we have 4096 free bytes cached on the initial gpu |
|
AssertEquals(allocator.cached_bytes[initial_gpu].free, rounded_bytes); |
|
|
|
// Check that that we have 4096 free bytes cached on the second gpu |
|
AssertEquals(allocator.cached_bytes[next_gpu].free, rounded_bytes); |
|
|
|
// Check that that we have 2 cached blocks across all GPUs |
|
AssertEquals(allocator.cached_blocks.size(), 2); |
|
|
|
// Check that that still we have 0 live block across all GPUs |
|
AssertEquals(allocator.live_blocks.size(), 0); |
|
} |
|
#endif // CUB_CDP |
|
|
|
// |
|
// Performance |
|
// |
|
|
|
printf("\nCPU Performance (%d timing iterations, %d bytes):\n", timing_iterations, timing_bytes); |
|
fflush(stdout); fflush(stderr); |
|
|
|
// CPU performance comparisons vs cached. Allocate and free a 1MB block 2000 times |
|
CpuTimer cpu_timer; |
|
char *d_1024MB = NULL; |
|
allocator.debug = false; |
|
|
|
// Prime the caching allocator and the kernel |
|
CubDebugExit(allocator.DeviceAllocate((void **) &d_1024MB, timing_bytes)); |
|
CubDebugExit(allocator.DeviceFree(d_1024MB)); |
|
cub::EmptyKernel<void><<<1, 32>>>(); |
|
|
|
// CUDA |
|
cpu_timer.Start(); |
|
for (int i = 0; i < timing_iterations; ++i) |
|
{ |
|
CubDebugExit(cudaMalloc((void **) &d_1024MB, timing_bytes)); |
|
CubDebugExit(cudaFree(d_1024MB)); |
|
} |
|
cpu_timer.Stop(); |
|
float cuda_malloc_elapsed_millis = cpu_timer.ElapsedMillis(); |
|
|
|
// CUB |
|
cpu_timer.Start(); |
|
for (int i = 0; i < timing_iterations; ++i) |
|
{ |
|
CubDebugExit(allocator.DeviceAllocate((void **) &d_1024MB, timing_bytes)); |
|
CubDebugExit(allocator.DeviceFree(d_1024MB)); |
|
} |
|
cpu_timer.Stop(); |
|
float cub_calloc_elapsed_millis = cpu_timer.ElapsedMillis(); |
|
|
|
printf("\t CUB CachingDeviceAllocator allocation CPU speedup: %.2f (avg cudaMalloc %.4f ms vs. avg DeviceAllocate %.4f ms)\n", |
|
cuda_malloc_elapsed_millis / cub_calloc_elapsed_millis, |
|
cuda_malloc_elapsed_millis / timing_iterations, |
|
cub_calloc_elapsed_millis / timing_iterations); |
|
|
|
// GPU performance comparisons. Allocate and free a 1MB block 2000 times |
|
GpuTimer gpu_timer; |
|
|
|
printf("\nGPU Performance (%d timing iterations, %d bytes):\n", timing_iterations, timing_bytes); |
|
fflush(stdout); fflush(stderr); |
|
|
|
// Kernel-only |
|
gpu_timer.Start(); |
|
for (int i = 0; i < timing_iterations; ++i) |
|
{ |
|
cub::EmptyKernel<void><<<1, 32>>>(); |
|
} |
|
gpu_timer.Stop(); |
|
float cuda_empty_elapsed_millis = gpu_timer.ElapsedMillis(); |
|
|
|
// CUDA |
|
gpu_timer.Start(); |
|
for (int i = 0; i < timing_iterations; ++i) |
|
{ |
|
CubDebugExit(cudaMalloc((void **) &d_1024MB, timing_bytes)); |
|
cub::EmptyKernel<void><<<1, 32>>>(); |
|
CubDebugExit(cudaFree(d_1024MB)); |
|
} |
|
gpu_timer.Stop(); |
|
cuda_malloc_elapsed_millis = gpu_timer.ElapsedMillis() - cuda_empty_elapsed_millis; |
|
|
|
// CUB |
|
gpu_timer.Start(); |
|
for (int i = 0; i < timing_iterations; ++i) |
|
{ |
|
CubDebugExit(allocator.DeviceAllocate((void **) &d_1024MB, timing_bytes)); |
|
cub::EmptyKernel<void><<<1, 32>>>(); |
|
CubDebugExit(allocator.DeviceFree(d_1024MB)); |
|
} |
|
gpu_timer.Stop(); |
|
cub_calloc_elapsed_millis = gpu_timer.ElapsedMillis() - cuda_empty_elapsed_millis; |
|
|
|
printf("\t CUB CachingDeviceAllocator allocation GPU speedup: %.2f (avg cudaMalloc %.4f ms vs. avg DeviceAllocate %.4f ms)\n", |
|
cuda_malloc_elapsed_millis / cub_calloc_elapsed_millis, |
|
cuda_malloc_elapsed_millis / timing_iterations, |
|
cub_calloc_elapsed_millis / timing_iterations); |
|
|
|
|
|
#endif |
|
|
|
printf("Success\n"); |
|
|
|
return 0; |
|
} |
|
|
|
|