|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#pragma once |
|
|
|
#if defined(_WIN32) || defined(_WIN64) |
|
#include <windows.h> |
|
#undef small |
|
#else |
|
#include <sys/resource.h> |
|
#endif |
|
|
|
#include <cuda_runtime.h> |
|
|
|
#include <stdio.h> |
|
#include <float.h> |
|
|
|
#include <cmath> |
|
#include <string> |
|
#include <vector> |
|
#include <sstream> |
|
#include <iostream> |
|
#include <limits> |
|
|
|
#include "mersenne.h" |
|
#include "half.h" |
|
|
|
#include "cub/util_debug.cuh" |
|
#include "cub/util_device.cuh" |
|
#include "cub/util_type.cuh" |
|
#include "cub/util_macro.cuh" |
|
#include "cub/iterator/discard_output_iterator.cuh" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <typename T, typename U> |
|
T SafeBitCast(const U& in) |
|
{ |
|
static_assert(sizeof(T) == sizeof(U), "Types must be same size."); |
|
T out; |
|
memcpy(&out, &in, sizeof(T)); |
|
return out; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#define AssertEquals(a, b) if ((a) != (b)) { std::cerr << "\n(" << __FILE__ << ": " << __LINE__ << ")\n"; exit(1);} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
struct CommandLineArgs |
|
{ |
|
|
|
std::vector<std::string> keys; |
|
std::vector<std::string> values; |
|
std::vector<std::string> args; |
|
cudaDeviceProp deviceProp; |
|
float device_giga_bandwidth; |
|
size_t device_free_physmem; |
|
size_t device_total_physmem; |
|
|
|
|
|
|
|
|
|
CommandLineArgs(int argc, char **argv) : |
|
keys(10), |
|
values(10) |
|
{ |
|
using namespace std; |
|
|
|
|
|
unsigned int mersenne_init[4]= {0x123, 0x234, 0x345, 0x456}; |
|
mersenne::init_by_array(mersenne_init, 4); |
|
|
|
for (int i = 1; i < argc; i++) |
|
{ |
|
string arg = argv[i]; |
|
|
|
if ((arg[0] != '-') || (arg[1] != '-')) |
|
{ |
|
args.push_back(arg); |
|
continue; |
|
} |
|
|
|
string::size_type pos; |
|
string key, val; |
|
if ((pos = arg.find('=')) == string::npos) { |
|
key = string(arg, 2, arg.length() - 2); |
|
val = ""; |
|
} else { |
|
key = string(arg, 2, pos - 2); |
|
val = string(arg, pos + 1, arg.length() - 1); |
|
} |
|
|
|
keys.push_back(key); |
|
values.push_back(val); |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
|
|
bool CheckCmdLineFlag(const char* arg_name) |
|
{ |
|
using namespace std; |
|
|
|
for (int i = 0; i < int(keys.size()); ++i) |
|
{ |
|
if (keys[i] == string(arg_name)) |
|
return true; |
|
} |
|
return false; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
template <typename T> |
|
int NumNakedArgs() |
|
{ |
|
return args.size(); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
template <typename T> |
|
void GetCmdLineArgument(int index, T &val) |
|
{ |
|
using namespace std; |
|
if (index < args.size()) { |
|
istringstream str_stream(args[index]); |
|
str_stream >> val; |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
template <typename T> |
|
void GetCmdLineArgument(const char *arg_name, T &val) |
|
{ |
|
using namespace std; |
|
|
|
for (int i = 0; i < int(keys.size()); ++i) |
|
{ |
|
if (keys[i] == string(arg_name)) |
|
{ |
|
istringstream str_stream(values[i]); |
|
str_stream >> val; |
|
} |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
|
|
template <typename T> |
|
void GetCmdLineArguments(const char *arg_name, std::vector<T> &vals) |
|
{ |
|
using namespace std; |
|
|
|
if (CheckCmdLineFlag(arg_name)) |
|
{ |
|
|
|
vals.clear(); |
|
|
|
|
|
for (int i = 0; i < keys.size(); ++i) |
|
{ |
|
if (keys[i] == string(arg_name)) |
|
{ |
|
string val_string(values[i]); |
|
istringstream str_stream(val_string); |
|
string::size_type old_pos = 0; |
|
string::size_type new_pos = 0; |
|
|
|
|
|
T val; |
|
while ((new_pos = val_string.find(',', old_pos)) != string::npos) |
|
{ |
|
if (new_pos != old_pos) |
|
{ |
|
str_stream.width(new_pos - old_pos); |
|
str_stream >> val; |
|
vals.push_back(val); |
|
} |
|
|
|
|
|
str_stream.ignore(1); |
|
old_pos = new_pos + 1; |
|
} |
|
|
|
|
|
str_stream >> val; |
|
vals.push_back(val); |
|
} |
|
} |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
|
|
int ParsedArgc() |
|
{ |
|
return (int) keys.size(); |
|
} |
|
|
|
|
|
|
|
|
|
cudaError_t DeviceInit(int dev = -1) |
|
{ |
|
cudaError_t error = cudaSuccess; |
|
|
|
do |
|
{ |
|
int deviceCount; |
|
error = CubDebug(cudaGetDeviceCount(&deviceCount)); |
|
if (error) break; |
|
|
|
if (deviceCount == 0) { |
|
fprintf(stderr, "No devices supporting CUDA.\n"); |
|
exit(1); |
|
} |
|
if (dev < 0) |
|
{ |
|
GetCmdLineArgument("device", dev); |
|
} |
|
if ((dev > deviceCount - 1) || (dev < 0)) |
|
{ |
|
dev = 0; |
|
} |
|
|
|
error = CubDebug(cudaSetDevice(dev)); |
|
if (error) break; |
|
|
|
CubDebugExit(cudaMemGetInfo(&device_free_physmem, &device_total_physmem)); |
|
|
|
int ptx_version = 0; |
|
error = CubDebug(cub::PtxVersion(ptx_version)); |
|
if (error) break; |
|
|
|
error = CubDebug(cudaGetDeviceProperties(&deviceProp, dev)); |
|
if (error) break; |
|
|
|
if (deviceProp.major < 1) { |
|
fprintf(stderr, "Device does not support CUDA.\n"); |
|
exit(1); |
|
} |
|
|
|
device_giga_bandwidth = float(deviceProp.memoryBusWidth) * deviceProp.memoryClockRate * 2 / 8 / 1000 / 1000; |
|
|
|
if (!CheckCmdLineFlag("quiet")) |
|
{ |
|
printf( |
|
"Using device %d: %s (PTX version %d, SM%d, %d SMs, " |
|
"%lld free / %lld total MB physmem, " |
|
"%.3f GB/s @ %d kHz mem clock, ECC %s)\n", |
|
dev, |
|
deviceProp.name, |
|
ptx_version, |
|
deviceProp.major * 100 + deviceProp.minor * 10, |
|
deviceProp.multiProcessorCount, |
|
(unsigned long long) device_free_physmem / 1024 / 1024, |
|
(unsigned long long) device_total_physmem / 1024 / 1024, |
|
device_giga_bandwidth, |
|
deviceProp.memoryClockRate, |
|
(deviceProp.ECCEnabled) ? "on" : "off"); |
|
fflush(stdout); |
|
} |
|
|
|
} while (0); |
|
|
|
return error; |
|
} |
|
}; |
|
|
|
|
|
|
|
|
|
|
|
int g_num_rand_samples = 0; |
|
|
|
|
|
template <typename T> |
|
bool IsNaN(T ) { return false; } |
|
|
|
template<> |
|
__noinline__ bool IsNaN<float>(float val) |
|
{ |
|
return std::isnan(val); |
|
} |
|
|
|
template<> |
|
__noinline__ bool IsNaN<float1>(float1 val) |
|
{ |
|
return (IsNaN(val.x)); |
|
} |
|
|
|
template<> |
|
__noinline__ bool IsNaN<float2>(float2 val) |
|
{ |
|
return (IsNaN(val.y) || IsNaN(val.x)); |
|
} |
|
|
|
template<> |
|
__noinline__ bool IsNaN<float3>(float3 val) |
|
{ |
|
return (IsNaN(val.z) || IsNaN(val.y) || IsNaN(val.x)); |
|
} |
|
|
|
template<> |
|
__noinline__ bool IsNaN<float4>(float4 val) |
|
{ |
|
return (IsNaN(val.y) || IsNaN(val.x) || IsNaN(val.w) || IsNaN(val.z)); |
|
} |
|
|
|
template<> |
|
__noinline__ bool IsNaN<double>(double val) |
|
{ |
|
return std::isnan(val); |
|
} |
|
|
|
template<> |
|
__noinline__ bool IsNaN<double1>(double1 val) |
|
{ |
|
return (IsNaN(val.x)); |
|
} |
|
|
|
template<> |
|
__noinline__ bool IsNaN<double2>(double2 val) |
|
{ |
|
return (IsNaN(val.y) || IsNaN(val.x)); |
|
} |
|
|
|
template<> |
|
__noinline__ bool IsNaN<double3>(double3 val) |
|
{ |
|
return (IsNaN(val.z) || IsNaN(val.y) || IsNaN(val.x)); |
|
} |
|
|
|
template<> |
|
__noinline__ bool IsNaN<double4>(double4 val) |
|
{ |
|
return (IsNaN(val.y) || IsNaN(val.x) || IsNaN(val.w) || IsNaN(val.z)); |
|
} |
|
|
|
|
|
template<> |
|
__noinline__ bool IsNaN<half_t>(half_t val) |
|
{ |
|
const auto bits = SafeBitCast<unsigned short>(val); |
|
|
|
|
|
return (((bits >= 0x7C01) && (bits <= 0x7FFF)) || |
|
((bits >= 0xFC01) )); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <typename K> |
|
void RandomBits( |
|
K &key, |
|
int entropy_reduction = 0, |
|
int begin_bit = 0, |
|
int end_bit = sizeof(K) * 8) |
|
{ |
|
const int NUM_BYTES = sizeof(K); |
|
const int WORD_BYTES = sizeof(unsigned int); |
|
const int NUM_WORDS = (NUM_BYTES + WORD_BYTES - 1) / WORD_BYTES; |
|
|
|
unsigned int word_buff[NUM_WORDS]; |
|
|
|
if (entropy_reduction == -1) |
|
{ |
|
memset((void *) &key, 0, sizeof(key)); |
|
return; |
|
} |
|
|
|
if (end_bit < 0) |
|
end_bit = sizeof(K) * 8; |
|
|
|
while (true) |
|
{ |
|
|
|
for (int j = 0; j < NUM_WORDS; j++) |
|
{ |
|
int current_bit = j * WORD_BYTES * 8; |
|
|
|
unsigned int word = 0xffffffff; |
|
word &= 0xffffffff << CUB_MAX(0, begin_bit - current_bit); |
|
word &= 0xffffffff >> CUB_MAX(0, (current_bit + (WORD_BYTES * 8)) - end_bit); |
|
|
|
for (int i = 0; i <= entropy_reduction; i++) |
|
{ |
|
|
|
word &= mersenne::genrand_int32(); |
|
g_num_rand_samples++; |
|
} |
|
|
|
word_buff[j] = word; |
|
} |
|
|
|
memcpy(&key, word_buff, sizeof(K)); |
|
|
|
K copy = key; |
|
if (!IsNaN(copy)) |
|
break; |
|
} |
|
} |
|
|
|
|
|
template <typename T> |
|
T RandomValue(T max) |
|
{ |
|
unsigned int bits; |
|
unsigned int max_int = (unsigned int) -1; |
|
do { |
|
RandomBits(bits); |
|
} while (bits == max_int); |
|
|
|
return (T) ((double(bits) / double(max_int)) * double(max)); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <typename T> |
|
T CoutCast(T val) { return val; } |
|
|
|
int CoutCast(char val) { return val; } |
|
|
|
int CoutCast(unsigned char val) { return val; } |
|
|
|
int CoutCast(signed char val) { return val; } |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
enum GenMode |
|
{ |
|
UNIFORM, |
|
INTEGER_SEED, |
|
RANDOM, |
|
RANDOM_BIT, |
|
}; |
|
|
|
|
|
|
|
|
|
template <typename T> |
|
__host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, T &value, int index = 0) |
|
{ |
|
switch (gen_mode) |
|
{ |
|
#if (CUB_PTX_ARCH == 0) |
|
case RANDOM: |
|
RandomBits(value); |
|
break; |
|
case RANDOM_BIT: |
|
char c; |
|
RandomBits(c, 0, 0, 1); |
|
value = (c > 0) ? (T) 1 : (T) -1; |
|
break; |
|
#endif |
|
case UNIFORM: |
|
value = 2; |
|
break; |
|
case INTEGER_SEED: |
|
default: |
|
value = (T) index; |
|
break; |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
|
|
__host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, bool &value, int index = 0) |
|
{ |
|
switch (gen_mode) |
|
{ |
|
#if (CUB_PTX_ARCH == 0) |
|
case RANDOM: |
|
case RANDOM_BIT: |
|
char c; |
|
RandomBits(c, 0, 0, 1); |
|
value = (c > 0); |
|
break; |
|
#endif |
|
case UNIFORM: |
|
value = true; |
|
break; |
|
case INTEGER_SEED: |
|
default: |
|
value = (index > 0); |
|
break; |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
|
|
__host__ __device__ __forceinline__ void InitValue(GenMode , |
|
cub::NullType &, |
|
int = 0) |
|
{} |
|
|
|
|
|
|
|
|
|
|
|
template <typename KeyT, typename ValueT> |
|
__host__ __device__ __forceinline__ void InitValue( |
|
GenMode gen_mode, |
|
cub::KeyValuePair<KeyT, ValueT>& value, |
|
int index = 0) |
|
{ |
|
InitValue(gen_mode, value.value, index); |
|
|
|
|
|
RandomBits(value.key, 3); |
|
value.key = (value.key & 0x1); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <typename Key, typename Value> |
|
std::ostream& operator<<(std::ostream& os, const cub::KeyValuePair<Key, Value> &val) |
|
{ |
|
os << '(' << CoutCast(val.key) << ',' << CoutCast(val.value) << ')'; |
|
return os; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#define CUB_VEC_OVERLOAD_1(T, BaseT) \ |
|
\ |
|
std::ostream& operator<<( \ |
|
std::ostream& os, \ |
|
const T& val) \ |
|
{ \ |
|
os << '(' << CoutCast(val.x) << ')'; \ |
|
return os; \ |
|
} \ |
|
\ |
|
__host__ __device__ __forceinline__ bool operator!=( \ |
|
const T &a, \ |
|
const T &b) \ |
|
{ \ |
|
return (a.x != b.x); \ |
|
} \ |
|
\ |
|
__host__ __device__ __forceinline__ bool operator==( \ |
|
const T &a, \ |
|
const T &b) \ |
|
{ \ |
|
return (a.x == b.x); \ |
|
} \ |
|
\ |
|
__host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, T &value, int index = 0) \ |
|
{ \ |
|
InitValue(gen_mode, value.x, index); \ |
|
} \ |
|
\ |
|
__host__ __device__ __forceinline__ bool operator>( \ |
|
const T &a, \ |
|
const T &b) \ |
|
{ \ |
|
return (a.x > b.x); \ |
|
} \ |
|
\ |
|
__host__ __device__ __forceinline__ bool operator<( \ |
|
const T &a, \ |
|
const T &b) \ |
|
{ \ |
|
return (a.x < b.x); \ |
|
} \ |
|
\ |
|
__host__ __device__ __forceinline__ T operator+( \ |
|
T a, \ |
|
T b) \ |
|
{ \ |
|
T retval = make_##T(a.x + b.x); \ |
|
return retval; \ |
|
} \ |
|
namespace cub { \ |
|
template<> \ |
|
struct NumericTraits<T> \ |
|
{ \ |
|
static const Category CATEGORY = NOT_A_NUMBER; \ |
|
enum { \ |
|
PRIMITIVE = false, \ |
|
NULL_TYPE = false, \ |
|
}; \ |
|
static T Max() \ |
|
{ \ |
|
T retval = { \ |
|
NumericTraits<BaseT>::Max()}; \ |
|
return retval; \ |
|
} \ |
|
static T Lowest() \ |
|
{ \ |
|
T retval = { \ |
|
NumericTraits<BaseT>::Lowest()}; \ |
|
return retval; \ |
|
} \ |
|
}; \ |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
#define CUB_VEC_OVERLOAD_2(T, BaseT) \ |
|
\ |
|
std::ostream& operator<<( \ |
|
std::ostream& os, \ |
|
const T& val) \ |
|
{ \ |
|
os << '(' \ |
|
<< CoutCast(val.x) << ',' \ |
|
<< CoutCast(val.y) << ')'; \ |
|
return os; \ |
|
} \ |
|
\ |
|
__host__ __device__ __forceinline__ bool operator!=( \ |
|
const T &a, \ |
|
const T &b) \ |
|
{ \ |
|
return (a.x != b.x) || \ |
|
(a.y != b.y); \ |
|
} \ |
|
\ |
|
__host__ __device__ __forceinline__ bool operator==( \ |
|
const T &a, \ |
|
const T &b) \ |
|
{ \ |
|
return (a.x == b.x) && \ |
|
(a.y == b.y); \ |
|
} \ |
|
\ |
|
__host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, T &value, int index = 0) \ |
|
{ \ |
|
InitValue(gen_mode, value.x, index); \ |
|
InitValue(gen_mode, value.y, index); \ |
|
} \ |
|
\ |
|
__host__ __device__ __forceinline__ bool operator>( \ |
|
const T &a, \ |
|
const T &b) \ |
|
{ \ |
|
if (a.x > b.x) return true; else if (b.x > a.x) return false; \ |
|
return a.y > b.y; \ |
|
} \ |
|
\ |
|
__host__ __device__ __forceinline__ bool operator<( \ |
|
const T &a, \ |
|
const T &b) \ |
|
{ \ |
|
if (a.x < b.x) return true; else if (b.x < a.x) return false; \ |
|
return a.y < b.y; \ |
|
} \ |
|
\ |
|
__host__ __device__ __forceinline__ T operator+( \ |
|
T a, \ |
|
T b) \ |
|
{ \ |
|
T retval = make_##T( \ |
|
a.x + b.x, \ |
|
a.y + b.y); \ |
|
return retval; \ |
|
} \ |
|
namespace cub { \ |
|
template<> \ |
|
struct NumericTraits<T> \ |
|
{ \ |
|
static const Category CATEGORY = NOT_A_NUMBER; \ |
|
enum { \ |
|
PRIMITIVE = false, \ |
|
NULL_TYPE = false, \ |
|
}; \ |
|
static T Max() \ |
|
{ \ |
|
T retval = { \ |
|
NumericTraits<BaseT>::Max(), \ |
|
NumericTraits<BaseT>::Max()}; \ |
|
return retval; \ |
|
} \ |
|
static T Lowest() \ |
|
{ \ |
|
T retval = { \ |
|
NumericTraits<BaseT>::Lowest(), \ |
|
NumericTraits<BaseT>::Lowest()}; \ |
|
return retval; \ |
|
} \ |
|
}; \ |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
#define CUB_VEC_OVERLOAD_3(T, BaseT) \ |
|
\ |
|
std::ostream& operator<<( \ |
|
std::ostream& os, \ |
|
const T& val) \ |
|
{ \ |
|
os << '(' \ |
|
<< CoutCast(val.x) << ',' \ |
|
<< CoutCast(val.y) << ',' \ |
|
<< CoutCast(val.z) << ')'; \ |
|
return os; \ |
|
} \ |
|
\ |
|
__host__ __device__ __forceinline__ bool operator!=( \ |
|
const T &a, \ |
|
const T &b) \ |
|
{ \ |
|
return (a.x != b.x) || \ |
|
(a.y != b.y) || \ |
|
(a.z != b.z); \ |
|
} \ |
|
\ |
|
__host__ __device__ __forceinline__ bool operator==( \ |
|
const T &a, \ |
|
const T &b) \ |
|
{ \ |
|
return (a.x == b.x) && \ |
|
(a.y == b.y) && \ |
|
(a.z == b.z); \ |
|
} \ |
|
\ |
|
__host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, T &value, int index = 0) \ |
|
{ \ |
|
InitValue(gen_mode, value.x, index); \ |
|
InitValue(gen_mode, value.y, index); \ |
|
InitValue(gen_mode, value.z, index); \ |
|
} \ |
|
\ |
|
__host__ __device__ __forceinline__ bool operator>( \ |
|
const T &a, \ |
|
const T &b) \ |
|
{ \ |
|
if (a.x > b.x) return true; else if (b.x > a.x) return false; \ |
|
if (a.y > b.y) return true; else if (b.y > a.y) return false; \ |
|
return a.z > b.z; \ |
|
} \ |
|
\ |
|
__host__ __device__ __forceinline__ bool operator<( \ |
|
const T &a, \ |
|
const T &b) \ |
|
{ \ |
|
if (a.x < b.x) return true; else if (b.x < a.x) return false; \ |
|
if (a.y < b.y) return true; else if (b.y < a.y) return false; \ |
|
return a.z < b.z; \ |
|
} \ |
|
\ |
|
__host__ __device__ __forceinline__ T operator+( \ |
|
T a, \ |
|
T b) \ |
|
{ \ |
|
T retval = make_##T( \ |
|
a.x + b.x, \ |
|
a.y + b.y, \ |
|
a.z + b.z); \ |
|
return retval; \ |
|
} \ |
|
namespace cub { \ |
|
template<> \ |
|
struct NumericTraits<T> \ |
|
{ \ |
|
static const Category CATEGORY = NOT_A_NUMBER; \ |
|
enum { \ |
|
PRIMITIVE = false, \ |
|
NULL_TYPE = false, \ |
|
}; \ |
|
static T Max() \ |
|
{ \ |
|
T retval = { \ |
|
NumericTraits<BaseT>::Max(), \ |
|
NumericTraits<BaseT>::Max(), \ |
|
NumericTraits<BaseT>::Max()}; \ |
|
return retval; \ |
|
} \ |
|
static T Lowest() \ |
|
{ \ |
|
T retval = { \ |
|
NumericTraits<BaseT>::Lowest(), \ |
|
NumericTraits<BaseT>::Lowest(), \ |
|
NumericTraits<BaseT>::Lowest()}; \ |
|
return retval; \ |
|
} \ |
|
}; \ |
|
} |
|
|
|
|
|
|
|
|
|
|
|
#define CUB_VEC_OVERLOAD_4(T, BaseT) \ |
|
\ |
|
std::ostream& operator<<( \ |
|
std::ostream& os, \ |
|
const T& val) \ |
|
{ \ |
|
os << '(' \ |
|
<< CoutCast(val.x) << ',' \ |
|
<< CoutCast(val.y) << ',' \ |
|
<< CoutCast(val.z) << ',' \ |
|
<< CoutCast(val.w) << ')'; \ |
|
return os; \ |
|
} \ |
|
\ |
|
__host__ __device__ __forceinline__ bool operator!=( \ |
|
const T &a, \ |
|
const T &b) \ |
|
{ \ |
|
return (a.x != b.x) || \ |
|
(a.y != b.y) || \ |
|
(a.z != b.z) || \ |
|
(a.w != b.w); \ |
|
} \ |
|
\ |
|
__host__ __device__ __forceinline__ bool operator==( \ |
|
const T &a, \ |
|
const T &b) \ |
|
{ \ |
|
return (a.x == b.x) && \ |
|
(a.y == b.y) && \ |
|
(a.z == b.z) && \ |
|
(a.w == b.w); \ |
|
} \ |
|
\ |
|
__host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, T &value, int index = 0) \ |
|
{ \ |
|
InitValue(gen_mode, value.x, index); \ |
|
InitValue(gen_mode, value.y, index); \ |
|
InitValue(gen_mode, value.z, index); \ |
|
InitValue(gen_mode, value.w, index); \ |
|
} \ |
|
\ |
|
__host__ __device__ __forceinline__ bool operator>( \ |
|
const T &a, \ |
|
const T &b) \ |
|
{ \ |
|
if (a.x > b.x) return true; else if (b.x > a.x) return false; \ |
|
if (a.y > b.y) return true; else if (b.y > a.y) return false; \ |
|
if (a.z > b.z) return true; else if (b.z > a.z) return false; \ |
|
return a.w > b.w; \ |
|
} \ |
|
\ |
|
__host__ __device__ __forceinline__ bool operator<( \ |
|
const T &a, \ |
|
const T &b) \ |
|
{ \ |
|
if (a.x < b.x) return true; else if (b.x < a.x) return false; \ |
|
if (a.y < b.y) return true; else if (b.y < a.y) return false; \ |
|
if (a.z < b.z) return true; else if (b.z < a.z) return false; \ |
|
return a.w < b.w; \ |
|
} \ |
|
\ |
|
__host__ __device__ __forceinline__ T operator+( \ |
|
T a, \ |
|
T b) \ |
|
{ \ |
|
T retval = make_##T( \ |
|
a.x + b.x, \ |
|
a.y + b.y, \ |
|
a.z + b.z, \ |
|
a.w + b.w); \ |
|
return retval; \ |
|
} \ |
|
namespace cub { \ |
|
template<> \ |
|
struct NumericTraits<T> \ |
|
{ \ |
|
static const Category CATEGORY = NOT_A_NUMBER; \ |
|
enum { \ |
|
PRIMITIVE = false, \ |
|
NULL_TYPE = false, \ |
|
}; \ |
|
static T Max() \ |
|
{ \ |
|
T retval = { \ |
|
NumericTraits<BaseT>::Max(), \ |
|
NumericTraits<BaseT>::Max(), \ |
|
NumericTraits<BaseT>::Max(), \ |
|
NumericTraits<BaseT>::Max()}; \ |
|
return retval; \ |
|
} \ |
|
static T Lowest() \ |
|
{ \ |
|
T retval = { \ |
|
NumericTraits<BaseT>::Lowest(), \ |
|
NumericTraits<BaseT>::Lowest(), \ |
|
NumericTraits<BaseT>::Lowest(), \ |
|
NumericTraits<BaseT>::Lowest()}; \ |
|
return retval; \ |
|
} \ |
|
}; \ |
|
} |
|
|
|
|
|
|
|
|
|
#define CUB_VEC_OVERLOAD(COMPONENT_T, BaseT) \ |
|
CUB_VEC_OVERLOAD_1(COMPONENT_T##1, BaseT) \ |
|
CUB_VEC_OVERLOAD_2(COMPONENT_T##2, BaseT) \ |
|
CUB_VEC_OVERLOAD_3(COMPONENT_T##3, BaseT) \ |
|
CUB_VEC_OVERLOAD_4(COMPONENT_T##4, BaseT) |
|
|
|
|
|
|
|
|
|
CUB_VEC_OVERLOAD(char, char) |
|
CUB_VEC_OVERLOAD(short, short) |
|
CUB_VEC_OVERLOAD(int, int) |
|
CUB_VEC_OVERLOAD(long, long) |
|
CUB_VEC_OVERLOAD(longlong, long long) |
|
CUB_VEC_OVERLOAD(uchar, unsigned char) |
|
CUB_VEC_OVERLOAD(ushort, unsigned short) |
|
CUB_VEC_OVERLOAD(uint, unsigned int) |
|
CUB_VEC_OVERLOAD(ulong, unsigned long) |
|
CUB_VEC_OVERLOAD(ulonglong, unsigned long long) |
|
CUB_VEC_OVERLOAD(float, float) |
|
CUB_VEC_OVERLOAD(double, double) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
struct TestFoo |
|
{ |
|
long long x; |
|
int y; |
|
short z; |
|
char w; |
|
|
|
|
|
static __host__ __device__ __forceinline__ TestFoo MakeTestFoo(long long x, int y, short z, char w) |
|
{ |
|
TestFoo retval = {x, y, z, w}; |
|
return retval; |
|
} |
|
|
|
|
|
__host__ __device__ __forceinline__ TestFoo& operator =(int b) |
|
{ |
|
x = b; |
|
y = b; |
|
z = b; |
|
w = b; |
|
return *this; |
|
} |
|
|
|
|
|
__host__ __device__ __forceinline__ TestFoo operator+(const TestFoo &b) const |
|
{ |
|
return MakeTestFoo(x + b.x, y + b.y, z + b.z, w + b.w); |
|
} |
|
|
|
|
|
__host__ __device__ __forceinline__ bool operator !=(const TestFoo &b) const |
|
{ |
|
return (x != b.x) || (y != b.y) || (z != b.z) || (w != b.w); |
|
} |
|
|
|
|
|
__host__ __device__ __forceinline__ bool operator ==(const TestFoo &b) const |
|
{ |
|
return (x == b.x) && (y == b.y) && (z == b.z) && (w == b.w); |
|
} |
|
|
|
|
|
__host__ __device__ __forceinline__ bool operator <(const TestFoo &b) const |
|
{ |
|
if (x < b.x) return true; else if (b.x < x) return false; |
|
if (y < b.y) return true; else if (b.y < y) return false; |
|
if (z < b.z) return true; else if (b.z < z) return false; |
|
return w < b.w; |
|
} |
|
|
|
|
|
__host__ __device__ __forceinline__ bool operator >(const TestFoo &b) const |
|
{ |
|
if (x > b.x) return true; else if (b.x > x) return false; |
|
if (y > b.y) return true; else if (b.y > y) return false; |
|
if (z > b.z) return true; else if (b.z > z) return false; |
|
return w > b.w; |
|
} |
|
|
|
}; |
|
|
|
|
|
|
|
|
|
std::ostream& operator<<(std::ostream& os, const TestFoo& val) |
|
{ |
|
os << '(' << val.x << ',' << val.y << ',' << val.z << ',' << CoutCast(val.w) << ')'; |
|
return os; |
|
} |
|
|
|
|
|
|
|
|
|
__host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, TestFoo &value, int index = 0) |
|
{ |
|
InitValue(gen_mode, value.x, index); |
|
InitValue(gen_mode, value.y, index); |
|
InitValue(gen_mode, value.z, index); |
|
InitValue(gen_mode, value.w, index); |
|
} |
|
|
|
|
|
|
|
namespace cub { |
|
template<> |
|
struct NumericTraits<TestFoo> |
|
{ |
|
static const Category CATEGORY = NOT_A_NUMBER; |
|
enum { |
|
PRIMITIVE = false, |
|
NULL_TYPE = false, |
|
}; |
|
static TestFoo Max() |
|
{ |
|
return TestFoo::MakeTestFoo( |
|
NumericTraits<long long>::Max(), |
|
NumericTraits<int>::Max(), |
|
NumericTraits<short>::Max(), |
|
NumericTraits<char>::Max()); |
|
} |
|
|
|
static TestFoo Lowest() |
|
{ |
|
return TestFoo::MakeTestFoo( |
|
NumericTraits<long long>::Lowest(), |
|
NumericTraits<int>::Lowest(), |
|
NumericTraits<short>::Lowest(), |
|
NumericTraits<char>::Lowest()); |
|
} |
|
}; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
struct TestBar |
|
{ |
|
long long x; |
|
int y; |
|
|
|
|
|
__host__ __device__ __forceinline__ TestBar() : x(0), y(0) |
|
{} |
|
|
|
|
|
__host__ __device__ __forceinline__ TestBar(int b) : x(b), y(b) |
|
{} |
|
|
|
|
|
__host__ __device__ __forceinline__ TestBar(long long x, int y) : x(x), y(y) |
|
{} |
|
|
|
|
|
__host__ __device__ __forceinline__ TestBar& operator =(int b) |
|
{ |
|
x = b; |
|
y = b; |
|
return *this; |
|
} |
|
|
|
|
|
__host__ __device__ __forceinline__ TestBar operator+(const TestBar &b) const |
|
{ |
|
return TestBar(x + b.x, y + b.y); |
|
} |
|
|
|
|
|
__host__ __device__ __forceinline__ bool operator !=(const TestBar &b) const |
|
{ |
|
return (x != b.x) || (y != b.y); |
|
} |
|
|
|
|
|
__host__ __device__ __forceinline__ bool operator ==(const TestBar &b) const |
|
{ |
|
return (x == b.x) && (y == b.y); |
|
} |
|
|
|
|
|
__host__ __device__ __forceinline__ bool operator <(const TestBar &b) const |
|
{ |
|
if (x < b.x) return true; else if (b.x < x) return false; |
|
return y < b.y; |
|
} |
|
|
|
|
|
__host__ __device__ __forceinline__ bool operator >(const TestBar &b) const |
|
{ |
|
if (x > b.x) return true; else if (b.x > x) return false; |
|
return y > b.y; |
|
} |
|
|
|
}; |
|
|
|
|
|
|
|
|
|
|
|
std::ostream& operator<<(std::ostream& os, const TestBar& val) |
|
{ |
|
os << '(' << val.x << ',' << val.y << ')'; |
|
return os; |
|
} |
|
|
|
|
|
|
|
|
|
__host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, TestBar &value, int index = 0) |
|
{ |
|
InitValue(gen_mode, value.x, index); |
|
InitValue(gen_mode, value.y, index); |
|
} |
|
|
|
|
|
namespace cub { |
|
template<> |
|
struct NumericTraits<TestBar> |
|
{ |
|
static const Category CATEGORY = NOT_A_NUMBER; |
|
enum { |
|
PRIMITIVE = false, |
|
NULL_TYPE = false, |
|
}; |
|
static TestBar Max() |
|
{ |
|
return TestBar( |
|
NumericTraits<long long>::Max(), |
|
NumericTraits<int>::Max()); |
|
} |
|
|
|
static TestBar Lowest() |
|
{ |
|
return TestBar( |
|
NumericTraits<long long>::Lowest(), |
|
NumericTraits<int>::Lowest()); |
|
} |
|
}; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <typename S, typename T, typename OffsetT> |
|
int CompareResults(T* computed, S* reference, OffsetT len, bool verbose = true) |
|
{ |
|
for (OffsetT i = 0; i < len; i++) |
|
{ |
|
if (computed[i] != reference[i]) |
|
{ |
|
if (verbose) std::cout << "INCORRECT: [" << i << "]: " |
|
<< CoutCast(computed[i]) << " != " |
|
<< CoutCast(reference[i]); |
|
return 1; |
|
} |
|
} |
|
return 0; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
template <typename OffsetT> |
|
int CompareResults(float* computed, float* reference, OffsetT len, bool verbose = true) |
|
{ |
|
for (OffsetT i = 0; i < len; i++) |
|
{ |
|
if (computed[i] != reference[i]) |
|
{ |
|
float difference = std::abs(computed[i]-reference[i]); |
|
float fraction = difference / std::abs(reference[i]); |
|
|
|
if (fraction > 0.0001) |
|
{ |
|
if (verbose) std::cout << "INCORRECT: [" << i << "]: " |
|
<< "(computed) " << CoutCast(computed[i]) << " != " |
|
<< CoutCast(reference[i]) << " (difference:" << difference << ", fraction: " << fraction << ")"; |
|
return 1; |
|
} |
|
} |
|
} |
|
return 0; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
template <typename OffsetT> |
|
int CompareResults(cub::NullType* computed, cub::NullType* reference, OffsetT len, bool verbose = true) |
|
{ |
|
return 0; |
|
} |
|
|
|
|
|
|
|
|
|
template <typename OffsetT> |
|
int CompareResults(double* computed, double* reference, OffsetT len, bool verbose = true) |
|
{ |
|
for (OffsetT i = 0; i < len; i++) |
|
{ |
|
if (computed[i] != reference[i]) |
|
{ |
|
double difference = std::abs(computed[i]-reference[i]); |
|
double fraction = difference / std::abs(reference[i]); |
|
|
|
if (fraction > 0.0001) |
|
{ |
|
if (verbose) std::cout << "INCORRECT: [" << i << "]: " |
|
<< CoutCast(computed[i]) << " != " |
|
<< CoutCast(reference[i]) << " (difference:" << difference << ", fraction: " << fraction << ")"; |
|
return 1; |
|
} |
|
} |
|
} |
|
return 0; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
int CompareDeviceResults( |
|
cub::NullType *, |
|
cub::NullType *, |
|
size_t , |
|
bool = true, |
|
bool = false) |
|
{ |
|
return 0; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
template <typename S, typename OffsetT> |
|
int CompareDeviceResults( |
|
S *h_reference, |
|
cub::DiscardOutputIterator<OffsetT> d_data, |
|
size_t num_items, |
|
bool verbose = true, |
|
bool display_data = false) |
|
{ |
|
return 0; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
template <typename S, typename T> |
|
int CompareDeviceResults( |
|
S *h_reference, |
|
T *d_data, |
|
size_t num_items, |
|
bool verbose = true, |
|
bool display_data = false) |
|
{ |
|
|
|
T *h_data = (T*) malloc(num_items * sizeof(T)); |
|
|
|
|
|
cudaMemcpy(h_data, d_data, sizeof(T) * num_items, cudaMemcpyDeviceToHost); |
|
|
|
|
|
if (display_data) |
|
{ |
|
printf("Reference:\n"); |
|
for (int i = 0; i < int(num_items); i++) |
|
{ |
|
std::cout << CoutCast(h_reference[i]) << ", "; |
|
} |
|
printf("\n\nComputed:\n"); |
|
for (int i = 0; i < int(num_items); i++) |
|
{ |
|
std::cout << CoutCast(h_data[i]) << ", "; |
|
} |
|
printf("\n\n"); |
|
} |
|
|
|
|
|
int retval = CompareResults(h_data, h_reference, num_items, verbose); |
|
|
|
|
|
if (h_data) free(h_data); |
|
|
|
return retval; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
template <typename T> |
|
int CompareDeviceDeviceResults( |
|
T *d_reference, |
|
T *d_data, |
|
size_t num_items, |
|
bool verbose = true, |
|
bool display_data = false) |
|
{ |
|
|
|
T *h_reference = (T*) malloc(num_items * sizeof(T)); |
|
T *h_data = (T*) malloc(num_items * sizeof(T)); |
|
|
|
|
|
cudaMemcpy(h_reference, d_reference, sizeof(T) * num_items, cudaMemcpyDeviceToHost); |
|
cudaMemcpy(h_data, d_data, sizeof(T) * num_items, cudaMemcpyDeviceToHost); |
|
|
|
|
|
if (display_data) { |
|
printf("Reference:\n"); |
|
for (int i = 0; i < num_items; i++) |
|
{ |
|
std::cout << CoutCast(h_reference[i]) << ", "; |
|
} |
|
printf("\n\nComputed:\n"); |
|
for (int i = 0; i < num_items; i++) |
|
{ |
|
std::cout << CoutCast(h_data[i]) << ", "; |
|
} |
|
printf("\n\n"); |
|
} |
|
|
|
|
|
int retval = CompareResults(h_data, h_reference, num_items, verbose); |
|
|
|
|
|
if (h_reference) free(h_reference); |
|
if (h_data) free(h_data); |
|
|
|
return retval; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
void DisplayResults( |
|
cub::NullType *, |
|
size_t ) |
|
{} |
|
|
|
|
|
|
|
|
|
|
|
template <typename InputIteratorT> |
|
void DisplayResults( |
|
InputIteratorT h_data, |
|
size_t num_items) |
|
{ |
|
|
|
for (int i = 0; i < int(num_items); i++) |
|
{ |
|
std::cout << CoutCast(h_data[i]) << ", "; |
|
} |
|
printf("\n"); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
template <typename T> |
|
void DisplayDeviceResults( |
|
T *d_data, |
|
size_t num_items) |
|
{ |
|
|
|
T *h_data = (T*) malloc(num_items * sizeof(T)); |
|
|
|
|
|
cudaMemcpy(h_data, d_data, sizeof(T) * num_items, cudaMemcpyDeviceToHost); |
|
|
|
DisplayResults(h_data, num_items); |
|
|
|
|
|
if (h_data) free(h_data); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void InitializeSegments( |
|
int num_items, |
|
int num_segments, |
|
int *h_segment_offsets, |
|
bool verbose = false) |
|
{ |
|
if (num_segments <= 0) |
|
return; |
|
|
|
unsigned int expected_segment_length = (num_items + num_segments - 1) / num_segments; |
|
int offset = 0; |
|
for (int i = 0; i < num_segments; ++i) |
|
{ |
|
h_segment_offsets[i] = offset; |
|
|
|
unsigned int segment_length = RandomValue((expected_segment_length * 2) + 1); |
|
offset += segment_length; |
|
offset = CUB_MIN(offset, num_items); |
|
} |
|
h_segment_offsets[num_segments] = num_items; |
|
|
|
if (verbose) |
|
{ |
|
printf("Segment offsets: "); |
|
DisplayResults(h_segment_offsets, num_segments + 1); |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
struct CpuTimer |
|
{ |
|
#if defined(_WIN32) || defined(_WIN64) |
|
|
|
LARGE_INTEGER ll_freq; |
|
LARGE_INTEGER ll_start; |
|
LARGE_INTEGER ll_stop; |
|
|
|
CpuTimer() |
|
{ |
|
QueryPerformanceFrequency(&ll_freq); |
|
} |
|
|
|
void Start() |
|
{ |
|
QueryPerformanceCounter(&ll_start); |
|
} |
|
|
|
void Stop() |
|
{ |
|
QueryPerformanceCounter(&ll_stop); |
|
} |
|
|
|
float ElapsedMillis() |
|
{ |
|
double start = double(ll_start.QuadPart) / double(ll_freq.QuadPart); |
|
double stop = double(ll_stop.QuadPart) / double(ll_freq.QuadPart); |
|
|
|
return float((stop - start) * 1000); |
|
} |
|
|
|
#else |
|
|
|
rusage start; |
|
rusage stop; |
|
|
|
void Start() |
|
{ |
|
getrusage(RUSAGE_SELF, &start); |
|
} |
|
|
|
void Stop() |
|
{ |
|
getrusage(RUSAGE_SELF, &stop); |
|
} |
|
|
|
float ElapsedMillis() |
|
{ |
|
float sec = stop.ru_utime.tv_sec - start.ru_utime.tv_sec; |
|
float usec = stop.ru_utime.tv_usec - start.ru_utime.tv_usec; |
|
|
|
return (sec * 1000) + (usec / 1000); |
|
} |
|
|
|
#endif |
|
}; |
|
|
|
struct GpuTimer |
|
{ |
|
cudaEvent_t start; |
|
cudaEvent_t stop; |
|
|
|
GpuTimer() |
|
{ |
|
cudaEventCreate(&start); |
|
cudaEventCreate(&stop); |
|
} |
|
|
|
~GpuTimer() |
|
{ |
|
cudaEventDestroy(start); |
|
cudaEventDestroy(stop); |
|
} |
|
|
|
void Start() |
|
{ |
|
cudaEventRecord(start, 0); |
|
} |
|
|
|
void Stop() |
|
{ |
|
cudaEventRecord(stop, 0); |
|
} |
|
|
|
float ElapsedMillis() |
|
{ |
|
float elapsed; |
|
cudaEventSynchronize(stop); |
|
cudaEventElapsedTime(&elapsed, start, stop); |
|
return elapsed; |
|
} |
|
}; |
|
|