Spaces:
Runtime error
Runtime error
/* | |
* Minimum CUDA compatibility definitions header | |
* | |
* Copyright (c) 2019 rcombs | |
* | |
* This file is part of FFmpeg. | |
* | |
* FFmpeg is free software; you can redistribute it and/or | |
* modify it under the terms of the GNU Lesser General Public | |
* License as published by the Free Software Foundation; either | |
* version 2.1 of the License, or (at your option) any later version. | |
* | |
* FFmpeg is distributed in the hope that it will be useful, | |
* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
* Lesser General Public License for more details. | |
* | |
* You should have received a copy of the GNU Lesser General Public | |
* License along with FFmpeg; if not, write to the Free Software | |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
*/ | |
// Common macros | |
// Basic typedefs | |
typedef __device_builtin__ unsigned long long cudaTextureObject_t; | |
typedef struct __device_builtin__ __align__(2) uchar2 | |
{ | |
unsigned char x, y; | |
} uchar2; | |
typedef struct __device_builtin__ __align__(4) ushort2 | |
{ | |
unsigned short x, y; | |
} ushort2; | |
typedef struct __device_builtin__ __align__(8) float2 | |
{ | |
float x, y; | |
} float2; | |
typedef struct __device_builtin__ __align__(8) int2 | |
{ | |
int x, y; | |
} int2; | |
typedef struct __device_builtin__ uint3 | |
{ | |
unsigned int x, y, z; | |
} uint3; | |
typedef struct uint3 dim3; | |
typedef struct __device_builtin__ __align__(4) uchar4 | |
{ | |
unsigned char x, y, z, w; | |
} uchar4; | |
typedef struct __device_builtin__ __align__(8) ushort4 | |
{ | |
unsigned short x, y, z, w; | |
} ushort4; | |
typedef struct __device_builtin__ __align__(16) int4 | |
{ | |
int x, y, z, w; | |
} int4; | |
typedef struct __device_builtin__ __align__(16) float4 | |
{ | |
float x, y, z, w; | |
} float4; | |
// Accessors for special registers | |
GET(getBlockIdx, ctaid) | |
GET(getBlockDim, ntid) | |
GET(getThreadIdx, tid) | |
// Instead of externs for these registers, we turn access to them into calls into trivial ASM | |
// Basic initializers (simple macros rather than inline functions) | |
// Conversions from the tex instruction's 4-register output to various types | |
TEX2D(unsigned char, a & 0xFF) | |
TEX2D(unsigned short, a & 0xFFFF) | |
TEX2D(float, a) | |
TEX2D(uchar2, make_uchar2(a & 0xFF, b & 0xFF)) | |
TEX2D(ushort2, make_ushort2(a & 0xFFFF, b & 0xFFFF)) | |
TEX2D(float2, make_float2(a, b)) | |
TEX2D(uchar4, make_uchar4(a & 0xFF, b & 0xFF, c & 0xFF, d & 0xFF)) | |
TEX2D(ushort4, make_ushort4(a & 0xFFFF, b & 0xFFFF, c & 0xFFFF, d & 0xFFFF)) | |
TEX2D(float4, make_float4(a, b, c, d)) | |
// Template calling tex instruction and converting the output to the selected type | |
template<typename T> | |
inline __device__ T tex2D(cudaTextureObject_t texObject, float x, float y) | |
{ | |
T ret; | |
unsigned ret1, ret2, ret3, ret4; | |
asm("tex.2d.v4.u32.f32 {%0, %1, %2, %3}, [%4, {%5, %6}];" : | |
"=r"(ret1), "=r"(ret2), "=r"(ret3), "=r"(ret4) : | |
"l"(texObject), "f"(x), "f"(y)); | |
conv(&ret, ret1, ret2, ret3, ret4); | |
return ret; | |
} | |
template<> | |
inline __device__ float4 tex2D<float4>(cudaTextureObject_t texObject, float x, float y) | |
{ | |
float4 ret; | |
asm("tex.2d.v4.f32.f32 {%0, %1, %2, %3}, [%4, {%5, %6}];" : | |
"=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : | |
"l"(texObject), "f"(x), "f"(y)); | |
return ret; | |
} | |
template<> | |
inline __device__ float tex2D<float>(cudaTextureObject_t texObject, float x, float y) | |
{ | |
return tex2D<float4>(texObject, x, y).x; | |
} | |
template<> | |
inline __device__ float2 tex2D<float2>(cudaTextureObject_t texObject, float x, float y) | |
{ | |
float4 ret = tex2D<float4>(texObject, x, y); | |
return make_float2(ret.x, ret.y); | |
} | |
// Math helper functions | |
static inline __device__ float floorf(float a) { return __builtin_floorf(a); } | |
static inline __device__ float floor(float a) { return __builtin_floorf(a); } | |
static inline __device__ double floor(double a) { return __builtin_floor(a); } | |
static inline __device__ float ceilf(float a) { return __builtin_ceilf(a); } | |
static inline __device__ float ceil(float a) { return __builtin_ceilf(a); } | |
static inline __device__ double ceil(double a) { return __builtin_ceil(a); } | |
static inline __device__ float truncf(float a) { return __builtin_truncf(a); } | |
static inline __device__ float trunc(float a) { return __builtin_truncf(a); } | |
static inline __device__ double trunc(double a) { return __builtin_trunc(a); } | |
static inline __device__ float fabsf(float a) { return __builtin_fabsf(a); } | |
static inline __device__ float fabs(float a) { return __builtin_fabsf(a); } | |
static inline __device__ double fabs(double a) { return __builtin_fabs(a); } | |
static inline __device__ float sqrtf(float a) { return __builtin_sqrtf(a); } | |
static inline __device__ float __saturatef(float a) { return __nvvm_saturate_f(a); } | |
static inline __device__ float __sinf(float a) { return __nvvm_sin_approx_f(a); } | |
static inline __device__ float __cosf(float a) { return __nvvm_cos_approx_f(a); } | |
static inline __device__ float __expf(float a) { return __nvvm_ex2_approx_f(a * (float)__builtin_log2(__builtin_exp(1))); } | |
static inline __device__ float __powf(float a, float b) { return __nvvm_ex2_approx_f(__nvvm_lg2_approx_f(a) * b); } | |