Spaces:
Runtime error
Runtime error
// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. | |
// | |
// NVIDIA CORPORATION and its licensors retain all intellectual property | |
// and proprietary rights in and to this software, related documentation | |
// and any modifications thereto. Any use, reproduction, disclosure or | |
// distribution of this software and related documentation without an express | |
// license agreement from NVIDIA CORPORATION is strictly prohibited. | |
//------------------------------------------------------------------------ | |
// Common op attribute parser. | |
static __host__ void parseOpAttributes(OpKernelConstruction* ctx, TextureKernelParams& p) | |
{ | |
// Mip and filter modes. | |
OP_REQUIRES_OK(ctx, ctx->GetAttr("filter_mode", &p.filterMode)); | |
OP_REQUIRES(ctx, p.filterMode >= 0 && p.filterMode < TEX_MODE_COUNT, errors::InvalidArgument("filter_mode unsupported")); | |
p.enableMip = (p.filterMode == TEX_MODE_LINEAR_MIPMAP_NEAREST || p.filterMode == TEX_MODE_LINEAR_MIPMAP_LINEAR); | |
// Mip level clamp. | |
if (p.enableMip) | |
{ | |
OP_REQUIRES_OK(ctx, ctx->GetAttr("max_mip_level", &p.mipLevelLimit)); | |
OP_REQUIRES(ctx, p.mipLevelLimit >= -1, errors::InvalidArgument("invalid max_mip_level")); | |
ctx->GetAttr("tex_const", &p.texConst); // Only available in forward op. | |
} | |
// Boundary mode. | |
OP_REQUIRES_OK(ctx, ctx->GetAttr("boundary_mode", &p.boundaryMode)); | |
OP_REQUIRES(ctx, p.boundaryMode >= 0 && p.boundaryMode < TEX_BOUNDARY_MODE_COUNT, errors::InvalidArgument("boundary_mode unsupported")); | |
} | |
//------------------------------------------------------------------------ | |
// Forward TensorFlow op. | |
struct TextureFwdOp : public OpKernel | |
{ | |
TextureKernelParams m_attribs; | |
PersistentTensor m_persistentMipTensor; // Used if texture is constant and mips are enabled. | |
bool m_persistentMipTensorInitialized; | |
TextureFwdOp(OpKernelConstruction* ctx): OpKernel(ctx) | |
{ | |
memset(&m_attribs, 0, sizeof(m_attribs)); | |
m_persistentMipTensorInitialized = false; | |
parseOpAttributes(ctx, m_attribs); | |
} | |
void Compute(OpKernelContext* ctx) | |
{ | |
TextureKernelParams& p = m_attribs; | |
cudaStream_t stream = ctx->eigen_device<Eigen::GpuDevice>().stream(); | |
bool cube_mode = (p.boundaryMode == TEX_BOUNDARY_MODE_CUBE); | |
// Get input. | |
const Tensor& tex = ctx->input(0); | |
const Tensor& uv = ctx->input(1); | |
const Tensor& uv_da = ctx->input(p.enableMip ? 2 : 1); | |
// Extract input dimensions. | |
p.n = (uv.dims() > 0) ? uv.dim_size(0) : 0; | |
p.imgHeight = (uv.dims() > 1) ? uv.dim_size(1) : 0; | |
p.imgWidth = (uv.dims() > 2) ? uv.dim_size(2) : 0; | |
p.texDepth = (tex.dims() > 0) ? tex.dim_size(0) : 0; | |
if (!cube_mode) | |
{ | |
p.texHeight = (tex.dims() > 1) ? tex.dim_size(1) : 0; | |
p.texWidth = (tex.dims() > 2) ? tex.dim_size(2) : 0; | |
p.channels = (tex.dims() > 3) ? tex.dim_size(3) : 0; | |
} | |
else | |
{ | |
p.texHeight = (tex.dims() > 2) ? tex.dim_size(2) : 0; | |
p.texWidth = (tex.dims() > 3) ? tex.dim_size(3) : 0; | |
p.channels = (tex.dims() > 4) ? tex.dim_size(4) : 0; | |
} | |
// Sanity checks. | |
if (!cube_mode) | |
{ | |
OP_REQUIRES(ctx, tex.dims() == 4 && tex.dim_size(0) > 0 && tex.dim_size(1) > 0 && tex.dim_size(2) > 0 && tex.dim_size(3) > 0, errors::InvalidArgument("tex must have shape[>0, >0, >0, >0]")); | |
OP_REQUIRES(ctx, uv.dims() == 4 && uv.dim_size(0) > 0 && uv.dim_size(1) > 0 && uv.dim_size(2) > 0 && uv.dim_size(3) == 2, errors::InvalidArgument("uv must have shape [>0, >0, >0, 2]")); | |
} | |
else | |
{ | |
OP_REQUIRES(ctx, tex.dims() == 5 && tex.dim_size(0) > 0 && tex.dim_size(1) == 6 && tex.dim_size(2) > 0 && tex.dim_size(3) > 0 && tex.dim_size(4) > 0, errors::InvalidArgument("tex must have shape[>0, 6, >0, >0, >0] in cube map mode")); | |
OP_REQUIRES(ctx, uv.dims() == 4 && uv.dim_size(0) > 0 && uv.dim_size(1) > 0 && uv.dim_size(2) > 0 && uv.dim_size(3) == 3, errors::InvalidArgument("uv must have shape [>0, >0, >0, 3] in cube map mode")); | |
OP_REQUIRES(ctx, tex.dim_size(2) == tex.dim_size(3), errors::InvalidArgument("texture shape must be square in cube map mode")); | |
} | |
OP_REQUIRES(ctx, tex.dim_size(0) == 1 || tex.dim_size(0) == p.n, errors::InvalidArgument("minibatch size mismatch between inputs tex, uv")); | |
OP_REQUIRES(ctx, p.texWidth <= (1 << TEX_MAX_MIP_LEVEL) && p.texHeight <= (1 << TEX_MAX_MIP_LEVEL), errors::InvalidArgument("texture size too large")); | |
if (p.enableMip) | |
{ | |
if (!cube_mode) | |
OP_REQUIRES(ctx, uv_da.dims() == 4 && uv_da.dim_size(0) == p.n && uv_da.dim_size(1) == p.imgHeight && uv_da.dim_size(2) == p.imgWidth && uv_da.dim_size(3) == 4, errors::InvalidArgument("uv_da must have shape [minibatch_size, height, width, 4]")); | |
else | |
OP_REQUIRES(ctx, uv_da.dims() == 4 && uv_da.dim_size(0) == p.n && uv_da.dim_size(1) == p.imgHeight && uv_da.dim_size(2) == p.imgWidth && uv_da.dim_size(3) == 6, errors::InvalidArgument("uv_da must have shape [minibatch_size, height, width, 6] in cube map mode")); | |
} | |
// Get input pointers. | |
p.tex[0] = tex.flat<float>().data(); | |
p.uv = uv.flat<float>().data(); | |
p.uvDA = p.enableMip ? uv_da.flat<float>().data() : 0; | |
// Allocate output tensor. | |
Tensor* out_tensor = NULL; | |
TensorShape out_shape; | |
out_shape.AddDim(p.n); | |
out_shape.AddDim(p.imgHeight); | |
out_shape.AddDim(p.imgWidth); | |
out_shape.AddDim(p.channels); | |
OP_REQUIRES_OK(ctx, ctx->allocate_output(0, out_shape, &out_tensor)); | |
p.out = out_tensor->flat<float>().data(); | |
// Choose kernel variants based on channel count. | |
void* args[] = {&p}; | |
int channel_div_idx = 0; | |
if (!(p.channels & 3)) | |
channel_div_idx = 2; // Channel count divisible by 4. | |
else if (!(p.channels & 1)) | |
channel_div_idx = 1; // Channel count divisible by 2. | |
// Mip-related setup. | |
float* pmip = 0; | |
if (p.enableMip) | |
{ | |
// Generate mip offsets. | |
int mipOffsets[TEX_MAX_MIP_LEVEL]; | |
int mipTotal = calculateMipInfo(ctx, p, mipOffsets); | |
// Mip output tensor. | |
Tensor* mip_tensor = NULL; | |
TensorShape mip_shape; | |
mip_shape.AddDim(mipTotal); | |
// If texture is constant, calculate mip stack only once. | |
bool computeMip = true; | |
if (p.texConst) | |
{ | |
// First execution? | |
if (!m_persistentMipTensorInitialized) | |
{ | |
// Allocate a persistent mip tensor. | |
OP_REQUIRES_OK(ctx, ctx->allocate_persistent(DT_FLOAT, mip_shape, &m_persistentMipTensor, &mip_tensor)); | |
m_persistentMipTensorInitialized = true; | |
} | |
else | |
{ | |
// Reuse the persistent tensor, do not recompute mip levels. | |
mip_tensor = m_persistentMipTensor.AccessTensor(ctx); | |
computeMip = false; | |
} | |
// Set as output tensor as well. | |
ctx->set_output(1, *mip_tensor); | |
} | |
else | |
{ | |
// Allocate an output tensor as usual. | |
OP_REQUIRES_OK(ctx, ctx->allocate_output(1, mip_shape, &mip_tensor)); | |
} | |
pmip = mip_tensor->flat<float>().data(); // Pointer to data. | |
for (int i=1; i <= p.mipLevelMax; i++) | |
p.tex[i] = pmip + mipOffsets[i]; // Pointers to mip levels. | |
// Build mip levels if needed. | |
if (computeMip) | |
{ | |
for (int i=1; i <= p.mipLevelMax; i++) | |
{ | |
int2 ms = mipLevelSize(p, i); | |
int3 sz = make_int3(ms.x, ms.y, p.texDepth); | |
dim3 blockSize = getLaunchBlockSize(TEX_FWD_MAX_MIP_KERNEL_BLOCK_WIDTH, TEX_FWD_MAX_MIP_KERNEL_BLOCK_HEIGHT, sz.x, sz.y); | |
dim3 gridSize = getLaunchGridSize(blockSize, sz.x, sz.y, sz.z * (cube_mode ? 6 : 1)); | |
p.mipLevelOut = i; | |
void* build_func_tbl[3] = { (void*)MipBuildKernel1, (void*)MipBuildKernel2, (void*)MipBuildKernel4 }; | |
OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel(build_func_tbl[channel_div_idx], gridSize, blockSize, args, 0, stream)); | |
} | |
} | |
} | |
// Verify that buffers are aligned to allow float2/float4 operations. Unused pointers are zero so always aligned. | |
if (!cube_mode) | |
OP_REQUIRES(ctx, !((uintptr_t)p.uv & 7), errors::Internal("uv input tensor not aligned to float2")); | |
if ((p.channels & 3) == 0) | |
{ | |
OP_REQUIRES(ctx, !((uintptr_t)p.tex[0] & 15), errors::Internal("tex input tensor not aligned to float4")); | |
OP_REQUIRES(ctx, !((uintptr_t)p.out & 15), errors::Internal("out output tensor not aligned to float4")); | |
OP_REQUIRES(ctx, !((uintptr_t)pmip & 15), errors::Internal("mip output tensor not aligned to float4")); | |
} | |
if ((p.channels & 1) == 0) | |
{ | |
OP_REQUIRES(ctx, !((uintptr_t)p.tex[0] & 7), errors::Internal("tex input tensor not aligned to float2")); | |
OP_REQUIRES(ctx, !((uintptr_t)p.out & 7), errors::Internal("out output tensor not aligned to float2")); | |
OP_REQUIRES(ctx, !((uintptr_t)pmip & 7), errors::Internal("mip output tensor not aligned to float2")); | |
} | |
if (!cube_mode) | |
OP_REQUIRES(ctx, !((uintptr_t)p.uvDA & 15), errors::Internal("uv_da input tensor not aligned to float4")); | |
else | |
OP_REQUIRES(ctx, !((uintptr_t)p.uvDA & 7), errors::Internal("uv_da input tensor not aligned to float2")); | |
// Choose launch parameters for texture lookup kernel. | |
dim3 blockSize = getLaunchBlockSize(TEX_FWD_MAX_KERNEL_BLOCK_WIDTH, TEX_FWD_MAX_KERNEL_BLOCK_HEIGHT, p.imgWidth, p.imgHeight); | |
dim3 gridSize = getLaunchGridSize(blockSize, p.imgWidth, p.imgHeight, p.n); | |
// Choose kernel based on filter mode, cube mode, and datatype. | |
void* func_tbl[TEX_MODE_COUNT * 3 * 2] = { | |
(void*)TextureFwdKernelNearest1, | |
(void*)TextureFwdKernelNearest2, | |
(void*)TextureFwdKernelNearest4, | |
(void*)TextureFwdKernelLinear1, | |
(void*)TextureFwdKernelLinear2, | |
(void*)TextureFwdKernelLinear4, | |
(void*)TextureFwdKernelLinearMipmapNearest1, | |
(void*)TextureFwdKernelLinearMipmapNearest2, | |
(void*)TextureFwdKernelLinearMipmapNearest4, | |
(void*)TextureFwdKernelLinearMipmapLinear1, | |
(void*)TextureFwdKernelLinearMipmapLinear2, | |
(void*)TextureFwdKernelLinearMipmapLinear4, | |
(void*)TextureFwdKernelCubeNearest1, | |
(void*)TextureFwdKernelCubeNearest2, | |
(void*)TextureFwdKernelCubeNearest4, | |
(void*)TextureFwdKernelCubeLinear1, | |
(void*)TextureFwdKernelCubeLinear2, | |
(void*)TextureFwdKernelCubeLinear4, | |
(void*)TextureFwdKernelCubeLinearMipmapNearest1, | |
(void*)TextureFwdKernelCubeLinearMipmapNearest2, | |
(void*)TextureFwdKernelCubeLinearMipmapNearest4, | |
(void*)TextureFwdKernelCubeLinearMipmapLinear1, | |
(void*)TextureFwdKernelCubeLinearMipmapLinear2, | |
(void*)TextureFwdKernelCubeLinearMipmapLinear4, | |
}; | |
// Function index. | |
int func_idx = p.filterMode; | |
if (cube_mode) | |
func_idx += TEX_MODE_COUNT; | |
func_idx = func_idx * 3 + channel_div_idx; | |
// Launch kernel. | |
OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel(func_tbl[func_idx], gridSize, blockSize, args, 0, stream)); | |
} | |
}; | |
REGISTER_OP("TextureFwd") | |
.Input ("tex: float") | |
.Input ("uv: float") | |
.Output ("out: float") | |
.Attr ("filter_mode: int") | |
.Attr ("boundary_mode: int"); | |
REGISTER_OP("TextureFwdMip") | |
.Input ("tex: float") | |
.Input ("uv: float") | |
.Input ("uv_da: float") | |
.Output ("out: float") | |
.Output ("mip: float") | |
.Attr ("filter_mode: int") | |
.Attr ("boundary_mode: int") | |
.Attr ("tex_const: int") | |
.Attr ("max_mip_level: int"); | |
REGISTER_KERNEL_BUILDER(Name("TextureFwd") .Device(DEVICE_GPU), TextureFwdOp); | |
REGISTER_KERNEL_BUILDER(Name("TextureFwdMip").Device(DEVICE_GPU), TextureFwdOp); | |
//------------------------------------------------------------------------ | |
// Gradient TensorFlow op. | |
struct TextureGradOp : public OpKernel | |
{ | |
TextureKernelParams m_attribs; | |
TextureGradOp(OpKernelConstruction* ctx): OpKernel(ctx) | |
{ | |
memset(&m_attribs, 0, sizeof(m_attribs)); | |
parseOpAttributes(ctx, m_attribs); | |
} | |
void Compute(OpKernelContext* ctx) | |
{ | |
TextureKernelParams& p = m_attribs; | |
cudaStream_t stream = ctx->eigen_device<Eigen::GpuDevice>().stream(); | |
bool cube_mode = (p.boundaryMode == TEX_BOUNDARY_MODE_CUBE); | |
// Get input. | |
const Tensor& tex = ctx->input(0); | |
const Tensor& uv = ctx->input(1); | |
const Tensor& dy = ctx->input(2); | |
const Tensor& uv_da = ctx->input(p.enableMip ? 3 : 2); | |
const Tensor& mip = ctx->input(p.enableMip ? 4 : 2); | |
// Extract input dimensions. | |
p.n = (uv.dims() > 0) ? uv.dim_size(0) : 0; | |
p.imgHeight = (uv.dims() > 1) ? uv.dim_size(1) : 0; | |
p.imgWidth = (uv.dims() > 2) ? uv.dim_size(2) : 0; | |
p.texDepth = (tex.dims() > 0) ? tex.dim_size(0) : 0; | |
if (!cube_mode) | |
{ | |
p.texHeight = (tex.dims() > 1) ? tex.dim_size(1) : 0; | |
p.texWidth = (tex.dims() > 2) ? tex.dim_size(2) : 0; | |
p.channels = (tex.dims() > 3) ? tex.dim_size(3) : 0; | |
} | |
else | |
{ | |
p.texHeight = (tex.dims() > 2) ? tex.dim_size(2) : 0; | |
p.texWidth = (tex.dims() > 3) ? tex.dim_size(3) : 0; | |
p.channels = (tex.dims() > 4) ? tex.dim_size(4) : 0; | |
} | |
// Sanity checks. | |
if (!cube_mode) | |
{ | |
OP_REQUIRES(ctx, tex.dims() == 4 && tex.dim_size(0) > 0 && tex.dim_size(1) > 0 && tex.dim_size(2) > 0 && tex.dim_size(3) > 0, errors::InvalidArgument("tex must have shape[>0, >0, >0, >0]")); | |
OP_REQUIRES(ctx, uv.dims() == 4 && uv.dim_size(0) > 0 && uv.dim_size(1) > 0 && uv.dim_size(2) > 0 && uv.dim_size(3) == 2, errors::InvalidArgument("uv must have shape [>0, >0, >0, 2]")); | |
} | |
else | |
{ | |
OP_REQUIRES(ctx, tex.dims() == 5 && tex.dim_size(0) > 0 && tex.dim_size(1) == 6 && tex.dim_size(2) > 0 && tex.dim_size(3) > 0 && tex.dim_size(4) > 0, errors::InvalidArgument("tex must have shape[>0, 6, >0, >0, >0] in cube map mode")); | |
OP_REQUIRES(ctx, uv.dims() == 4 && uv.dim_size(0) > 0 && uv.dim_size(1) > 0 && uv.dim_size(2) > 0 && uv.dim_size(3) == 3, errors::InvalidArgument("uv must have shape [>0, >0, >0, 3] in cube map mode")); | |
OP_REQUIRES(ctx, tex.dim_size(2) == tex.dim_size(3), errors::InvalidArgument("texture shape must be square in cube map mode")); | |
} | |
OP_REQUIRES(ctx, tex.dim_size(0) == 1 || tex.dim_size(0) == p.n, errors::InvalidArgument("minibatch size mismatch between inputs tex, uv")); | |
OP_REQUIRES(ctx, dy.dims() == 4 && dy.dim_size(0) == p.n && dy.dim_size(1) == p.imgHeight && dy.dim_size(2) == p.imgWidth && dy.dim_size(3) == p.channels, errors::InvalidArgument("dy must have shape [minibatch_size, height, width, channels]")); | |
if (p.enableMip) | |
{ | |
if (!cube_mode) | |
OP_REQUIRES(ctx, uv_da.dims() == 4 && uv_da.dim_size(0) == p.n && uv_da.dim_size(1) == p.imgHeight && uv_da.dim_size(2) == p.imgWidth && uv_da.dim_size(3) == 4, errors::InvalidArgument("uv_da must have shape [minibatch_size, height, width, 4]")); | |
else | |
OP_REQUIRES(ctx, uv_da.dims() == 4 && uv_da.dim_size(0) == p.n && uv_da.dim_size(1) == p.imgHeight && uv_da.dim_size(2) == p.imgWidth && uv_da.dim_size(3) == 6, errors::InvalidArgument("uv_da must have shape [minibatch_size, height, width, 6] in cube map mode")); | |
} | |
// Get input pointers. | |
p.tex[0] = tex.flat<float>().data(); | |
p.uv = uv.flat<float>().data(); | |
p.dy = dy.flat<float>().data(); | |
p.uvDA = p.enableMip ? uv_da.flat<float>().data() : 0; | |
float* pmip = p.enableMip ? (float*)mip.flat<float>().data() : 0; | |
// Allocate output tensor for tex gradient. | |
Tensor* grad_tex_tensor = NULL; | |
TensorShape grad_tex_shape; | |
grad_tex_shape.AddDim(p.texDepth); | |
if (cube_mode) | |
grad_tex_shape.AddDim(6); | |
grad_tex_shape.AddDim(p.texHeight); | |
grad_tex_shape.AddDim(p.texWidth); | |
grad_tex_shape.AddDim(p.channels); | |
OP_REQUIRES_OK(ctx, ctx->allocate_output(0, grad_tex_shape, &grad_tex_tensor)); | |
p.gradTex[0] = grad_tex_tensor->flat<float>().data(); | |
// Allocate output tensor for uv gradient. | |
if (p.filterMode != TEX_MODE_NEAREST) | |
{ | |
TensorShape grad_uv_shape; | |
Tensor* grad_uv_tensor = NULL; | |
grad_uv_shape.AddDim(p.n); | |
grad_uv_shape.AddDim(p.imgHeight); | |
grad_uv_shape.AddDim(p.imgWidth); | |
grad_uv_shape.AddDim(uv.dim_size(3)); | |
OP_REQUIRES_OK(ctx, ctx->allocate_output(1, grad_uv_shape, &grad_uv_tensor)); | |
p.gradUV = grad_uv_tensor->flat<float>().data(); | |
// Allocate output tensor for uv_da gradient. | |
if (p.filterMode == TEX_MODE_LINEAR_MIPMAP_LINEAR) | |
{ | |
Tensor* grad_uv_da_tensor = NULL; | |
grad_uv_shape.set_dim(3, uv_da.dim_size(3)); | |
OP_REQUIRES_OK(ctx, ctx->allocate_output(2, grad_uv_shape, &grad_uv_da_tensor)); | |
p.gradUVDA = grad_uv_da_tensor->flat<float>().data(); | |
} | |
} | |
// Choose kernel variants based on channel count. | |
int channel_div_idx = 0; | |
if (!(p.channels & 3)) | |
channel_div_idx = 2; // Channel count divisible by 4. | |
else if (!(p.channels & 1)) | |
channel_div_idx = 1; // Channel count divisible by 2. | |
// Mip-related setup. | |
Tensor grad_mip_tensor; | |
float* pgradMip = 0; | |
if (p.enableMip) | |
{ | |
// Generate mip offsets. | |
int mipOffsets[TEX_MAX_MIP_LEVEL]; | |
int mipTotal = calculateMipInfo(ctx, p, mipOffsets); | |
// Get space for temporary mip gradients. | |
TensorShape grad_mip_shape; | |
grad_mip_shape.AddDim(mipTotal); | |
ctx->allocate_temp(DT_FLOAT, grad_mip_shape, &grad_mip_tensor); | |
pgradMip = grad_mip_tensor.flat<float>().data(); | |
for (int i=1; i <= p.mipLevelMax; i++) | |
{ | |
p.tex[i] = pmip + mipOffsets[i]; // Pointers to mip levels. | |
p.gradTex[i] = pgradMip + mipOffsets[i]; // Pointers to mip gradients. | |
} | |
// Clear mip gradients. | |
OP_CHECK_CUDA_ERROR(ctx, cudaMemsetAsync(pgradMip, 0, mipTotal * sizeof(float), stream)); | |
} | |
// Initialize texture gradients to zero. | |
int texBytes = p.texHeight * p.texWidth * p.texDepth * p.channels * sizeof(float); | |
if (cube_mode) | |
texBytes *= 6; | |
OP_CHECK_CUDA_ERROR(ctx, cudaMemsetAsync(p.gradTex[0], 0, texBytes, stream)); | |
// Verify that buffers are aligned to allow float2/float4 operations. Unused pointers are zero so always aligned. | |
if (!cube_mode) | |
{ | |
OP_REQUIRES(ctx, !((uintptr_t)p.uv & 7), errors::Internal("uv input tensor not aligned to float2")); | |
OP_REQUIRES(ctx, !((uintptr_t)p.gradUV & 7), errors::Internal("grad_uv output tensor not aligned to float2")); | |
OP_REQUIRES(ctx, !((uintptr_t)p.uvDA & 15), errors::Internal("uv_da input tensor not aligned to float4")); | |
OP_REQUIRES(ctx, !((uintptr_t)p.gradUVDA & 15), errors::Internal("grad_uv_da output tensor not aligned to float4")); | |
} | |
else | |
{ | |
OP_REQUIRES(ctx, !((uintptr_t)p.uvDA & 7), errors::Internal("uv_da input tensor not aligned to float2")); | |
OP_REQUIRES(ctx, !((uintptr_t)p.gradUVDA & 7), errors::Internal("grad_uv_da output tensor not aligned to float2")); | |
} | |
if ((p.channels & 3) == 0) | |
{ | |
OP_REQUIRES(ctx, !((uintptr_t)p.tex[0] & 15), errors::Internal("tex input tensor not aligned to float4")); | |
OP_REQUIRES(ctx, !((uintptr_t)p.gradTex[0] & 15), errors::Internal("grad_tex output tensor not aligned to float4")); | |
OP_REQUIRES(ctx, !((uintptr_t)p.dy & 15), errors::Internal("dy input tensor not aligned to float4")); | |
OP_REQUIRES(ctx, !((uintptr_t)pmip & 15), errors::Internal("mip input tensor not aligned to float4")); | |
OP_REQUIRES(ctx, !((uintptr_t)pgradMip & 15), errors::Internal("internal mip gradient tensor not aligned to float4")); | |
} | |
if ((p.channels & 1) == 0) | |
{ | |
OP_REQUIRES(ctx, !((uintptr_t)p.tex[0] & 7), errors::Internal("tex input tensor not aligned to float2")); | |
OP_REQUIRES(ctx, !((uintptr_t)p.gradTex[0] & 7), errors::Internal("grad_tex output tensor not aligned to float2")); | |
OP_REQUIRES(ctx, !((uintptr_t)p.dy & 7), errors::Internal("dy output tensor not aligned to float2")); | |
OP_REQUIRES(ctx, !((uintptr_t)pmip & 7), errors::Internal("mip input tensor not aligned to float2")); | |
OP_REQUIRES(ctx, !((uintptr_t)pgradMip & 7), errors::Internal("internal mip gradient tensor not aligned to float2")); | |
} | |
// Choose launch parameters for main gradient kernel. | |
void* args[] = {&p}; | |
dim3 blockSize = getLaunchBlockSize(TEX_GRAD_MAX_KERNEL_BLOCK_WIDTH, TEX_GRAD_MAX_KERNEL_BLOCK_HEIGHT, p.imgWidth, p.imgHeight); | |
dim3 gridSize = getLaunchGridSize(blockSize, p.imgWidth, p.imgHeight, p.n); | |
void* func_tbl[TEX_MODE_COUNT * 2] = { | |
(void*)TextureGradKernelNearest, | |
(void*)TextureGradKernelLinear, | |
(void*)TextureGradKernelLinearMipmapNearest, | |
(void*)TextureGradKernelLinearMipmapLinear, | |
(void*)TextureGradKernelCubeNearest, | |
(void*)TextureGradKernelCubeLinear, | |
(void*)TextureGradKernelCubeLinearMipmapNearest, | |
(void*)TextureGradKernelCubeLinearMipmapLinear, | |
}; | |
// Function index. | |
int func_idx = p.filterMode; | |
if (cube_mode) | |
func_idx += TEX_MODE_COUNT; | |
// Launch main gradient kernel. | |
OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel(func_tbl[func_idx], gridSize, blockSize, args, 0, stream)); | |
// Launch kernel to pull gradients from mip levels. | |
if (p.enableMip) | |
{ | |
dim3 blockSize = getLaunchBlockSize(TEX_GRAD_MAX_MIP_KERNEL_BLOCK_WIDTH, TEX_GRAD_MAX_MIP_KERNEL_BLOCK_HEIGHT, p.texWidth, p.texHeight); | |
dim3 gridSize = getLaunchGridSize(blockSize, p.texWidth, p.texHeight, p.texDepth * (cube_mode ? 6 : 1)); | |
int sharedBytes = blockSize.x * blockSize.y * p.channels * sizeof(float); | |
void* mip_grad_func_tbl[3] = { (void*)MipGradKernel1, (void*)MipGradKernel2, (void*)MipGradKernel4 }; | |
OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel(mip_grad_func_tbl[channel_div_idx], gridSize, blockSize, args, sharedBytes, stream)); | |
} | |
} | |
}; | |
REGISTER_OP("TextureGradNearest") | |
.Input ("tex: float") | |
.Input ("uv: float") | |
.Input ("dy: float") | |
.Output ("grad_tex: float") | |
.Attr ("filter_mode: int") | |
.Attr ("boundary_mode: int"); | |
REGISTER_OP("TextureGradLinear") | |
.Input ("tex: float") | |
.Input ("uv: float") | |
.Input ("dy: float") | |
.Output ("grad_tex: float") | |
.Output ("grad_uv: float") | |
.Attr ("filter_mode: int") | |
.Attr ("boundary_mode: int"); | |
REGISTER_OP("TextureGradLinearMipmapNearest") | |
.Input ("tex: float") | |
.Input ("uv: float") | |
.Input ("dy: float") | |
.Input ("uv_da: float") | |
.Input ("mip: float") | |
.Output ("grad_tex: float") | |
.Output ("grad_uv: float") | |
.Attr ("filter_mode: int") | |
.Attr ("boundary_mode: int") | |
.Attr ("max_mip_level: int"); | |
REGISTER_OP("TextureGradLinearMipmapLinear") | |
.Input ("tex: float") | |
.Input ("uv: float") | |
.Input ("dy: float") | |
.Input ("uv_da: float") | |
.Input ("mip: float") | |
.Output ("grad_tex: float") | |
.Output ("grad_uv: float") | |
.Output ("grad_uv_da: float") | |
.Attr ("filter_mode: int") | |
.Attr ("boundary_mode: int") | |
.Attr ("max_mip_level: int"); | |
REGISTER_KERNEL_BUILDER(Name("TextureGradNearest") .Device(DEVICE_GPU), TextureGradOp); | |
REGISTER_KERNEL_BUILDER(Name("TextureGradLinear") .Device(DEVICE_GPU), TextureGradOp); | |
REGISTER_KERNEL_BUILDER(Name("TextureGradLinearMipmapNearest").Device(DEVICE_GPU), TextureGradOp); | |
REGISTER_KERNEL_BUILDER(Name("TextureGradLinearMipmapLinear") .Device(DEVICE_GPU), TextureGradOp); | |
//------------------------------------------------------------------------ | |