From ebe27c0266124e005997af878cdab55bca21d200 Mon Sep 17 00:00:00 2001 From: piiswrong Date: Fri, 23 Oct 2015 13:21:56 -0700 Subject: [PATCH 1/6] softmax and softmaxGrad now support multiple output --- mshadow/cuda/tensor_gpu-inl.cuh | 110 +++++++++++++++++--------------- 1 file changed, 59 insertions(+), 51 deletions(-) diff --git a/mshadow/cuda/tensor_gpu-inl.cuh b/mshadow/cuda/tensor_gpu-inl.cuh index 4cf474f35944..4b47f115600b 100644 --- a/mshadow/cuda/tensor_gpu-inl.cuh +++ b/mshadow/cuda/tensor_gpu-inl.cuh @@ -190,75 +190,82 @@ inline void MapReduceKeepDim1(expr::Plan dst, } template -__global__ void SoftmaxGradKernel(DstPlan dst, SrcPlan1 src, SrcPlan2 label, index_t xmax) { +__global__ void SoftmaxGradKernel(DstPlan dst, SrcPlan1 src, SrcPlan2 label, + index_t xmax, unsigned n_output) { const unsigned x_size = 1 << x_bits; const int y = blockIdx.x; - const int k = static_cast(label.Eval(0, y)); - // calculate normalizer, with writeback - for (unsigned x = 0; x < xmax; x += x_size) { - const unsigned xindex = x + threadIdx.x; - if (xindex < xmax) { - if (xindex == k) { - dst.REval(y, xindex) = src.Eval(y, xindex) - 1.0f; - } else { - dst.REval(y, xindex) = src.Eval(y, xindex); + for (unsigned n = 0; n < n_output; ++n) { + const int k = static_cast(label.Eval(0, y*n_output + n)); + const unsigned base = n*xmax; + // calculate normalizer, with writeback + for (unsigned x = 0; x < xmax; x += x_size) { + const unsigned xindex = x + threadIdx.x; + if (xindex < xmax) { + if (xindex == k) { + dst.REval(y, base + xindex) = src.Eval(y, base + xindex) - 1.0f; + } else { + dst.REval(y, base + xindex) = src.Eval(y, base + xindex); + } } } } } template -__global__ void SoftmaxKernel(DstPlan dst, SrcPlan src, index_t xmax) { +__global__ void SoftmaxKernel(DstPlan dst, SrcPlan src, index_t xmax, unsigned n_output) { const unsigned x_size = 1 << x_bits; const int y = blockIdx.x; __shared__ DType s_rec[x_size]; - // step 1: get max - if (threadIdx.x < xmax) { - s_rec[threadIdx.x] = src.Eval(y, threadIdx.x); - } - for (unsigned x = x_size; x < xmax; x += x_size) { - if (x + threadIdx.x < xmax) { - DType a = src.Eval(y, x + threadIdx.x); - s_rec[threadIdx.x] = max(a, s_rec[threadIdx.x]); + + for (unsigned base = 0; base < xmax*n_output; base += xmax) { + // step 1: get max + if (threadIdx.x < xmax) { + s_rec[threadIdx.x] = src.Eval(y, base + threadIdx.x); } - } - __syncthreads(); - if (threadIdx.x >= xmax) { - s_rec[threadIdx.x] = s_rec[0]; - } - __syncthreads(); - Reduce1D(s_rec); - __syncthreads(); - DType smax = s_rec[0]; - __syncthreads(); - s_rec[threadIdx.x] = 0.0f; - __syncthreads(); + for (unsigned x = x_size; x < xmax; x += x_size) { + if (x + threadIdx.x < xmax) { + DType a = src.Eval(y, base + x + threadIdx.x); + s_rec[threadIdx.x] = max(a, s_rec[threadIdx.x]); + } + } + __syncthreads(); + if (threadIdx.x >= xmax) { + s_rec[threadIdx.x] = s_rec[0]; + } + __syncthreads(); + Reduce1D(s_rec); + __syncthreads(); + DType smax = s_rec[0]; + __syncthreads(); + s_rec[threadIdx.x] = 0.0f; + __syncthreads(); - // calculate normalizer, with writeback - for (unsigned x = 0; x < xmax; x += x_size) { - if (x + threadIdx.x < xmax) { - DType p = expf(src.Eval(y, x + threadIdx.x) - smax); - s_rec[threadIdx.x] += p; - // write back first, will fetch later - dst.REval(y, x + threadIdx.x) = p; + // calculate normalizer, with writeback + for (unsigned x = 0; x < xmax; x += x_size) { + if (x + threadIdx.x < xmax) { + DType p = expf(src.Eval(y, base + x + threadIdx.x) - smax); + s_rec[threadIdx.x] += p; + // write back first, will fetch later + dst.REval(y, base + x + threadIdx.x) = p; + } } - } - // calculate normalizer - __syncthreads(); - Reduce1D(s_rec); - __syncthreads(); - DType ssum = s_rec[0]; + // calculate normalizer + __syncthreads(); + Reduce1D(s_rec); + __syncthreads(); + DType ssum = s_rec[0]; - for (unsigned x = 0; x < xmax; x += x_size) { - if (x + threadIdx.x < xmax) { - dst.REval(y, x + threadIdx.x) /= ssum; + for (unsigned x = 0; x < xmax; x += x_size) { + if (x + threadIdx.x < xmax) { + dst.REval(y, base + x + threadIdx.x) /= ssum; + } } } } template inline void Softmax(Tensor &dst, - const Tensor &src) { + const Tensor &src, unsigned n_output = 1) { dim3 dimBlock(kBaseThreadNum); dim3 dimGrid(dst.size(0)); CHECK_EQ(dst.shape_, src.shape_) << "Softmax: shape mismatch"; @@ -268,13 +275,14 @@ inline void Softmax(Tensor &dst, <<>> (expr::MakePlan(dst), expr::MakePlan(src), - dst.size(1)); + dst.size(1), n_output); } template inline void SoftmaxGrad(Tensor &dst, const Tensor &src, - const Tensor &label) { + const Tensor &label, + unsigned n_output = 1) { dim3 dimBlock(kBaseThreadNum); dim3 dimGrid(dst.size(0)); CHECK_EQ(dst.shape_, src.shape_) << "SoftmaxGrad: shape mismatch"; @@ -286,7 +294,7 @@ inline void SoftmaxGrad(Tensor &dst, (expr::MakePlan(dst), expr::MakePlan(src), expr::MakePlan(label), - dst.size(1)); + dst.size(1), n_output); } } // namespace cuda } // namespace mshadow From 9ee0817cdb4341f31184302c8e2cb61543088131 Mon Sep 17 00:00:00 2001 From: piiswrong Date: Fri, 23 Oct 2015 14:40:46 -0700 Subject: [PATCH 2/6] gpu multi softmax --- mshadow/tensor_cpu-inl.h | 59 ++++++++++++++++++++++++++-------------- 1 file changed, 38 insertions(+), 21 deletions(-) diff --git a/mshadow/tensor_cpu-inl.h b/mshadow/tensor_cpu-inl.h index 93abee83f6be..024d47926dd3 100644 --- a/mshadow/tensor_cpu-inl.h +++ b/mshadow/tensor_cpu-inl.h @@ -251,32 +251,49 @@ inline void MapReduceKeepHighDim(TRValue *dst, template inline void Softmax(Tensor dst, - const Tensor &energy) { - DType mmax = energy[0]; - for (index_t x = 1; x < dst.size(0); ++x) { - if (mmax < energy[x]) mmax = energy[x]; - } - DType sum = 0.0f; - for (index_t x = 0; x < dst.size(0); ++x) { - dst[x] = std::exp(energy[x] - mmax); - sum += dst[x]; - } - for (index_t x = 0; x < dst.size(0); ++x) { - dst[x] /= sum; + const Tensor &energy, + unsigned n_output = 1) { + CHECK_EQ(dst.size(0)%n_output, 0) + << "Invalid input dimension for output number"; + unsigned xmax = dst.size(0)/n_output; + for (unsigned base = 0; base < dst.size(0); base += xmax) { + DType mmax = energy[base]; + for (index_t x = 1; x < xmax; ++x) { + if (mmax < energy[base + x]) mmax = energy[base + x]; + } + DType sum = 0.0f; + for (index_t x = 0; x < xmax; ++x) { + dst[base + x] = std::exp(energy[base + x] - mmax); + sum += dst[base + x]; + } + for (index_t x = 0; x < xmax; ++x) { + dst[base + x] /= sum; + } } } template inline void SoftmaxGrad(Tensor dst, const Tensor &src, - const Tensor &label) { + const Tensor &label, + unsigned n_output = 1) { + CHECK_EQ(dst.size(1)%n_output, 0) + << "Invalid input dimension for n_output"; + CHECK_EQ(dst.size(0), label.size(0)/n_output) + << "Label and input dimensions doesn't match"; + CHECK_EQ(label.size(0)%n_output, 0) + << "Invalid label dimension for n_output"; + const unsigned xmax = dst.size(1)/n_output; for (index_t y = 0; y < dst.size(0); ++y) { - const index_t k = static_cast(label[y]); - for (index_t x = 0; x < dst.size(1); ++x) { - if (x == k) { - dst[y][k] = src[y][k] - 1.0f; - } else { - dst[y][x] = src[y][x]; + for (unsigned n = 0; n < n_output; ++n) { + const base = n*xmax; + const index_t k = static_cast(label[y*n_output+n]); + for (index_t x = 0; x < xmax; ++x) { + if (x == k) { + dst[y][base + k] = src[y][base + k] - 1.0f; + } else { + dst[y][base + x] = src[y][base + x]; + } } } } @@ -284,10 +301,10 @@ inline void SoftmaxGrad(Tensor dst, template inline void Softmax(Tensor dst, - const Tensor &energy) { + const Tensor &energy, unsigned n_output = 1) { CHECK_EQ(dst.shape_, energy.shape_) << "Softmax: shape mismatch"; for (index_t y = 0; y < dst.size(0); ++y) { - Softmax(dst[y], energy[y]); + Softmax(dst[y], energy[y], unsigned n_output = 1); } } From 324f991bf7ffa558d5194f0441131092bbb41044 Mon Sep 17 00:00:00 2001 From: tqchen Date: Fri, 23 Oct 2015 15:39:09 -0700 Subject: [PATCH 3/6] [Update] change get_with_shape to concrete shape --- mshadow/tensor_blob.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mshadow/tensor_blob.h b/mshadow/tensor_blob.h index 7b9e45be8c08..1d8ca52c0be6 100644 --- a/mshadow/tensor_blob.h +++ b/mshadow/tensor_blob.h @@ -509,7 +509,7 @@ class TBlob { } /*! * \brief fetch a tensor in given shape - * if size do not match the stored dimension, an error will be issued + * If size do not match the stored size, an error will be issued * \return the tensor requested * \param shape the shape required * \param stream the possible stream target tensor should reside on @@ -518,7 +518,7 @@ class TBlob { * \tparam DType the type of elements in the tensor */ template - inline Tensor get_with_shape(const TShape &shape, + inline Tensor get_with_shape(const Shape &shape, Stream *stream = NULL) const { CHECK(Device::kDevMask == dev_mask_ && DataType::kFlag == type_flag_) << "TBlob.get_with_shape: device type do not match specified type"; @@ -526,7 +526,7 @@ class TBlob { CHECK_EQ(this->shape_.Size(), shape.Size()) << "TBlob.get_with_shape: new and old shape do not match total elements"; return Tensor(static_cast(dptr_), - shape.get(), + shape, shape[dim - 1], stream); } From 1b81d7318181e1b2ad05f712d67f80ee818f2f43 Mon Sep 17 00:00:00 2001 From: piiswrong Date: Fri, 23 Oct 2015 13:21:56 -0700 Subject: [PATCH 4/6] softmax and softmaxGrad now support multiple output --- mshadow/cuda/tensor_gpu-inl.cuh | 71 +++++++++++++++++++++++++++++++++ mshadow/tensor_blob.h | 8 ++++ mshadow/tensor_cpu-inl.h | 40 +++++++++++++++++++ mshadow/tensor_gpu-inl.h | 13 ++++++ 4 files changed, 132 insertions(+) diff --git a/mshadow/cuda/tensor_gpu-inl.cuh b/mshadow/cuda/tensor_gpu-inl.cuh index 4b47f115600b..f99a74a51a18 100644 --- a/mshadow/cuda/tensor_gpu-inl.cuh +++ b/mshadow/cuda/tensor_gpu-inl.cuh @@ -296,6 +296,77 @@ inline void SoftmaxGrad(Tensor &dst, expr::MakePlan(label), dst.size(1), n_output); } + +template +__global__ void Softmax3DGradKernel(Tensor &dst, + const Tensor &src, + const Tensor &label) { + const index_t xmax = dst.size(1); + const int y = blockIdx.x; + const int n = threadIdx.x; + + if (n < dst.size(2)) { + const int k = static_cast(label[y][n]); + for (index_t i = 0; i < xmax; ++i) { + if (i == k) { + dst[y][i][n] = src[y][i][n] - 1.0f; + } else { + dst[y][i][n] = src[y][i][n]; + } + } + } +} + +template +__global__ void Softmax3DKernel(Tensor &dst, + const Tensor &src) { + const index_t xmax = dst.size(1); + const int y = blockIdx.x; + const int n = threadIdx.x; + + if (n < dst.size(2)) { + DType smax = src[y][0][n]; + for (index_t i = 1; i < xmax; ++i) { + smax = max(smax, src[y][i][n]); + } + DType ssum = 0.0f; + for (index_t i = 0; i < xmax; ++i) { + DType p = expf(src[y][i][n] - smax); + ssum += p; + dst[y][i][n] = p; + } + for (index_t i = 0; i < xmax; ++i) { + dst[y][i][n] /= ssum; + } + } +} + +template +inline void Softmax(Tensor &dst, + const Tensor &src) { + dim3 dimBlock(kBaseThreadNum); + dim3 dimGrid(dst.size(0), dst.size(2)); + CHECK_EQ(dst.shape_, src.shape_) << "Softmax: shape mismatch"; + CheckLaunchParam(dimGrid, dimBlock, "Softmax"); + cudaStream_t stream = Stream::GetStream(dst.stream_); + Softmax3DKernel<<>>(dst, src); +} + + +template +inline void SoftmaxGrad(Tensor &dst, + const Tensor &src, + const Tensor &label) { + dim3 dimBlock(kBaseThreadNum); + dim3 dimGrid(dst.size(0), dst.size(2)); + CHECK_EQ(dst.shape_, src.shape_) << "SoftmaxGrad: shape mismatch"; + CHECK_EQ(dst.size(0), label.size(0)) << "SoftmaxGrad: label shape mismatch"; + CHECK_EQ(dst.size(2), label.size(1)) << "SoftmaxGrad: label shape mismatch"; + CheckLaunchParam(dimGrid, dimBlock, "SoftmaxGrad"); + cudaStream_t stream = Stream::GetStream(dst.stream_); + Softmax3DGradKernel<<>>(dst, src, label); +} + } // namespace cuda } // namespace mshadow #endif // MSHADOW_CUDA_TENSOR_GPU_INL_CUH_ diff --git a/mshadow/tensor_blob.h b/mshadow/tensor_blob.h index 1d8ca52c0be6..d08156c4cad7 100644 --- a/mshadow/tensor_blob.h +++ b/mshadow/tensor_blob.h @@ -490,6 +490,14 @@ class TBlob { inline index_t size(index_t idx) const { return shape_[idx]; } + /*! \brief total number of elements in the tensor */ + inline index_t Size(void) const { + index_t size = 1; + for (index_t i = 0; i < shape_.ndim(); ++i) { + size *= shape_[i]; + } + return size; + } /*! * \brief fetch the tensor, with respect to specific dimension * if dim do not match the stored dimension, an error will be issued diff --git a/mshadow/tensor_cpu-inl.h b/mshadow/tensor_cpu-inl.h index 024d47926dd3..cdf51e84283a 100644 --- a/mshadow/tensor_cpu-inl.h +++ b/mshadow/tensor_cpu-inl.h @@ -299,6 +299,24 @@ inline void SoftmaxGrad(Tensor dst, } } +template +inline void SoftmaxGrad(Tensor dst, + const Tensor &src, + const Tensor &label) { + for (index_t n = 0; n < dst.size(2); ++n) { + for (index_t y = 0; y < dst.size(0); ++y) { + const index_t k = static_cast(label[y][n]); + for (index_t x = 0; x < dst.size(1); ++x) { + if (x == k) { + dst[y][k][n] = src[y][k][n] - 1.0f; + } else { + dst[y][x][n] = src[y][x][n]; + } + } + } + } +} + template inline void Softmax(Tensor dst, const Tensor &energy, unsigned n_output = 1) { @@ -308,6 +326,28 @@ inline void Softmax(Tensor dst, } } +template +inline void Softmax(Tensor dst, + const Tensor &energy) { + CHECK_EQ(dst.shape_, energy.shape_) << "Softmax: shape mismatch"; + for (index_t y = 0; y < dst.size(0); ++y) { + for (index_t n = 0; n < dst.size(2); ++n) { + DType mmax = energy[y][0][n]; + for (index_t x = 1; x < dst.size(1); ++x) { + if (mmax < energy[y][x][n]) mmax = energy[y][x][n]; + } + DType sum = 0.0f; + for (index_t x = 0; x < dst.size(1); ++x) { + dst[y][x][n] = std::exp(energy[y][x][n] - mmax); + sum += dst[y][x][n]; + } + for (index_t x = 0; x < dst.size(1); ++x) { + dst[y][x][n] /= sum; + } + } + } +} + template inline DType VDot(const Tensor &lhs, const Tensor &rhs) { diff --git a/mshadow/tensor_gpu-inl.h b/mshadow/tensor_gpu-inl.h index 3bb90249e903..76774ada05ca 100644 --- a/mshadow/tensor_gpu-inl.h +++ b/mshadow/tensor_gpu-inl.h @@ -159,6 +159,12 @@ inline void Softmax(Tensor dst, cuda::Softmax(dst, src); } +template +inline void Softmax(Tensor dst, + const Tensor& src) { + cuda::Softmax(dst, src); +} + template inline void SoftmaxGrad(Tensor dst, const Tensor &src, @@ -166,6 +172,13 @@ inline void SoftmaxGrad(Tensor dst, cuda::SoftmaxGrad(dst, src, label); } +template +inline void SoftmaxGrad(Tensor dst, + const Tensor &src, + const Tensor &label) { + cuda::SoftmaxGrad(dst, src, label); +} + } // namespace mshadow #endif // __CUDACC__ #endif // MSHADOW_TENSOR_GPU_INL_H_ From 24572d4546c8cc1f694ccd937c790aad27100da1 Mon Sep 17 00:00:00 2001 From: piiswrong Date: Fri, 23 Oct 2015 13:21:56 -0700 Subject: [PATCH 5/6] softmax and softmaxGrad now support multiple output --- mshadow/cuda/tensor_gpu-inl.cuh | 181 +++++++++++++++++++++----------- mshadow/tensor_blob.h | 8 ++ mshadow/tensor_cpu-inl.h | 40 +++++++ mshadow/tensor_gpu-inl.h | 13 +++ 4 files changed, 183 insertions(+), 59 deletions(-) diff --git a/mshadow/cuda/tensor_gpu-inl.cuh b/mshadow/cuda/tensor_gpu-inl.cuh index 4b47f115600b..380876d3fa89 100644 --- a/mshadow/cuda/tensor_gpu-inl.cuh +++ b/mshadow/cuda/tensor_gpu-inl.cuh @@ -190,82 +190,75 @@ inline void MapReduceKeepDim1(expr::Plan dst, } template -__global__ void SoftmaxGradKernel(DstPlan dst, SrcPlan1 src, SrcPlan2 label, - index_t xmax, unsigned n_output) { +__global__ void SoftmaxGradKernel(DstPlan dst, SrcPlan1 src, SrcPlan2 label, index_t xmax) { const unsigned x_size = 1 << x_bits; const int y = blockIdx.x; + const int k = static_cast(label.Eval(0, y)); - for (unsigned n = 0; n < n_output; ++n) { - const int k = static_cast(label.Eval(0, y*n_output + n)); - const unsigned base = n*xmax; - // calculate normalizer, with writeback - for (unsigned x = 0; x < xmax; x += x_size) { - const unsigned xindex = x + threadIdx.x; - if (xindex < xmax) { - if (xindex == k) { - dst.REval(y, base + xindex) = src.Eval(y, base + xindex) - 1.0f; - } else { - dst.REval(y, base + xindex) = src.Eval(y, base + xindex); - } + // calculate normalizer, with writeback + for (unsigned x = 0; x < xmax; x += x_size) { + const unsigned xindex = x + threadIdx.x; + if (xindex < xmax) { + if (xindex == k) { + dst.REval(y, xindex) = src.Eval(y, xindex) - 1.0f; + } else { + dst.REval(y, xindex) = src.Eval(y, xindex); } } } } template -__global__ void SoftmaxKernel(DstPlan dst, SrcPlan src, index_t xmax, unsigned n_output) { +__global__ void SoftmaxKernel(DstPlan dst, SrcPlan src, index_t xmax) { const unsigned x_size = 1 << x_bits; const int y = blockIdx.x; __shared__ DType s_rec[x_size]; - - for (unsigned base = 0; base < xmax*n_output; base += xmax) { - // step 1: get max - if (threadIdx.x < xmax) { - s_rec[threadIdx.x] = src.Eval(y, base + threadIdx.x); - } - for (unsigned x = x_size; x < xmax; x += x_size) { - if (x + threadIdx.x < xmax) { - DType a = src.Eval(y, base + x + threadIdx.x); - s_rec[threadIdx.x] = max(a, s_rec[threadIdx.x]); - } - } - __syncthreads(); - if (threadIdx.x >= xmax) { - s_rec[threadIdx.x] = s_rec[0]; + // step 1: get max + if (threadIdx.x < xmax) { + s_rec[threadIdx.x] = src.Eval(y, threadIdx.x); + } + for (unsigned x = x_size; x < xmax; x += x_size) { + if (x + threadIdx.x < xmax) { + DType a = src.Eval(y, x + threadIdx.x); + s_rec[threadIdx.x] = max(a, s_rec[threadIdx.x]); } - __syncthreads(); - Reduce1D(s_rec); - __syncthreads(); - DType smax = s_rec[0]; - __syncthreads(); - s_rec[threadIdx.x] = 0.0f; - __syncthreads(); + } + __syncthreads(); + if (threadIdx.x >= xmax) { + s_rec[threadIdx.x] = s_rec[0]; + } + __syncthreads(); + Reduce1D(s_rec); + __syncthreads(); + DType smax = s_rec[0]; + __syncthreads(); + s_rec[threadIdx.x] = 0.0f; + __syncthreads(); - // calculate normalizer, with writeback - for (unsigned x = 0; x < xmax; x += x_size) { - if (x + threadIdx.x < xmax) { - DType p = expf(src.Eval(y, base + x + threadIdx.x) - smax); - s_rec[threadIdx.x] += p; - // write back first, will fetch later - dst.REval(y, base + x + threadIdx.x) = p; - } + // calculate normalizer, with writeback + for (unsigned x = 0; x < xmax; x += x_size) { + if (x + threadIdx.x < xmax) { + DType p = expf(src.Eval(y, x + threadIdx.x) - smax); + s_rec[threadIdx.x] += p; + // write back first, will fetch later + dst.REval(y, x + threadIdx.x) = p; } - // calculate normalizer - __syncthreads(); - Reduce1D(s_rec); - __syncthreads(); - DType ssum = s_rec[0]; + } + // calculate normalizer + __syncthreads(); + Reduce1D(s_rec); + __syncthreads(); + DType ssum = s_rec[0]; - for (unsigned x = 0; x < xmax; x += x_size) { - if (x + threadIdx.x < xmax) { - dst.REval(y, base + x + threadIdx.x) /= ssum; - } + for (unsigned x = 0; x < xmax; x += x_size) { + if (x + threadIdx.x < xmax) { + dst.REval(y, x + threadIdx.x) /= ssum; } } } template inline void Softmax(Tensor &dst, - const Tensor &src, unsigned n_output = 1) { + const Tensor &src) { dim3 dimBlock(kBaseThreadNum); dim3 dimGrid(dst.size(0)); CHECK_EQ(dst.shape_, src.shape_) << "Softmax: shape mismatch"; @@ -275,14 +268,13 @@ inline void Softmax(Tensor &dst, <<>> (expr::MakePlan(dst), expr::MakePlan(src), - dst.size(1), n_output); + dst.size(1)); } template inline void SoftmaxGrad(Tensor &dst, const Tensor &src, - const Tensor &label, - unsigned n_output = 1) { + const Tensor &label) { dim3 dimBlock(kBaseThreadNum); dim3 dimGrid(dst.size(0)); CHECK_EQ(dst.shape_, src.shape_) << "SoftmaxGrad: shape mismatch"; @@ -294,8 +286,79 @@ inline void SoftmaxGrad(Tensor &dst, (expr::MakePlan(dst), expr::MakePlan(src), expr::MakePlan(label), - dst.size(1), n_output); + dst.size(1)); +} + +template +__global__ void Softmax3DGradKernel(Tensor &dst, + const Tensor &src, + const Tensor &label) { + const index_t xmax = dst.size(1); + const int y = blockIdx.x; + const int n = threadIdx.x; + + if (n < dst.size(2)) { + const int k = static_cast(label[y][n]); + for (index_t i = 0; i < xmax; ++i) { + if (i == k) { + dst[y][i][n] = src[y][i][n] - 1.0f; + } else { + dst[y][i][n] = src[y][i][n]; + } + } + } +} + +template +__global__ void Softmax3DKernel(Tensor &dst, + const Tensor &src) { + const index_t xmax = dst.size(1); + const int y = blockIdx.x; + const int n = threadIdx.x; + + if (n < dst.size(2)) { + DType smax = src[y][0][n]; + for (index_t i = 1; i < xmax; ++i) { + smax = max(smax, src[y][i][n]); + } + DType ssum = 0.0f; + for (index_t i = 0; i < xmax; ++i) { + DType p = expf(src[y][i][n] - smax); + ssum += p; + dst[y][i][n] = p; + } + for (index_t i = 0; i < xmax; ++i) { + dst[y][i][n] /= ssum; + } + } } + +template +inline void Softmax(Tensor &dst, + const Tensor &src) { + dim3 dimBlock(kBaseThreadNum); + dim3 dimGrid(dst.size(0), dst.size(2)); + CHECK_EQ(dst.shape_, src.shape_) << "Softmax: shape mismatch"; + CheckLaunchParam(dimGrid, dimBlock, "Softmax"); + cudaStream_t stream = Stream::GetStream(dst.stream_); + Softmax3DKernel<<>>(dst, src); +} + + +template +inline void SoftmaxGrad(Tensor &dst, + const Tensor &src, + const Tensor &label) { + dim3 dimBlock(kBaseThreadNum); + dim3 dimGrid(dst.size(0), dst.size(2)); + CHECK_EQ(dst.shape_, src.shape_) << "SoftmaxGrad: shape mismatch"; + CHECK_EQ(dst.size(0), label.size(0)) << "SoftmaxGrad: label shape mismatch"; + CHECK_EQ(dst.size(2), label.size(1)) << "SoftmaxGrad: label shape mismatch"; + CheckLaunchParam(dimGrid, dimBlock, "SoftmaxGrad"); + cudaStream_t stream = Stream::GetStream(dst.stream_); + Softmax3DGradKernel<<>>(dst, src, label); +} + } // namespace cuda } // namespace mshadow #endif // MSHADOW_CUDA_TENSOR_GPU_INL_CUH_ diff --git a/mshadow/tensor_blob.h b/mshadow/tensor_blob.h index 1d8ca52c0be6..d08156c4cad7 100644 --- a/mshadow/tensor_blob.h +++ b/mshadow/tensor_blob.h @@ -490,6 +490,14 @@ class TBlob { inline index_t size(index_t idx) const { return shape_[idx]; } + /*! \brief total number of elements in the tensor */ + inline index_t Size(void) const { + index_t size = 1; + for (index_t i = 0; i < shape_.ndim(); ++i) { + size *= shape_[i]; + } + return size; + } /*! * \brief fetch the tensor, with respect to specific dimension * if dim do not match the stored dimension, an error will be issued diff --git a/mshadow/tensor_cpu-inl.h b/mshadow/tensor_cpu-inl.h index 024d47926dd3..cdf51e84283a 100644 --- a/mshadow/tensor_cpu-inl.h +++ b/mshadow/tensor_cpu-inl.h @@ -299,6 +299,24 @@ inline void SoftmaxGrad(Tensor dst, } } +template +inline void SoftmaxGrad(Tensor dst, + const Tensor &src, + const Tensor &label) { + for (index_t n = 0; n < dst.size(2); ++n) { + for (index_t y = 0; y < dst.size(0); ++y) { + const index_t k = static_cast(label[y][n]); + for (index_t x = 0; x < dst.size(1); ++x) { + if (x == k) { + dst[y][k][n] = src[y][k][n] - 1.0f; + } else { + dst[y][x][n] = src[y][x][n]; + } + } + } + } +} + template inline void Softmax(Tensor dst, const Tensor &energy, unsigned n_output = 1) { @@ -308,6 +326,28 @@ inline void Softmax(Tensor dst, } } +template +inline void Softmax(Tensor dst, + const Tensor &energy) { + CHECK_EQ(dst.shape_, energy.shape_) << "Softmax: shape mismatch"; + for (index_t y = 0; y < dst.size(0); ++y) { + for (index_t n = 0; n < dst.size(2); ++n) { + DType mmax = energy[y][0][n]; + for (index_t x = 1; x < dst.size(1); ++x) { + if (mmax < energy[y][x][n]) mmax = energy[y][x][n]; + } + DType sum = 0.0f; + for (index_t x = 0; x < dst.size(1); ++x) { + dst[y][x][n] = std::exp(energy[y][x][n] - mmax); + sum += dst[y][x][n]; + } + for (index_t x = 0; x < dst.size(1); ++x) { + dst[y][x][n] /= sum; + } + } + } +} + template inline DType VDot(const Tensor &lhs, const Tensor &rhs) { diff --git a/mshadow/tensor_gpu-inl.h b/mshadow/tensor_gpu-inl.h index 3bb90249e903..76774ada05ca 100644 --- a/mshadow/tensor_gpu-inl.h +++ b/mshadow/tensor_gpu-inl.h @@ -159,6 +159,12 @@ inline void Softmax(Tensor dst, cuda::Softmax(dst, src); } +template +inline void Softmax(Tensor dst, + const Tensor& src) { + cuda::Softmax(dst, src); +} + template inline void SoftmaxGrad(Tensor dst, const Tensor &src, @@ -166,6 +172,13 @@ inline void SoftmaxGrad(Tensor dst, cuda::SoftmaxGrad(dst, src, label); } +template +inline void SoftmaxGrad(Tensor dst, + const Tensor &src, + const Tensor &label) { + cuda::SoftmaxGrad(dst, src, label); +} + } // namespace mshadow #endif // __CUDACC__ #endif // MSHADOW_TENSOR_GPU_INL_H_ From d298f4fe6d22a05f13e766c4a74d9afa1442180e Mon Sep 17 00:00:00 2001 From: piiswrong Date: Fri, 23 Oct 2015 23:40:58 -0700 Subject: [PATCH 6/6] fix --- mshadow/tensor_blob.h | 6 +--- mshadow/tensor_cpu-inl.h | 59 ++++++++++++++-------------------------- 2 files changed, 22 insertions(+), 43 deletions(-) diff --git a/mshadow/tensor_blob.h b/mshadow/tensor_blob.h index d08156c4cad7..265d80059ce7 100644 --- a/mshadow/tensor_blob.h +++ b/mshadow/tensor_blob.h @@ -492,11 +492,7 @@ class TBlob { } /*! \brief total number of elements in the tensor */ inline index_t Size(void) const { - index_t size = 1; - for (index_t i = 0; i < shape_.ndim(); ++i) { - size *= shape_[i]; - } - return size; + return shape_.Size(); } /*! * \brief fetch the tensor, with respect to specific dimension diff --git a/mshadow/tensor_cpu-inl.h b/mshadow/tensor_cpu-inl.h index cdf51e84283a..14fcd9219f4c 100644 --- a/mshadow/tensor_cpu-inl.h +++ b/mshadow/tensor_cpu-inl.h @@ -251,49 +251,32 @@ inline void MapReduceKeepHighDim(TRValue *dst, template inline void Softmax(Tensor dst, - const Tensor &energy, - unsigned n_output = 1) { - CHECK_EQ(dst.size(0)%n_output, 0) - << "Invalid input dimension for output number"; - unsigned xmax = dst.size(0)/n_output; - for (unsigned base = 0; base < dst.size(0); base += xmax) { - DType mmax = energy[base]; - for (index_t x = 1; x < xmax; ++x) { - if (mmax < energy[base + x]) mmax = energy[base + x]; - } - DType sum = 0.0f; - for (index_t x = 0; x < xmax; ++x) { - dst[base + x] = std::exp(energy[base + x] - mmax); - sum += dst[base + x]; - } - for (index_t x = 0; x < xmax; ++x) { - dst[base + x] /= sum; - } + const Tensor &energy) { + DType mmax = energy[0]; + for (index_t x = 1; x < dst.size(0); ++x) { + if (mmax < energy[x]) mmax = energy[x]; + } + DType sum = 0.0f; + for (index_t x = 0; x < dst.size(0); ++x) { + dst[x] = std::exp(energy[x] - mmax); + sum += dst[x]; + } + for (index_t x = 0; x < dst.size(0); ++x) { + dst[x] /= sum; } } template inline void SoftmaxGrad(Tensor dst, const Tensor &src, - const Tensor &label, - unsigned n_output = 1) { - CHECK_EQ(dst.size(1)%n_output, 0) - << "Invalid input dimension for n_output"; - CHECK_EQ(dst.size(0), label.size(0)/n_output) - << "Label and input dimensions doesn't match"; - CHECK_EQ(label.size(0)%n_output, 0) - << "Invalid label dimension for n_output"; - const unsigned xmax = dst.size(1)/n_output; + const Tensor &label) { for (index_t y = 0; y < dst.size(0); ++y) { - for (unsigned n = 0; n < n_output; ++n) { - const base = n*xmax; - const index_t k = static_cast(label[y*n_output+n]); - for (index_t x = 0; x < xmax; ++x) { - if (x == k) { - dst[y][base + k] = src[y][base + k] - 1.0f; - } else { - dst[y][base + x] = src[y][base + x]; - } + const index_t k = static_cast(label[y]); + for (index_t x = 0; x < dst.size(1); ++x) { + if (x == k) { + dst[y][k] = src[y][k] - 1.0f; + } else { + dst[y][x] = src[y][x]; } } } @@ -319,10 +302,10 @@ inline void SoftmaxGrad(Tensor dst, template inline void Softmax(Tensor dst, - const Tensor &energy, unsigned n_output = 1) { + const Tensor &energy) { CHECK_EQ(dst.shape_, energy.shape_) << "Softmax: shape mismatch"; for (index_t y = 0; y < dst.size(0); ++y) { - Softmax(dst[y], energy[y], unsigned n_output = 1); + Softmax(dst[y], energy[y]); } }