From ebe27c0266124e005997af878cdab55bca21d200 Mon Sep 17 00:00:00 2001
From: piiswrong <eric.jy.xie@gmail.com>
Date: Fri, 23 Oct 2015 13:21:56 -0700
Subject: [PATCH 1/6] softmax and softmaxGrad now support multiple output

---
 mshadow/cuda/tensor_gpu-inl.cuh | 110 +++++++++++++++++---------------
 1 file changed, 59 insertions(+), 51 deletions(-)
diff --git a/mshadow/cuda/tensor_gpu-inl.cuh b/mshadow/cuda/tensor_gpu-inl.cuh
index 4cf474f35944..4b47f115600b 100644
--- a/mshadow/cuda/tensor_gpu-inl.cuh
+++ b/mshadow/cuda/tensor_gpu-inl.cuh
@@ -190,75 +190,82 @@ inline void MapReduceKeepDim1(expr::Plan<DstExp, DType> dst,
 }
 
 template<int x_bits, typename DType, typename DstPlan, typename SrcPlan1, typename SrcPlan2>
-__global__ void SoftmaxGradKernel(DstPlan dst, SrcPlan1 src, SrcPlan2 label, index_t xmax) {
+__global__ void SoftmaxGradKernel(DstPlan dst, SrcPlan1 src, SrcPlan2 label,
+                                  index_t xmax, unsigned n_output) {
   const unsigned x_size = 1 << x_bits;
   const int y = blockIdx.x;
-  const int k = static_cast<int>(label.Eval(0, y));
 
-  // calculate normalizer, with writeback
-  for (unsigned x = 0; x < xmax; x += x_size) {
-    const unsigned xindex = x + threadIdx.x;
-    if (xindex < xmax) {
-      if (xindex == k) {
-        dst.REval(y, xindex) = src.Eval(y, xindex) - 1.0f;
-      } else {
-        dst.REval(y, xindex) = src.Eval(y, xindex);
+  for (unsigned n = 0; n < n_output; ++n) {
+    const int k = static_cast<int>(label.Eval(0, y*n_output + n));
+    const unsigned base = n*xmax;
+    // calculate normalizer, with writeback
+    for (unsigned x = 0; x < xmax; x += x_size) {
+      const unsigned xindex = x + threadIdx.x;
+      if (xindex < xmax) {
+        if (xindex == k) {
+          dst.REval(y, base + xindex) = src.Eval(y, base + xindex) - 1.0f;
+        } else {
+          dst.REval(y, base + xindex) = src.Eval(y, base + xindex);
+        }
       }
     }
   }
 }
 
 template<int x_bits, typename DType,  typename DstPlan, typename SrcPlan>
-__global__ void SoftmaxKernel(DstPlan dst, SrcPlan src, index_t xmax) {
+__global__ void SoftmaxKernel(DstPlan dst, SrcPlan src, index_t xmax, unsigned n_output) {
   const unsigned x_size = 1 << x_bits;
   const int y = blockIdx.x;
   __shared__ DType s_rec[x_size];
-  // step 1: get max
-  if (threadIdx.x < xmax) {
-    s_rec[threadIdx.x] = src.Eval(y, threadIdx.x);
-  }
-  for (unsigned x = x_size; x < xmax; x += x_size) {
-    if (x + threadIdx.x < xmax) {
-      DType a = src.Eval(y, x + threadIdx.x);
-      s_rec[threadIdx.x] = max(a, s_rec[threadIdx.x]);
+
+  for (unsigned base = 0; base < xmax*n_output; base += xmax) {
+    // step 1: get max
+    if (threadIdx.x < xmax) {
+      s_rec[threadIdx.x] = src.Eval(y, base + threadIdx.x);
     }
-  }
-  __syncthreads();
-  if (threadIdx.x >= xmax) {
-    s_rec[threadIdx.x] = s_rec[0];
-  }
-  __syncthreads();
-  Reduce1D<red::maximum, x_bits>(s_rec);
-  __syncthreads();
-  DType smax = s_rec[0];
-  __syncthreads();
-  s_rec[threadIdx.x] = 0.0f;
-  __syncthreads();
+    for (unsigned x = x_size; x < xmax; x += x_size) {
+      if (x + threadIdx.x < xmax) {
+        DType a = src.Eval(y, base + x + threadIdx.x);
+        s_rec[threadIdx.x] = max(a, s_rec[threadIdx.x]);
+      }
+    }
+    __syncthreads();
+    if (threadIdx.x >= xmax) {
+      s_rec[threadIdx.x] = s_rec[0];
+    }
+    __syncthreads();
+    Reduce1D<red::maximum, x_bits>(s_rec);
+    __syncthreads();
+    DType smax = s_rec[0];
+    __syncthreads();
+    s_rec[threadIdx.x] = 0.0f;
+    __syncthreads();
 
-  // calculate normalizer, with writeback
-  for (unsigned x = 0; x < xmax; x += x_size) {
-    if (x + threadIdx.x < xmax) {
-      DType p = expf(src.Eval(y, x + threadIdx.x) - smax);
-      s_rec[threadIdx.x] += p;
-      // write back first, will fetch later
-      dst.REval(y, x + threadIdx.x) = p;
+    // calculate normalizer, with writeback
+    for (unsigned x = 0; x < xmax; x += x_size) {
+      if (x + threadIdx.x < xmax) {
+        DType p = expf(src.Eval(y, base + x + threadIdx.x) - smax);
+        s_rec[threadIdx.x] += p;
+        // write back first, will fetch later
+        dst.REval(y, base + x + threadIdx.x) = p;
+      }
     }
-  }
-  // calculate normalizer
-  __syncthreads();
-  Reduce1D<red::sum, x_bits>(s_rec);
-  __syncthreads();
-  DType ssum = s_rec[0];
+    // calculate normalizer
+    __syncthreads();
+    Reduce1D<red::sum, x_bits>(s_rec);
+    __syncthreads();
+    DType ssum = s_rec[0];
 
-  for (unsigned x = 0; x < xmax; x += x_size) {
-    if (x + threadIdx.x < xmax) {
-      dst.REval(y, x + threadIdx.x) /= ssum;
+    for (unsigned x = 0; x < xmax; x += x_size) {
+      if (x + threadIdx.x < xmax) {
+        dst.REval(y, base + x + threadIdx.x) /= ssum;
+      }
     }
   }
 }
 template<typename DType>
 inline void Softmax(Tensor<gpu, 2, DType> &dst,
-                    const Tensor<gpu, 2, DType> &src) {
+                    const Tensor<gpu, 2, DType> &src, unsigned n_output = 1) {
   dim3 dimBlock(kBaseThreadNum);
   dim3 dimGrid(dst.size(0));
   CHECK_EQ(dst.shape_, src.shape_) << "Softmax: shape mismatch";
@@ -268,13 +275,14 @@ inline void Softmax(Tensor<gpu, 2, DType> &dst,
       <<<dimGrid, dimBlock, 0, stream>>>
       (expr::MakePlan(dst),
        expr::MakePlan(src),
-       dst.size(1));
+       dst.size(1), n_output);
 }
 
 template<typename DType>
 inline void SoftmaxGrad(Tensor<gpu, 2, DType> &dst,
                         const Tensor<gpu, 2, DType> &src,
-                        const Tensor<gpu, 1, DType> &label) {
+                        const Tensor<gpu, 1, DType> &label,
+                        unsigned n_output = 1) {
   dim3 dimBlock(kBaseThreadNum);
   dim3 dimGrid(dst.size(0));
   CHECK_EQ(dst.shape_, src.shape_) << "SoftmaxGrad: shape mismatch";
@@ -286,7 +294,7 @@ inline void SoftmaxGrad(Tensor<gpu, 2, DType> &dst,
       (expr::MakePlan(dst),
        expr::MakePlan(src),
        expr::MakePlan(label),
-       dst.size(1));
+       dst.size(1), n_output);
 }
 }  // namespace cuda
 }  // namespace mshadow

From 9ee0817cdb4341f31184302c8e2cb61543088131 Mon Sep 17 00:00:00 2001
From: piiswrong <eric.jy.xie@gmail.com>
Date: Fri, 23 Oct 2015 14:40:46 -0700
Subject: [PATCH 2/6] gpu multi softmax

---
 mshadow/tensor_cpu-inl.h | 59 ++++++++++++++++++++++++++--------------
 1 file changed, 38 insertions(+), 21 deletions(-)

diff --git a/mshadow/tensor_cpu-inl.h b/mshadow/tensor_cpu-inl.h
index 93abee83f6be..024d47926dd3 100644
--- a/mshadow/tensor_cpu-inl.h
+++ b/mshadow/tensor_cpu-inl.h
@@ -251,32 +251,49 @@ inline void MapReduceKeepHighDim(TRValue<R, cpu, 1, DType> *dst,
 
 template<typename DType>
 inline void Softmax(Tensor<cpu, 1, DType> dst,
-                    const Tensor<cpu, 1, DType> &energy) {
-  DType mmax = energy[0];
-  for (index_t x = 1; x < dst.size(0); ++x) {
-    if (mmax < energy[x]) mmax = energy[x];
-  }
-  DType sum = 0.0f;
-  for (index_t x = 0; x < dst.size(0); ++x) {
-    dst[x] = std::exp(energy[x] - mmax);
-    sum += dst[x];
-  }
-  for (index_t x = 0; x < dst.size(0); ++x) {
-    dst[x] /= sum;
+                    const Tensor<cpu, 1, DType> &energy,
+                    unsigned n_output = 1) {
+  CHECK_EQ(dst.size(0)%n_output, 0)
+    << "Invalid input dimension for output number";
+  unsigned xmax = dst.size(0)/n_output;
+  for (unsigned base = 0; base < dst.size(0); base += xmax) {
+    DType mmax = energy[base];
+    for (index_t x = 1; x < xmax; ++x) {
+      if (mmax < energy[base + x]) mmax = energy[base + x];
+    }
+    DType sum = 0.0f;
+    for (index_t x = 0; x < xmax; ++x) {
+      dst[base + x] = std::exp(energy[base + x] - mmax);
+      sum += dst[base + x];
+    }
+    for (index_t x = 0; x < xmax; ++x) {
+      dst[base + x] /= sum;
+    }
   }
 }
 
 template<typename DType>
 inline void SoftmaxGrad(Tensor<cpu, 2, DType> dst,
                         const Tensor<cpu, 2, DType> &src,
-                        const Tensor<cpu, 1, DType> &label) {
+                        const Tensor<cpu, 1, DType> &label,
+                        unsigned n_output = 1) {
+  CHECK_EQ(dst.size(1)%n_output, 0)
+    << "Invalid input dimension for n_output";
+  CHECK_EQ(dst.size(0), label.size(0)/n_output)
+    << "Label and input dimensions doesn't match";
+  CHECK_EQ(label.size(0)%n_output, 0)
+    << "Invalid label dimension for n_output";
+  const unsigned xmax = dst.size(1)/n_output;
   for (index_t y = 0; y < dst.size(0); ++y) {
-    const index_t k = static_cast<int>(label[y]);
-    for (index_t x = 0; x < dst.size(1); ++x) {
-      if (x == k) {
-        dst[y][k] = src[y][k] - 1.0f;
-      } else {
-        dst[y][x] = src[y][x];
+    for (unsigned n = 0; n < n_output; ++n) {
+      const base = n*xmax;
+      const index_t k = static_cast<int>(label[y*n_output+n]);
+      for (index_t x = 0; x < xmax; ++x) {
+        if (x == k) {
+          dst[y][base + k] = src[y][base + k] - 1.0f;
+        } else {
+          dst[y][base + x] = src[y][base + x];
+        }
       }
     }
   }
@@ -284,10 +301,10 @@ inline void SoftmaxGrad(Tensor<cpu, 2, DType> dst,
 
 template<typename DType>
 inline void Softmax(Tensor<cpu, 2, DType> dst,
-                    const Tensor<cpu, 2, DType> &energy) {
+                    const Tensor<cpu, 2, DType> &energy, unsigned n_output = 1) {
   CHECK_EQ(dst.shape_, energy.shape_) << "Softmax: shape mismatch";
   for (index_t y = 0; y < dst.size(0); ++y) {
-    Softmax(dst[y], energy[y]);
+    Softmax(dst[y], energy[y], unsigned n_output = 1);
   }
 }
 

From 324f991bf7ffa558d5194f0441131092bbb41044 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Fri, 23 Oct 2015 15:39:09 -0700
Subject: [PATCH 3/6] [Update] change get_with_shape to concrete shape

---
 mshadow/tensor_blob.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mshadow/tensor_blob.h b/mshadow/tensor_blob.h
index 7b9e45be8c08..1d8ca52c0be6 100644
--- a/mshadow/tensor_blob.h
+++ b/mshadow/tensor_blob.h
@@ -509,7 +509,7 @@ class TBlob {
   }
   /*!
    * \brief fetch a tensor in given shape
-   * if size do not match the stored dimension, an error will be issued
+   *  If size do not match the stored size, an error will be issued
    * \return the tensor requested
    * \param shape the shape required
    * \param stream the possible stream target tensor should reside on
@@ -518,7 +518,7 @@ class TBlob {
    * \tparam DType the type of elements in the tensor
    */
   template<typename Device, int dim, typename DType>
-  inline Tensor<Device, dim, DType> get_with_shape(const TShape &shape,
+  inline Tensor<Device, dim, DType> get_with_shape(const Shape<dim> &shape,
                                                    Stream<Device> *stream = NULL) const {
     CHECK(Device::kDevMask == dev_mask_ && DataType<DType>::kFlag == type_flag_)
       << "TBlob.get_with_shape: device type do not match specified type";
@@ -526,7 +526,7 @@ class TBlob {
     CHECK_EQ(this->shape_.Size(), shape.Size())
       << "TBlob.get_with_shape: new and old shape do not match total elements";
     return Tensor<Device, dim, DType>(static_cast<DType*>(dptr_),
-                                      shape.get<dim>(),
+                                      shape,
                                       shape[dim - 1],
                                       stream);
   }

From 1b81d7318181e1b2ad05f712d67f80ee818f2f43 Mon Sep 17 00:00:00 2001
From: piiswrong <eric.jy.xie@gmail.com>
Date: Fri, 23 Oct 2015 13:21:56 -0700
Subject: [PATCH 4/6] softmax and softmaxGrad now support multiple output

---
 mshadow/cuda/tensor_gpu-inl.cuh | 71 +++++++++++++++++++++++++++++++++
 mshadow/tensor_blob.h           |  8 ++++
 mshadow/tensor_cpu-inl.h        | 40 +++++++++++++++++++
 mshadow/tensor_gpu-inl.h        | 13 ++++++
 4 files changed, 132 insertions(+)

diff --git a/mshadow/cuda/tensor_gpu-inl.cuh b/mshadow/cuda/tensor_gpu-inl.cuh
index 4b47f115600b..f99a74a51a18 100644
--- a/mshadow/cuda/tensor_gpu-inl.cuh
+++ b/mshadow/cuda/tensor_gpu-inl.cuh
@@ -296,6 +296,77 @@ inline void SoftmaxGrad(Tensor<gpu, 2, DType> &dst,
        expr::MakePlan(label),
        dst.size(1), n_output);
 }
+
+template<typename DType>
+__global__ void Softmax3DGradKernel(Tensor<gpu, 3, DType> &dst,
+                                    const Tensor<gpu, 3, DType> &src,
+                                    const Tensor<gpu, 2, DType> &label) {
+  const index_t xmax = dst.size(1);
+  const int y = blockIdx.x;
+  const int n = threadIdx.x;
+
+  if (n < dst.size(2)) {
+    const int k = static_cast<int>(label[y][n]);
+    for (index_t i = 0; i < xmax; ++i) {
+      if (i == k) {
+        dst[y][i][n] = src[y][i][n] - 1.0f;
+      } else {
+        dst[y][i][n] = src[y][i][n];
+      }
+    }
+  }
+}  
+
+template<typename DType>
+__global__ void Softmax3DKernel(Tensor<gpu, 3, DType> &dst,
+                    const Tensor<gpu, 3, DType> &src) {
+  const index_t xmax = dst.size(1);
+  const int y = blockIdx.x;
+  const int n = threadIdx.x;
+
+  if (n < dst.size(2)) {
+    DType smax = src[y][0][n];
+    for (index_t i = 1; i < xmax; ++i) {
+      smax = max(smax, src[y][i][n]);
+    }
+    DType ssum = 0.0f;
+    for (index_t i = 0; i < xmax; ++i) {
+      DType p = expf(src[y][i][n] - smax);
+      ssum += p;
+      dst[y][i][n] = p;
+    }
+    for (index_t i = 0; i < xmax; ++i) {
+      dst[y][i][n] /= ssum;
+    }
+  }
+}
+
+template<typename DType>
+inline void Softmax(Tensor<gpu, 3, DType> &dst,
+                    const Tensor<gpu, 3, DType> &src) {
+  dim3 dimBlock(kBaseThreadNum);
+  dim3 dimGrid(dst.size(0), dst.size(2));
+  CHECK_EQ(dst.shape_, src.shape_) << "Softmax: shape mismatch";
+  CheckLaunchParam(dimGrid, dimBlock, "Softmax");
+  cudaStream_t stream = Stream<gpu>::GetStream(dst.stream_);
+  Softmax3DKernel<DType><<<dimGrid, dimBlock, 0, stream>>>(dst, src);
+}
+
+
+template<typename DType>
+inline void SoftmaxGrad(Tensor<gpu, 3, DType> &dst,
+                        const Tensor<gpu, 3, DType> &src,
+                        const Tensor<gpu, 2, DType> &label) {
+  dim3 dimBlock(kBaseThreadNum);
+  dim3 dimGrid(dst.size(0), dst.size(2));
+  CHECK_EQ(dst.shape_, src.shape_) << "SoftmaxGrad: shape mismatch";
+  CHECK_EQ(dst.size(0), label.size(0)) << "SoftmaxGrad: label shape mismatch";
+  CHECK_EQ(dst.size(2), label.size(1)) << "SoftmaxGrad: label shape mismatch";
+  CheckLaunchParam(dimGrid, dimBlock, "SoftmaxGrad");
+  cudaStream_t stream = Stream<gpu>::GetStream(dst.stream_);
+  Softmax3DGradKernel<DType><<<dimGrid, dimBlock, 0, stream>>>(dst, src, label);
+}
+
 }  // namespace cuda
 }  // namespace mshadow
 #endif  // MSHADOW_CUDA_TENSOR_GPU_INL_CUH_
diff --git a/mshadow/tensor_blob.h b/mshadow/tensor_blob.h
index 1d8ca52c0be6..d08156c4cad7 100644
--- a/mshadow/tensor_blob.h
+++ b/mshadow/tensor_blob.h
@@ -490,6 +490,14 @@ class TBlob {
   inline index_t size(index_t idx) const {
     return shape_[idx];
   }
+  /*! \brief total number of elements in the tensor */
+  inline index_t Size(void) const {
+    index_t size = 1;
+    for (index_t i = 0; i < shape_.ndim(); ++i) {
+      size *= shape_[i];
+    }
+    return size;
+  }
   /*!
    * \brief fetch the tensor, with respect to specific dimension
    * if dim do not match the stored dimension, an error will be issued
diff --git a/mshadow/tensor_cpu-inl.h b/mshadow/tensor_cpu-inl.h
index 024d47926dd3..cdf51e84283a 100644
--- a/mshadow/tensor_cpu-inl.h
+++ b/mshadow/tensor_cpu-inl.h
@@ -299,6 +299,24 @@ inline void SoftmaxGrad(Tensor<cpu, 2, DType> dst,
   }
 }
 
+template<typename DType>
+inline void SoftmaxGrad(Tensor<cpu, 3, DType> dst,
+                        const Tensor<cpu, 3, DType> &src,
+                        const Tensor<cpu, 2, DType> &label) {
+  for (index_t n = 0; n < dst.size(2); ++n) {
+    for (index_t y = 0; y < dst.size(0); ++y) {
+      const index_t k = static_cast<int>(label[y][n]);
+      for (index_t x = 0; x < dst.size(1); ++x) {
+        if (x == k) {
+          dst[y][k][n] = src[y][k][n] - 1.0f;
+        } else {
+          dst[y][x][n] = src[y][x][n];
+        }
+      }
+    }
+  }
+}
+
 template<typename DType>
 inline void Softmax(Tensor<cpu, 2, DType> dst,
                     const Tensor<cpu, 2, DType> &energy, unsigned n_output = 1) {
@@ -308,6 +326,28 @@ inline void Softmax(Tensor<cpu, 2, DType> dst,
   }
 }
 
+template<typename DType>
+inline void Softmax(Tensor<cpu, 3, DType> dst,
+                    const Tensor<cpu, 3, DType> &energy) {
+  CHECK_EQ(dst.shape_, energy.shape_) << "Softmax: shape mismatch";
+  for (index_t y = 0; y < dst.size(0); ++y) {
+    for (index_t n = 0; n < dst.size(2); ++n) {
+      DType mmax = energy[y][0][n];
+      for (index_t x = 1; x < dst.size(1); ++x) {
+        if (mmax < energy[y][x][n]) mmax = energy[y][x][n];
+      }
+      DType sum = 0.0f;
+      for (index_t x = 0; x < dst.size(1); ++x) {
+        dst[y][x][n] = std::exp(energy[y][x][n] - mmax);
+        sum += dst[y][x][n];
+      }
+      for (index_t x = 0; x < dst.size(1); ++x) {
+        dst[y][x][n] /= sum;
+      }
+    }
+  }
+}
+
 template<typename DType>
 inline DType VDot(const Tensor<cpu, 1, DType> &lhs,
                   const Tensor<cpu, 1, DType> &rhs) {
diff --git a/mshadow/tensor_gpu-inl.h b/mshadow/tensor_gpu-inl.h
index 3bb90249e903..76774ada05ca 100644
--- a/mshadow/tensor_gpu-inl.h
+++ b/mshadow/tensor_gpu-inl.h
@@ -159,6 +159,12 @@ inline void Softmax(Tensor<gpu, 2, DType> dst,
   cuda::Softmax(dst, src);
 }
 
+template<typename DType>
+inline void Softmax(Tensor<gpu, 3, DType> dst,
+                    const Tensor<gpu, 3, DType>& src) {
+  cuda::Softmax(dst, src);
+}
+
 template<typename DType>
 inline void SoftmaxGrad(Tensor<gpu, 2, DType> dst,
                         const Tensor<gpu, 2, DType> &src,
@@ -166,6 +172,13 @@ inline void SoftmaxGrad(Tensor<gpu, 2, DType> dst,
   cuda::SoftmaxGrad(dst, src, label);
 }
 
+template<typename DType>
+inline void SoftmaxGrad(Tensor<gpu, 3, DType> dst,
+                        const Tensor<gpu, 3, DType> &src,
+                        const Tensor<gpu, 2, DType> &label) {
+  cuda::SoftmaxGrad(dst, src, label);
+}
+
 }  // namespace mshadow
 #endif  // __CUDACC__
 #endif  // MSHADOW_TENSOR_GPU_INL_H_

From 24572d4546c8cc1f694ccd937c790aad27100da1 Mon Sep 17 00:00:00 2001
From: piiswrong <eric.jy.xie@gmail.com>
Date: Fri, 23 Oct 2015 13:21:56 -0700
Subject: [PATCH 5/6] softmax and softmaxGrad now support multiple output

---
 mshadow/cuda/tensor_gpu-inl.cuh | 181 +++++++++++++++++++++-----------
 mshadow/tensor_blob.h           |   8 ++
 mshadow/tensor_cpu-inl.h        |  40 +++++++
 mshadow/tensor_gpu-inl.h        |  13 +++
 4 files changed, 183 insertions(+), 59 deletions(-)

diff --git a/mshadow/cuda/tensor_gpu-inl.cuh b/mshadow/cuda/tensor_gpu-inl.cuh
index 4b47f115600b..380876d3fa89 100644
--- a/mshadow/cuda/tensor_gpu-inl.cuh
+++ b/mshadow/cuda/tensor_gpu-inl.cuh
@@ -190,82 +190,75 @@ inline void MapReduceKeepDim1(expr::Plan<DstExp, DType> dst,
 }
 
 template<int x_bits, typename DType, typename DstPlan, typename SrcPlan1, typename SrcPlan2>
-__global__ void SoftmaxGradKernel(DstPlan dst, SrcPlan1 src, SrcPlan2 label,
-                                  index_t xmax, unsigned n_output) {
+__global__ void SoftmaxGradKernel(DstPlan dst, SrcPlan1 src, SrcPlan2 label, index_t xmax) {
   const unsigned x_size = 1 << x_bits;
   const int y = blockIdx.x;
+  const int k = static_cast<int>(label.Eval(0, y));
 
-  for (unsigned n = 0; n < n_output; ++n) {
-    const int k = static_cast<int>(label.Eval(0, y*n_output + n));
-    const unsigned base = n*xmax;
-    // calculate normalizer, with writeback
-    for (unsigned x = 0; x < xmax; x += x_size) {
-      const unsigned xindex = x + threadIdx.x;
-      if (xindex < xmax) {
-        if (xindex == k) {
-          dst.REval(y, base + xindex) = src.Eval(y, base + xindex) - 1.0f;
-        } else {
-          dst.REval(y, base + xindex) = src.Eval(y, base + xindex);
-        }
+  // calculate normalizer, with writeback
+  for (unsigned x = 0; x < xmax; x += x_size) {
+    const unsigned xindex = x + threadIdx.x;
+    if (xindex < xmax) {
+      if (xindex == k) {
+        dst.REval(y, xindex) = src.Eval(y, xindex) - 1.0f;
+      } else {
+        dst.REval(y, xindex) = src.Eval(y, xindex);
       }
     }
   }
 }
 
 template<int x_bits, typename DType,  typename DstPlan, typename SrcPlan>
-__global__ void SoftmaxKernel(DstPlan dst, SrcPlan src, index_t xmax, unsigned n_output) {
+__global__ void SoftmaxKernel(DstPlan dst, SrcPlan src, index_t xmax) {
   const unsigned x_size = 1 << x_bits;
   const int y = blockIdx.x;
   __shared__ DType s_rec[x_size];
-
-  for (unsigned base = 0; base < xmax*n_output; base += xmax) {
-    // step 1: get max
-    if (threadIdx.x < xmax) {
-      s_rec[threadIdx.x] = src.Eval(y, base + threadIdx.x);
-    }
-    for (unsigned x = x_size; x < xmax; x += x_size) {
-      if (x + threadIdx.x < xmax) {
-        DType a = src.Eval(y, base + x + threadIdx.x);
-        s_rec[threadIdx.x] = max(a, s_rec[threadIdx.x]);
-      }
-    }
-    __syncthreads();
-    if (threadIdx.x >= xmax) {
-      s_rec[threadIdx.x] = s_rec[0];
+  // step 1: get max
+  if (threadIdx.x < xmax) {
+    s_rec[threadIdx.x] = src.Eval(y, threadIdx.x);
+  }
+  for (unsigned x = x_size; x < xmax; x += x_size) {
+    if (x + threadIdx.x < xmax) {
+      DType a = src.Eval(y, x + threadIdx.x);
+      s_rec[threadIdx.x] = max(a, s_rec[threadIdx.x]);
     }
-    __syncthreads();
-    Reduce1D<red::maximum, x_bits>(s_rec);
-    __syncthreads();
-    DType smax = s_rec[0];
-    __syncthreads();
-    s_rec[threadIdx.x] = 0.0f;
-    __syncthreads();
+  }
+  __syncthreads();
+  if (threadIdx.x >= xmax) {
+    s_rec[threadIdx.x] = s_rec[0];
+  }
+  __syncthreads();
+  Reduce1D<red::maximum, x_bits>(s_rec);
+  __syncthreads();
+  DType smax = s_rec[0];
+  __syncthreads();
+  s_rec[threadIdx.x] = 0.0f;
+  __syncthreads();
 
-    // calculate normalizer, with writeback
-    for (unsigned x = 0; x < xmax; x += x_size) {
-      if (x + threadIdx.x < xmax) {
-        DType p = expf(src.Eval(y, base + x + threadIdx.x) - smax);
-        s_rec[threadIdx.x] += p;
-        // write back first, will fetch later
-        dst.REval(y, base + x + threadIdx.x) = p;
-      }
+  // calculate normalizer, with writeback
+  for (unsigned x = 0; x < xmax; x += x_size) {
+    if (x + threadIdx.x < xmax) {
+      DType p = expf(src.Eval(y, x + threadIdx.x) - smax);
+      s_rec[threadIdx.x] += p;
+      // write back first, will fetch later
+      dst.REval(y, x + threadIdx.x) = p;
     }
-    // calculate normalizer
-    __syncthreads();
-    Reduce1D<red::sum, x_bits>(s_rec);
-    __syncthreads();
-    DType ssum = s_rec[0];
+  }
+  // calculate normalizer
+  __syncthreads();
+  Reduce1D<red::sum, x_bits>(s_rec);
+  __syncthreads();
+  DType ssum = s_rec[0];
 
-    for (unsigned x = 0; x < xmax; x += x_size) {
-      if (x + threadIdx.x < xmax) {
-        dst.REval(y, base + x + threadIdx.x) /= ssum;
-      }
+  for (unsigned x = 0; x < xmax; x += x_size) {
+    if (x + threadIdx.x < xmax) {
+      dst.REval(y, x + threadIdx.x) /= ssum;
     }
   }
 }
 template<typename DType>
 inline void Softmax(Tensor<gpu, 2, DType> &dst,
-                    const Tensor<gpu, 2, DType> &src, unsigned n_output = 1) {
+                    const Tensor<gpu, 2, DType> &src) {
   dim3 dimBlock(kBaseThreadNum);
   dim3 dimGrid(dst.size(0));
   CHECK_EQ(dst.shape_, src.shape_) << "Softmax: shape mismatch";
@@ -275,14 +268,13 @@ inline void Softmax(Tensor<gpu, 2, DType> &dst,
       <<<dimGrid, dimBlock, 0, stream>>>
       (expr::MakePlan(dst),
        expr::MakePlan(src),
-       dst.size(1), n_output);
+       dst.size(1));
 }
 
 template<typename DType>
 inline void SoftmaxGrad(Tensor<gpu, 2, DType> &dst,
                         const Tensor<gpu, 2, DType> &src,
-                        const Tensor<gpu, 1, DType> &label,
-                        unsigned n_output = 1) {
+                        const Tensor<gpu, 1, DType> &label) {
   dim3 dimBlock(kBaseThreadNum);
   dim3 dimGrid(dst.size(0));
   CHECK_EQ(dst.shape_, src.shape_) << "SoftmaxGrad: shape mismatch";
@@ -294,8 +286,79 @@ inline void SoftmaxGrad(Tensor<gpu, 2, DType> &dst,
       (expr::MakePlan(dst),
        expr::MakePlan(src),
        expr::MakePlan(label),
-       dst.size(1), n_output);
+       dst.size(1));
+}
+
+template<typename DType>
+__global__ void Softmax3DGradKernel(Tensor<gpu, 3, DType> &dst,
+                                    const Tensor<gpu, 3, DType> &src,
+                                    const Tensor<gpu, 2, DType> &label) {
+  const index_t xmax = dst.size(1);
+  const int y = blockIdx.x;
+  const int n = threadIdx.x;
+
+  if (n < dst.size(2)) {
+    const int k = static_cast<int>(label[y][n]);
+    for (index_t i = 0; i < xmax; ++i) {
+      if (i == k) {
+        dst[y][i][n] = src[y][i][n] - 1.0f;
+      } else {
+        dst[y][i][n] = src[y][i][n];
+      }
+    }
+  }
+}  
+
+template<typename DType>
+__global__ void Softmax3DKernel(Tensor<gpu, 3, DType> &dst,
+                    const Tensor<gpu, 3, DType> &src) {
+  const index_t xmax = dst.size(1);
+  const int y = blockIdx.x;
+  const int n = threadIdx.x;
+
+  if (n < dst.size(2)) {
+    DType smax = src[y][0][n];
+    for (index_t i = 1; i < xmax; ++i) {
+      smax = max(smax, src[y][i][n]);
+    }
+    DType ssum = 0.0f;
+    for (index_t i = 0; i < xmax; ++i) {
+      DType p = expf(src[y][i][n] - smax);
+      ssum += p;
+      dst[y][i][n] = p;
+    }
+    for (index_t i = 0; i < xmax; ++i) {
+      dst[y][i][n] /= ssum;
+    }
+  }
 }
+
+template<typename DType>
+inline void Softmax(Tensor<gpu, 3, DType> &dst,
+                    const Tensor<gpu, 3, DType> &src) {
+  dim3 dimBlock(kBaseThreadNum);
+  dim3 dimGrid(dst.size(0), dst.size(2));
+  CHECK_EQ(dst.shape_, src.shape_) << "Softmax: shape mismatch";
+  CheckLaunchParam(dimGrid, dimBlock, "Softmax");
+  cudaStream_t stream = Stream<gpu>::GetStream(dst.stream_);
+  Softmax3DKernel<DType><<<dimGrid, dimBlock, 0, stream>>>(dst, src);
+}
+
+
+template<typename DType>
+inline void SoftmaxGrad(Tensor<gpu, 3, DType> &dst,
+                        const Tensor<gpu, 3, DType> &src,
+                        const Tensor<gpu, 2, DType> &label) {
+  dim3 dimBlock(kBaseThreadNum);
+  dim3 dimGrid(dst.size(0), dst.size(2));
+  CHECK_EQ(dst.shape_, src.shape_) << "SoftmaxGrad: shape mismatch";
+  CHECK_EQ(dst.size(0), label.size(0)) << "SoftmaxGrad: label shape mismatch";
+  CHECK_EQ(dst.size(2), label.size(1)) << "SoftmaxGrad: label shape mismatch";
+  CheckLaunchParam(dimGrid, dimBlock, "SoftmaxGrad");
+  cudaStream_t stream = Stream<gpu>::GetStream(dst.stream_);
+  Softmax3DGradKernel<DType><<<dimGrid, dimBlock, 0, stream>>>(dst, src, label);
+}
+
 }  // namespace cuda
 }  // namespace mshadow
 #endif  // MSHADOW_CUDA_TENSOR_GPU_INL_CUH_
diff --git a/mshadow/tensor_blob.h b/mshadow/tensor_blob.h
index 1d8ca52c0be6..d08156c4cad7 100644
--- a/mshadow/tensor_blob.h
+++ b/mshadow/tensor_blob.h
@@ -490,6 +490,14 @@ class TBlob {
   inline index_t size(index_t idx) const {
     return shape_[idx];
   }
+  /*! \brief total number of elements in the tensor */
+  inline index_t Size(void) const {
+    index_t size = 1;
+    for (index_t i = 0; i < shape_.ndim(); ++i) {
+      size *= shape_[i];
+    }
+    return size;
+  }
   /*!
    * \brief fetch the tensor, with respect to specific dimension
    * if dim do not match the stored dimension, an error will be issued
diff --git a/mshadow/tensor_cpu-inl.h b/mshadow/tensor_cpu-inl.h
index 024d47926dd3..cdf51e84283a 100644
--- a/mshadow/tensor_cpu-inl.h
+++ b/mshadow/tensor_cpu-inl.h
@@ -299,6 +299,24 @@ inline void SoftmaxGrad(Tensor<cpu, 2, DType> dst,
   }
 }
 
+template<typename DType>
+inline void SoftmaxGrad(Tensor<cpu, 3, DType> dst,
+                        const Tensor<cpu, 3, DType> &src,
+                        const Tensor<cpu, 2, DType> &label) {
+  for (index_t n = 0; n < dst.size(2); ++n) {
+    for (index_t y = 0; y < dst.size(0); ++y) {
+      const index_t k = static_cast<int>(label[y][n]);
+      for (index_t x = 0; x < dst.size(1); ++x) {
+        if (x == k) {
+          dst[y][k][n] = src[y][k][n] - 1.0f;
+        } else {
+          dst[y][x][n] = src[y][x][n];
+        }
+      }
+    }
+  }
+}
+
 template<typename DType>
 inline void Softmax(Tensor<cpu, 2, DType> dst,
                     const Tensor<cpu, 2, DType> &energy, unsigned n_output = 1) {
@@ -308,6 +326,28 @@ inline void Softmax(Tensor<cpu, 2, DType> dst,
   }
 }
 
+template<typename DType>
+inline void Softmax(Tensor<cpu, 3, DType> dst,
+                    const Tensor<cpu, 3, DType> &energy) {
+  CHECK_EQ(dst.shape_, energy.shape_) << "Softmax: shape mismatch";
+  for (index_t y = 0; y < dst.size(0); ++y) {
+    for (index_t n = 0; n < dst.size(2); ++n) {
+      DType mmax = energy[y][0][n];
+      for (index_t x = 1; x < dst.size(1); ++x) {
+        if (mmax < energy[y][x][n]) mmax = energy[y][x][n];
+      }
+      DType sum = 0.0f;
+      for (index_t x = 0; x < dst.size(1); ++x) {
+        dst[y][x][n] = std::exp(energy[y][x][n] - mmax);
+        sum += dst[y][x][n];
+      }
+      for (index_t x = 0; x < dst.size(1); ++x) {
+        dst[y][x][n] /= sum;
+      }
+    }
+  }
+}
+
 template<typename DType>
 inline DType VDot(const Tensor<cpu, 1, DType> &lhs,
                   const Tensor<cpu, 1, DType> &rhs) {
diff --git a/mshadow/tensor_gpu-inl.h b/mshadow/tensor_gpu-inl.h
index 3bb90249e903..76774ada05ca 100644
--- a/mshadow/tensor_gpu-inl.h
+++ b/mshadow/tensor_gpu-inl.h
@@ -159,6 +159,12 @@ inline void Softmax(Tensor<gpu, 2, DType> dst,
   cuda::Softmax(dst, src);
 }
 
+template<typename DType>
+inline void Softmax(Tensor<gpu, 3, DType> dst,
+                    const Tensor<gpu, 3, DType>& src) {
+  cuda::Softmax(dst, src);
+}
+
 template<typename DType>
 inline void SoftmaxGrad(Tensor<gpu, 2, DType> dst,
                         const Tensor<gpu, 2, DType> &src,
@@ -166,6 +172,13 @@ inline void SoftmaxGrad(Tensor<gpu, 2, DType> dst,
   cuda::SoftmaxGrad(dst, src, label);
 }
 
+template<typename DType>
+inline void SoftmaxGrad(Tensor<gpu, 3, DType> dst,
+                        const Tensor<gpu, 3, DType> &src,
+                        const Tensor<gpu, 2, DType> &label) {
+  cuda::SoftmaxGrad(dst, src, label);
+}
+
 }  // namespace mshadow
 #endif  // __CUDACC__
 #endif  // MSHADOW_TENSOR_GPU_INL_H_

From d298f4fe6d22a05f13e766c4a74d9afa1442180e Mon Sep 17 00:00:00 2001
From: piiswrong <eric.jy.xie@gmail.com>
Date: Fri, 23 Oct 2015 23:40:58 -0700
Subject: [PATCH 6/6] fix

---
 mshadow/tensor_blob.h    |  6 +---
 mshadow/tensor_cpu-inl.h | 59 ++++++++++++++--------------------------
 2 files changed, 22 insertions(+), 43 deletions(-)

diff --git a/mshadow/tensor_blob.h b/mshadow/tensor_blob.h
index d08156c4cad7..265d80059ce7 100644
--- a/mshadow/tensor_blob.h
+++ b/mshadow/tensor_blob.h
@@ -492,11 +492,7 @@ class TBlob {
   }
   /*! \brief total number of elements in the tensor */
   inline index_t Size(void) const {
-    index_t size = 1;
-    for (index_t i = 0; i < shape_.ndim(); ++i) {
-      size *= shape_[i];
-    }
-    return size;
+    return shape_.Size();
   }
   /*!
    * \brief fetch the tensor, with respect to specific dimension
diff --git a/mshadow/tensor_cpu-inl.h b/mshadow/tensor_cpu-inl.h
index cdf51e84283a..14fcd9219f4c 100644
--- a/mshadow/tensor_cpu-inl.h
+++ b/mshadow/tensor_cpu-inl.h
@@ -251,49 +251,32 @@ inline void MapReduceKeepHighDim(TRValue<R, cpu, 1, DType> *dst,
 
 template<typename DType>
 inline void Softmax(Tensor<cpu, 1, DType> dst,
-                    const Tensor<cpu, 1, DType> &energy,
-                    unsigned n_output = 1) {
-  CHECK_EQ(dst.size(0)%n_output, 0)
-    << "Invalid input dimension for output number";
-  unsigned xmax = dst.size(0)/n_output;
-  for (unsigned base = 0; base < dst.size(0); base += xmax) {
-    DType mmax = energy[base];
-    for (index_t x = 1; x < xmax; ++x) {
-      if (mmax < energy[base + x]) mmax = energy[base + x];
-    }
-    DType sum = 0.0f;
-    for (index_t x = 0; x < xmax; ++x) {
-      dst[base + x] = std::exp(energy[base + x] - mmax);
-      sum += dst[base + x];
-    }
-    for (index_t x = 0; x < xmax; ++x) {
-      dst[base + x] /= sum;
-    }
+                    const Tensor<cpu, 1, DType> &energy) {
+  DType mmax = energy[0];
+  for (index_t x = 1; x < dst.size(0); ++x) {
+    if (mmax < energy[x]) mmax = energy[x];
+  }
+  DType sum = 0.0f;
+  for (index_t x = 0; x < dst.size(0); ++x) {
+    dst[x] = std::exp(energy[x] - mmax);
+    sum += dst[x];
+  }
+  for (index_t x = 0; x < dst.size(0); ++x) {
+    dst[x] /= sum;
   }
 }
 
 template<typename DType>
 inline void SoftmaxGrad(Tensor<cpu, 2, DType> dst,
                         const Tensor<cpu, 2, DType> &src,
-                        const Tensor<cpu, 1, DType> &label,
-                        unsigned n_output = 1) {
-  CHECK_EQ(dst.size(1)%n_output, 0)
-    << "Invalid input dimension for n_output";
-  CHECK_EQ(dst.size(0), label.size(0)/n_output)
-    << "Label and input dimensions doesn't match";
-  CHECK_EQ(label.size(0)%n_output, 0)
-    << "Invalid label dimension for n_output";
-  const unsigned xmax = dst.size(1)/n_output;
+                        const Tensor<cpu, 1, DType> &label) {
   for (index_t y = 0; y < dst.size(0); ++y) {
-    for (unsigned n = 0; n < n_output; ++n) {
-      const base = n*xmax;
-      const index_t k = static_cast<int>(label[y*n_output+n]);
-      for (index_t x = 0; x < xmax; ++x) {
-        if (x == k) {
-          dst[y][base + k] = src[y][base + k] - 1.0f;
-        } else {
-          dst[y][base + x] = src[y][base + x];
-        }
+    const index_t k = static_cast<int>(label[y]);
+    for (index_t x = 0; x < dst.size(1); ++x) {
+      if (x == k) {
+        dst[y][k] = src[y][k] - 1.0f;
+      } else {
+        dst[y][x] = src[y][x];
       }
     }
   }
@@ -319,10 +302,10 @@ inline void SoftmaxGrad(Tensor<cpu, 3, DType> dst,
 
 template<typename DType>
 inline void Softmax(Tensor<cpu, 2, DType> dst,
-                    const Tensor<cpu, 2, DType> &energy, unsigned n_output = 1) {
+                    const Tensor<cpu, 2, DType> &energy) {
   CHECK_EQ(dst.shape_, energy.shape_) << "Softmax: shape mismatch";
   for (index_t y = 0; y < dst.size(0); ++y) {
-    Softmax(dst[y], energy[y], unsigned n_output = 1);
+    Softmax(dst[y], energy[y]);
   }
 }