diff --git a/docs/api/io.rst b/docs/api/io.rst
index e5cb3ffb32e5..8ec67cf6d073 100644
--- a/docs/api/io.rst
+++ b/docs/api/io.rst
@@ -351,7 +351,7 @@ libmxnet data providers
    
    
    :param prefetch_buffer: Backend Param: Number of prefetched parameters
-   :type prefetch_buffer: , optional, default=4
+   :type prefetch_buffer: long (non-negative), optional, default=4
    
    
    :param rand_crop: Augmentation Param: Whether to random crop on the image
@@ -514,7 +514,7 @@ libmxnet data providers
    
    
    :param prefetch_buffer: Backend Param: Number of prefetched parameters
-   :type prefetch_buffer: , optional, default=4
+   :type prefetch_buffer: long (non-negative), optional, default=4
    
    :return: the constructed :class:`MXDataProvider`.
 
diff --git a/docs/api/metric.rst b/docs/api/metric.rst
index db503d5e6849..966d682ba2b7 100644
--- a/docs/api/metric.rst
+++ b/docs/api/metric.rst
@@ -56,3 +56,12 @@ set.
 
 
 
+
+.. class:: ACE
+
+   Averaged cross-entropy for classification. This also know als logloss.
+
+   Calculated the averaged cross entropy for multi-dimentions output.
+
+
+
diff --git a/docs/api/ndarray.rst b/docs/api/ndarray.rst
index be1c74b80bea..b72f0faa5cfe 100644
--- a/docs/api/ndarray.rst
+++ b/docs/api/ndarray.rst
@@ -687,6 +687,28 @@ Public APIs
 
 
 
+
+.. function:: sum_mid_internal(...)
+
+   Take sum on medium dimension of the 3D src.
+   
+   :param src: Source input to the function
+   :type src: NDArray
+   
+
+
+
+
+.. function:: transpose(...)
+
+   Transpose the input matrix and return a new one
+   
+   :param src: Source input to the function
+   :type src: NDArray
+   
+
+
+
 Internal APIs
 ^^^^^^^^^^^^^
 
@@ -694,6 +716,25 @@ Internal APIs
 
    Document and signatures for internal API functions might be incomplete.
 
+.. function:: _broadcast(...)
+
+   Broadcast array in the given axis to the given size
+   
+   :param src: source ndarray
+   :type src: NDArray
+   
+   
+   :param axis: axis to broadcast
+   :type axis: int
+   
+   
+   :param size: size of broadcast
+   :type size: int
+   
+
+
+
+
 .. function:: _copyto(...)
 
    
diff --git a/docs/api/symbolic-node.rst b/docs/api/symbolic-node.rst
index a390b82dd09d..e303afba21dc 100644
--- a/docs/api/symbolic-node.rst
+++ b/docs/api/symbolic-node.rst
@@ -346,11 +346,15 @@ Public APIs
 
 .. function:: Crop(...)
 
-   Crop the 2nd and 3rd dim of input data, with the corresponding size of w_h or with width and height of the second input symbol
+   Crop the 2nd and 3rd dim of input data, with the corresponding size of h_w or with width and height of the second input symbol, i.e., with one input, we need h_w to specify the crop height and width, otherwise the second input symbol's size will be used
    
    This function support variable length positional :class:`SymbolicNode` inputs.
    
-   :param num_args: Number of inputs for crop, if equals one, then we will use the h_wfor crop heihgt and width, else if equals two, then we will use the heightand width of the second input symbol, we name crop_like here
+   :param data: Tensor or List of Tensors, the second input will be used as crop_like shape reference
+   :type data: SymbolicNode or SymbolicNode[]
+   
+   
+   :param num_args: Number of inputs for crop, if equals one, then we will use the h_wfor crop height and width, else if equals two, then we will use the heightand width of the second input symbol, we name crop_like here
    :type num_args: int, required
    
    
@@ -374,6 +378,34 @@ Public APIs
 
 
 
+.. function:: CuDNNBatchNorm(...)
+
+   Apply batch normalization to input.
+   
+   :param data: Input data to batch normalization
+   :type data: SymbolicNode
+   
+   
+   :param eps: Epsilon to prevent div 0
+   :type eps: float, optional, default=0.001
+   
+   
+   :param momentum: Momentum for moving average
+   :type momentum: float, optional, default=0.9
+   
+   
+   :param fix_gamma: Fix gamma while training
+   :type fix_gamma: boolean, optional, default=False
+   
+   :param Symbol name: The name of the :class:`SymbolicNode`. (e.g. `:my_symbol`), optional.
+   :param Dict{Symbol, AbstractString} attrs: The attributes associated with this :class:`SymbolicNode`.
+   
+   :return: SymbolicNode.
+   
+
+
+
+
 .. function:: Deconvolution(...)
 
    Apply deconvolution to input then add a bias.
@@ -759,11 +791,15 @@ Public APIs
 
 .. function:: SliceChannel(...)
 
-   Slice channel into many outputs with equally divided channel
+   Slice input equally along specified axis
    
    :param num_outputs: Number of outputs to be sliced.
    :type num_outputs: int, required
    
+   
+   :param axis: Dimension along which to slice.
+   :type axis: int, optional, default='1'
+   
    :param Symbol name: The name of the :class:`SymbolicNode`. (e.g. `:my_symbol`), optional.
    :param Dict{Symbol, AbstractString} attrs: The attributes associated with this :class:`SymbolicNode`.
    
diff --git a/src/metric.jl b/src/metric.jl
index 5bf14e52a840..a22794e9f158 100644
--- a/src/metric.jl
+++ b/src/metric.jl
@@ -51,47 +51,36 @@ type Accuracy <: AbstractEvalMetric
   Accuracy() = new(0.0, 0)
 end
 
-"""
-Implementation taken from findmax in Julia base.
-Searches for the maximum value in p_dim of a.
-I and n are values for the other dimensions.
-"""
-function _indmax(a, I, p_dim, n)
-  m = a[I..., 1, n]
-  mi = 1
-  for i in 2:size(a, p_dim)
-    ai = a[I..., i, n]
-    if ai > m || m!=m
-      m = ai
-      mi = i
-    end
-  end
-  return mi
-end
-
 function _update_single_output(metric :: Accuracy, label :: NDArray, pred :: NDArray)
   @nd_as_jl ro=(label,pred) begin
-    if ndims(pred) > 2 # Multidimensional case
-      # Construct cartesian index
-      p_dim = ndims(pred)-1
-      initial = tuple(fill(1,p_dim-1)...)
-      dims = size(pred, (1:p_dim-1)...)
-      crange = CartesianRange(CartesianIndex(initial), CartesianIndex(dims))
-
-      for sample in 1:size(label, ndims(label))
-        for i in crange
-          l_i = sub2ind(dims, i.I...)
-          klass = _indmax(pred, i.I, p_dim, sample)
-          metric.acc_sum += (klass-1) == label[l_i, sample]
-          metric.n_sample += 1
+    # Samples are stored in the last dimension
+    @assert size(label, ndims(label)) == size(pred, ndims(pred))
+
+    if ndims(pred) == 4 # Multidimensional case
+      # Reshape label to be of the same shape as pred.
+      # Except for the third dimension where the predictions are stored.
+      labels = reshape(label, size(pred, 1, 2)..., 1, size(pred, 4))
+
+      for sample in 1:size(labels, 4)
+        for j in 1:size(labels, 2)
+          for i in 1:size(labels, 1)
+            label = labels[i, j, 1, sample]
+            klasses = sub(pred, i, j, :, sample)
+            klass = indmax(klasses) - 1 # Classes start at 0...k-1
+
+            metric.acc_sum += klass == label
+            metric.n_sample += 1
+          end
         end
       end
-    else # 1-dimensional case
+    elseif ndims(pred) == 2 # 1-dimensional case
       for sample in 1:size(label, 1)
-        klass = indmax(pred[:, sample])
-        metric.acc_sum += (klass-1) == label[sample]
+        klass = indmax(sub(pred, :, sample)) - 1
+        metric.acc_sum += klass == label[sample]
         metric.n_sample += 1
       end
+    else
+      error("Can't handle prediction with dimensions $(ndims(pred)).")
     end
   end
 end
@@ -155,3 +144,59 @@ function reset!(metric :: MSE)
   metric.mse_sum  = 0.0
   metric.n_sample = 0
 end
+
+#=doc
+.. class:: ACE
+
+   Averaged cross-entropy for classification. This also know als logloss.
+
+   Calculated the averaged cross entropy for multi-dimentions output.
+=#
+type ACE <: AbstractEvalMetric
+  ace_sum  :: Float64
+  n_sample :: Int
+
+  ACE() = new(0.0, 0)
+end
+
+function get(metric :: ACE)
+  return [(:ACE, - metric.ace_sum / metric.n_sample)]
+end
+
+function reset!(metric :: ACE)
+  metric.ace_sum = 0.0
+  metric.n_sample = 0
+end
+
+function _update_single_output(metric :: ACE, label :: NDArray, pred :: NDArray)
+  @nd_as_jl ro=(label,pred) begin
+    # Samples are stored in the last dimension
+    @assert size(label, ndims(label)) == size(pred, ndims(pred))
+    @assert ndims(pred) == 4
+
+    labels = reshape(label, size(pred, 1, 2)..., 1, size(pred, 4))
+    for sample in 1:size(labels, 4)
+      for j in 1:size(labels, 2)
+        for i in 1:size(labels, 1)
+          label = labels[i, j, 1, sample]
+
+          # Cross-entropy reduces to -(ln(p_1)*0 + ln(p_2)*1) for classification
+          # Since we can only target labels right now this is the only thing we can do.
+          target = Int(label) + 1 # klasses are 0...k-1 => julia indexing
+          p_k = pred[i, j, target, sample]
+
+          metric.ace_sum += log(p_k)
+          metric.n_sample += 1
+        end
+      end
+    end
+  end
+end
+
+function update!(metric :: ACE, labels :: Vector{NDArray}, preds :: Vector{NDArray})
+  @assert length(labels) == length(preds)
+  for i = 1:length(labels)
+    _update_single_output(metric, labels[i], preds[i])
+  end
+end
+