diff --git a/cd/mxnet_lib/mxnet_lib_pipeline.groovy b/cd/mxnet_lib/mxnet_lib_pipeline.groovy
index c35db6d3f25f..27064a3a21f6 100644
--- a/cd/mxnet_lib/mxnet_lib_pipeline.groovy
+++ b/cd/mxnet_lib/mxnet_lib_pipeline.groovy
@@ -42,16 +42,6 @@ def get_pipeline(mxnet_variant, build_fn) {
           }
         }
 
-        if (mxnet_variant.startsWith('cu')) {
-          tests["${mxnet_variant}: Quantization Python 3"] = {
-            stage("${mxnet_variant}: Quantization Python 3") {
-              timeout(time: max_time, unit: 'MINUTES') {
-                test_gpu_quantization_py3(mxnet_variant)
-              }
-            }
-          }
-        }
-
         parallel tests
       }
 
@@ -103,17 +93,6 @@ def unittest_py3(mxnet_variant) {
   }
 }
 
-// Tests quantization in P3 instance using Python 3
-def test_gpu_quantization_py3(mxnet_variant) {
-  node(NODE_LINUX_GPU_P3) {
-    ws("workspace/mxnet_${libtype}/${mxnet_variant}/${env.BUILD_NUMBER}") {
-      def image = get_environment(mxnet_variant)
-      ci_utils.unpack_and_init("mxnet_${mxnet_variant}", get_stash(mxnet_variant), false)
-      ci_utils.docker_run(image, "unittest_ubuntu_python3_quantization_gpu", true)
-    }
-  }
-}
-
 // Pushes artifact to artifact repository
 def push(mxnet_variant) {
   node(NODE_LINUX_CPU) {
diff --git a/ci/docker/Dockerfile.build.ubuntu b/ci/docker/Dockerfile.build.ubuntu
index ce58839e940e..415e6ae881ae 100644
--- a/ci/docker/Dockerfile.build.ubuntu
+++ b/ci/docker/Dockerfile.build.ubuntu
@@ -126,10 +126,6 @@ RUN update-java-alternatives -s java-1.8.0-openjdk-amd64
 COPY install/ubuntu_julia.sh /work/
 RUN /work/ubuntu_julia.sh
 
-# PDL::CCS missing on 18.04
-COPY install/ubuntu_perl.sh /work/
-RUN /work/ubuntu_perl.sh
-
 # MXNetJS nightly needs emscripten for wasm
 COPY install/ubuntu_emscripten.sh /work/
 RUN /work/ubuntu_emscripten.sh
diff --git a/ci/docker/Dockerfile.build.ubuntu_cpu_julia b/ci/docker/Dockerfile.build.ubuntu_cpu_julia
index eb8e30c99cd2..e100d4df09a8 100644
--- a/ci/docker/Dockerfile.build.ubuntu_cpu_julia
+++ b/ci/docker/Dockerfile.build.ubuntu_cpu_julia
@@ -39,9 +39,6 @@ RUN /work/ubuntu_scala.sh
 COPY install/ubuntu_clojure.sh /work/
 RUN /work/ubuntu_clojure.sh
 
-COPY install/ubuntu_perl.sh /work/
-RUN /work/ubuntu_perl.sh
-
 COPY install/ubuntu_julia.sh /work/
 RUN /work/ubuntu_julia.sh
 
diff --git a/ci/docker/install/ubuntu_perl.sh b/ci/docker/install/ubuntu_perl.sh
deleted file mode 100755
index e04141eee322..000000000000
--- a/ci/docker/install/ubuntu_perl.sh
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/usr/bin/env bash
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# build and install are separated so changes to build don't invalidate
-# the whole docker cache for the image
-
-set -ex
-# install libraries for mxnet's perl package on ubuntu
-apt-get update || true
-apt-get install -y libmouse-perl pdl cpanminus swig libgraphviz-perl
-cpanm -q Function::Parameters Hash::Ordered PDL::CCS
diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 0ee0e4bffca4..3886d1134987 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -925,7 +925,6 @@ cd_unittest_ubuntu() {
 
     pytest -m 'not serial' -n 4 --durations=50 --verbose tests/python/unittest
     pytest -m 'serial' --durations=50 --verbose tests/python/unittest
-    pytest -n 4 --durations=50 --verbose tests/python/quantization
 
     # https://github.com/apache/incubator-mxnet/issues/11801
     # if [[ ${mxnet_variant} = "cpu" ]] || [[ ${mxnet_variant} = "mkl" ]]; then
@@ -963,7 +962,6 @@ unittest_ubuntu_python3_cpu() {
     MXNET_ENGINE_TYPE=NaiveEngine \
         pytest -m 'not serial' -k 'test_operator' -n 4 --durations=50 --cov-report xml:tests_unittest.xml --cov-append --verbose tests/python/unittest
     pytest -m 'serial' --durations=50 --cov-report xml:tests_unittest.xml --cov-append --verbose tests/python/unittest
-    pytest -n 4 --durations=50 --cov-report xml:tests_quantization.xml --verbose tests/python/quantization
 }
 
 unittest_ubuntu_python3_cpu_serial() {
@@ -976,7 +974,6 @@ unittest_ubuntu_python3_cpu_serial() {
     export MXNET_ENABLE_CYTHON=0
     export DMLC_LOG_STACK_TRACE_DEPTH=10
     pytest --durations=50 --cov-report xml:tests_unittest.xml --verbose tests/python/unittest
-    pytest --durations=50 --cov-report xml:tests_quantization.xml --verbose tests/python/quantization
 }
 
 unittest_ubuntu_python3_cpu_mkldnn() {
@@ -1044,38 +1041,6 @@ unittest_ubuntu_python3_gpu_nocudnn() {
     pytest -m 'serial' --durations=50 --cov-report xml:tests_gpu.xml --cov-append --verbose tests/python/gpu
 }
 
-unittest_ubuntu_tensorrt_gpu() {
-    set -ex
-    export PYTHONPATH=./python/
-    export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
-    export MXNET_SUBGRAPH_VERBOSE=0
-    export LD_LIBRARY_PATH=/work/mxnet/lib:$LD_LIBRARY_PATH
-    export CUDNN_VERSION=${CUDNN_VERSION:-7.0.3}
-    export MXNET_ENABLE_CYTHON=0
-    export DMLC_LOG_STACK_TRACE_DEPTH=10
-    MXNET_GPU_MEM_POOL_TYPE=Unpooled \
-        pytest -n 4 --durations=50 --cov-report xml:tests_trt_gpu.xml --verbose --capture=no tests/python/tensorrt/test_ops.py
-    pytest -k 'not test_ops' --durations=50 --cov-report xml:tests_trt_gpu.xml --cov-append --verbose --capture=no tests/python/tensorrt/
-}
-
-# quantization gpu currently only runs on P3 instances
-# need to separte it from unittest_ubuntu_python3_gpu()
-unittest_ubuntu_python3_quantization_gpu() {
-    set -ex
-    if [ -f /etc/redhat-release ]; then
-        source /opt/rh/rh-python36/enable
-    fi
-    export PYTHONPATH=./python/
-    export MXNET_MKLDNN_DEBUG=0 # Ignored if not present
-    export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
-    export MXNET_SUBGRAPH_VERBOSE=0
-    export CUDNN_VERSION=${CUDNN_VERSION:-7.0.3}
-    export MXNET_ENABLE_CYTHON=0
-    export DMLC_LOG_STACK_TRACE_DEPTH=10
-    MXNET_GPU_MEM_POOL_TYPE=Unpooled \
-        pytest -n 4 --durations=50 --cov-report xml:tests_quantization_gpu.xml --verbose tests/python/quantization_gpu
-}
-
 unittest_centos7_cpu_scala() {
     set -ex
     source /opt/rh/devtoolset-7/enable
@@ -1104,11 +1069,6 @@ unittest_ubuntu_cpu_clojure_integration() {
 }
 
 
-unittest_ubuntu_cpugpu_perl() {
-    set -ex
-    ./perl-package/test.sh
-}
-
 unittest_cpp() {
     set -ex
     build/tests/mxnet_unit_tests
diff --git a/ci/jenkins/Jenkins_steps.groovy b/ci/jenkins/Jenkins_steps.groovy
index a0b08aa77590..200ce9ca880e 100644
--- a/ci/jenkins/Jenkins_steps.groovy
+++ b/ci/jenkins/Jenkins_steps.groovy
@@ -839,24 +839,6 @@ def test_unix_python3_gpu(lib_name) {
     }]
 }
 
-def test_unix_python3_quantize_gpu(lib_name) {
-    return ['Python3: Quantize GPU': {
-      node(NODE_LINUX_GPU_P3) {
-        ws('workspace/ut-python3-quantize-gpu') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            try {
-              utils.unpack_and_init(lib_name, mx_lib)
-              utils.docker_run('ubuntu_gpu_cu101', 'unittest_ubuntu_python3_quantization_gpu', true)
-              utils.publish_test_coverage()
-            } finally {
-              utils.collect_test_results_unix('tests_quantization_gpu.xml', 'tests_python3_quantize_gpu.xml')
-            }
-          }
-        }
-      }
-    }]
-}
-
 def test_unix_python3_debug_cpu() {
     return ['Python3: CPU debug': {
       node(NODE_LINUX_CPU) {
@@ -955,24 +937,6 @@ def test_unix_python3_mkldnn_nocudnn_gpu(lib_name) {
     }]
 }
 
-def test_unix_python3_tensorrt_gpu(lib_name) {
-    return ['Python3: TensorRT GPU': {
-      node(NODE_LINUX_GPU_P3) {
-        ws('workspace/build-tensorrt') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            try {
-              utils.unpack_and_init(lib_name, mx_tensorrt_lib)
-              utils.docker_run('ubuntu_gpu_tensorrt', 'unittest_ubuntu_tensorrt_gpu', true)
-              utils.publish_test_coverage()
-            } finally {
-              utils.collect_test_results_unix('tests_tensorrt.xml', 'tests_python3_tensorrt_gpu.xml')
-            }
-          }
-        }
-      }
-    }]
-}
-
 def test_unix_cpp_package_gpu(lib_name) {
     return ['cpp-package GPU Makefile': {
       node(NODE_LINUX_GPU_G4) {
@@ -1084,20 +1048,6 @@ def test_unix_r_mkldnn_cpu(lib_name) {
     }]
 }
 
-def test_unix_perl_cpu(lib_name) {
-    return ['Perl: CPU Makefile': {
-      node(NODE_LINUX_CPU) {
-        ws('workspace/ut-perl-cpu') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.unpack_and_init(lib_name, mx_lib_make)
-            utils.docker_run('ubuntu_cpu', 'unittest_ubuntu_cpugpu_perl', false)
-            utils.publish_test_coverage()
-          }
-        }
-      }
-    }]
-}
-
 def test_unix_cpp_gpu(lib_name) {
     return ['Cpp: GPU': {
       node(NODE_LINUX_GPU_G4) {
@@ -1126,20 +1076,6 @@ def test_unix_cpp_cpu(lib_name) {
     }]
 }
 
-def test_unix_perl_gpu(lib_name) {
-    return ['Perl: GPU Makefile': {
-      node(NODE_LINUX_GPU_G4) {
-        ws('workspace/ut-perl-gpu') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.unpack_and_init(lib_name, mx_lib_make)
-            utils.docker_run('ubuntu_gpu_cu101', 'unittest_ubuntu_cpugpu_perl', true)
-            utils.publish_test_coverage()
-          }
-        }
-      }
-    }]
-}
-
 def test_unix_r_gpu(lib_name) {
     return ['R: GPU': {
       node(NODE_LINUX_GPU_G4) {
@@ -1312,19 +1248,6 @@ def test_centos7_python3_cd_gpu(lib_name) {
     }]
 }
 
-def test_centos7_quantization_cd_gpu(lib_name) {
-    return ['Quantization Python3: CentOS 7 GPU CD': {
-      node(NODE_LINUX_GPU_P3) {
-        ws('workspace/test-cd-static/gpu') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.unpack_and_init(lib_name, mx_cd_lib)
-            utils.docker_run('centos7_gpu_cu102', 'unittest_ubuntu_python3_quantization_gpu', true)
-          }
-        }
-      }
-    }]
-}
-
 def test_centos7_pypi_package_cd_gpu(lib_name) {
     return ['PyPI package: CentOS 7 GPU CD': {
       node(NODE_LINUX_GPU) {
diff --git a/ci/jenkins/Jenkinsfile_centos_gpu b/ci/jenkins/Jenkinsfile_centos_gpu
index 62fec050a1b1..1eff794d5a0d 100644
--- a/ci/jenkins/Jenkinsfile_centos_gpu
+++ b/ci/jenkins/Jenkinsfile_centos_gpu
@@ -42,7 +42,6 @@ core_logic: {
   utils.parallel_stage('Tests', [
     custom_steps.test_centos7_python3_gpu('centos7_gpu'),
     custom_steps.test_centos7_python3_cd_gpu('centos7_gpu_cd'),
-    custom_steps.test_centos7_quantization_cd_gpu('centos7_gpu_cd'),
     custom_steps.test_centos7_pypi_package_cd_gpu('centos7_gpu_cd')
   ])
 }
diff --git a/ci/jenkins/Jenkinsfile_unix_cpu b/ci/jenkins/Jenkinsfile_unix_cpu
index 9335b5b33302..86498f0c3d1f 100644
--- a/ci/jenkins/Jenkinsfile_unix_cpu
+++ b/ci/jenkins/Jenkinsfile_unix_cpu
@@ -54,7 +54,6 @@ core_logic: {
     custom_steps.test_unix_scala_mkldnn_cpu('mkldnn_cpu_make'),
     custom_steps.test_unix_clojure_cpu('cpu_make'),
     custom_steps.test_unix_clojure_integration_cpu('cpu_make'),
-    custom_steps.test_unix_perl_cpu('cpu_make'),
     custom_steps.test_unix_r_cpu('cpu'),
     custom_steps.test_unix_r_mkldnn_cpu('mkldnn_cpu'),
     custom_steps.test_unix_julia07_cpu('cpu'),
diff --git a/ci/jenkins/Jenkinsfile_unix_gpu b/ci/jenkins/Jenkinsfile_unix_gpu
index d662a95e014f..8ea598e37882 100644
--- a/ci/jenkins/Jenkinsfile_unix_gpu
+++ b/ci/jenkins/Jenkinsfile_unix_gpu
@@ -48,11 +48,8 @@ core_logic: {
 
   utils.parallel_stage('Tests', [
     custom_steps.test_unix_python3_gpu('gpu'),
-    custom_steps.test_unix_python3_quantize_gpu('gpu'),
     custom_steps.test_unix_python3_mkldnn_gpu('mkldnn_gpu'),
     custom_steps.test_unix_python3_mkldnn_nocudnn_gpu('mkldnn_gpu_nocudnn'),
-    custom_steps.test_unix_python3_tensorrt_gpu('tensorrt'),
-    custom_steps.test_unix_perl_gpu('gpu_make'),
     custom_steps.test_unix_r_gpu('gpu'),
     custom_steps.test_unix_cpp_gpu('cmake_gpu'),
     custom_steps.test_unix_cpp_package_gpu('gpu_make'),
diff --git a/docs/python_docs/python/api/gluon/contrib/index.rst b/docs/python_docs/python/api/gluon/contrib/index.rst
index b4766a877dfa..b65c1f974f29 100644
--- a/docs/python_docs/python/api/gluon/contrib/index.rst
+++ b/docs/python_docs/python/api/gluon/contrib/index.rst
@@ -50,7 +50,6 @@ Neural Network
     Concurrent
     HybridConcurrent
     Identity
-    SparseEmbedding
     SyncBatchNorm
     PixelShuffle1D
     PixelShuffle2D
@@ -165,4 +164,4 @@ API Reference
 
 .. automodule:: mxnet.gluon.contrib.estimator
     :members:
-    :imported-members:
\ No newline at end of file
+    :imported-members:
diff --git a/docs/python_docs/python/tutorials/packages/ndarray/sparse/train_gluon.md b/docs/python_docs/python/tutorials/packages/ndarray/sparse/train_gluon.md
index 8239f2b7de07..9479eb8d6a0c 100644
--- a/docs/python_docs/python/tutorials/packages/ndarray/sparse/train_gluon.md
+++ b/docs/python_docs/python/tutorials/packages/ndarray/sparse/train_gluon.md
@@ -465,10 +465,6 @@ Memory Allocation for Weight Gradient:
   0.000 MBs (  0.050%) for fullyconnected3                         
 ```
 
-### Advanced: Sparse `weight`
-
-You can optimize this example further by setting the weight's `stype` to `'row_sparse'`, but whether `'row_sparse'` weights make sense or not will depends on your specific task. See [contrib.SparseEmbedding](https://github.com/apache/incubator-mxnet/blob/master/python/mxnet/gluon/contrib/nn/basic_layers.py#L118) for an example of this.
-
 ## Conclusion
 
 As part of this tutorial, we learned how to write sparse data to disk in LibSVM format and load it back in sparse batches with the [LibSVMIter](/api/python/docs/api/mxnet/io/index.html#mxnet.io.LibSVMIter). We learned how to improve the performance of Gluon's [nn.Dense](/api/python/docs/api/gluon/nn/index.html#mxnet.gluon.nn.Dense) on sparse arrays using `mx.nd.sparse`. And lastly, we set `grad_stype` to `'row_sparse'` to reduce the size of the gradient and speed up the parameter update step.
diff --git a/docs/python_docs/python/tutorials/performance/backend/tensorrt/tensorrt.md b/docs/python_docs/python/tutorials/performance/backend/tensorrt/tensorrt.md
deleted file mode 100644
index 44082f977eb7..000000000000
--- a/docs/python_docs/python/tutorials/performance/backend/tensorrt/tensorrt.md
+++ /dev/null
@@ -1,125 +0,0 @@
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-# Optimizing Deep Learning Computation Graphs with TensorRT
-
-NVIDIA's TensorRT is a deep learning library that has been shown to provide large speedups when used for network inference. MXNet 1.5.0 and later versions ship with experimental integrated support for TensorRT. This means MXNet users can now make use of this acceleration library to efficiently run their networks. In this tutorial we'll see how to install, enable and run TensorRT with MXNet.  We'll also give some insight into what is happening behind the scenes in MXNet to enable TensorRT graph execution.
-
-## Installation and Prerequisites
-To use MXNet with TensorRT integration you'll have to follow the MXNet build from source instructions, and have a few extra packages installed on your machine. First ensure that you are running Ubuntu 18.04, and that you have updated your video drivers, and you have installed CUDA 10.1 or newer.  You'll need a Pascal or newer generation NVIDIA GPU.  You'll also have to download and install TensorRT libraries [instructions here](https://docs.nvidia.com/deeplearning/sdk/tensorrt-install-guide/index.html).  Once you have these prerequisites installed you can follow the [recommended instructions for building MXNet for NVIDIA GPUs](https://mxnet.apache.org/get_started/build_from_source#recommended-for-Systems-with-NVIDIA-GPUs-and-Intel-CPUs) but include the additional CMake flag -DUSE_TENSORRT=1.
-
-## Sample Models
-### Resnet 18
-TensorRT is an inference only library, so for the purposes of this tutorial we will be using a pre-trained network, in this case a Resnet 18.  Resnets are a computationally intensive model architecture that are often used as a backbone for various computer vision tasks. Resnets are also commonly used as a reference for benchmarking deep learning library performance.  In this section we'll use a pretrained Resnet 18 from the [Gluon Model Zoo](/api/python/docs/api/gluon/model_zoo/index.html) and compare its inference speed with TensorRT using MXNet with TensorRT integration turned off as a baseline.
-
-## Model Initialization
-```python
-import mxnet as mx
-from mxnet.gluon.model_zoo import vision
-import time
-import os
-
-batch_shape = (1, 3, 224, 224)
-resnet18 = vision.resnet18_v2(pretrained=True)
-resnet18.hybridize()
-resnet18.forward(mx.nd.zeros(batch_shape))
-resnet18.export('resnet18_v2')
-sym, arg_params, aux_params = mx.model.load_checkpoint('resnet18_v2', 0)
-```
-In our first section of code we import the modules needed to run MXNet, and to time our benchmark runs.  We then download a pretrained version of Resnet18, hybridize it, and load it symbolically.  It's important to note that the experimental version of TensorRT integration will only work with the symbolic MXNet API. If you're using Gluon, you must [hybridize](https://gluon.mxnet.io/chapter07_distributed-learning/hybridize.html) your computation graph and export it as a symbol before running inference.  This may be addressed in future releases of MXNet, but in general if you're concerned about getting the best inference performance possible from your models, it's a good practice to hybridize.
-
-## MXNet Baseline Performance
-```python
-# Create sample input
-input = mx.nd.zeros(batch_shape)
-
-# Execute with MXNet
-executor = sym.simple_bind(ctx=mx.gpu(0), data=batch_shape, grad_req='null', force_rebind=True)
-executor.copy_params_from(arg_params, aux_params)
-
-# Warmup
-print('Warming up MXNet')
-for i in range(0, 10):
-    y_gen = executor.forward(is_train=False, data=input)
-    y_gen[0].wait_to_read()
-
-# Timing
-print('Starting MXNet timed run')
-start = time.process_time()
-for i in range(0, 10000):
-    y_gen = executor.forward(is_train=False, data=input)
-    y_gen[0].wait_to_read()
-end = time.time()
-print(time.process_time() - start)
-```
-
-We are interested in inference performance, so to simplify the benchmark we'll pass a tensor filled with zeros as an input.  We bind a symbol as usual, returning an MXNet executor, and we run forward on this executor in a loop.  To help improve the accuracy of our benchmarks we run a small number of predictions as a warmup before running our timed loop.  On a modern PC with an RTX 2070 GPU the time taken for our MXNet baseline is **17.20s**.  Next we'll run the same model with TensorRT enabled, and see how the performance compares.
-
-## MXNet with TensorRT Integration Performance
-```python
-# Execute with TensorRT
-print('Building TensorRT engine')
-trt_sym = sym.get_backend_symbol('TensorRT')
-arg_params, aux_params = mx.contrib.tensorrt.init_tensorrt_params(trt_sym, arg_params, aux_params)
-mx.contrib.tensorrt.set_use_fp16(True)
-executor = trt_sym.simple_bind(ctx=mx.gpu(), data=batch_shape,
-                               grad_req='null', force_rebind=True)
-executor.copy_params_from(arg_params, aux_params)
-```
-
-We use a few TensorRT specific API calls from the contrib package here to setup our parameters and indicate we'd like to run inference in fp16 mode. We then call simple_bind as normal and copy our parameter dictionaries to our executor.
-
-```python
-#Warmup
-print('Warming up TensorRT')
-for i in range(0, 10):
-    y_gen = executor.forward(is_train=False, data=input)
-    y_gen[0].wait_to_read()
-
-# Timing
-print('Starting TensorRT timed run')
-start = time.process_time()
-for i in range(0, 10000):
-    y_gen = executor.forward(is_train=False, data=input)
-    y_gen[0].wait_to_read()
-end = time.time()
-print(time.process_time() - start)
-```
-
-We run timing with a warmup once more, and on the same machine, run in **9.83s**. A 1.75x speed improvement!  Speed improvements when using libraries like TensorRT can come from a variety of optimizations, but in this case our speedups are coming from a technique known as [operator fusion](http://ziheng.org/2016/11/21/fusion-and-runtime-compilation-for-nnvm-and-tinyflow/).
-
-## Operators and Subgraph Fusion
-
-Behind the scenes a number of interesting things are happening to make these optimizations possible, and most revolve around subgraphs and operator fusion.  As we can see in the images below, neural networks can be represented as computation graphs of operators (nodes in the graphs).  Operators can perform a variety of functions, but most run simple mathematics and linear algebra on tensors.  Often these operators run more efficiently if fused together into a large CUDA kernel that is executed on the GPU in a single call.  What the MXNet TensorRT integration enables is the ability to scan the entire computation graph, identify interesting subgraphs and optimize them with TensorRT.
-
-This means that when an MXNet computation graph is constructed, it will be parsed to determine if there are any sub-graphs that contain operator types that are supported by TensorRT.  If MXNet determines that there are one (or many) compatible subgraphs during the graph-parse, it will extract these graphs and replace them with special TensorRT nodes (visible in the diagrams below).  As the graph is executed, whenever a TensorRT node is reached the graph will make a library call to TensorRT.  TensorRT will then run its own implementation of the subgraph, potentially with many operators fused together into a single CUDA kernel.
-
-During this process MXNet will take care of passing along the input to the node and fetching the results.  MXNet will also attempt to remove any duplicated weights (parameters) during the graph initialization to keep memory usage low.  That is, if there are graph weights that are used only in the TensorRT sections of the graph, they will be removed from the MXNet set of parameters, and their memory will be freed.
-
-The examples below shows a Gluon implementation of a Wavenet before and after a TensorRT graph pass. You can see that for this network TensorRT supports a subset of the operators involved. This makes it an interesting example to visualize, as several subgraphs are extracted and replaced with special TensorRT nodes. The Resnet used as an example above would be less interesting to visualization. The entire Resnet graph is supported by TensorRT, and hence the optimized graph would be a single TensorRT node.  If your browser is unable to render svg files you can view the graphs in png format: [unoptimized](wavenet_unoptimized.svg) and [optimized](wavenet_optimized.svg).
-
-## Before
-![before](wavenet_unoptimized.svg)
-
-## After
-![after](wavenet_optimized.svg)
-
-## Subgraph API
-As of MXNet 1.5, MXNet developers have integrated TensorRT with MXNet via a Subgraph API.  Read more about the design of the API [here](https://cwiki.apache.org/confluence/display/MXNET/MXNet+Graph+Optimization+and+Quantization+based+on+subgraph+and+MKL-DNN).
-
-## Thanks
-Thanks to NVIDIA for contributing this feature, and specifically thanks to Marek Kolodziej and Clement Fuji-Tsang.  Thanks to Junyuan Xie and Jun Wu for the code reviews and design feedback, and to Aaron Markham for the copy review.
\ No newline at end of file
diff --git a/example/multi_threaded_inference/multi_threaded_inference.cc b/example/multi_threaded_inference/multi_threaded_inference.cc
index 8b1864feea93..f1d0d72ef774 100644
--- a/example/multi_threaded_inference/multi_threaded_inference.cc
+++ b/example/multi_threaded_inference/multi_threaded_inference.cc
@@ -257,7 +257,7 @@ void run_inference(const std::string& model_name, const std::vector<mxnet::cpp::
     int num_output = 0;
     const int *stypes;
     int ret = MXInvokeCachedOpEx(hdl, arr_handles[num].size(), arr_handles[num].data(),
-                                 &num_output, &(cached_op_handles[num]), &stypes);
+                                 cpu::kDevMask, 0, &num_output, &(cached_op_handles[num]), &stypes);
     if (ret < 0) {
       LOG(FATAL) << MXGetLastError();
     }
diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index 60659c5f9c40..d2e0d73234e2 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -1414,13 +1414,10 @@ MXNET_DLL int MXCreateCachedOpEX(SymbolHandle handle,
 MXNET_DLL int MXFreeCachedOp(CachedOpHandle handle);
 
 /*!
- * \brief invoke cached operator
+ * \brief get optimized graph from the cached op
  */
-MXNET_DLL int MXInvokeCachedOp(CachedOpHandle handle,
-                               int num_inputs,
-                               NDArrayHandle *inputs,
-                               int *num_outputs,
-                               NDArrayHandle **outputs);
+MXNET_DLL int MXCachedOpGetOptimizedSymbol(CachedOpHandle handle,
+                                           SymbolHandle *out);
 
 /*!
  * \brief invoke a cached op
@@ -1428,6 +1425,8 @@ MXNET_DLL int MXInvokeCachedOp(CachedOpHandle handle,
  * \param num_inputs number of input NDArrays
  * \param inputs input NDArrays
  * \param num_outputs number of output NDArrays
+ * \param default_dev_type the default context type
+ * \param default_dev_id the default context device id
  * \param outputs output NDArrays
  * \param out_stypes output ndarrays' stypes
  * \return 0 when success, -1 when failure happens
@@ -1435,6 +1434,8 @@ MXNET_DLL int MXInvokeCachedOp(CachedOpHandle handle,
 MXNET_DLL int MXInvokeCachedOpEx(CachedOpHandle handle,
                                  int num_inputs,
                                  NDArrayHandle *inputs,
+                                 int default_dev_type,
+                                 int default_dev_id,
                                  int *num_outputs,
                                  NDArrayHandle **outputs,
                                  const int** out_stypes);
diff --git a/perl-package/.gitignore b/perl-package/.gitignore
deleted file mode 100644
index f9be8dfe0908..000000000000
--- a/perl-package/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-!*
diff --git a/perl-package/AI-MXNet-Gluon-Contrib/Changes b/perl-package/AI-MXNet-Gluon-Contrib/Changes
deleted file mode 100644
index f91ea2045edc..000000000000
--- a/perl-package/AI-MXNet-Gluon-Contrib/Changes
+++ /dev/null
@@ -1,14 +0,0 @@
-Revision history for Perl extension AI::MXNet::Gluon::Contrib
-
-1.33    Thu Oct  4 13:25:56 PDT 2018
-        - Fixed kwalitee issues.
-
-1.32    Sun Jul 15 12:12:15 PDT 2018
-        - Missing POD fixes.
-
-1.31    Sat Jul 14 08:33:21 PDT 2018
-        - Fixed CPAN indexing issue.
-
-1.3     Tue Jul 10 21:19:13 PDT 2018
-        - Initial release
-
diff --git a/perl-package/AI-MXNet-Gluon-Contrib/MANIFEST b/perl-package/AI-MXNet-Gluon-Contrib/MANIFEST
deleted file mode 100644
index 8b3e383068b0..000000000000
--- a/perl-package/AI-MXNet-Gluon-Contrib/MANIFEST
+++ /dev/null
@@ -1,9 +0,0 @@
-Changes
-lib/AI/MXNet/Gluon/Contrib.pm
-lib/AI/MXNet/Gluon/Contrib/NN/BasicLayers.pm
-Makefile.PL
-MANIFEST
-META.json
-META.yml
-README
-t/AI-MXNet-Gluon-Contrib.t
diff --git a/perl-package/AI-MXNet-Gluon-Contrib/META.json b/perl-package/AI-MXNet-Gluon-Contrib/META.json
deleted file mode 100644
index 910c7d4d9a15..000000000000
--- a/perl-package/AI-MXNet-Gluon-Contrib/META.json
+++ /dev/null
@@ -1,42 +0,0 @@
-{
-   "abstract" : "Perl interface to MXNet Gluon Contrib",
-   "author" : [
-      "Sergey Kolychev <sergeykolychev.github@gmail.com>"
-   ],
-   "dynamic_config" : 0,
-   "generated_by" : "ExtUtils::MakeMaker version 7.24, CPAN::Meta::Converter version 2.143240",
-   "license" : [
-      "apache_2_0"
-   ],
-   "meta-spec" : {
-      "url" : "http://search.cpan.org/perldoc?CPAN::Meta::Spec",
-      "version" : "2"
-   },
-   "name" : "AI-MXNet-Gluon-Contrib",
-   "no_index" : {
-      "directory" : [
-         "t",
-         "inc"
-      ]
-   },
-   "prereqs" : {
-      "build" : {
-         "requires" : {}
-      },
-      "configure" : {
-         "requires" : {
-            "ExtUtils::MakeMaker" : "6.30"
-         }
-      },
-      "runtime" : {
-         "requires" : {
-            "AI::MXNet" : "1.33"
-         }
-      },
-      "test" : {
-         "requires" : {}
-      }
-   },
-   "release_status" : "stable",
-   "version" : "1.33"
-}
diff --git a/perl-package/AI-MXNet-Gluon-Contrib/META.yml b/perl-package/AI-MXNet-Gluon-Contrib/META.yml
deleted file mode 100644
index f56b10c939b1..000000000000
--- a/perl-package/AI-MXNet-Gluon-Contrib/META.yml
+++ /dev/null
@@ -1,38 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
----
-abstract: 'Perl interface to MXNet Gluon Contrib'
-author:
-  - 'Sergey Kolychev <sergeykolychev.github@gmail.com>'
-build_requires: {}
-configure_requires:
-  ExtUtils::MakeMaker: '6.30'
-dynamic_config: 0
-generated_by: 'ExtUtils::MakeMaker version 7.24, CPAN::Meta::Converter version 2.143240'
-license: apache
-meta-spec:
-  url: http://module-build.sourceforge.net/META-spec-v1.4.html
-  version: '1.4'
-name: AI-MXNet-Gluon-Contrib
-no_index:
-  directory:
-    - t
-    - inc
-requires:
-  AI::MXNet: '1.31'
-version: '1.33'
diff --git a/perl-package/AI-MXNet-Gluon-Contrib/Makefile.PL b/perl-package/AI-MXNet-Gluon-Contrib/Makefile.PL
deleted file mode 100644
index a6ff95e8bcc6..000000000000
--- a/perl-package/AI-MXNet-Gluon-Contrib/Makefile.PL
+++ /dev/null
@@ -1,63 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-use strict;
-use warnings;
-
-use 5.014000;
-
-use ExtUtils::MakeMaker 6.30;
-
-
-
-my %WriteMakefileArgs = (
-  "ABSTRACT" => "Perl interface to MXNet Gluon Contrib",
-  "AUTHOR" => "Sergey Kolychev <sergeykolychev.github\@gmail.com>",
-  "BUILD_REQUIRES" => {},
-  "CONFIGURE_REQUIRES" => {
-    "ExtUtils::MakeMaker" => "6.30"
-  },
-  "DISTNAME" => "AI-MXNet-Gluon-Contrib",
-  "EXE_FILES" => [],
-  "LICENSE" => "apache_2_0",
-  "NAME" => "AI::MXNet::Gluon::Contrib",
-  "PREREQ_PM" => {
-    "AI::MXNet" => "1.31",
-  },
-  "TEST_REQUIRES" => {},
-  "VERSION" => "1.33",
-  "test" => {
-    "TESTS" => "t/*.t"
-  }
-);
-
-
-my %FallbackPrereqs = (
-  "AI::MXNet" => "1.31"
-);
-
-
-unless ( eval { ExtUtils::MakeMaker->VERSION(6.63_03) } ) {
-  delete $WriteMakefileArgs{TEST_REQUIRES};
-  delete $WriteMakefileArgs{BUILD_REQUIRES};
-  $WriteMakefileArgs{PREREQ_PM} = \%FallbackPrereqs;
-}
-
-delete $WriteMakefileArgs{CONFIGURE_REQUIRES}
-  unless eval { ExtUtils::MakeMaker->VERSION(6.52) };
-
-WriteMakefile(%WriteMakefileArgs);
diff --git a/perl-package/AI-MXNet-Gluon-Contrib/README b/perl-package/AI-MXNet-Gluon-Contrib/README
deleted file mode 100644
index f0301d168f75..000000000000
--- a/perl-package/AI-MXNet-Gluon-Contrib/README
+++ /dev/null
@@ -1,7 +0,0 @@
-This archive contains the distribution AI-MXNet-Gluon-Contrib,
-version 1.33:
-
-  Perl interface to MXNet Gluon Contib modules, a collection of supplemental Gluon blocks.
-
-This library is licensed under Apache 2.0 license https://www.apache.org/licenses/LICENSE-2.0
-
diff --git a/perl-package/AI-MXNet-Gluon-Contrib/lib/AI/MXNet/Gluon/Contrib.pm b/perl-package/AI-MXNet-Gluon-Contrib/lib/AI/MXNet/Gluon/Contrib.pm
deleted file mode 100644
index 807dfc87200d..000000000000
--- a/perl-package/AI-MXNet-Gluon-Contrib/lib/AI/MXNet/Gluon/Contrib.pm
+++ /dev/null
@@ -1,39 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-package AI::MXNet::Gluon::Contrib;
-use strict;
-use warnings;
-use AI::MXNet;
-use AI::MXNet::Gluon::Contrib::NN::BasicLayers;
-our $VERSION = '1.33';
-=head1 NAME 
-
-    AI::MXNet::Gluon::Contrib - A collection of supplemental Gluon blocks.
-=cut
-
-1;
-
-=head1 AUTHOR
-
-    Sergey Kolychev, <sergeykolychev.github@gmail.com>
-
-=head1 COPYRIGHT & LICENSE
-
-    This library is licensed under Apache 2.0 license L<https://www.apache.org/licenses/LICENSE-2.0>
-
-=cut
diff --git a/perl-package/AI-MXNet-Gluon-Contrib/lib/AI/MXNet/Gluon/Contrib/NN/BasicLayers.pm b/perl-package/AI-MXNet-Gluon-Contrib/lib/AI/MXNet/Gluon/Contrib/NN/BasicLayers.pm
deleted file mode 100644
index 5f57e031032c..000000000000
--- a/perl-package/AI-MXNet-Gluon-Contrib/lib/AI/MXNet/Gluon/Contrib/NN/BasicLayers.pm
+++ /dev/null
@@ -1,208 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-use strict;
-use warnings;
-package AI::MXNet::Gluon::Contrib::NN::BasicLayers;
-
-=head1 NAME 
-
-    AI::MXNet::Gluon::Contrib::NN::BasicLayers - An additional collection of Gluon's building blocks.
-=cut
-
-use AI::MXNet::Function::Parameters;
-package AI::MXNet::Gluon::NN::Concurrent;
-use AI::MXNet::Gluon::Mouse;
-extends 'AI::MXNet::Gluon::NN::Sequential';
-
-=head1 NAME
-
-    AI::MXNet::Gluon::NN::Concurrent - Lays Blocks concurrently.
-=cut
-
-=head1 DESCRIPTION
-
-    Lays Blocks concurrently.
-
-    This block feeds its input to all children blocks, and
-    produces the output by concatenating all the children blocks' outputs
-    on the specified axis.
-
-    Example:
-
-        $net = nn->Concurrent();
-        # use net's name_scope to give children blocks appropriate names.
-        $net->name_scope(sub {
-            $net->add(nn->Dense(10, activation=>'relu'));
-            $net->add(nn->Dense(20));
-            $net->add(nn->Identity());
-        });
-
-    Parameters
-    ----------
-    axis : int, default -1
-        The axis on which to concatenate the outputs.
-=cut
-has 'axis' => (is => 'rw', isa => 'Int', default => -1);
-method python_constructor_arguments() { ['axis'] }
-
-method forward(GluonInput $x)
-{
-    return AI::MXNet::NDArray->concat((map { $_->($x) } $self->_children->values), dim=>$self->axis);
-}
-
-__PACKAGE__->register('AI::MXNet::Gluon::NN');
-
-package AI::MXNet::Gluon::NN::HybridConcurrent;
-use AI::MXNet::Gluon::Mouse;
-extends 'AI::MXNet::Gluon::NN::HybridSequential';
-
-=head1 NAME
-
-    AI::MXNet::Gluon::NN::HybridConcurrent - Lays HubridBlocks concurrently.
-=cut
-
-=head1 DESCRIPTION
-
-    Lays HybridBlocks concurrently.
-
-    This block feeds its input to all children blocks, and
-    produces the output by concatenating all the children blocks' outputs
-    on the specified axis.
-
-    Example:
-
-        $net = nn->HybridConcurrent();
-        # use net's name_scope to give children blocks appropriate names.
-        $net->name_scope(sub {
-            $net->add(nn->Dense(10, activation=>'relu'));
-            $net->add(nn->Dense(20));
-            $net->add(nn->Identity());
-        });
-
-    Parameters
-    ----------
-    axis : int, default -1
-        The axis on which to concatenate the outputs.
-=cut
-has 'axis' => (is => 'rw', isa => 'Int', default => -1);
-method python_constructor_arguments() { ['axis'] }
-
-method hybrid_forward(GluonClass $F, GluonInput $x)
-{
-    return $F->concat((map { $_->($x) } $self->_children->values), dim=>$self->axis);
-}
-
-__PACKAGE__->register('AI::MXNet::Gluon::NN');
-
-package AI::MXNet::Gluon::NN::Identity;
-use AI::MXNet::Gluon::Mouse;
-extends 'AI::MXNet::Gluon::HybridBlock';
-
-=head1 NAME
-
-    AI::MXNet::Gluon::NN::Identity - Block that passes through the input directly.
-=cut
-
-=head1 DESCRIPTION
-
-    Block that passes through the input directly.
-
-    This block can be used in conjunction with HybridConcurrent
-    block for residual connection.
-
-    Example:
-
-        $net = nn->HybridConcurrent();
-        # use net's name_scope to give child Blocks appropriate names.
-        $net->name_scope(sub {
-            $net->add(nn->Dense(10, activation=>'relu'));
-            $net->add(nn->Dense(20));
-            $net->add(nn->Identity());
-        });
-=cut
-
-method hybrid_forward(GluonClass $F, GluonInput $x)
-{
-    return $x;
-}
-
-__PACKAGE__->register('AI::MXNet::Gluon::NN');
-
-package AI::MXNet::Gluon::NN::SparseEmbedding;
-use AI::MXNet::Gluon::Mouse;
-extends 'AI::MXNet::Gluon::Block';
-
-=head1 NAME
-
-    AI::MXNet::Gluon::NN::SparseEmbedding - Turns non-negative integers (indexes/tokens) into dense vectors.
-=cut
-
-=head1 DESCRIPTION
-
-    Turns non-negative integers (indexes/tokens) into dense vectors
-    of fixed size. eg. [4, 20] -> [[0.25, 0.1], [0.6, -0.2]]
-
-    This SparseBlock is designed for distributed training with extremely large
-    input dimension. Both weight and gradient w.r.t. weight are AI::MXNet::NDArray::RowSparse.
-
-    Parameters
-    ----------
-    input_dim : int
-        Size of the vocabulary, i.e. maximum integer index + 1.
-    output_dim : int
-        Dimension of the dense embedding.
-    dtype : Dtype, default 'float32'
-        Data type of output embeddings.
-    weight_initializer : Initializer
-        Initializer for the embeddings matrix.
-=cut
-
-has 'input_dim'          => (is => 'ro', isa => 'Int', required => 1);
-has 'output_dim'         => (is => 'ro', isa => 'Int', required => 1);
-has 'dtype'              => (is => 'ro', isa => 'Dtype', default => 'float32');
-has 'weight_initializer' => (is => 'ro', isa => 'Maybe[Initializer]');
-method python_constructor_arguments() { [qw/input_dim output_dim dtype weight_initializer/] }
-
-sub BUILD
-{
-    my $self = shift;
-    $self->_kwargs({
-        input_dim => $self->input_dim, 
-        output_dim => $self->output_dim,
-        dtype => $self->dtype,
-        sparse_grad => 1
-    });
-    $self->weight($self->params->get('weight', shape=>[$self->input_dim, $self->output_dim],
-                                      init=>$self->weight_initializer, dtype=>$self->dtype,
-                                      grad_stype=>'row_sparse', stype=>'row_sparse'));
-}
-
-method forward(GluonInput $x)
-{
-    my $weight = $self->weight->row_sparse_data($x);
-    return AI::MXNet::NDArray->Embedding($x, $weight, { name=>'fwd', %{ $self->_kwargs } });
-}
-
-use overload '""' => sub {
-    my $self = shift;
-    $self->_class_name.'('.$self->input_dim.' -> '.$self->input_dim.', '.$self->dtype.')';
-};
-
-__PACKAGE__->register('AI::MXNet::Gluon::NN');
-
-1;
diff --git a/perl-package/AI-MXNet-Gluon-Contrib/t/AI-MXNet-Gluon-Contrib.t b/perl-package/AI-MXNet-Gluon-Contrib/t/AI-MXNet-Gluon-Contrib.t
deleted file mode 100644
index 4af95036fe0c..000000000000
--- a/perl-package/AI-MXNet-Gluon-Contrib/t/AI-MXNet-Gluon-Contrib.t
+++ /dev/null
@@ -1,21 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-use strict;
-use warnings;
-use Test::More tests => 1;
-BEGIN { use_ok('AI::MXNet::Gluon::Contrib') };
diff --git a/perl-package/AI-MXNet-Gluon-ModelZoo/Changes b/perl-package/AI-MXNet-Gluon-ModelZoo/Changes
deleted file mode 100644
index 61018181c9a5..000000000000
--- a/perl-package/AI-MXNet-Gluon-ModelZoo/Changes
+++ /dev/null
@@ -1,11 +0,0 @@
-Revision history for Perl extension AI::MXNet::Gluon::ModelZoo
-
-1.33    Thu Oct  4 13:25:56 PDT 2018
-        - Fixed kwalitee issues.
-
-1.32    Sun Aug  5 14:25:31 PDT 2018
-        - Updated vgg16/19 models
-
-1.3     Tue Jul 10 21:19:13 PDT 2018
-        - Initial release
-
diff --git a/perl-package/AI-MXNet-Gluon-ModelZoo/MANIFEST b/perl-package/AI-MXNet-Gluon-ModelZoo/MANIFEST
deleted file mode 100644
index d01d25e85b1c..000000000000
--- a/perl-package/AI-MXNet-Gluon-ModelZoo/MANIFEST
+++ /dev/null
@@ -1,19 +0,0 @@
-Changes
-examples/image_classification.pl
-lib/AI/MXNet/Gluon/ModelZoo.pm
-lib/AI/MXNet/Gluon/ModelZoo/ModelStore.pm
-lib/AI/MXNet/Gluon/ModelZoo/Vision.pm
-lib/AI/MXNet/Gluon/ModelZoo/Vision/AlexNet.pm
-lib/AI/MXNet/Gluon/ModelZoo/Vision/DenseNet.pm
-lib/AI/MXNet/Gluon/ModelZoo/Vision/Inception.pm
-lib/AI/MXNet/Gluon/ModelZoo/Vision/MobileNet.pm
-lib/AI/MXNet/Gluon/ModelZoo/Vision/ResNet.pm
-lib/AI/MXNet/Gluon/ModelZoo/Vision/SqueezeNet.pm
-lib/AI/MXNet/Gluon/ModelZoo/Vision/VGG.pm
-Makefile.PL
-MANIFEST
-META.json
-META.yml
-README
-t/AI-MXNet-Gluon-ModelZoo.t
-t/test_gluon_model_zoo.t
diff --git a/perl-package/AI-MXNet-Gluon-ModelZoo/META.json b/perl-package/AI-MXNet-Gluon-ModelZoo/META.json
deleted file mode 100644
index 2ce7dddba36c..000000000000
--- a/perl-package/AI-MXNet-Gluon-ModelZoo/META.json
+++ /dev/null
@@ -1,44 +0,0 @@
-{
-   "abstract" : "Perl interface to MXNet Gluon ModelZoo",
-   "author" : [
-      "Sergey Kolychev <sergeykolychev.github@gmail.com>"
-   ],
-   "dynamic_config" : 0,
-   "generated_by" : "ExtUtils::MakeMaker version 7.24, CPAN::Meta::Converter version 2.143240",
-   "license" : [
-      "apache_2_0"
-   ],
-   "meta-spec" : {
-      "url" : "http://search.cpan.org/perldoc?CPAN::Meta::Spec",
-      "version" : "2"
-   },
-   "name" : "AI-MXNet-Gluon-ModelZoo",
-   "no_index" : {
-      "directory" : [
-         "t",
-         "inc"
-      ]
-   },
-   "prereqs" : {
-      "build" : {
-         "requires" : {}
-      },
-      "configure" : {
-         "requires" : {
-            "ExtUtils::MakeMaker" : "6.30"
-         }
-      },
-      "runtime" : {
-         "requires" : {
-            "AI::MXNet" : "1.31",
-            "AI::MXNet::Gluon::Contrib" : "1.3",
-            "IO::Uncompress::Unzip" : "0"
-         }
-      },
-      "test" : {
-         "requires" : {}
-      }
-   },
-   "release_status" : "stable",
-   "version" : "1.33"
-}
diff --git a/perl-package/AI-MXNet-Gluon-ModelZoo/META.yml b/perl-package/AI-MXNet-Gluon-ModelZoo/META.yml
deleted file mode 100644
index 35c93845d367..000000000000
--- a/perl-package/AI-MXNet-Gluon-ModelZoo/META.yml
+++ /dev/null
@@ -1,40 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
----
-abstract: 'Perl interface to MXNet Gluon ModelZoo'
-author:
-  - 'Sergey Kolychev <sergeykolychev.github@gmail.com>'
-build_requires: {}
-configure_requires:
-  ExtUtils::MakeMaker: '6.30'
-dynamic_config: 0
-generated_by: 'ExtUtils::MakeMaker version 7.24, CPAN::Meta::Converter version 2.143240'
-license: apache
-meta-spec:
-  url: http://module-build.sourceforge.net/META-spec-v1.4.html
-  version: '1.4'
-name: AI-MXNet-Gluon-ModelZoo
-no_index:
-  directory:
-    - t
-    - inc
-requires:
-  AI::MXNet: '1.31'
-  AI::MXNet::Gluon::Contrib: '1.3'
-  IO::Uncompress::Unzip: '0'
-version: '1.33'
diff --git a/perl-package/AI-MXNet-Gluon-ModelZoo/Makefile.PL b/perl-package/AI-MXNet-Gluon-ModelZoo/Makefile.PL
deleted file mode 100644
index de8b1acc5e2f..000000000000
--- a/perl-package/AI-MXNet-Gluon-ModelZoo/Makefile.PL
+++ /dev/null
@@ -1,67 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-use strict;
-use warnings;
-
-use 5.014000;
-
-use ExtUtils::MakeMaker 6.30;
-
-
-
-my %WriteMakefileArgs = (
-  "ABSTRACT" => "Perl interface to MXNet Gluon ModelZoo",
-  "AUTHOR" => "Sergey Kolychev <sergeykolychev.github\@gmail.com>",
-  "BUILD_REQUIRES" => {},
-  "CONFIGURE_REQUIRES" => {
-    "ExtUtils::MakeMaker" => "6.30"
-  },
-  "DISTNAME" => "AI-MXNet-Gluon-ModelZoo",
-  "EXE_FILES" => [],
-  "LICENSE" => "apache_2_0",
-  "NAME" => "AI::MXNet::Gluon::ModelZoo",
-  "PREREQ_PM" => {
-    "AI::MXNet" => "1.31",
-    "AI::MXNet::Gluon::Contrib" => "1.3",
-    "IO::Uncompress::Unzip" => "0"
-  },
-  "TEST_REQUIRES" => {},
-  "VERSION" => "1.33",
-  "test" => {
-    "TESTS" => "t/*.t"
-  }
-);
-
-
-my %FallbackPrereqs = (
-  "AI::MXNet" => "1.31",
-  "AI::MXNet::Gluon::Contrib" => "1.3",
-  "IO::Uncompress::Unzip" => "0"
-);
-
-
-unless ( eval { ExtUtils::MakeMaker->VERSION(6.63_03) } ) {
-  delete $WriteMakefileArgs{TEST_REQUIRES};
-  delete $WriteMakefileArgs{BUILD_REQUIRES};
-  $WriteMakefileArgs{PREREQ_PM} = \%FallbackPrereqs;
-}
-
-delete $WriteMakefileArgs{CONFIGURE_REQUIRES}
-  unless eval { ExtUtils::MakeMaker->VERSION(6.52) };
-
-WriteMakefile(%WriteMakefileArgs);
diff --git a/perl-package/AI-MXNet-Gluon-ModelZoo/README b/perl-package/AI-MXNet-Gluon-ModelZoo/README
deleted file mode 100644
index e39ae4b69be0..000000000000
--- a/perl-package/AI-MXNet-Gluon-ModelZoo/README
+++ /dev/null
@@ -1,7 +0,0 @@
-This archive contains the distribution AI-MXNet-Gluon-ModelZoo,
-version 1.33:
-
-  Perl interface to MXNet Gluon ModelZoo, a collection of pretrained machine learning models for computer vision.
-
-This library is licensed under Apache 2.0 license https://www.apache.org/licenses/LICENSE-2.0
-
diff --git a/perl-package/AI-MXNet-Gluon-ModelZoo/examples/image_classification.pl b/perl-package/AI-MXNet-Gluon-ModelZoo/examples/image_classification.pl
deleted file mode 100755
index 4dbf890b4ff0..000000000000
--- a/perl-package/AI-MXNet-Gluon-ModelZoo/examples/image_classification.pl
+++ /dev/null
@@ -1,77 +0,0 @@
-#!/usr/bin/env perl
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-use strict;
-use warnings;
-use AI::MXNet::Gluon::ModelZoo 'get_model';
-use AI::MXNet::Gluon::Utils 'download';
-use Getopt::Long qw(HelpMessage);
-
-GetOptions(
-    ## my Pembroke Welsh Corgi Kyuubi, enjoing Solar eclipse of August 21, 2017
-    'image=s' => \(my $image = 'http://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/'.
-                               'gluon/dataset/kyuubi.jpg'),
-    'model=s' => \(my $model = 'resnet152_v2'),
-    'help'    => sub { HelpMessage(0) },
-) or HelpMessage(1);
-
-## get a pretrained model (download parameters file if necessary)
-my $net = get_model($model, pretrained => 1);
-
-## ImageNet classes
-my $fname = download('http://data.mxnet.io/models/imagenet/synset.txt');
-my @text_labels = map { chomp; s/^\S+\s+//; $_ } IO::File->new($fname)->getlines;
-
-## get the image from the disk or net
-if($image =~ /^https/)
-{
-    eval { require IO::Socket::SSL; };
-    die "Need to have IO::Socket::SSL installed for https images" if $@;
-}
-$image = $image =~ /^https?/ ? download($image) : $image;
-
-# Following the conventional way of preprocessing ImageNet data:
-# Resize the short edge into 256 pixes,
-# And then perform a center crop to obtain a 224-by-224 image.
-# The following code uses the image processing functions provided 
-# in the AI::MXNet::Image module.
-
-$image = mx->image->imread($image);
-$image = mx->image->resize_short($image, $model =~ /inception/ ? 330 : 256);
-($image) = mx->image->center_crop($image, [($model =~ /inception/ ? 299 : 224)x2]);
-
-## CV that is used to read image is column major (as PDL)
-$image = $image->transpose([2,0,1])->expand_dims(axis=>0);
-
-## normalizing the image
-my $rgb_mean = nd->array([0.485, 0.456, 0.406])->reshape([1,3,1,1]);
-my $rgb_std = nd->array([0.229, 0.224, 0.225])->reshape([1,3,1,1]);
-$image = ($image->astype('float32') / 255 - $rgb_mean) / $rgb_std;
-
-# Now we can recognize the object in the image.
-# We perform an additional softmax on the output to obtain probability scores.
-# And then print the top-5 recognized objects.
-my $prob = $net->($image)->softmax;
-for my $idx (@{ $prob->topk(k=>5)->at(0) })
-{
-    my $i = $idx->asscalar;
-    printf(
-        "With prob = %.5f, it contains %s\n",
-        $prob->at(0)->at($i)->asscalar, $text_labels[$i]
-    );
-}
diff --git a/perl-package/AI-MXNet-Gluon-ModelZoo/lib/AI/MXNet/Gluon/ModelZoo.pm b/perl-package/AI-MXNet-Gluon-ModelZoo/lib/AI/MXNet/Gluon/ModelZoo.pm
deleted file mode 100644
index 1611ce7f8318..000000000000
--- a/perl-package/AI-MXNet-Gluon-ModelZoo/lib/AI/MXNet/Gluon/ModelZoo.pm
+++ /dev/null
@@ -1,142 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-package AI::MXNet::Gluon::ModelZoo;
-use strict;
-use warnings;
-use AI::MXNet qw(mx);
-use AI::MXNet::Gluon qw(gluon);
-use AI::MXNet::Gluon::NN qw(nn);
-use AI::MXNet::Gluon::Contrib;
-use AI::MXNet::Gluon::ModelZoo::Vision;
-use Exporter;
-use base qw(Exporter);
-@AI::MXNet::Gluon::ModelZoo::EXPORT_OK = qw(get_model);
-our $VERSION = '1.33';
-
-=head1 NAME
-
-    AI::MXNet::Gluon::ModelZoo - A collection of pretrained MXNet Gluon models
-=cut
-
-=head1 SYNOPSIS
-
-    ## run forward prediction on random data
-    use AI::MXNet::Gluon::ModelZoo qw(get_model);
-    my $alexnet = get_model('alexnet', pretrained => 1);
-    my $out = $alexnet->(mx->nd->random->uniform(shape=>[1, 3, 224, 224]));
-    print $out->aspdl;
-=cut
-
-=head1 DESCRIPTION
-
-    This module houses a collection of pretrained models (the parameters are hosted on public mxnet servers).
-    https://mxnet.apache.org/api/python/gluon/model_zoo.html
-    See examples/image_classification.pl for the example of real time image classification
-    using a pretrained model from the ModelZoo
-=cut
-
-our %models = qw/
-    resnet18_v1 resnet18_v1
-    resnet34_v1 resnet34_v1
-    resnet50_v1 resnet50_v1
-    resnet101_v1 resnet101_v1
-    resnet152_v1 resnet152_v1
-    resnet18_v2 resnet18_v2
-    resnet34_v2 resnet34_v2
-    resnet50_v2 resnet50_v2
-    resnet101_v2 resnet101_v2
-    resnet152_v2 resnet152_v2
-    vgg11 vgg11
-    vgg13 vgg13
-    vgg16 vgg16
-    vgg19 vgg19
-    vgg11_bn vgg11_bn
-    vgg13_bn vgg13_bn
-    vgg16_bn vgg16_bn
-    vgg19_bn vgg19_bn
-    alexnet alexnet
-    densenet121 densenet121
-    densenet161 densenet161
-    densenet169 densenet169
-    densenet201 densenet201
-    squeezenet1.0 squeezenet1_0
-    squeezenet1.1 squeezenet1_1
-    inceptionv3 inception_v3
-    mobilenet1.0 mobilenet1_0
-    mobilenet0.75 mobilenet0_75
-    mobilenet0.5 mobilenet0_5
-    mobilenet0.25 mobilenet0_25
-    mobilenetv2_1.0 mobilenet_v2_1_0
-    mobilenetv2_0.75 mobilenet_v2_0_75
-    mobilenetv2_0.5 mobilenet_v2_0_5
-    mobilenetv2_0.25 mobilenet_v2_0_25
-/;
-
-
-=head2 get_model
-
-    Returns a pre-defined model by name
-
-    Parameters
-    ----------
-    $name : Str
-        Name of the model.
-    :$pretrained : Bool
-        Whether to load the pretrained weights for model.
-    :$classes : Int
-        Number of classes for the output layer.
-    :$ctx : AI::MXNet::Context, default CPU
-        The context in which to load the pretrained weights.
-    :$root : Str, default '~/.mxnet/models'
-        Location for keeping the model parameters.
-
-    Returns
-    -------
-    HybridBlock
-        The model.
-=cut
-
-sub get_model
-{
-    if(exists $models{lc $_[1]})
-    {
-        shift;
-    }
-    my ($name, %kwargs) = @_;
-    $name = lc $name;
-    Carp::confess(
-        "Model $name is not present in the zoo\nValid models are:\n".
-        join(', ', sort keys %models)."\n"
-    ) unless exists $models{$name};
-    my $sub = $models{$name};
-    AI::MXNet::Gluon::ModelZoo::Vision->$sub(%kwargs);
-}
-
-sub vision { 'AI::MXNet::Gluon::ModelZoo::Vision' }
-
-1;
-
-=head1 AUTHOR
-
-    Sergey Kolychev, <sergeykolychev.github@gmail.com>
-
-=head1 COPYRIGHT & LICENSE
-
-    This library is licensed under Apache 2.0 license L<https://www.apache.org/licenses/LICENSE-2.0>
-
-=cut
diff --git a/perl-package/AI-MXNet-Gluon-ModelZoo/lib/AI/MXNet/Gluon/ModelZoo/ModelStore.pm b/perl-package/AI-MXNet-Gluon-ModelZoo/lib/AI/MXNet/Gluon/ModelZoo/ModelStore.pm
deleted file mode 100644
index bb258b4d9cdf..000000000000
--- a/perl-package/AI-MXNet-Gluon-ModelZoo/lib/AI/MXNet/Gluon/ModelZoo/ModelStore.pm
+++ /dev/null
@@ -1,164 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-package AI::MXNet::Gluon::ModelZoo::ModelStore;
-use strict;
-use warnings;
-use AI::MXNet::Function::Parameters;
-
-=head1 NAME
-
-    AI::MXNet::Gluon::ModelZoo::ModelStore - Model zoo for pre-trained models.
-=cut
-
-use AI::MXNet::Gluon::Utils qw(download check_sha1);
-use IO::Uncompress::Unzip qw(unzip);
-use File::Path qw(make_path);
-
-my %_model_sha1 = map { $_->[1] => $_->[0] } (
-    ['44335d1f0046b328243b32a26a4fbd62d9057b45', 'alexnet'],
-    ['f27dbf2dbd5ce9a80b102d89c7483342cd33cb31', 'densenet121'],
-    ['b6c8a95717e3e761bd88d145f4d0a214aaa515dc', 'densenet161'],
-    ['2603f878403c6aa5a71a124c4a3307143d6820e9', 'densenet169'],
-    ['1cdbc116bc3a1b65832b18cf53e1cb8e7da017eb', 'densenet201'],
-    ['ed47ec45a937b656fcc94dabde85495bbef5ba1f', 'inceptionv3'],
-    ['9f83e440996887baf91a6aff1cccc1c903a64274', 'mobilenet0.25'],
-    ['8e9d539cc66aa5efa71c4b6af983b936ab8701c3', 'mobilenet0.5'],
-    ['529b2c7f4934e6cb851155b22c96c9ab0a7c4dc2', 'mobilenet0.75'],
-    ['6b8c5106c730e8750bcd82ceb75220a3351157cd', 'mobilenet1.0'],
-    ['36da4ff1867abccd32b29592d79fc753bca5a215', 'mobilenetv2_1.0'],
-    ['e2be7b72a79fe4a750d1dd415afedf01c3ea818d', 'mobilenetv2_0.75'],
-    ['aabd26cd335379fcb72ae6c8fac45a70eab11785', 'mobilenetv2_0.5'],
-    ['ae8f9392789b04822cbb1d98c27283fc5f8aa0a7', 'mobilenetv2_0.25'],
-    ['a0666292f0a30ff61f857b0b66efc0228eb6a54b', 'resnet18_v1'],
-    ['48216ba99a8b1005d75c0f3a0c422301a0473233', 'resnet34_v1'],
-    ['0aee57f96768c0a2d5b23a6ec91eb08dfb0a45ce', 'resnet50_v1'],
-    ['d988c13d6159779e907140a638c56f229634cb02', 'resnet101_v1'],
-    ['671c637a14387ab9e2654eafd0d493d86b1c8579', 'resnet152_v1'],
-    ['a81db45fd7b7a2d12ab97cd88ef0a5ac48b8f657', 'resnet18_v2'],
-    ['9d6b80bbc35169de6b6edecffdd6047c56fdd322', 'resnet34_v2'],
-    ['ecdde35339c1aadbec4f547857078e734a76fb49', 'resnet50_v2'],
-    ['18e93e4f48947e002547f50eabbcc9c83e516aa6', 'resnet101_v2'],
-    ['f2695542de38cf7e71ed58f02893d82bb409415e', 'resnet152_v2'],
-    ['264ba4970a0cc87a4f15c96e25246a1307caf523', 'squeezenet1.0'],
-    ['33ba0f93753c83d86e1eb397f38a667eaf2e9376', 'squeezenet1.1'],
-    ['dd221b160977f36a53f464cb54648d227c707a05', 'vgg11'],
-    ['ee79a8098a91fbe05b7a973fed2017a6117723a8', 'vgg11_bn'],
-    ['6bc5de58a05a5e2e7f493e2d75a580d83efde38c', 'vgg13'],
-    ['7d97a06c3c7a1aecc88b6e7385c2b373a249e95e', 'vgg13_bn'],
-    ['e660d4569ccb679ec68f1fd3cce07a387252a90a', 'vgg16'],
-    ['7f01cf050d357127a73826045c245041b0df7363', 'vgg16_bn'],
-    ['ad2f660d101905472b83590b59708b71ea22b2e5', 'vgg19'],
-    ['f360b758e856f1074a85abd5fd873ed1d98297c3', 'vgg19_bn']
-);
-
-my $apache_repo_url = 'http://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/';
-my $_url_format = '%sgluon/models/%s.zip';
-
-func short_hash($name)
-{
-    Carp::confess("model $name is not available in model zoo") unless exists $_model_sha1{$name};
-    return substr($_model_sha1{$name}, 0, 8);
-}
-
-=head2 get_model_file
-
-    Return location for the pretrained on local file system.
-
-    This function will download from online model zoo when model cannot be found or has mismatch.
-    The root directory will be created if it doesn't exist.
-
-    Parameters
-    ----------
-    $name : Str
-        Name of the model.
-    :$root : Str, default '~/.mxnet/models'
-        Location for keeping the model parameters.
-
-    Returns
-    -------
-    $file_path
-        Path to the requested pretrained model file.
-=cut
-
-method get_model_file(Str $name, Str :$root='~/.mxnet/models')
-{
-    my $file_name = "$name-".short_hash($name);
-    $root =~ s/~/$ENV{HOME}/;
-    my $file_path = "$root/$file_name.params";
-    my $sha1_hash = $_model_sha1{$name};
-    if(-f $file_path)
-    {
-        if(check_sha1($file_path, $sha1_hash))
-        {
-            return $file_path;
-        }
-        else
-        {
-            warn("Mismatch in the content of model file detected. Downloading again.\n");
-        }
-    }
-    else
-    {
-        warn("Model file is not found. Downloading.\n");
-    }
-
-    if(not -d $root)
-    {
-        make_path($root);
-    }
-
-    my $zip_file_path = "$root/$file_name.zip";
-    my $repo_url = $ENV{MXNET_GLUON_REPO}//$apache_repo_url;
-    if($repo_url !~ /\/$/)
-    {
-        $repo_url .= '/';
-    }
-    download(
-        sprintf($_url_format, $repo_url, $file_name),
-        path=>$zip_file_path,
-        overwrite=>1
-    );
-    unzip($zip_file_path, $file_path);
-    unlink $zip_file_path;
-    if(check_sha1($file_path, $sha1_hash))
-    {
-        return $file_path;
-    }
-    else
-    {
-        Carp::confess("Downloaded file $file_path has different hash. Please try again.");
-    }
-}
-
-=head2 purge
-
-    Purge all pretrained model files in local file store.
-
-    Parameters
-    ----------
-    root : str, default '~/.mxnet/models'
-        Location for keeping the model parameters.
-=cut
-
-method purge(Str $root='~/.mxnet/models')
-{
-    $root =~ s/~/$ENV{HOME}/;
-    map { unlink } glob("$root/*.params");
-}
-
-1;
diff --git a/perl-package/AI-MXNet-Gluon-ModelZoo/lib/AI/MXNet/Gluon/ModelZoo/Vision.pm b/perl-package/AI-MXNet-Gluon-ModelZoo/lib/AI/MXNet/Gluon/ModelZoo/Vision.pm
deleted file mode 100644
index d306c2d44895..000000000000
--- a/perl-package/AI-MXNet-Gluon-ModelZoo/lib/AI/MXNet/Gluon/ModelZoo/Vision.pm
+++ /dev/null
@@ -1,48 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-package AI::MXNet::Gluon::ModelZoo::Vision;
-use strict;
-use warnings;
-use AI::MXNet::Gluon::ModelZoo::ModelStore;
-use AI::MXNet::Gluon::ModelZoo::Vision::AlexNet;
-use AI::MXNet::Gluon::ModelZoo::Vision::DenseNet;
-use AI::MXNet::Gluon::ModelZoo::Vision::Inception;
-use AI::MXNet::Gluon::ModelZoo::Vision::MobileNet;
-use AI::MXNet::Gluon::ModelZoo::Vision::ResNet;
-use AI::MXNet::Gluon::ModelZoo::Vision::SqueezeNet;
-use AI::MXNet::Gluon::ModelZoo::Vision::VGG;
-
-sub import
-{
-    my ($class, $short_name) = @_;
-    if($short_name)
-    {
-        $short_name =~ s/[^\w:]//g;
-        if(length $short_name)
-        {
-            my $short_name_package =<<"EOP";
-            package $short_name;
-            \@${short_name}::ISA = ('AI::MXNet::Gluon::ModelZoo::Vision');
-            1;
-EOP
-            eval $short_name_package;
-        }
-    }
-}
-
-1;
diff --git a/perl-package/AI-MXNet-Gluon-ModelZoo/lib/AI/MXNet/Gluon/ModelZoo/Vision/AlexNet.pm b/perl-package/AI-MXNet-Gluon-ModelZoo/lib/AI/MXNet/Gluon/ModelZoo/Vision/AlexNet.pm
deleted file mode 100644
index 48afe8b39f63..000000000000
--- a/perl-package/AI-MXNet-Gluon-ModelZoo/lib/AI/MXNet/Gluon/ModelZoo/Vision/AlexNet.pm
+++ /dev/null
@@ -1,115 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-package AI::MXNet::Gluon::ModelZoo::Vision::AlexNet;
-use strict;
-use warnings;
-use AI::MXNet::Function::Parameters;
-use AI::MXNet::Gluon::Mouse;
-extends 'AI::MXNet::Gluon::HybridBlock';
-
-=head1 NAME
-
-    AI::MXNet::Gluon::ModelZoo::Vision::AlexNet - AlexNet model from the `"One weird trick..."
-=cut
-
-=head1 DESCRIPTION
-
-    AlexNet model from the "One weird trick..." <https://arxiv.org/abs/1404.5997> paper.
-
-    Parameters
-    ----------
-    classes : Int, default 1000
-        Number of classes for the output layer.
-=cut
-has 'classes' => (is => 'ro', isa => 'Int', default => 1000);
-method python_constructor_arguments() { ['classes'] }
-
-sub BUILD
-{
-    my $self = shift;
-    $self->name_scope(sub {
-        $self->features(nn->HybridSequential(prefix=>''));
-        $self->features->name_scope(sub {
-            $self->features->add(nn->Conv2D(64, kernel_size=>11, strides=>4,
-                                            padding=>2, activation=>'relu'));
-            $self->features->add(nn->MaxPool2D(pool_size=>3, strides=>2));
-            $self->features->add(nn->Conv2D(192, kernel_size=>5, padding=>2,
-                                            activation=>'relu'));
-            $self->features->add(nn->MaxPool2D(pool_size=>3, strides=>2));
-            $self->features->add(nn->Conv2D(384, kernel_size=>3, padding=>1,
-                                            activation=>'relu'));
-            $self->features->add(nn->Conv2D(256, kernel_size=>3, padding=>1,
-                                            activation=>'relu'));
-            $self->features->add(nn->Conv2D(256, kernel_size=>3, padding=>1,
-                                            activation=>'relu'));
-            $self->features->add(nn->MaxPool2D(pool_size=>3, strides=>2));
-            $self->features->add(nn->Flatten());
-            $self->features->add(nn->Dense(4096, activation=>'relu'));
-            $self->features->add(nn->Dropout(0.5));
-            $self->features->add(nn->Dense(4096, activation=>'relu'));
-            $self->features->add(nn->Dropout(0.5));
-        });
-        $self->output(nn->Dense($self->classes));
-    });
-}
-
-method hybrid_forward(GluonClass $F, GluonInput $x)
-{
-    $x = $self->features->($x);
-    $x = $self->output->($x);
-    return $x;
-}
-
-package AI::MXNet::Gluon::ModelZoo::Vision;
-
-=head2 alexnet
-
-    AlexNet model from the `"One weird trick..." <https://arxiv.org/abs/1404.5997> paper.
-
-    Parameters
-    ----------
-    :$pretrained : Bool, default 0
-        Whether to load the pretrained weights for model.
-    :$ctx : AI::MXNet::Context, default AI::MXNet::Context->cpu
-        The context in which to load the pretrained weights.
-    :$root : Str, default '~/.mxnet/models'
-        Location for keeping the model parameters.
-=cut
-
-method alexnet(
-    Bool :$pretrained=0,
-    AI::MXNet::Context :$ctx=AI::MXNet::Context->cpu(),
-    Str :$root='~/.mxnet/models',
-    Int :$classes=1000
-)
-{
-    my $net = AI::MXNet::Gluon::ModelZoo::Vision::AlexNet->new($classes);
-    if($pretrained)
-    {
-        $net->load_parameters(
-            AI::MXNet::Gluon::ModelZoo::ModelStore->get_model_file(
-                'alexnet',
-                root=>$root
-            ),
-            ctx=>$ctx
-        );
-    }
-    return $net;
-}
-
-1;
diff --git a/perl-package/AI-MXNet-Gluon-ModelZoo/lib/AI/MXNet/Gluon/ModelZoo/Vision/DenseNet.pm b/perl-package/AI-MXNet-Gluon-ModelZoo/lib/AI/MXNet/Gluon/ModelZoo/Vision/DenseNet.pm
deleted file mode 100644
index cc69e7b5f40e..000000000000
--- a/perl-package/AI-MXNet-Gluon-ModelZoo/lib/AI/MXNet/Gluon/ModelZoo/Vision/DenseNet.pm
+++ /dev/null
@@ -1,277 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-package AI::MXNet::Gluon::ModelZoo::Vision::DenseNet;
-use strict;
-use warnings;
-use AI::MXNet::Base;
-use AI::MXNet::Function::Parameters;
-use AI::MXNet::Gluon::Mouse;
-extends 'AI::MXNet::Gluon::HybridBlock';
-
-func _make_dense_block($num_layers, $bn_size, $growth_rate, $dropout, $stage_index)
-{
-    my $out = nn->HybridSequential(prefix=>"stage${stage_index}_");
-    $out->name_scope(sub {
-        for(1..$num_layers)
-        {
-            $out->add(_make_dense_layer($growth_rate, $bn_size, $dropout));
-        }
-    });
-    return $out;
-}
-
-func _make_dense_layer($growth_rate, $bn_size, $dropout)
-{
-    my $new_features = nn->HybridSequential(prefix=>'');
-    $new_features->add(nn->BatchNorm());
-    $new_features->add(nn->Activation('relu'));
-    $new_features->add(nn->Conv2D($bn_size * $growth_rate, kernel_size=>1, use_bias=>0));
-    $new_features->add(nn->BatchNorm());
-    $new_features->add(nn->Activation('relu'));
-    $new_features->add(nn->Conv2D($growth_rate, kernel_size=>3, padding=>1, use_bias=>0));
-    if($dropout)
-    {
-        $new_features->add(nn->Dropout($dropout));
-    }
-
-    my $out = nn->HybridConcurrent(axis=>1, prefix=>'');
-    $out->add(nn->Identity());
-    $out->add($new_features);
-
-    return $out;
-}
-
-func _make_transition($num_output_features)
-{
-    my $out = nn->HybridSequential(prefix=>'');
-    $out->add(nn->BatchNorm());
-    $out->add(nn->Activation('relu'));
-    $out->add(nn->Conv2D($num_output_features, kernel_size=>1, use_bias=>0));
-    $out->add(nn->AvgPool2D(pool_size=>2, strides=>2));
-    return $out;
-}
-
-=head1 NAME
-
-    AI::MXNet::Gluon::ModelZoo::Vision::DenseNet - Densenet-BC model from the "Densely Connected Convolutional Networks"
-=cut
-
-=head1 DESCRIPTION
-
-    Densenet-BC model from the "Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf> paper.
-
-    Parameters
-    ----------
-    num_init_features : Int
-        Number of filters to learn in the first convolution layer.
-    growth_rate : Int
-        Number of filters to add each layer (`k` in the paper).
-    block_config : array ref of Int
-        List of integers for numbers of layers in each pooling block.
-    bn_size : Int, default 4
-        Multiplicative factor for number of bottle neck layers.
-        (i.e. bn_size * k features in the bottleneck layer)
-    dropout : float, default 0
-        Rate of dropout after each dense layer.
-    classes : int, default 1000
-        Number of classification classes.
-=cut
-has [qw/num_init_features
-        growth_rate/] => (is => 'ro', isa => 'Int', required => 1);
-has 'block_config'    => (is => 'ro', isa => 'ArrayRef[Int]', required => 1);
-has 'bn_size'         => (is => 'ro', isa => 'Int', default => 4);
-has 'dropout'         => (is => 'ro', isa => 'Num', default => 0);
-has 'classes'         => (is => 'ro', isa => 'Int', default => 1000);
-method python_constructor_arguments(){ [qw/num_init_features growth_rate block_config bn_size dropout classes/] }
-
-sub BUILD
-{
-    my $self = shift;
-    $self->name_scope(sub {
-        $self->features(nn->HybridSequential(prefix=>''));
-        $self->features->add(
-            nn->Conv2D(
-                $self->num_init_features, kernel_size=>7,
-                strides=>2, padding=>3, use_bias=>0
-            )
-        );
-        $self->features->add(nn->BatchNorm());
-        $self->features->add(nn->Activation('relu'));
-        $self->features->add(nn->MaxPool2D(pool_size=>3, strides=>2, padding=>1));
-        # Add dense blocks
-        my $num_features = $self->num_init_features;
-        for(enumerate($self->block_config))
-        {
-            my ($i, $num_layers) = @$_;
-            $self->features->add(_make_dense_block($num_layers, $self->bn_size, $self->growth_rate, $self->dropout, $i+1));
-            $num_features += $num_layers * $self->growth_rate;
-            if($i != @{ $self->block_config } - 1)
-            {
-                $self->features->add(_make_transition(int($num_features/2)));
-                $num_features = int($num_features/2);
-            }
-        }
-        $self->features->add(nn->BatchNorm());
-        $self->features->add(nn->Activation('relu'));
-        $self->features->add(nn->AvgPool2D(pool_size=>7));
-        $self->features->add(nn->Flatten());
-
-        $self->output(nn->Dense($self->classes));
-    });
-}
-
-method hybrid_forward(GluonClass $F, GluonInput $x)
-{
-    $x = $self->features->($x);
-    $x = $self->output->($x);
-    return $x;
-}
-
-package AI::MXNet::Gluon::ModelZoo::Vision;
-
-my %densenet_spec = (
-    121 => [64, 32, [6, 12, 24, 16]],
-    161 => [96, 48, [6, 12, 36, 24]],
-    169 => [64, 32, [6, 12, 32, 32]],
-    201 => [64, 32, [6, 12, 48, 32]]
-);
-
-=head2 get_densenet
-
-    Densenet-BC model from the
-    "Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf> paper.
-
-    Parameters
-    ----------
-    $num_layers : Int
-        Number of layers for the variant of densenet. Options are 121, 161, 169, 201.
-    :$pretrained : Bool, default 0
-        Whether to load the pretrained weights for model.
-    :$ctx : AI::MXNet::Context, default CPU
-        The context in which to load the pretrained weights.
-    :$root : Str, default '~/.mxnet/models'
-        Location for keeping the model parameters.
-=cut
-
-method get_densenet(
-    Int $num_layers, Bool :$pretrained=0, :$ctx=AI::MXNet::Context->cpu(),
-    :$root='~/.mxnet/models',
-    Int :$bn_size=4,
-    Num :$dropout=0,
-    Int :$classes=1000
-)
-{
-    my ($num_init_features, $growth_rate, $block_config) = @{ $densenet_spec{$num_layers} };
-    my $net = AI::MXNet::Gluon::ModelZoo::Vision::DenseNet->new(
-        $num_init_features, $growth_rate, $block_config,
-        $bn_size, $dropout, $classes
-    );
-    if($pretrained)
-    {
-        $net->load_parameters(
-            AI::MXNet::Gluon::ModelZoo::ModelStore->get_model_file(
-                "densenet$num_layers",
-                root=>$root
-            ),
-            ctx=>$ctx
-        );
-    }
-    return $net;
-}
-
-=head2 densenet121
-
-    Densenet-BC 121-layer model from the
-    "Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf> paper.
-
-    Parameters
-    ----------
-    :$pretrained : Bool, default 0
-        Whether to load the pretrained weights for model.
-    :$ctx : AI::MXNet::Context, default CPU
-        The context in which to load the pretrained weights.
-    :$root : Str, default '~/.mxnet/models'
-        Location for keeping the model parameters.
-=cut
-
-method densenet121(%kwargs)
-{
-    return __PACKAGE__->get_densenet(121, %kwargs)
-}
-
-=head2 densenet161
-
-    Densenet-BC 161-layer model from the
-    "Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf> paper.
-
-    Parameters
-    ----------
-    :$pretrained : Bool, default 0
-        Whether to load the pretrained weights for model.
-    :$ctx : AI::MXNet::Context, default CPU
-        The context in which to load the pretrained weights.
-    :$root : Str, default '~/.mxnet/models'
-        Location for keeping the model parameters.
-=cut
-
-method densenet161(%kwargs)
-{
-    return __PACKAGE__->get_densenet(161, %kwargs)
-}
-
-=head2 densenet169
-
-    Densenet-BC 169-layer model from the
-    "Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf> paper.
-
-    Parameters
-    ----------
-    :$pretrained : Bool, default 0
-        Whether to load the pretrained weights for model.
-    :$ctx : AI::MXNet::Context, default CPU
-        The context in which to load the pretrained weights.
-    :$root : Str, default '~/.mxnet/models'
-        Location for keeping the model parameters.
-=cut
-
-method densenet169(%kwargs)
-{
-    return __PACKAGE__->get_densenet(169, %kwargs)
-}
-
-=head2 densenet201
-
-    Densenet-BC 201-layer model from the
-    "Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf> paper.
-
-    Parameters
-    ----------
-    :$pretrained : Bool, default 0
-        Whether to load the pretrained weights for model.
-    :$ctx : AI::MXNet::Context, default CPU
-        The context in which to load the pretrained weights.
-    :$root : Str, default '~/.mxnet/models'
-        Location for keeping the model parameters.
-=cut
-
-method densenet201(%kwargs)
-{
-    return __PACKAGE__->get_densenet(201, %kwargs)
-}
-
-1;
diff --git a/perl-package/AI-MXNet-Gluon-ModelZoo/lib/AI/MXNet/Gluon/ModelZoo/Vision/Inception.pm b/perl-package/AI-MXNet-Gluon-ModelZoo/lib/AI/MXNet/Gluon/ModelZoo/Vision/Inception.pm
deleted file mode 100644
index 19bd8e1bfb45..000000000000
--- a/perl-package/AI-MXNet-Gluon-ModelZoo/lib/AI/MXNet/Gluon/ModelZoo/Vision/Inception.pm
+++ /dev/null
@@ -1,287 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-package AI::MXNet::Gluon::ModelZoo::Vision::Inception::V3;
-use strict;
-use warnings;
-use AI::MXNet::Base;
-use AI::MXNet::Function::Parameters;
-use AI::MXNet::Gluon::Mouse;
-extends 'AI::MXNet::Gluon::HybridBlock';
-
-func _make_basic_conv(%kwargs)
-{
-    my $out = nn->HybridSequential(prefix=>'');
-    $out->add(nn->Conv2D(use_bias=>0, %kwargs));
-    $out->add(nn->BatchNorm(epsilon=>0.001));
-    $out->add(nn->Activation('relu'));
-    return $out;
-}
-
-func _make_branch($use_pool, @conv_settings)
-{
-    my $out = nn->HybridSequential(prefix=>'');
-    if($use_pool eq 'avg')
-    {
-        $out->add(nn->AvgPool2D(pool_size=>3, strides=>1, padding=>1));
-    }
-    elsif($use_pool eq 'max')
-    {
-        $out->add(nn->MaxPool2D(pool_size=>3, strides=>2));
-    }
-    my @setting_names = ('channels', 'kernel_size', 'strides', 'padding');
-    for my $setting (@conv_settings)
-    {
-        my %kwargs;
-        for(enumerate($setting))
-        {
-            my ($i, $value) = @$_;
-            if(defined $value)
-            {
-                $kwargs{ $setting_names[$i] } = $value;
-            }
-        }
-        $out->add(_make_basic_conv(%kwargs));
-    }
-    return $out;
-}
-
-func _make_A($pool_features, $prefix)
-{
-    my $out = nn->HybridConcurrent(axis=>1, prefix=>$prefix);
-    $out->name_scope(sub {
-        $out->add(_make_branch('', [64, 1, undef, undef]));
-        $out->add(_make_branch(
-            '',
-            [48, 1, undef, undef],
-            [64, 5, undef, 2]
-        ));
-        $out->add(_make_branch(
-            '',
-            [64, 1, undef, undef],
-            [96, 3, undef, 1],
-            [96, 3, undef, 1]
-        ));
-        $out->add(_make_branch('avg', [$pool_features, 1, undef, undef]));
-    });
-    return $out;
-}
-
-func _make_B($prefix)
-{
-    my $out = nn->HybridConcurrent(axis=>1, prefix=>$prefix);
-    $out->name_scope(sub {
-        $out->add(_make_branch('', [384, 3, 2, undef]));
-        $out->add(_make_branch(
-            '',
-            [64, 1, undef, undef],
-            [96, 3, undef, 1],
-            [96, 3, 2, undef]
-        ));
-        $out->add(_make_branch('max'));
-    });
-    return $out;
-}
-
-func _make_C($channels_7x7, $prefix)
-{
-    my $out = nn->HybridConcurrent(axis=>1, prefix=>$prefix);
-    $out->name_scope(sub {
-        $out->add(_make_branch('', [192, 1, undef, undef]));
-        $out->add(_make_branch(
-            '',
-            [$channels_7x7, 1, undef, undef],
-            [$channels_7x7, [1, 7], undef, [0, 3]],
-            [192, [7, 1], undef, [3, 0]]
-        ));
-        $out->add(_make_branch(
-            '',
-            [$channels_7x7, 1, undef, undef],
-            [$channels_7x7, [7, 1], undef, [3, 0]],
-            [$channels_7x7, [1, 7], undef, [0, 3]],
-            [$channels_7x7, [7, 1], undef, [3, 0]],
-            [192, [1, 7], undef, [0, 3]]
-        ));
-        $out->add(_make_branch(
-            'avg',
-            [192, 1, undef, undef]
-        ));
-    });
-    return $out;
-}
-
-func _make_D($prefix)
-{
-    my $out = nn->HybridConcurrent(axis=>1, prefix=>$prefix);
-    $out->name_scope(sub {
-        $out->add(_make_branch(
-            '',
-            [192, 1, undef, undef],
-            [320, 3, 2, undef]
-        ));
-        $out->add(_make_branch(
-            '',
-            [192, 1, undef, undef],
-            [192, [1, 7], undef, [0, 3]],
-            [192, [7, 1], undef, [3, 0]],
-            [192, 3, 2, undef]
-        ));
-        $out->add(_make_branch('max'));
-    });
-    return $out;
-}
-
-func _make_E($prefix)
-{
-    my $out = nn->HybridConcurrent(axis=>1, prefix=>$prefix);
-    $out->name_scope(sub {
-        $out->add(_make_branch('', [320, 1, undef, undef]));
-
-        my $branch_3x3 = nn->HybridSequential(prefix=>'');
-        $out->add($branch_3x3);
-        $branch_3x3->add(_make_branch(
-            '',
-            [384, 1, undef, undef]
-        ));
-        my $branch_3x3_split = nn->HybridConcurrent(axis=>1, prefix=>'');
-        $branch_3x3_split->add(_make_branch('', [384, [1, 3], undef, [0, 1]]));
-        $branch_3x3_split->add(_make_branch('', [384, [3, 1], undef, [1, 0]]));
-        $branch_3x3->add($branch_3x3_split);
-
-        my $branch_3x3dbl = nn->HybridSequential(prefix=>'');
-        $out->add($branch_3x3dbl);
-        $branch_3x3dbl->add(_make_branch(
-            '',
-            [448, 1, undef, undef],
-            [384, 3, undef, 1]
-        ));
-        my $branch_3x3dbl_split = nn->HybridConcurrent(axis=>1, prefix=>'');
-        $branch_3x3dbl->add($branch_3x3dbl_split);
-        $branch_3x3dbl_split->add(_make_branch('', [384, [1, 3], undef, [0, 1]]));
-        $branch_3x3dbl_split->add(_make_branch('', [384, [3, 1], undef, [1, 0]]));
-
-        $out->add(_make_branch('avg', [192, 1, undef, undef]));
-    });
-    return $out;
-}
-
-func make_aux($classes)
-{
-    my $out = nn->HybridSequential(prefix=>'');
-    $out->add(nn->AvgPool2D(pool_size=>5, strides=>3));
-    $out->add(_make_basic_conv(channels=>128, kernel_size=>1));
-    $out->add(_make_basic_conv(channels=>768, kernel_size=>5));
-    $out->add(nn->Flatten());
-    $out->add(nn->Dense($classes));
-    return $out;
-}
-
-=head1 NAME
-
-    AI::MXNet::Gluon::ModelZoo::Vision::Inception::V3 - Inception v3 model.
-=cut
-
-=head1 DESCRIPTION
-
-    Inception v3 model from
-    "Rethinking the Inception Architecture for Computer Vision"
-    <http://arxiv.org/abs/1512.00567> paper.
-
-    Parameters
-    ----------
-    classes : Int, default 1000
-        Number of classification classes.
-=cut
-
-has 'classes' => (is => 'ro', isa => 'Int', default => 1000);
-method python_constructor_arguments(){ ['classes'] }
-
-sub BUILD
-{
-    my $self = shift;
-    $self->name_scope(sub {
-        $self->features(nn->HybridSequential(prefix=>''));
-        $self->features->add(_make_basic_conv(channels=>32, kernel_size=>3, strides=>2));
-        $self->features->add(_make_basic_conv(channels=>32, kernel_size=>3));
-        $self->features->add(_make_basic_conv(channels=>64, kernel_size=>3, padding=>1));
-        $self->features->add(nn->MaxPool2D(pool_size=>3, strides=>2));
-        $self->features->add(_make_basic_conv(channels=>80, kernel_size=>1));
-        $self->features->add(_make_basic_conv(channels=>192, kernel_size=>3));
-        $self->features->add(nn->MaxPool2D(pool_size=>3, strides=>2));
-        $self->features->add(_make_A(32, 'A1_'));
-        $self->features->add(_make_A(64, 'A2_'));
-        $self->features->add(_make_A(64, 'A3_'));
-        $self->features->add(_make_B('B_'));
-        $self->features->add(_make_C(128, 'C1_'));
-        $self->features->add(_make_C(160, 'C2_'));
-        $self->features->add(_make_C(160, 'C3_'));
-        $self->features->add(_make_C(192, 'C4_'));
-        $self->features->add(_make_D('D_'));
-        $self->features->add(_make_E('E1_'));
-        $self->features->add(_make_E('E2_'));
-        $self->features->add(nn->AvgPool2D(pool_size=>8));
-        $self->features->add(nn->Dropout(0.5));
-
-        $self->output(nn->Dense($self->classes));
-    });
-}
-
-method hybrid_forward(GluonClass $F, GluonInput $x)
-{
-    $x = $self->features->($x);
-    $x = $self->output->($x);
-    return $x;
-}
-
-package AI::MXNet::Gluon::ModelZoo::Vision;
-
-=head2 inception_v3
-
-    Inception v3 model from
-    "Rethinking the Inception Architecture for Computer Vision"
-    <http://arxiv.org/abs/1512.00567> paper.
-
-    Parameters
-    ----------
-    :$pretrained : Bool, default 0
-        Whether to load the pretrained weights for model.
-    :$ctx : AI::MXNet::Context, default CPU
-        The context in which to load the pretrained weights.
-    :$root : Str, default '~/.mxnet/models'
-        Location for keeping the model parameters.
-=cut
-
-method inception_v3(
-    Bool :$pretrained=0, AI::MXNet::Context :$ctx=AI::MXNet::Context->cpu(),
-    Str :$root='~/.mxnet/models', Int :$classes=1000
-)
-{
-    my $net = AI::MXNet::Gluon::ModelZoo::Vision::Inception::V3->new($classes);
-    if($pretrained)
-    {
-        $net->load_parameters(
-            AI::MXNet::Gluon::ModelZoo::ModelStore->get_model_file(
-                "inceptionv3",
-                root=>$root
-            ),
-            ctx=>$ctx
-        );
-    }
-    return $net;
-}
-
-1;
\ No newline at end of file
diff --git a/perl-package/AI-MXNet-Gluon-ModelZoo/lib/AI/MXNet/Gluon/ModelZoo/Vision/MobileNet.pm b/perl-package/AI-MXNet-Gluon-ModelZoo/lib/AI/MXNet/Gluon/ModelZoo/Vision/MobileNet.pm
deleted file mode 100644
index a0891c5d51da..000000000000
--- a/perl-package/AI-MXNet-Gluon-ModelZoo/lib/AI/MXNet/Gluon/ModelZoo/Vision/MobileNet.pm
+++ /dev/null
@@ -1,518 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-use strict;
-use warnings;
-use AI::MXNet::Function::Parameters;
-package AI::MXNet::Gluon::ModelZoo::Vision::MobileNet::RELU6;
-use AI::MXNet::Gluon::Mouse;
-extends 'AI::MXNet::Gluon::HybridBlock';
-
-method hybrid_forward(GluonClass $F, GluonInput $x)
-{
-    return $F->clip($x, a_min => 0, a_max => 6, name=>"relu6");
-}
-
-package AI::MXNet::Gluon::ModelZoo::Vision::MobileNet::LinearBottleneck;
-use AI::MXNet::Gluon::Mouse;
-extends 'AI::MXNet::Gluon::HybridBlock';
-has [qw/in_channels channels t stride/] => (is => 'ro', isa => 'Int', required => 1);
-method python_constructor_arguments(){ [qw/in_channels channels t stride/] }
-
-=head1 NAME
-
-    AI::MXNet::Gluon::ModelZoo::Vision::MobileNet::LinearBottleneck - LinearBottleneck used in MobileNetV2 model
-=cut
-
-=head1 DESCRIPTION
-
-    LinearBottleneck used in MobileNetV2 model from the
-    "Inverted Residuals and Linear Bottlenecks:
-      Mobile Networks for Classification, Detection and Segmentation"
-    <https://arxiv.org/abs/1801.04381> paper.
-
-    Parameters
-    ----------
-    in_channels : Int
-        Number of input channels.
-    channels : Int
-        Number of output channels.
-    t : Int
-        Layer expansion ratio.
-    stride : Int
-        stride
-=cut
-
-func _add_conv(
-    $out, $channels, :$kernel=1, :$stride=1, :$pad=0,
-    :$num_group=1, :$active=1, :$relu6=0
-)
-{
-    $out->add(nn->Conv2D($channels, $kernel, $stride, $pad, groups=>$num_group, use_bias=>0));
-    $out->add(nn->BatchNorm(scale=>1));
-    if($active)
-    {
-        $out->add($relu6 ? AI::MXNet::Gluon::ModelZoo::Vision::MobileNet::RELU6->new : nn->Activation('relu'));
-    }
-}
-
-sub BUILD
-{
-    my $self = shift;
-    $self->use_shortcut($self->stride == 1 and $self->in_channels == $self->channels);
-    $self->name_scope(sub {
-        $self->out(nn->HybridSequential());
-        _add_conv($self->out, $self->in_channels * $self->t, relu6=>1);
-        _add_conv(
-            $self->out, $self->in_channels * $self->t, kernel=>3, stride=>$self->stride,
-            pad=>1, num_group=>$self->in_channels * $self->t, relu6=>1
-        );
-        _add_conv($self->out, $self->channels, active=>0, relu6=>1);
-    });
-}
-
-method hybrid_forward($F, $x)
-{
-    my $out = $self->out->($x);
-    if($self->use_shortcut)
-    {
-        $out = $F->elemwise_add($out, $x);
-    }
-    return $out;
-}
-
-package AI::MXNet::Gluon::ModelZoo::Vision::MobileNet;
-use AI::MXNet::Gluon::Mouse;
-use AI::MXNet::Base;
-extends 'AI::MXNet::Gluon::HybridBlock';
-has 'multiplier' => (is => 'ro', isa => 'Num', default => 1);
-has 'classes'    => (is => 'ro', isa => 'Int', default => 1000);
-method python_constructor_arguments(){ [qw/multiplier classes/] }
-
-=head1 NAME
-
-    AI::MXNet::Gluon::ModelZoo::Vision::MobileNet - MobileNet model from the
-        "MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications"
-=cut
-
-=head1 DESCRIPTION
-
-    MobileNet model from the
-    "MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications"
-    <https://arxiv.org/abs/1704.04861> paper.
-
-    Parameters
-    ----------
-    multiplier : Num, default 1.0
-        The width multiplier for controling the model size. Only multipliers that are no
-        less than 0.25 are supported. The actual number of channels is equal to the original
-        channel size multiplied by this multiplier.
-    classes : Int, default 1000
-        Number of classes for the output layer.
-=cut
-
-func _add_conv(
-    $out, :$channels=1, :$kernel=1, :$stride=1, :$pad=0,
-    :$num_group=1, :$active=1, :$relu6=0
-)
-{
-    $out->add(nn->Conv2D($channels, $kernel, $stride, $pad, groups=>$num_group, use_bias=>0));
-    $out->add(nn->BatchNorm(scale=>1));
-    if($active)
-    {
-        $out->add($relu6 ? AI::MXNet::Gluon::ModelZoo::Vision::MobileNet::RELU6->new : nn->Activation('relu'));
-    }
-}
-
-
-func _add_conv_dw($out, :$dw_channels=, :$channels=, :$stride=, :$relu6=0)
-{
-    _add_conv($out, channels=>$dw_channels, kernel=>3, stride=>$stride,
-              pad=>1, num_group=>$dw_channels, relu6=>$relu6);
-    _add_conv($out, channels=>$channels, relu6=>$relu6);
-}
-
-sub BUILD
-{
-    my $self = shift;
-    $self->name_scope(sub {
-        $self->features(nn->HybridSequential(prefix=>''));
-        $self->features->name_scope(sub {
-            _add_conv($self->features, channels=>int(32 * $self->multiplier), kernel=>3, pad=>1, stride=>2);
-            my $dw_channels = [map { int($_ * $self->multiplier) } (32, 64, (128)x2, (256)x2, (512)x6, 1024)];
-            my $channels = [map { int($_ * $self->multiplier) } (64, (128)x2, (256)x2, (512)x6, (1024)x2)];
-            my $strides = [(1, 2)x3, (1)x5, 2, 1];
-            for(zip($dw_channels, $channels, $strides))
-            {
-                my ($dwc, $c, $s) = @$_;
-                _add_conv_dw($self->features, dw_channels=>$dwc, channels=>$c, stride=>$s);
-            }
-            $self->features->add(nn->GlobalAvgPool2D());
-            $self->features->add(nn->Flatten());
-        });
-        $self->output(nn->Dense($self->classes));
-    });
-}
-
-method hybrid_forward(GluonClass $F, GluonInput $x)
-{
-    $x = $self->features->($x);
-    $x = $self->output->($x);
-    return $x;
-}
-
-package AI::MXNet::Gluon::ModelZoo::Vision::MobileNetV2;
-use AI::MXNet::Gluon::Mouse;
-use AI::MXNet::Base;
-extends 'AI::MXNet::Gluon::HybridBlock';
-has 'multiplier' => (is => 'ro', isa => 'Num', default => 1);
-has 'classes'    => (is => 'ro', isa => 'Int', default => 1000);
-method python_constructor_arguments(){ [qw/multiplier classes/] }
-
-=head1 NAME
-
-    AI::MXNet::Gluon::ModelZoo::Vision::MobileNetV2 - MobileNet model from the
-        "Inverted Residuals and Linear Bottlenecks: Mobile Networks for Classification, Detection and Segmentation"
-=cut
-
-=head1 DESCRIPTION
-
-    MobileNetV2 model from the
-    "Inverted Residuals and Linear Bottlenecks:
-      Mobile Networks for Classification, Detection and Segmentation"
-    <https://arxiv.org/abs/1801.04381> paper.
-
-    Parameters
-    ----------
-    multiplier : Num, default 1.0
-        The width multiplier for controling the model size. Only multipliers that are no
-        less than 0.25 are supported. The actual number of channels is equal to the original
-        channel size multiplied by this multiplier.
-    classes : Int, default 1000
-        Number of classes for the output layer.
-=cut
-
-func _add_conv(
-    $out, $channels, :$kernel=1, :$stride=1, :$pad=0,
-    :$num_group=1, :$active=1, :$relu6=0
-)
-{
-    $out->add(nn->Conv2D($channels, $kernel, $stride, $pad, groups=>$num_group, use_bias=>0));
-    $out->add(nn->BatchNorm(scale=>1));
-    if($active)
-    {
-        $out->add($relu6 ? AI::MXNet::Gluon::ModelZoo::Vision::MobileNet::RELU6->new : nn->Activation('relu'));
-    }
-}
-
-sub BUILD
-{
-    my $self = shift;
-    $self->name_scope(sub {
-        $self->features(nn->HybridSequential(prefix=>'features_'));
-        $self->features->name_scope(sub {
-            _add_conv(
-                $self->features, int(32 * $self->multiplier), kernel=>3,
-                stride=>2, pad=>1, relu6=>1
-            );
-
-            my $in_channels_group = [map { int($_ * $self->multiplier) } (32, 16, (24)x2, (32)x3, (64)x4, (96)x3, (160)x3)];
-            my $channels_group = [map { int($_ * $self->multiplier) } (16, (24)x2, (32)x3, (64)x4, (96)x3, (160)x3, 320)];
-            my $ts = [1, (6)x16];
-            my $strides = [(1, 2)x2, 1, 1, 2, (1)x6, 2, (1)x3];
-
-            for(zip($in_channels_group, $channels_group, $ts, $strides))
-            {
-                my ($in_c, $c, $t, $s) = @$_;
-                $self->features->add(
-                    AI::MXNet::Gluon::ModelZoo::Vision::MobileNet::LinearBottleneck->new(
-                        in_channels=>$in_c, channels=>$c,
-                        t=>$t, stride=>$s
-                    )
-                );
-            }
-
-            my $last_channels = $self->multiplier > 1 ? int(1280 * $self->multiplier) : 1280;
-            _add_conv($self->features, $last_channels, relu6=>1);
-            $self->features->add(nn->GlobalAvgPool2D());
-        });
-
-        $self->output(nn->HybridSequential(prefix=>'output_'));
-        $self->output->name_scope(sub {
-            $self->output->add(
-                nn->Conv2D($self->classes, 1, use_bias=>0, prefix=>'pred_'),
-                nn->Flatten()
-            );
-        });
-    });
-}
-
-method hybrid_forward(GluonClass $F, GluonInput $x)
-{
-    $x = $self->features->($x);
-    $x = $self->output->($x);
-    return $x;
-}
-
-package AI::MXNet::Gluon::ModelZoo::Vision;
-
-=head2 get_mobilenet
-
-    MobileNet model from the
-    "MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications"
-    <https://arxiv.org/abs/1704.04861> paper.
-
-    Parameters
-    ----------
-    $multiplier : Num
-        The width multiplier for controling the model size. Only multipliers that are no
-        less than 0.25 are supported. The actual number of channels is equal to the original
-        channel size multiplied by this multiplier.
-    :$pretrained : Bool, default 0
-        Whether to load the pretrained weights for model.
-    :$ctx : AI::MXNet::Context, default CPU
-        The context in which to load the pretrained weights.
-    :$root : Str, default '~/.mxnet/models'
-        Location for keeping the model parameters.
-=cut
-
-method get_mobilenet(
-    Num $multiplier, Bool :$pretrained=0, AI::MXNet::Context :$ctx=AI::MXNet::Context->cpu(),
-    Str :$root='~/.mxnet/models'
-)
-{
-    my $net = AI::MXNet::Gluon::ModelZoo::Vision::MobileNet->new($multiplier);
-    if($pretrained)
-    {
-        my $version_suffix = sprintf("%.2f", $multiplier);
-        if($version_suffix eq '1.00' or $version_suffix eq '0.50')
-        {
-            $version_suffix =~ s/.$//;
-        }
-        $net->load_parameters(
-            AI::MXNet::Gluon::ModelZoo::ModelStore->get_model_file(
-                "mobilenet$version_suffix",
-                root=>$root
-            ),
-            ctx=>$ctx
-        );
-    }
-    return $net;
-}
-
-=head2 get_mobilenet_v2
-
-    MobileNetV2 model from the
-    "Inverted Residuals and Linear Bottlenecks:
-      Mobile Networks for Classification, Detection and Segmentation"
-    <https://arxiv.org/abs/1801.04381> paper.
-
-    Parameters
-    ----------
-    $multiplier : Num
-        The width multiplier for controling the model size. Only multipliers that are no
-        less than 0.25 are supported. The actual number of channels is equal to the original
-        channel size multiplied by this multiplier.
-    :$pretrained : Bool, default 0
-        Whether to load the pretrained weights for model.
-    :$ctx : AI::MXNet::Context, default CPU
-        The context in which to load the pretrained weights.
-    :$root : Str, default '~/.mxnet/models'
-        Location for keeping the model parameters.
-=cut
-
-method get_mobilenet_v2(
-    Num $multiplier, Bool :$pretrained=0, AI::MXNet::Context :$ctx=AI::MXNet::Context->cpu(),
-    Str :$root='~/.mxnet/models'
-)
-{
-    my $net = AI::MXNet::Gluon::ModelZoo::Vision::MobileNetV2->new($multiplier);
-    if($pretrained)
-    {
-        my $version_suffix = sprintf("%.2f", $multiplier);
-        if($version_suffix eq '1.00' or $version_suffix eq '0.50')
-        {
-            $version_suffix =~ s/.$//;
-        }
-        $net->load_parameters(
-            AI::MXNet::Gluon::ModelZoo::ModelStore->get_model_file(
-                "mobilenetv2_$version_suffix",
-                root=>$root
-            ),
-            ctx=>$ctx
-        );
-    }
-    return $net;
-}
-
-=head2 mobilenet1_0
-
-    MobileNet model from the
-    "MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications"
-    <https://arxiv.org/abs/1704.04861> paper, with width multiplier 1.0.
-
-    Parameters
-    ----------
-    :$pretrained : Bool, default 0
-        Whether to load the pretrained weights for model.
-    :$ctx : AI::MXNet::Context, default CPU
-        The context in which to load the pretrained weights.
-=cut
-
-method mobilenet1_0(%kwargs)
-{
-    return __PACKAGE__->get_mobilenet(1.0, %kwargs);
-}
-
-=head2 mobilenet_v2_1_0
-
-    MobileNetV2 model from the
-    "Inverted Residuals and Linear Bottlenecks:
-      Mobile Networks for Classification, Detection and Segmentation"
-    <https://arxiv.org/abs/1801.04381> paper.
-
-    Parameters
-    ----------
-    :$pretrained : Bool, default 0
-        Whether to load the pretrained weights for model.
-    :$ctx : AI::MXNet::Context, default CPU
-        The context in which to load the pretrained weights.
-=cut
-
-method mobilenet_v2_1_0(%kwargs)
-{
-    return __PACKAGE__->get_mobilenet_v2(1.0, %kwargs);
-}
-
-=head2 mobilenet0_75
-
-    MobileNet model from the
-    "MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications"
-    <https://arxiv.org/abs/1704.04861> paper, with width multiplier 0.75.
-
-    Parameters
-    ----------
-    :$pretrained : Bool, default 0
-        Whether to load the pretrained weights for model.
-    :$ctx : AI::MXNet::Context, default CPU
-        The context in which to load the pretrained weights.
-=cut
-
-method mobilenet0_75(%kwargs)
-{
-    return __PACKAGE__->get_mobilenet(0.75, %kwargs);
-}
-
-=head2 mobilenet_v2_0_75
-
-    MobileNetV2 model from the
-    "Inverted Residuals and Linear Bottlenecks:
-      Mobile Networks for Classification, Detection and Segmentation"
-    <https://arxiv.org/abs/1801.04381> paper.
-
-    Parameters
-    ----------
-    :$pretrained : Bool, default 0
-        Whether to load the pretrained weights for model.
-    :$ctx : AI::MXNet::Context, default CPU
-        The context in which to load the pretrained weights.
-=cut
-
-method mobilenet_v2_0_75(%kwargs)
-{
-    return __PACKAGE__->get_mobilenet_v2(0.75, %kwargs);
-}
-
-=head2 mobilenet0_5
-
-    MobileNet model from the
-    "MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications"
-    <https://arxiv.org/abs/1704.04861> paper, with width multiplier 0.5.
-
-    Parameters
-    ----------
-    :$pretrained : Bool, default 0
-        Whether to load the pretrained weights for model.
-    :$ctx : AI::MXNet::Context, default CPU
-        The context in which to load the pretrained weights.
-=cut
-
-method mobilenet0_5(%kwargs)
-{
-    return __PACKAGE__->get_mobilenet(0.5, %kwargs);
-}
-
-=head2 mobilenet_v2_0_5
-
-    MobileNetV2 model from the
-    "Inverted Residuals and Linear Bottlenecks:
-      Mobile Networks for Classification, Detection and Segmentation"
-    <https://arxiv.org/abs/1801.04381> paper.
-
-    Parameters
-    ----------
-    :$pretrained : Bool, default 0
-        Whether to load the pretrained weights for model.
-    :$ctx : AI::MXNet::Context, default CPU
-        The context in which to load the pretrained weights.
-=cut
-
-method mobilenet_v2_0_5(%kwargs)
-{
-    return __PACKAGE__->get_mobilenet_v2(0.5, %kwargs);
-}
-
-=head2 mobilenet0_25
-
-    MobileNet model from the
-    "MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications"
-    <https://arxiv.org/abs/1704.04861> paper, with width multiplier 0.25.
-
-    Parameters
-    ----------
-    :$pretrained : Bool, default 0
-        Whether to load the pretrained weights for model.
-    :$ctx : AI::MXNet::Context, default CPU
-        The context in which to load the pretrained weights.
-=cut
-
-method mobilenet0_25(%kwargs)
-{
-    return __PACKAGE__->get_mobilenet(0.25, %kwargs);
-}
-
-=head2 mobilenet_v2_0_25
-
-    MobileNetV2 model from the
-    "Inverted Residuals and Linear Bottlenecks:
-      Mobile Networks for Classification, Detection and Segmentation"
-    <https://arxiv.org/abs/1801.04381> paper.
-
-    Parameters
-    ----------
-    :$pretrained : Bool, default 0
-        Whether to load the pretrained weights for model.
-    :$ctx : AI::MXNet::Context, default CPU
-        The context in which to load the pretrained weights.
-=cut
-
-method mobilenet_v2_0_25(%kwargs)
-{
-    return __PACKAGE__->get_mobilenet_v2(0.25, %kwargs);
-}
-
-1;
diff --git a/perl-package/AI-MXNet-Gluon-ModelZoo/lib/AI/MXNet/Gluon/ModelZoo/Vision/ResNet.pm b/perl-package/AI-MXNet-Gluon-ModelZoo/lib/AI/MXNet/Gluon/ModelZoo/Vision/ResNet.pm
deleted file mode 100644
index adf6e8aaaa8f..000000000000
--- a/perl-package/AI-MXNet-Gluon-ModelZoo/lib/AI/MXNet/Gluon/ModelZoo/Vision/ResNet.pm
+++ /dev/null
@@ -1,828 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-use strict;
-use warnings;
-use AI::MXNet::Function::Parameters;
-
-package AI::MXNet::Gluon::ModelZoo::Vision::ResNet::BasicBlockV1;
-use AI::MXNet::Gluon::Mouse;
-extends 'AI::MXNet::Gluon::HybridBlock';
-
-=head1 NAME 
-
-    AI::MXNet::Gluon::ModelZoo::Vision::ResNet::BasicBlockV1 - BasicBlock V1 from `"Deep Residual Learning for Image Recognition"
-=cut
-
-=head1 DESCRIPTION
-
-    BasicBlock V1 from `"Deep Residual Learning for Image Recognition"
-    <http://arxiv.org/abs/1512.03385>`_ paper.
-    This is used for ResNet V1 for 18, 34 layers.
-
-    Parameters
-    ----------
-    channels : Int
-        Number of output channels.
-    stride : Int
-        Stride size.
-    downsample : Bool, default 0
-        Whether to downsample the input.
-    in_channels : Int, default 0
-        Number of input channels. Default is 0, to infer from the graph.
-=cut
-
-has ['channels',
-      'stride']   => (is => 'ro', isa => 'Int', required => 1);
-has 'downsample' => (is => 'rw', default => 0);
-has 'in_channels' => (is => 'ro', isa => 'Int', default => 0);
-method python_constructor_arguments() { [qw/channels stride downsample/] }
-func _conv3x3($channels, $stride, $in_channels)
-{
-    return nn->Conv2D(
-        $channels, kernel_size=>3, strides=>$stride, padding=>1,
-        use_bias=>0, in_channels=>$in_channels
-    );
-}
-
-sub BUILD
-{
-    my $self = shift;
-    $self->body(nn->HybridSequential(prefix=>''));
-    $self->body->add(_conv3x3($self->channels, $self->stride, $self->in_channels));
-    $self->body->add(nn->BatchNorm());
-    $self->body->add(nn->Activation('relu'));
-    $self->body->add(_conv3x3($self->channels, 1, $self->channels));
-    $self->body->add(nn->BatchNorm());
-    if($self->downsample)
-    {
-        $self->downsample(nn->HybridSequential(prefix=>''));
-        $self->downsample->add(
-            nn->Conv2D($self->channels, kernel_size=>1, strides=>$self->stride,
-                       use_bias=>0, in_channels=>$self->in_channels)
-        );
-        $self->downsample->add(nn->BatchNorm());
-    }
-    else
-    {
-        $self->downsample(undef);
-    }
-}
-
-method hybrid_forward(GluonClass $F, GluonInput $x)
-{
-    my $residual = $x;
-    $x = $self->body->($x);
-    if(defined $self->downsample)
-    {
-        $residual = $self->downsample->($residual);
-    }
-    $x = $F->Activation($residual+$x, act_type=>'relu');
-    return $x;
-}
-
-package AI::MXNet::Gluon::ModelZoo::Vision::ResNet::BottleneckV1;
-use AI::MXNet::Gluon::Mouse;
-extends 'AI::MXNet::Gluon::HybridBlock';
-
-=head1 NAME
-
-    AI::MXNet::Gluon::ModelZoo::Vision::ResNet::BottleneckV1 - Bottleneck V1 from "Deep Residual Learning for Image Recognition"
-=cut
-
-=head1 DESCRIPTION
-
-    Bottleneck V1 from "Deep Residual Learning for Image Recognition"
-    <http://arxiv.org/abs/1512.03385> paper.
-    This is used for ResNet V1 for 50, 101, 152 layers.
-
-    Parameters
-    ----------
-    channels : int
-        Number of output channels.
-    stride : int
-        Stride size.
-    downsample : bool, default False
-        Whether to downsample the input.
-    in_channels : int, default 0
-        Number of input channels. Default is 0, to infer from the graph.
-=cut
-
-has ['channels',
-      'stride']   => (is => 'ro', isa => 'Int', required => 1);
-has 'downsample'  => (is => 'rw', default => 0);
-has 'in_channels' => (is => 'ro', isa => 'Int', default => 0);
-method python_constructor_arguments() { [qw/channels stride downsample/] }
-func _conv3x3($channels, $stride, $in_channels)
-{
-    return nn->Conv2D(
-        $channels, kernel_size=>3, strides=>$stride, padding=>1,
-        use_bias=>0, in_channels=>$in_channels
-    );
-}
-
-sub BUILD
-{
-    my $self = shift;
-    $self->body(nn->HybridSequential(prefix=>''));
-    $self->body->add(nn->Conv2D(int($self->channels/4), kernel_size=>1, strides=>$self->stride));
-    $self->body->add(nn->BatchNorm());
-    $self->body->add(nn->Activation('relu'));
-    $self->body->add(_conv3x3(int($self->channels/4), 1, int($self->channels/4)));
-    $self->body->add(nn->BatchNorm());
-    $self->body->add(nn->Activation('relu'));
-    $self->body->add(nn->Conv2D($self->channels, kernel_size=>1, strides=>1));
-    $self->body->add(nn->BatchNorm());
-    if($self->downsample)
-    {
-        $self->downsample(nn->HybridSequential(prefix=>''));
-        $self->downsample->add(
-            nn->Conv2D($self->channels, kernel_size=>1, strides=>$self->stride,
-                       use_bias=>0, in_channels=>$self->in_channels)
-        );
-        $self->downsample->add(nn->BatchNorm());
-    }
-    else
-    {
-        $self->downsample(undef);
-    }
-}
-
-method hybrid_forward(GluonClass $F, GluonInput $x)
-{
-    my $residual = $x;
-    $x = $self->body->($x);
-    if(defined $self->downsample)
-    {
-        $residual = $self->downsample->($residual);
-    }
-    $x = $F->Activation($residual+$x, act_type=>'relu');
-    return $x;
-}
-
-package AI::MXNet::Gluon::ModelZoo::Vision::ResNet::BasicBlockV2;
-use AI::MXNet::Gluon::Mouse;
-extends 'AI::MXNet::Gluon::HybridBlock';
-
-=head1 NAME 
-
-    AI::MXNet::Gluon::ModelZoo::Vision::ResNet::BasicBlockV2 - BasicBlock V2 from "Identity Mappings in Deep Residual Networks"
-=cut
-
-=head1 DESCRIPTION
-
-    Bottleneck V2 from "Identity Mappings in Deep Residual Networks"
-    <https://arxiv.org/abs/1603.05027> paper.
-    This is used for ResNet V2 for 18, 34 layers.
-
-    Parameters
-    ----------
-    channels : Int
-        Number of output channels.
-    stride : Int
-        Stride size.
-    downsample : Bool, default 0
-        Whether to downsample the input.
-    in_channels : Int, default 0
-        Number of input channels. Default is 0, to infer from the graph.
-=cut
-
-has ['channels',
-      'stride']   => (is => 'ro', isa => 'Int', required => 1);
-has 'downsample' => (is => 'rw', default => 0);
-has 'in_channels' => (is => 'ro', isa => 'Int', default => 0);
-method python_constructor_arguments() { [qw/channels stride downsample/] }
-func _conv3x3($channels, $stride, $in_channels)
-{
-    return nn->Conv2D(
-        $channels, kernel_size=>3, strides=>$stride, padding=>1,
-        use_bias=>0, in_channels=>$in_channels
-    );
-}
-
-sub BUILD
-{
-    my $self = shift;
-    $self->bn1(nn->BatchNorm());
-    $self->conv1(_conv3x3($self->channels, $self->stride, $self->in_channels));
-    $self->bn2(nn->BatchNorm());
-    $self->conv2(_conv3x3($self->channels, 1, $self->channels));
-    if($self->downsample)
-    {
-        $self->downsample(
-            nn->Conv2D($self->channels, kernel_size=>1, strides=>$self->stride,
-                       use_bias=>0, in_channels=>$self->in_channels)
-        );
-    }
-    else
-    {
-        $self->downsample(undef);
-    }
-}
-
-method hybrid_forward(GluonClass $F, GluonInput $x)
-{
-    my $residual = $x;
-    $x = $self->bn1->($x);
-    $x = $F->Activation($x, act_type=>'relu');
-    if(defined $self->downsample)
-    {
-        $residual = $self->downsample->($x);
-    }
-    $x = $self->conv1->($x);
-
-    $x = $self->bn2->($x);
-    $x = $F->Activation($x, act_type=>'relu');
-    $x = $self->conv2->($x);
-
-    return $x + $residual;
-}
-
-
-package AI::MXNet::Gluon::ModelZoo::Vision::ResNet::BottleneckV2;
-use AI::MXNet::Gluon::Mouse;
-extends 'AI::MXNet::Gluon::HybridBlock';
-
-=head1 NAME
-
-    AI::MXNet::Gluon::ModelZoo::Vision::ResNet::BottleneckV2 - Bottleneck V2 from "Identity Mappings in Deep Residual Networks"
-=cut
-
-=head1 DESCRIPTION
-
-    Bottleneck V2 from "Identity Mappings in Deep Residual Networks"
-    <https://arxiv.org/abs/1603.05027> paper.
-    This is used for ResNet V2 for 50, 101, 152 layers.
-
-    Parameters
-    ----------
-    channels : int
-        Number of output channels.
-    stride : int
-        Stride size.
-    downsample : bool, default False
-        Whether to downsample the input.
-    in_channels : int, default 0
-        Number of input channels. Default is 0, to infer from the graph.
-=cut
-
-has ['channels',
-      'stride']   => (is => 'ro', isa => 'Int', required => 1);
-has 'downsample' => (is => 'rw', default => 0);
-has 'in_channels' => (is => 'ro', isa => 'Int', default => 0);
-method python_constructor_arguments() { [qw/channels stride downsample/] }
-func _conv3x3($channels, $stride, $in_channels)
-{
-    return nn->Conv2D(
-        $channels, kernel_size=>3, strides=>$stride, padding=>1,
-        use_bias=>0, in_channels=>$in_channels
-    );
-}
-
-sub BUILD
-{
-    my $self = shift;
-    $self->bn1(nn->BatchNorm());
-    $self->conv1(nn->Conv2D(int($self->channels/4), kernel_size=>1, strides=>1, use_bias=>0));
-    $self->bn2(nn->BatchNorm());
-    $self->conv2(_conv3x3(int($self->channels/4), $self->stride, int($self->channels/4)));
-    $self->bn3(nn->BatchNorm());
-    $self->conv3(nn->Conv2D($self->channels, kernel_size=>1, strides=>1, use_bias=>0));
-    if($self->downsample)
-    {
-        $self->downsample(
-            nn->Conv2D($self->channels, kernel_size=>1, strides=>$self->stride,
-                       use_bias=>0, in_channels=>$self->in_channels)
-        );
-    }
-    else
-    {
-        $self->downsample(undef);
-    }
-}
-
-method hybrid_forward(GluonClass $F, GluonInput $x)
-{
-    my $residual = $x;
-    $x = $self->bn1->($x);
-    $x = $F->Activation($x, act_type=>'relu');
-    if(defined $self->downsample)
-    {
-        $residual = $self->downsample->($x);
-    }
-    $x = $self->conv1->($x);
-
-    $x = $self->bn2->($x);
-    $x = $F->Activation($x, act_type=>'relu');
-    $x = $self->conv2->($x);
-
-    $x = $self->bn3->($x);
-    $x = $F->Activation($x, act_type=>'relu');
-    $x = $self->conv3->($x);
-
-    return $x + $residual;
-}
-
-
-# Nets
-package AI::MXNet::Gluon::ModelZoo::Vision::ResNet::V1;
-use AI::MXNet::Gluon::Mouse;
-extends 'AI::MXNet::Gluon::HybridBlock';
-use AI::MXNet::Base;
-
-=head1 NAME
-
-    AI::MXNet::Gluon::ModelZoo::Vision::ResNet::V1 - ResNet V1 model from "Deep Residual Learning for Image Recognition"
-=cut
-
-=head1 DESCRIPTION
-
-    ResNet V1 model from from "Deep Residual Learning for Image Recognition"
-    <http://arxiv.org/abs/1512.03385> paper.
-
-    Parameters
-    ----------
-    block : AI::MXNet::Gluon::HybridBlock
-        Class for the residual block. Options are AI::MXNet::Gluon::ModelZoo::Vision::ResNet::BasicBlockV1,
-        AI::MXNet::Gluon::ModelZoo::Vision::ResNet::BottleneckV1.
-    layers : array ref of Int
-        Numbers of layers in each block
-    channels : array ref of Int
-        Numbers of channels in each block. Length should be one larger than layers list.
-    classes : int, default 1000
-        Number of classification classes.
-    thumbnail : bool, default 0
-        Enable thumbnail.
-=cut
-
-has 'block'     => (is => 'ro', isa => 'Str', required => 1);
-has ['layers',
-    'channels'] => (is => 'ro', isa => 'ArrayRef[Int]', required => 1);
-has 'classes'   => (is => 'ro', isa => 'Int', default => 1000);
-has 'thumbnail' => (is => 'ro', isa => 'Bool', default => 0);
-method python_constructor_arguments() { [qw/block layers channels classes thumbnail/] }
-func _conv3x3($channels, $stride, $in_channels)
-{
-    return nn->Conv2D(
-        $channels, kernel_size=>3, strides=>$stride, padding=>1,
-        use_bias=>0, in_channels=>$in_channels
-    );
-}
-
-sub BUILD
-{
-    my $self = shift;
-    assert(@{ $self->layers } == (@{ $self->channels } - 1));
-    $self->name_scope(sub {
-        $self->features(nn->HybridSequential(prefix=>''));
-        if($self->thumbnail)
-        {
-            $self->features->add(_conv3x3($self->channels->[0], 1, 0));
-        }
-        else
-        {
-            $self->features->add(nn->Conv2D($self->channels->[0], 7, 2, 3, use_bias=>0));
-            $self->features->add(nn->BatchNorm());
-            $self->features->add(nn->Activation('relu'));
-            $self->features->add(nn->MaxPool2D(3, 2, 1));
-        }
-        for(enumerate($self->layers))
-        {
-            my ($i, $num_layer) = @$_;
-            my $stride = $i == 0 ? 1 : 2;
-            $self->features->add(
-                $self->_make_layer(
-                    $self->block, $num_layer, $self->channels->[$i+1],
-                    $stride, $i+1, in_channels=>$self->channels->[$i]
-                )
-            );
-        }
-        $self->features->add(nn->GlobalAvgPool2D());
-        $self->output(nn->Dense($self->classes, in_units=>$self->channels->[-1]));
-    });
-}
-
-method _make_layer($block, $layers, $channels, $stride, $stage_index, :$in_channels=0)
-{
-    my $layer = nn->HybridSequential(prefix=>"stage${stage_index}_");
-    $layer->name_scope(sub {
-        $layer->add(
-            $block->new(
-                $channels, $stride, $channels != $in_channels, in_channels=>$in_channels,
-                prefix=>''
-            )
-        );
-        for(1..$layers-1)
-        {
-            $layer->add($block->new($channels, 1, 0, in_channels=>$channels, prefix=>''));
-        }
-    });
-    return $layer;
-}
-
-method hybrid_forward(GluonClass $F, GluonInput $x)
-{
-    $x = $self->features->($x);
-    $x = $self->output->($x);
-    return $x;
-}
-
-
-package AI::MXNet::Gluon::ModelZoo::Vision::ResNet::V2;
-use AI::MXNet::Gluon::Mouse;
-extends 'AI::MXNet::Gluon::HybridBlock';
-use AI::MXNet::Base;
-
-=head1 NAME
-
-    AI::MXNet::Gluon::ModelZoo::Vision::ResNet::V2 - ResNet V2 model from "Identity Mappings in Deep Residual Networks"
-=cut
-
-=head1 DESCRIPTION
-
-    ResNet V2 model from "Identity Mappings in Deep Residual Networks"
-    <https://arxiv.org/abs/1603.05027> paper.
-
-    Parameters
-    ----------
-    block : AI::MXNet::Gluon::HybridBlock
-        Class for the residual block. Options are AI::MXNet::Gluon::ModelZoo::Vision::ResNet::BasicBlockV2,
-        AI::MXNet::Gluon::ModelZoo::Vision::ResNet::BottleneckV2.
-    layers : array ref of Int
-        Numbers of layers in each block
-    channels : array ref of Int
-        Numbers of channels in each block. Length should be one larger than layers list.
-    classes : int, default 1000
-        Number of classification classes.
-    thumbnail : bool, default 0
-        Enable thumbnail.
-=cut
-
-has 'block'     => (is => 'ro', isa => 'Str', required => 1);
-has ['layers',
-    'channels'] => (is => 'ro', isa => 'ArrayRef[Int]', required => 1);
-has 'classes'   => (is => 'ro', isa => 'Int', default => 1000);
-has 'thumbnail' => (is => 'ro', isa => 'Bool', default => 0);
-method python_constructor_arguments() { [qw/block layers channels classes thumbnail/] }
-func _conv3x3($channels, $stride, $in_channels)
-{
-    return nn->Conv2D(
-        $channels, kernel_size=>3, strides=>$stride, padding=>1,
-        use_bias=>0, in_channels=>$in_channels
-    );
-}
-
-sub BUILD
-{
-    my $self = shift;
-    assert(@{ $self->layers } == (@{ $self->channels } - 1));
-    $self->name_scope(sub {
-        $self->features(nn->HybridSequential(prefix=>''));
-        $self->features->add(nn->BatchNorm(scale=>0, center=>0));
-        if($self->thumbnail)
-        {
-            $self->features->add(_conv3x3($self->channels->[0], 1, 0));
-        }
-        else
-        {
-            $self->features->add(nn->Conv2D($self->channels->[0], 7, 2, 3, use_bias=>0));
-            $self->features->add(nn->BatchNorm());
-            $self->features->add(nn->Activation('relu'));
-            $self->features->add(nn->MaxPool2D(3, 2, 1));
-        }
-        my $in_channels = $self->channels->[0];
-        for(enumerate($self->layers))
-        {
-            my ($i, $num_layer) = @$_;
-            my $stride = $i == 0 ? 1 : 2;
-            $self->features->add(
-                $self->_make_layer(
-                    $self->block, $num_layer, $self->channels->[$i+1],
-                    $stride, $i+1, in_channels=>$in_channels
-                )
-            );
-            $in_channels = $self->channels->[$i+1];
-        }
-        $self->features->add(nn->BatchNorm());
-        $self->features->add(nn->Activation('relu'));
-        $self->features->add(nn->GlobalAvgPool2D());
-        $self->features->add(nn->Flatten());
-        $self->output(nn->Dense($self->classes, in_units=>$in_channels));
-    });
-}
-
-method _make_layer($block, $layers, $channels, $stride, $stage_index, :$in_channels=0)
-{
-    my $layer = nn->HybridSequential(prefix=>"stage${stage_index}_");
-    $layer->name_scope(sub {
-        $layer->add(
-            $block->new(
-                $channels, $stride, $channels != $in_channels, in_channels=>$in_channels,
-                prefix=>''
-            )
-        );
-        for(1..$layers-1)
-        {
-            $layer->add($block->new($channels, 1, 0, in_channels=>$channels, prefix=>''));
-        }
-    });
-    return $layer;
-}
-
-method hybrid_forward(GluonClass $F, GluonInput $x)
-{
-    $x = $self->features->($x);
-    $x = $self->output->($x);
-    return $x;
-}
-
-package AI::MXNet::Gluon::ModelZoo::Vision;
-
-# Specification
-my %resnet_spec = (
-    18  => ['basic_block', [2, 2, 2, 2], [64, 64, 128, 256, 512]],
-    34  => ['basic_block', [3, 4, 6, 3], [64, 64, 128, 256, 512]],
-    50  => ['bottle_neck', [3, 4, 6, 3], [64, 256, 512, 1024, 2048]],
-    101 => ['bottle_neck', [3, 4, 23, 3], [64, 256, 512, 1024, 2048]],
-    152 => ['bottle_neck', [3, 8, 36, 3], [64, 256, 512, 1024, 2048]]
-);
-
-my @resnet_net_versions = qw(AI::MXNet::Gluon::ModelZoo::Vision::ResNet::V1 AI::MXNet::Gluon::ModelZoo::Vision::ResNet::V2);
-my @resnet_block_versions = (
-    {
-        basic_block => 'AI::MXNet::Gluon::ModelZoo::Vision::ResNet::BasicBlockV1',
-        bottle_neck => 'AI::MXNet::Gluon::ModelZoo::Vision::ResNet::BottleneckV1'
-    },
-    {
-        basic_block => 'AI::MXNet::Gluon::ModelZoo::Vision::ResNet::BasicBlockV2',
-        bottle_neck => 'AI::MXNet::Gluon::ModelZoo::Vision::ResNet::BottleneckV2'
-    },
-);
-
-=head2 get_resnet
-
-    ResNet V1 model from "Deep Residual Learning for Image Recognition"
-    <http://arxiv.org/abs/1512.03385> paper.
-    ResNet V2 model from "Identity Mappings in Deep Residual Networks"
-    <https://arxiv.org/abs/1603.05027> paper.
-
-    Parameters
-    ----------
-    $version : Int
-        Version of ResNet. Options are 1, 2.
-    $num_layers : Int
-        Numbers of layers. Options are 18, 34, 50, 101, 152.
-    :$pretrained : Bool, default 0
-        Whether to load the pretrained weights for model.
-    :$ctx : AI::MXNet::Context, default CPU
-        The context in which to load the pretrained weights.
-    :$root : Str, default '~/.mxnet/models'
-        Location for keeping the model parameters.
-=cut
-
-# Constructor
-method get_resnet(
-    Int $version, Int $num_layers, Bool :$pretrained=0,
-    AI::MXNet::Context :$ctx=AI::MXNet::Context->cpu(),
-    Str :$root='~/.mxnet/models',
-    Maybe[Int]  :$classes=,
-    Maybe[Bool] :$thumbnail=
-)
-{
-    my ($block_type, $layers, $channels) = @{ $resnet_spec{$num_layers} };
-    my $resnet_class = $resnet_net_versions[$version-1];
-    confess("invalid resnet $version [$version], can be 1,2") unless $resnet_class;
-    my $block_class = $resnet_block_versions[$version-1]{$block_type};
-    my $net = $resnet_class->new(
-        $block_class, $layers, $channels,
-        (defined($classes) ? (classes => $classes) : ()),
-        (defined($thumbnail) ? (thumbnail => $thumbnail) : ())
-    );
-    if($pretrained)
-    {
-        $net->load_parameters(
-            AI::MXNet::Gluon::ModelZoo::ModelStore->get_model_file(
-                "resnet${num_layers}_v$version",
-                root=>$root
-            ),
-            ctx=>$ctx
-        );
-    }
-    return $net;
-}
-
-=head2 resnet18_v1
-
-    ResNet-18 V1 model from "Deep Residual Learning for Image Recognition"
-    <http://arxiv.org/abs/1512.03385> paper.
-
-    Parameters
-    ----------
-    :$pretrained : Bool, default 0
-        Whether to load the pretrained weights for model.
-    :$ctx : AI::MXNet::Context, default CPU
-        The context in which to load the pretrained weights.
-    :$root : Str, default '~/.mxnet/models'
-        Location for keeping the model parameters.
-=cut
-
-method resnet18_v1(%kwargs)
-{
-    return __PACKAGE__->get_resnet(1, 18, %kwargs);
-}
-
-=head2 resnet34_v1
-
-    ResNet-34 V1 model from "Deep Residual Learning for Image Recognition"
-    <http://arxiv.org/abs/1512.03385> paper.
-
-    Parameters
-    ----------
-    :$pretrained : Bool, default 0
-        Whether to load the pretrained weights for model.
-    :$ctx : AI::MXNet::Context, default CPU
-        The context in which to load the pretrained weights.
-    :$root : Str, default '~/.mxnet/models'
-        Location for keeping the model parameters.
-=cut
-
-method resnet34_v1(%kwargs)
-{
-    return __PACKAGE__->get_resnet(1, 34, %kwargs);
-}
-
-=head2 resnet50_v1
-
-    ResNet-50 V1 model from "Deep Residual Learning for Image Recognition"
-    <http://arxiv.org/abs/1512.03385> paper.
-
-    Parameters
-    ----------
-    :$pretrained : Bool, default 0
-        Whether to load the pretrained weights for model.
-    :$ctx : AI::MXNet::Context, default CPU
-        The context in which to load the pretrained weights.
-    :$root : Str, default '~/.mxnet/models'
-        Location for keeping the model parameters.
-=cut
-
-method resnet50_v1(%kwargs)
-{
-    return __PACKAGE__->get_resnet(1, 50, %kwargs);
-}
-
-=head2 resnet101_v1
-
-    ResNet-101 V1 model from "Deep Residual Learning for Image Recognition"
-    <http://arxiv.org/abs/1512.03385> paper.
-
-    Parameters
-    ----------
-    :$pretrained : Bool, default 0
-        Whether to load the pretrained weights for model.
-    :$ctx : AI::MXNet::Context, default CPU
-        The context in which to load the pretrained weights.
-    :$root : Str, default '~/.mxnet/models'
-        Location for keeping the model parameters.
-=cut
-
-method resnet101_v1(%kwargs)
-{
-    return __PACKAGE__->get_resnet(1, 101, %kwargs);
-}
-
-=head2 resnet152_v1
-
-    ResNet-152 V1 model from "Deep Residual Learning for Image Recognition"
-    <http://arxiv.org/abs/1512.03385> paper.
-
-    Parameters
-    ----------
-    :$pretrained : Bool, default 0
-        Whether to load the pretrained weights for model.
-    :$ctx : AI::MXNet::Context, default CPU
-        The context in which to load the pretrained weights.
-    :$root : Str, default '~/.mxnet/models'
-        Location for keeping the model parameters.
-=cut
-
-method resnet152_v1(%kwargs)
-{
-    return __PACKAGE__->get_resnet(1, 152, %kwargs);
-}
-
-=head2 resnet18_v2
-
-    ResNet-18 V2 model from "Identity Mappings in Deep Residual Networks"
-    <https://arxiv.org/abs/1603.05027> paper.
-
-    Parameters
-    ----------
-    :$pretrained : Bool, default 0
-        Whether to load the pretrained weights for model.
-    :$ctx : AI::MXNet::Context, default CPU
-        The context in which to load the pretrained weights.
-    :$root : Str, default '~/.mxnet/models'
-        Location for keeping the model parameters.
-=cut
-
-method resnet18_v2(%kwargs)
-{
-    return __PACKAGE__->get_resnet(2, 18, %kwargs);
-}
-
-=head2 resnet34_v2
-
-    ResNet-34 V2 model from "Identity Mappings in Deep Residual Networks"
-    <https://arxiv.org/abs/1603.05027> paper.
-
-    Parameters
-    ----------
-    :$pretrained : Bool, default 0
-        Whether to load the pretrained weights for model.
-    :$ctx : AI::MXNet::Context, default CPU
-        The context in which to load the pretrained weights.
-    :$root : Str, default '~/.mxnet/models'
-        Location for keeping the model parameters.
-=cut
-
-method resnet34_v2(%kwargs)
-{
-    return __PACKAGE__->get_resnet(2, 34, %kwargs);
-}
-
-=head2 resnet50_v2
-
-    ResNet-50 V2 model from "Identity Mappings in Deep Residual Networks"
-    <https://arxiv.org/abs/1603.05027> paper.
-
-    Parameters
-    ----------
-    :$pretrained : Bool, default 0
-        Whether to load the pretrained weights for model.
-    :$ctx : AI::MXNet::Context, default CPU
-        The context in which to load the pretrained weights.
-    :$root : Str, default '~/.mxnet/models'
-        Location for keeping the model parameters.
-=cut
-
-method resnet50_v2(%kwargs)
-{
-    return __PACKAGE__->get_resnet(2, 50, %kwargs);
-}
-
-=head2 resnet101_v2
-
-    ResNet-101 V2 model from "Identity Mappings in Deep Residual Networks"
-    <https://arxiv.org/abs/1603.05027> paper.
-
-    Parameters
-    ----------
-    :$pretrained : Bool, default 0
-        Whether to load the pretrained weights for model.
-    :$ctx : AI::MXNet::Context, default CPU
-        The context in which to load the pretrained weights.
-    :$root : Str, default '~/.mxnet/models'
-        Location for keeping the model parameters.
-=cut
-
-method resnet101_v2(%kwargs)
-{
-    return __PACKAGE__->get_resnet(2, 101, %kwargs);
-}
-
-=head2 resnet152_v2
-
-    ResNet-152 V2 model from "Identity Mappings in Deep Residual Networks"
-    <https://arxiv.org/abs/1603.05027> paper.
-
-    Parameters
-    ----------
-    :$pretrained : Bool, default 0
-        Whether to load the pretrained weights for model.
-    :$ctx : AI::MXNet::Context, default CPU
-        The context in which to load the pretrained weights.
-    :$root : Str, default '~/.mxnet/models'
-        Location for keeping the model parameters.
-=cut
-
-method resnet152_v2(%kwargs)
-{
-    return __PACKAGE__->get_resnet(2, 152, %kwargs);
-}
-
-1;
diff --git a/perl-package/AI-MXNet-Gluon-ModelZoo/lib/AI/MXNet/Gluon/ModelZoo/Vision/SqueezeNet.pm b/perl-package/AI-MXNet-Gluon-ModelZoo/lib/AI/MXNet/Gluon/ModelZoo/Vision/SqueezeNet.pm
deleted file mode 100644
index 3cbe8dd9e8f3..000000000000
--- a/perl-package/AI-MXNet-Gluon-ModelZoo/lib/AI/MXNet/Gluon/ModelZoo/Vision/SqueezeNet.pm
+++ /dev/null
@@ -1,212 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-package AI::MXNet::Gluon::ModelZoo::Vision::SqueezeNet;
-use strict;
-use warnings;
-use AI::MXNet::Function::Parameters;
-use AI::MXNet::Gluon::Mouse;
-use AI::MXNet::Types;
-extends 'AI::MXNet::Gluon::HybridBlock';
-
-func _make_fire($squeeze_channels, $expand1x1_channels, $expand3x3_channels)
-{
-    my $out = nn->HybridSequential(prefix=>'');
-    $out->add(_make_fire_conv($squeeze_channels, 1));
-
-    my $paths = nn->HybridConcurrent(axis=>1, prefix=>'');
-    $paths->add(_make_fire_conv($expand1x1_channels, 1));
-    $paths->add(_make_fire_conv($expand3x3_channels, 3, 1));
-    $out->add($paths);
-
-    return $out;
-}
-
-func _make_fire_conv($channels, $kernel_size, $padding=0)
-{
-    my $out = nn->HybridSequential(prefix=>'');
-    $out->add(nn->Conv2D($channels, $kernel_size, padding=>$padding));
-    $out->add(nn->Activation('relu'));
-    return $out;
-}
-
-=head1 NAME
-
-    AI::MXNet::Gluon::ModelZoo::Vision::SqueezeNet - SqueezeNet model from the "SqueezeNet: AlexNet-level accuracy with 50x fewer parameters and <0.5MB model size"
-=cut
-
-=head1 DESCRIPTION
-
-    SqueezeNet model from the "SqueezeNet: AlexNet-level accuracy with 50x fewer parameters
-    and <0.5MB model size" <https://arxiv.org/abs/1602.07360> paper.
-    SqueezeNet 1.1 model from the official SqueezeNet repo
-    <https://github.com/DeepScale/SqueezeNet/tree/master/SqueezeNet_v1.1>.
-    SqueezeNet 1.1 has 2.4x less computation and slightly fewer parameters
-    than SqueezeNet 1.0, without sacrificing accuracy.
-
-    Parameters
-    ----------
-    version : Str
-        Version of squeezenet. Options are '1.0', '1.1'.
-    classes : Int, default 1000
-        Number of classification classes.
-=cut
-
-has 'version' => (is => 'ro', isa => enum([qw[1.0 1.1]]), required => 1);
-has 'classes' => (is => 'ro', isa => 'Int', default => 1000);
-method python_constructor_arguments() { [qw/version classes/] }
-
-sub BUILD
-{
-    my $self = shift;
-    $self->name_scope(sub {
-        $self->features(nn->HybridSequential(prefix=>''));
-        if($self->version eq '1.0')
-        {
-            $self->features->add(nn->Conv2D(96, kernel_size=>7, strides=>2));
-            $self->features->add(nn->Activation('relu'));
-            $self->features->add(nn->MaxPool2D(pool_size=>3, strides=>2, ceil_mode=>1));
-            $self->features->add(_make_fire(16, 64, 64));
-            $self->features->add(_make_fire(16, 64, 64));
-            $self->features->add(_make_fire(32, 128, 128));
-            $self->features->add(nn->MaxPool2D(pool_size=>3, strides=>2, ceil_mode=>1));
-            $self->features->add(_make_fire(32, 128, 128));
-            $self->features->add(_make_fire(48, 192, 192));
-            $self->features->add(_make_fire(48, 192, 192));
-            $self->features->add(_make_fire(64, 256, 256));
-            $self->features->add(nn->MaxPool2D(pool_size=>3, strides=>2, ceil_mode=>1));
-            $self->features->add(_make_fire(64, 256, 256));
-        }
-        else
-        {
-            $self->features->add(nn->Conv2D(64, kernel_size=>3, strides=>2));
-            $self->features->add(nn->Activation('relu'));
-            $self->features->add(nn->MaxPool2D(pool_size=>3, strides=>2, ceil_mode=>1));
-            $self->features->add(_make_fire(16, 64, 64));
-            $self->features->add(_make_fire(16, 64, 64));
-            $self->features->add(nn->MaxPool2D(pool_size=>3, strides=>2, ceil_mode=>1));
-            $self->features->add(_make_fire(32, 128, 128));
-            $self->features->add(_make_fire(32, 128, 128));
-            $self->features->add(nn->MaxPool2D(pool_size=>3, strides=>2, ceil_mode=>1));
-            $self->features->add(_make_fire(48, 192, 192));
-            $self->features->add(_make_fire(48, 192, 192));
-            $self->features->add(_make_fire(64, 256, 256));
-            $self->features->add(_make_fire(64, 256, 256));
-        }
-        $self->features->add(nn->Dropout(0.5));
-
-        $self->output(nn->HybridSequential(prefix=>''));
-        $self->output->add(nn->Conv2D($self->classes, kernel_size=>1));
-        $self->output->add(nn->Activation('relu'));
-        $self->output->add(nn->AvgPool2D(13));
-        $self->output->add(nn->Flatten());
-    });
-}
-
-method hybrid_forward(GluonClass $F, GluonInput $x)
-{
-    $x = $self->features->($x);
-    $x = $self->output->($x);
-    return $x;
-}
-
-
-package AI::MXNet::Gluon::ModelZoo::Vision;
-
-=head2 get_squeezenet
-
-    SqueezeNet model from the "SqueezeNet: AlexNet-level accuracy with 50x fewer parameters
-    and <0.5MB model size" <https://arxiv.org/abs/1602.07360> paper.
-    SqueezeNet 1.1 model from the official SqueezeNet repo
-    <https://github.com/DeepScale/SqueezeNet/tree/master/SqueezeNet_v1.1>.
-    SqueezeNet 1.1 has 2.4x less computation and slightly fewer parameters
-    than SqueezeNet 1.0, without sacrificing accuracy.
-
-    Parameters
-    ----------
-    $version : Str
-        Version of squeezenet. Options are '1.0', '1.1'.
-    :$pretrained : Bool, default 0
-        Whether to load the pretrained weights for model.
-    :$ctx : AI::MXNet::Context, default CPU
-        The context in which to load the pretrained weights.
-    :$root : Str, default '~/.mxnet/models'
-        Location for keeping the model parameters.
-=cut
-
-method get_squeezenet(
-    Str $version, Bool :$pretrained=0, AI::MXNet::Context :$ctx=AI::MXNet::Context->cpu(),
-    Str :$root='~/.mxnet/models', Int :$classes=1000
-)
-{
-    my $net = AI::MXNet::Gluon::ModelZoo::Vision::SqueezeNet->new($version, $classes);
-    if($pretrained)
-    {
-        $net->load_parameters(
-            AI::MXNet::Gluon::ModelZoo::ModelStore->get_model_file(
-                "squeezenet$version",
-                root=>$root
-            ),
-            ctx=>$ctx
-        );
-    }
-    return $net;
-}
-
-=head2 squeezenet1_0
-
-    SqueezeNet 1.0 model from the "SqueezeNet: AlexNet-level accuracy with 50x fewer parameters
-    and <0.5MB model size" <https://arxiv.org/abs/1602.07360> paper.
-
-    Parameters
-    ----------
-    :$pretrained : Bool, default 0
-        Whether to load the pretrained weights for model.
-    :$ctx : AI::MXNet::Context, default CPU
-        The context in which to load the pretrained weights.
-    :$root : Str, default '~/.mxnet/models'
-        Location for keeping the model parameters.
-=cut
-
-method squeezenet1_0(%kwargs)
-{
-    return __PACKAGE__->get_squeezenet('1.0', %kwargs);
-}
-
-=head2 squeezenet1_1
-
-    SqueezeNet 1.1 model from the official SqueezeNet repo
-    <https://github.com/DeepScale/SqueezeNet/tree/master/SqueezeNet_v1.1>.
-    SqueezeNet 1.1 has 2.4x less computation and slightly fewer parameters
-    than SqueezeNet 1.0, without sacrificing accuracy.
-
-    Parameters
-    ----------
-    :$pretrained : Bool, default 0
-        Whether to load the pretrained weights for model.
-    :$ctx : AI::MXNet::Context, default CPU
-        The context in which to load the pretrained weights.
-    :$root : Str, default '~/.mxnet/models'
-        Location for keeping the model parameters.
-=cut
-
-method squeezenet1_1(%kwargs)
-{
-    return __PACKAGE__->get_squeezenet('1.1', %kwargs);
-}
-
-1;
diff --git a/perl-package/AI-MXNet-Gluon-ModelZoo/lib/AI/MXNet/Gluon/ModelZoo/Vision/VGG.pm b/perl-package/AI-MXNet-Gluon-ModelZoo/lib/AI/MXNet/Gluon/ModelZoo/Vision/VGG.pm
deleted file mode 100644
index 03443a78b02e..000000000000
--- a/perl-package/AI-MXNet-Gluon-ModelZoo/lib/AI/MXNet/Gluon/ModelZoo/Vision/VGG.pm
+++ /dev/null
@@ -1,321 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-use strict;
-use warnings;
-use AI::MXNet::Function::Parameters;
-package AI::MXNet::Gluon::ModelZoo::Vision::VGG;
-use AI::MXNet::Gluon::Mouse;
-extends 'AI::MXNet::Gluon::HybridBlock';
-use AI::MXNet::Base;
-
-=head1 NAME 
-
-    AI::MXNet::Gluon::ModelZoo::Vision::VGG - VGG model from the "Very Deep Convolutional Networks for Large-Scale Image Recognition"
-=cut
-
-=head1 DESCRIPTION
-
-    VGG model from the "Very Deep Convolutional Networks for Large-Scale Image Recognition"
-    <https://arxiv.org/abs/1409.1556> paper.
-
-    Parameters
-    ----------
-    layers : array ref of Int
-        Numbers of layers in each feature block.
-    filters : array ref of Int
-        Numbers of filters in each feature block. List length should match the layers.
-    classes : Int, default 1000
-        Number of classification classes.
-    batch_norm : Bool, default 0
-        Use batch normalization.
-=cut
-method python_constructor_arguments() { [qw/layers filters classes batch_norm/] }
-has ['layers',
-     'filters']   => (is => 'ro', isa => 'ArrayRef[Int]', required => 1);
-has  'classes'    => (is => 'ro', isa => 'Int', default => 1000);
-has  'batch_norm' => (is => 'ro', isa => 'Bool', default => 0);
-
-sub BUILD
-{
-    my $self = shift;
-    assert(@{ $self->layers } == @{ $self->filters });
-    $self->name_scope(sub {
-        $self->features($self->_make_features());
-        $self->features->add(nn->Dense(4096, activation=>'relu',
-                                       weight_initializer=>'normal',
-                                       bias_initializer=>'zeros'));
-        $self->features->add(nn->Dropout(rate=>0.5));
-        $self->features->add(nn->Dense(4096, activation=>'relu',
-                                       weight_initializer=>'normal',
-                                       bias_initializer=>'zeros'));
-        $self->features->add(nn->Dropout(rate=>0.5));
-        $self->output(nn->Dense($self->classes,
-                                   weight_initializer=>'normal',
-                                   bias_initializer=>'zeros'));
-    });
-}
-
-method _make_features()
-{
-    my $featurizer = nn->HybridSequential(prefix=>'');
-    for(enumerate($self->layers))
-    {
-        my ($i, $num) = @$_;
-        for(0..$num-1)
-        {
-            $featurizer->add(
-                nn->Conv2D(
-                    $self->filters->[$i], kernel_size => 3, padding => 1,
-                    weight_initializer => mx->init->Xavier(
-                        rnd_type    => 'gaussian',
-                        factor_type => 'out',
-                        magnitude   => 2
-                    ),
-                    bias_initializer=>'zeros'
-                )
-            );
-            if($self->batch_norm)
-            {
-                $featurizer->add(nn->BatchNorm());
-            }
-            $featurizer->add(nn->Activation('relu'));
-        }
-        $featurizer->add(nn->MaxPool2D(strides=>2));
-    }
-    return $featurizer;
-}
-
-method hybrid_forward(GluonClass $F, GluonInput $x)
-{
-    $x = $self->features->($x);
-    $x = $self->output->($x);
-    return $x;
-}
-
-package AI::MXNet::Gluon::ModelZoo::Vision;
-
-# Specification
-my %vgg_spec = (
-    11 => [[1, 1, 2, 2, 2], [64, 128, 256, 512, 512]],
-    13 => [[2, 2, 2, 2, 2], [64, 128, 256, 512, 512]],
-    16 => [[2, 2, 3, 3, 3], [64, 128, 256, 512, 512]],
-    19 => [[2, 2, 4, 4, 4], [64, 128, 256, 512, 512]]
-);
-
-=head2 get_vgg
-
-    VGG model from the "Very Deep Convolutional Networks for Large-Scale Image Recognition"
-    <https://arxiv.org/abs/1409.1556> paper.
-
-    Parameters
-    ----------
-    $num_layers : Int
-        Number of layers for the variant of densenet. Options are 11, 13, 16, 19.
-    :$pretrained : Bool, default 0
-        Whether to load the pretrained weights for model.
-    :$ctx : AI::MXNet::Context, default AI::MXNet::Context->cpu
-        The context in which to load the pretrained weights.
-    :$root : Str, default '~/.mxnet/models'
-        Location for keeping the model parameters.
-=cut
-
-method get_vgg(
-    Int $num_layers, Bool :$pretrained=0, AI::MXNet::Context :$ctx=AI::MXNet::Context->cpu(),
-    Str :$root='~/.mxnet/models', Int :$classes=1000, Bool :$batch_norm=0
-)
-{
-    my ($layers, $filters) = @{ $vgg_spec{$num_layers} };
-    my $net = AI::MXNet::Gluon::ModelZoo::Vision::VGG->new($layers, $filters, $classes, $batch_norm);
-    if($pretrained)
-    {
-        $net->load_parameters(
-            AI::MXNet::Gluon::ModelZoo::ModelStore->get_model_file(
-                "vgg$num_layers".($batch_norm ? '_bn' : ''),
-                root=>$root
-            ),
-            ctx=>$ctx
-        );
-    }
-    return $net;
-}
-
-=head2 vgg11
-
-    VGG-11 model from the "Very Deep Convolutional Networks for Large-Scale Image Recognition"
-    <https://arxiv.org/abs/1409.1556> paper.
-
-    Parameters
-    ----------
-    :$pretrained : Bool, default 0
-        Whether to load the pretrained weights for model.
-    :$ctx : AI::MXNet::Context, default AI::MXNet::Context->cpu
-        The context in which to load the pretrained weights.
-    :$root : Str, default '~/.mxnet/models'
-        Location for keeping the model parameters.
-=cut
-
-method vgg11(%kwargs)
-{
-    return __PACKAGE__->get_vgg(11, %kwargs);
-}
-
-=head2 vgg13
-
-    VGG-13 model from the "Very Deep Convolutional Networks for Large-Scale Image Recognition"
-    <https://arxiv.org/abs/1409.1556> paper.
-
-    Parameters
-    ----------
-    :$pretrained : Bool, default 0
-        Whether to load the pretrained weights for model.
-    :$ctx : AI::MXNet::Context, default AI::MXNet::Context->cpu
-        The context in which to load the pretrained weights.
-    :$root : Str, default '~/.mxnet/models'
-        Location for keeping the model parameters.
-=cut
-
-method vgg13(%kwargs)
-{
-    return __PACKAGE__->get_vgg(13, %kwargs);
-}
-
-=head2 vgg16
-
-    VGG-16 model from the "Very Deep Convolutional Networks for Large-Scale Image Recognition"
-    <https://arxiv.org/abs/1409.1556> paper.
-
-    Parameters
-    ----------
-    :$pretrained : Bool, default 0
-        Whether to load the pretrained weights for model.
-    :$ctx : AI::MXNet::Context, default AI::MXNet::Context->cpu
-        The context in which to load the pretrained weights.
-    :$root : Str, default '~/.mxnet/models'
-        Location for keeping the model parameters.
-=cut
-
-method vgg16(%kwargs)
-{
-    return __PACKAGE__->get_vgg(16, %kwargs);
-}
-
-=head2 vgg19
-
-    VGG-19 model from the "Very Deep Convolutional Networks for Large-Scale Image Recognition"
-    <https://arxiv.org/abs/1409.1556> paper.
-
-    Parameters
-    ----------
-    :$pretrained : Bool, default 0
-        Whether to load the pretrained weights for model.
-    :$ctx : AI::MXNet::Context, default AI::MXNet::Context->cpu
-        The context in which to load the pretrained weights.
-    :$root : Str, default '~/.mxnet/models'
-        Location for keeping the model parameters.
-=cut
-
-method vgg19(%kwargs)
-{
-    return __PACKAGE__->get_vgg(19, %kwargs);
-}
-
-=head2 vgg11_bn
-
-    VGG-11 model with batch normalization from the "Very Deep Convolutional Networks for Large-Scale Image Recognition"
-    <https://arxiv.org/abs/1409.1556> paper.
-
-    Parameters
-    ----------
-    :$pretrained : Bool, default 0
-        Whether to load the pretrained weights for model.
-    :$ctx : AI::MXNet::Context, default AI::MXNet::Context->cpu
-        The context in which to load the pretrained weights.
-    :$root : Str, default '~/.mxnet/models'
-        Location for keeping the model parameters.
-=cut
-
-method vgg11_bn(%kwargs)
-{
-    $kwargs{batch_norm} = 1;
-    return __PACKAGE__->get_vgg(11, %kwargs);
-}
-
-=head2 vgg13_bn
-
-    VGG-13 model with batch normalization from the "Very Deep Convolutional Networks for Large-Scale Image Recognition"
-    <https://arxiv.org/abs/1409.1556> paper.
-
-    Parameters
-    ----------
-    :$pretrained : Bool, default 0
-        Whether to load the pretrained weights for model.
-    :$ctx : AI::MXNet::Context, default AI::MXNet::Context->cpu
-        The context in which to load the pretrained weights.
-    :$root : Str, default '~/.mxnet/models'
-        Location for keeping the model parameters.
-=cut
-
-method vgg13_bn(%kwargs)
-{
-    $kwargs{batch_norm} = 1;
-    return __PACKAGE__->get_vgg(13, %kwargs);
-}
-
-=head2 vgg16_bn
-
-    VGG-16 model with batch normalization from the "Very Deep Convolutional Networks for Large-Scale Image Recognition"
-    <https://arxiv.org/abs/1409.1556> paper.
-
-    Parameters
-    ----------
-    :$pretrained : Bool, default 0
-        Whether to load the pretrained weights for model.
-    :$ctx : AI::MXNet::Context, default AI::MXNet::Context->cpu
-        The context in which to load the pretrained weights.
-    :$root : Str, default '~/.mxnet/models'
-        Location for keeping the model parameters.
-=cut
-
-method vgg16_bn(%kwargs)
-{
-    $kwargs{batch_norm} = 1;
-    return __PACKAGE__->get_vgg(16, %kwargs);
-}
-
-=head2 vgg19_bn
-
-    VGG-19 model with batch normalization from the "Very Deep Convolutional Networks for Large-Scale Image Recognition"
-    <https://arxiv.org/abs/1409.1556> paper.
-
-    Parameters
-    ----------
-    :$pretrained : Bool, default 0
-        Whether to load the pretrained weights for model.
-    :$ctx : AI::MXNet::Context, default AI::MXNet::Context->cpu
-        The context in which to load the pretrained weights.
-    :$root : Str, default '~/.mxnet/models'
-        Location for keeping the model parameters.
-=cut
-
-method vgg19_bn(%kwargs)
-{
-    $kwargs{batch_norm} = 1;
-    return __PACKAGE__->get_vgg(19, %kwargs);
-}
-
-1;
\ No newline at end of file
diff --git a/perl-package/AI-MXNet-Gluon-ModelZoo/t/AI-MXNet-Gluon-ModelZoo.t b/perl-package/AI-MXNet-Gluon-ModelZoo/t/AI-MXNet-Gluon-ModelZoo.t
deleted file mode 100644
index 30014547759e..000000000000
--- a/perl-package/AI-MXNet-Gluon-ModelZoo/t/AI-MXNet-Gluon-ModelZoo.t
+++ /dev/null
@@ -1,21 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-use strict;
-use warnings;
-use Test::More tests => 1;
-BEGIN { use_ok('AI::MXNet::Gluon::ModelZoo') };
diff --git a/perl-package/AI-MXNet-Gluon-ModelZoo/t/test_gluon_model_zoo.t b/perl-package/AI-MXNet-Gluon-ModelZoo/t/test_gluon_model_zoo.t
deleted file mode 100644
index d782c2027583..000000000000
--- a/perl-package/AI-MXNet-Gluon-ModelZoo/t/test_gluon_model_zoo.t
+++ /dev/null
@@ -1,51 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-use strict;
-use warnings;
-use AI::MXNet::Gluon::ModelZoo qw(get_model);
-use Test::More tests => 34;
-
-sub test_models
-{
-    my @all_models = ('resnet34_v1', 'resnet18_v1', 'resnet50_v1', 'resnet101_v1', 'resnet152_v1',
-                  'resnet18_v2', 'resnet34_v2', 'resnet50_v2', 'resnet101_v2', 'resnet152_v2',
-                  'vgg11', 'vgg13', 'vgg16', 'vgg19',
-                  'vgg11_bn', 'vgg13_bn', 'vgg16_bn', 'vgg19_bn',
-                  'alexnet', 'inceptionv3',
-                  'densenet121', 'densenet161', 'densenet169', 'densenet201',
-                  'squeezenet1.0', 'squeezenet1.1',
-                  'mobilenet1.0', 'mobilenet0.75', 'mobilenet0.5', 'mobilenet0.25',
-                  'mobilenetv2_1.0', 'mobilenetv2_0.75', 'mobilenetv2_0.5', 'mobilenetv2_0.25');
-    my %pretrained_to_test = ('squeezenet1.1' => 1);
-
-    for my $model_name (@all_models)
-    {
-            my $test_pretrain = exists $pretrained_to_test{ $model_name };
-            my $model = get_model($model_name, pretrained=>$test_pretrain, root=>'model/');
-            my $data_shape = $model_name !~ /inception/ ? [2, 3, 224, 224] : [2, 3, 299, 299];
-            if(not $test_pretrain)
-            {
-                $model->collect_params()->initialize();
-            }
-            $model->hybridize();
-            $model->(mx->nd->random->uniform(shape=>$data_shape))->wait_to_read;
-            ok(1, "forward for $model_name");
-    }
-}
-
-test_models();
diff --git a/perl-package/AI-MXNet/Changes b/perl-package/AI-MXNet/Changes
deleted file mode 100644
index 6807d79d088a..000000000000
--- a/perl-package/AI-MXNet/Changes
+++ /dev/null
@@ -1,100 +0,0 @@
-Revision history for Perl extension AI::MXNet
-1.5     Sun Feb 16 19:56:17 PST 2020
-        - Runtime features
-        - INT64 Tensor support
-
-1.4     Mon Feb 18 11:54:07 PST 2019
-        - Two more gluon loss classes
-        - Visualization fixes
-        - Gluon rnn rework, including hybridization
-        - Exposed GPU memory info to perl level.
-
-1.33    Thu Oct  4 13:25:56 PDT 2018
-        - Added randn function.
-        - Internal SELU function on C++ layer.
-        - Predict now accepts ndarray as well.
-        - Gluon: Only warn when the blocks are unregistered.
-        - Gluon: Better sparse support.
-        - Gluon: Improved block summary.
-        - Added validation docs for MXNet installation for Perl.
-        - Flexible perl env for examples.
-        - Gluon: Custom dtypes for the symbol block
-        - Separate eval metric for the epoch level.
-
-1.32    Sun Aug  5 14:25:31 PDT 2018
-        - Several new metric classes
-        - Expanded documentation
-        - Bugfixes.
-
-1.31    Tue Jul 10 21:19:13 PDT 2018
-        - Memory leak fix for Gluon API
-        - Added summary function for Gluon models
-        - Improved graphviz vizualizations
-        - Added artistic style transfer example for Gluon API
-
-1.3     Tue Jun 26 20:57:40 PDT 2018
-        - Major Gluon update towards parity with Python's API.
-        - Miscellaneous bugfixes and improvements.
-        - New Engine API.
-        - Module::reshape moved to C++ backend.
-        - Examples were updated to work on multi-gpu boxes
-
-1.23    Thu Apr 19 15:38:10 PDT 2018
-        - Support for image operations on symbols and ndarrays.
-
-1.22    Sat Apr 14 17:51:55 PDT 2018
-        - Parity with Python Gluon Loss classes
-
-1.21    Sun Apr  8 12:08:44 PDT 2018
-        - Support for linear algebra operations on symbols and ndarrays.
-
-1.2     Sun Mar  4 16:29:19 PST 2018
-        - Support for sparse tensors
-
-1.1     Sun Oct  1 10:19:08 PDT 2017
-        - Major update, added support for new imperative MXNet interface Gluon and realtime GPU kernels callable from Perl space.
-        - Bugfixes for distributed training.
-        - Miscellaneous fixes and performance improvements.
-
-1.0102 Sun Aug  6 16:55:08 PDT 2017
-        - bugfixes in Image.pm, updated tests, added PearsonCorrelation metric, added Convolutional RNN modules.
-
-1.0101  Sun Jul  2 17:16:01 PDT 2017
-        - reworked CachedOp, two new optimizers, auto module reshape, using strings to index the kvstore.
-
-1.01    Sat Jun 10 23:57:27 PDT 2017
-        - sync with python.
-
-0.9507  Thu May 11 17:04:44 PDT 2017
-        - added AutoGrad, bugfixes.
-
-0.9506  Sat Apr 29 20:26:50 PDT 2017
-        - Ftrl optimizer, new tests, bugfixes.
-
-0.9505  Sun Apr 23 21:26:04 PDT 2017
-        - Perplexity bugfix, two new examples.
-
-0.9504  Wed Apr 19 18:59:45 PDT 2017
-        - LR Scheduler bugfix.
-
-0.9503  Wed Apr 19 13:33:57 PDT 2017
-        - added an example of generation of inferred text via pre-trained RNN.
-        - bugfixes/tests.
-
-0.9502  Sat Apr 15 17:18:21 PDT 2017
-        - optimizations/bugfixes.
-
-0.9501  Sat Apr  8 13:01:00 PDT 2017
-        - ZoneoutCell, nd inferred reshape and moveaxis, cosmetic changes to Image iter,
-           pod reworked to be readable via metacpan.
-
-0.95  Sun Mar 26 17:42:02 PDT 2017
-        - docs, bugfixes, tests in order to be visible on https://mxnet.io
-
-0.03  Tue Feb 14 07:28:11 PST 2017
-        - sync up with current state of the Python inteface.
-        - high level RNN support.
-
-0.02  Tue Feb 14 07:28:11 PST 2017
-        - prepared for inclusion to the mxnet code repository.
-
diff --git a/perl-package/AI-MXNet/MANIFEST b/perl-package/AI-MXNet/MANIFEST
deleted file mode 100644
index fef158689f2a..000000000000
--- a/perl-package/AI-MXNet/MANIFEST
+++ /dev/null
@@ -1,118 +0,0 @@
-Changes
-examples/gluon/dcgan.pl
-examples/gluon/mnist.pl
-examples/gluon/style_transfer/get_data.sh
-examples/gluon/style_transfer/net.pl
-examples/gluon/style_transfer/README.md
-examples/gluon/style_transfer/style_transfer.pl
-examples/gluon/style_transfer/utils.pl
-lib/AI/MXNet.pm
-lib/AI/MXNet/AutoGrad.pm
-lib/AI/MXNet/AutoLoad.pm
-lib/AI/MXNet/Base.pm
-lib/AI/MXNet/CachedOp.pm
-lib/AI/MXNet/Callback.pm
-lib/AI/MXNet/Context.pm
-lib/AI/MXNet/Contrib.pm
-lib/AI/MXNet/Contrib/NDArray.pm
-lib/AI/MXNet/Contrib/Symbol.pm
-lib/AI/MXNet/CudaModule.pm
-lib/AI/MXNet/Engine.pm
-lib/AI/MXNet/Executor.pm
-lib/AI/MXNet/Executor/Group.pm
-lib/AI/MXNet/Function/Parameters.pm
-lib/AI/MXNet/Gluon.pm
-lib/AI/MXNet/Gluon/Block.pm
-lib/AI/MXNet/Gluon/Data.pm
-lib/AI/MXNet/Gluon/Data/Loader.pm
-lib/AI/MXNet/Gluon/Data/Sampler.pm
-lib/AI/MXNet/Gluon/Data/Set.pm
-lib/AI/MXNet/Gluon/Data/Vision.pm
-lib/AI/MXNet/Gluon/Loss.pm
-lib/AI/MXNet/Gluon/Mouse.pm
-lib/AI/MXNet/Gluon/NN.pm
-lib/AI/MXNet/Gluon/NN/Activation.pm
-lib/AI/MXNet/Gluon/NN/BasicLayers.pm
-lib/AI/MXNet/Gluon/NN/ConvLayers.pm
-lib/AI/MXNet/Gluon/Parameter.pm
-lib/AI/MXNet/Gluon/RNN.pm
-lib/AI/MXNet/Gluon/RNN/Cell.pm
-lib/AI/MXNet/Gluon/RNN/Layer.pm
-lib/AI/MXNet/Gluon/Trainer.pm
-lib/AI/MXNet/Gluon/Utils.pm
-lib/AI/MXNet/Image.pm
-lib/AI/MXNet/Image/NDArray.pm
-lib/AI/MXNet/Image/Symbol.pm
-lib/AI/MXNet/Initializer.pm
-lib/AI/MXNet/IO.pm
-lib/AI/MXNet/KVStore.pm
-lib/AI/MXNet/KVStoreServer.pm
-lib/AI/MXNet/LinAlg.pm
-lib/AI/MXNet/LinAlg/NDArray.pm
-lib/AI/MXNet/LinAlg/Symbol.pm
-lib/AI/MXNet/Logging.pm
-lib/AI/MXNet/LRScheduler.pm
-lib/AI/MXNet/Metric.pm
-lib/AI/MXNet/Module.pm
-lib/AI/MXNet/Module/Base.pm
-lib/AI/MXNet/Module/Bucketing.pm
-lib/AI/MXNet/Monitor.pm
-lib/AI/MXNet/NDArray.pm
-lib/AI/MXNet/NDArray/Base.pm
-lib/AI/MXNet/NDArray/Doc.pm
-lib/AI/MXNet/NDArray/Slice.pm
-lib/AI/MXNet/NDArray/Sparse.pm
-lib/AI/MXNet/NS.pm
-lib/AI/MXNet/Optimizer.pm
-lib/AI/MXNet/Profiler.pm
-lib/AI/MXNet/Random.pm
-lib/AI/MXNet/RecordIO.pm
-lib/AI/MXNet/RNN.pm
-lib/AI/MXNet/RNN/Cell.pm
-lib/AI/MXNet/RNN/IO.pm
-lib/AI/MXNet/RunTime.pm
-lib/AI/MXNet/Symbol.pm
-lib/AI/MXNet/Symbol/AttrScope.pm
-lib/AI/MXNet/Symbol/Base.pm
-lib/AI/MXNet/Symbol/Doc.pm
-lib/AI/MXNet/Symbol/NameManager.pm
-lib/AI/MXNet/Symbol/Random.pm
-lib/AI/MXNet/Symbol/Sparse.pm
-lib/AI/MXNet/TestUtils.pm
-lib/AI/MXNet/Types.pm
-lib/AI/MXNet/Util/Printable.pm
-lib/AI/MXNet/Visualization.pm
-Makefile.PL
-MANIFEST			This list of files
-META.json
-META.yml
-README
-t/AI-MXNet.t
-t/test_attr.t
-t/test_autograd.t
-t/test_base.t
-t/test_cuda_module.t
-t/test_engine.t
-t/test_executor.t
-t/test_gluon.t
-t/test_gluon_data.t
-t/test_gluon_rnn.t
-t/test_gluon_trainer.t
-t/test_infer_shape.t
-t/test_init.t
-t/test_io.t
-t/test_io_image.t
-t/test_kvstore.t
-t/test_loss.t
-t/test_metric.t
-t/test_model_parallel.t
-t/test_module.t
-t/test_ndarray.t
-t/test_optimizers.t
-t/test_random.t
-t/test_recordio.t
-t/test_rnn.t
-t/test_runtime.t
-t/test_sparse_ndarray.t
-t/test_symbol.t
-t/test_viz.t
diff --git a/perl-package/AI-MXNet/META.json b/perl-package/AI-MXNet/META.json
deleted file mode 100644
index 02357d478095..000000000000
--- a/perl-package/AI-MXNet/META.json
+++ /dev/null
@@ -1,55 +0,0 @@
-{
-   "abstract" : "Perl interface to MXNet machine learning library",
-   "author" : [
-      "Sergey Kolychev <sergeykolychev.github@gmail.com>"
-   ],
-   "dynamic_config" : 0,
-   "generated_by" : "ExtUtils::MakeMaker version 7.24, CPAN::Meta::Converter version 2.143240",
-   "license" : [
-      "apache_2_0"
-   ],
-   "meta-spec" : {
-      "url" : "http://search.cpan.org/perldoc?CPAN::Meta::Spec",
-      "version" : "2"
-   },
-   "name" : "AI-MXNet",
-   "no_index" : {
-      "directory" : [
-         "t",
-         "inc"
-      ]
-   },
-   "prereqs" : {
-      "build" : {
-         "requires" : {}
-      },
-      "configure" : {
-         "requires" : {
-            "ExtUtils::MakeMaker" : "6.30"
-         }
-      },
-      "runtime" : {
-         "requires" : {
-            "AI::MXNetCAPI" : "1.5",
-            "AI::NNVMCAPI" : "1.4",
-            "Function::Parameters" : "1.0705",
-            "Hash::Ordered" : "0.012",
-            "GraphViz" : "2.14",
-            "Mouse" : "v2.1.0",
-            "PDL" : "2.007",
-            "PDL::CCS" : "1.23.4",
-            "Archive::Tar": "0",
-            "Digest::SHA": "0",
-            "HTTP::Tiny": "0",
-            "IO::Zlib": "0",
-            "JSON::PP": "0",
-            "parent": "0"
-         }
-      },
-      "test" : {
-         "requires" : {}
-      }
-   },
-   "release_status" : "stable",
-   "version" : "1.5"
-}
diff --git a/perl-package/AI-MXNet/META.yml b/perl-package/AI-MXNet/META.yml
deleted file mode 100644
index b06b331ec4ae..000000000000
--- a/perl-package/AI-MXNet/META.yml
+++ /dev/null
@@ -1,51 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
----
-abstract: 'Perl interface to MXNet machine learning library'
-author:
-  - 'Sergey Kolychev <sergeykolychev.github@gmail.com>'
-build_requires: {}
-configure_requires:
-  ExtUtils::MakeMaker: '6.30'
-dynamic_config: 0
-generated_by: 'ExtUtils::MakeMaker version 7.24, CPAN::Meta::Converter version 2.143240'
-license: apache
-meta-spec:
-  url: http://module-build.sourceforge.net/META-spec-v1.4.html
-  version: '1.4'
-name: AI-MXNet
-no_index:
-  directory:
-    - t
-    - inc
-requires:
-  AI::MXNetCAPI: '1.5'
-  AI::NNVMCAPI: '1.4'
-  Function::Parameters: '1.0705'
-  Hash::Ordered: '0.012'
-  GraphViz: '2.14'
-  Mouse: v2.1.0
-  PDL: '2.007'
-  PDL::CCS: '1.23.4'
-  Archive::Tar: '0'
-  Digest::SHA: '0'
-  HTTP::Tiny: '0'
-  IO::Zlib: '0'
-  JSON::PP: '0'
-  parent: '0'
-version: '1.5'
diff --git a/perl-package/AI-MXNet/Makefile.PL b/perl-package/AI-MXNet/Makefile.PL
deleted file mode 100644
index 4a9016234ad6..000000000000
--- a/perl-package/AI-MXNet/Makefile.PL
+++ /dev/null
@@ -1,80 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-use strict;
-use warnings;
-
-use 5.014000;
-
-use ExtUtils::MakeMaker 6.30;
-
-
-
-my %WriteMakefileArgs = (
-  "ABSTRACT" => "Perl interface to MXNet machine learning library",
-  "AUTHOR" => "Sergey Kolychev <sergeykolychev.github\@gmail.com>",
-  "BUILD_REQUIRES" => {},
-  "CONFIGURE_REQUIRES" => {
-    "ExtUtils::MakeMaker" => "6.30"
-  },
-  "DISTNAME" => "AI-MXNet",
-  "EXE_FILES" => [],
-  "LICENSE" => "apache_2_0",
-  "NAME" => "AI::MXNet",
-  "PREREQ_PM" => {
-    "AI::MXNetCAPI" => "1.5",
-    "AI::NNVMCAPI" => "1.3",
-    "Function::Parameters" => "1.0705",
-    "Hash::Ordered" => "0.012",
-    "Mouse" => "v2.1.0",
-    "PDL" => "2.007",
-    "PDL::CCS" => "1.23.4",
-    "GraphViz" => "2.14"
-  },
-  "TEST_REQUIRES" => {},
-  "VERSION" => "1.5",
-  "test" => {
-    "TESTS" => "t/*.t"
-  }
-);
-
-
-my %FallbackPrereqs = (
-  "AI::MXNetCAPI" => "1.33",
-  "AI::NNVMCAPI" => "1.3",
-  "Function::Parameters" => "1.0705",
-  "Hash::Ordered" => "0.012",
-  "Mouse" => "v2.1.0",
-  "PDL" => "2.007",
-  "PDL::CCS" => "1.23.4",
-  "GraphViz" => "2.14"
-);
-
-
-unless ( eval { ExtUtils::MakeMaker->VERSION(6.63_03) } ) {
-  delete $WriteMakefileArgs{TEST_REQUIRES};
-  delete $WriteMakefileArgs{BUILD_REQUIRES};
-  $WriteMakefileArgs{PREREQ_PM} = \%FallbackPrereqs;
-}
-
-delete $WriteMakefileArgs{CONFIGURE_REQUIRES}
-  unless eval { ExtUtils::MakeMaker->VERSION(6.52) };
-
-WriteMakefile(%WriteMakefileArgs);
-
-
-
diff --git a/perl-package/AI-MXNet/README b/perl-package/AI-MXNet/README
deleted file mode 100644
index 57d9e4a7b7e8..000000000000
--- a/perl-package/AI-MXNet/README
+++ /dev/null
@@ -1,7 +0,0 @@
-This archive contains the distribution AI-MXNet,
-version 1.5:
-
-  Perl interface to MXNet machine learning library
-
-This library is licensed under Apache 2.0 license https://www.apache.org/licenses/LICENSE-2.0
-
diff --git a/perl-package/AI-MXNet/examples/get_sherlockholmes_data.sh b/perl-package/AI-MXNet/examples/get_sherlockholmes_data.sh
deleted file mode 100755
index 43c8669e003c..000000000000
--- a/perl-package/AI-MXNet/examples/get_sherlockholmes_data.sh
+++ /dev/null
@@ -1,43 +0,0 @@
-#!/usr/bin/env bash
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-echo
-echo "NOTE: To continue, you need to review the licensing of the data sets used by this script"
-echo "See https://www.gutenberg.org/wiki/Gutenberg:The_Project_Gutenberg_License for the licensing"
-read -p "Please confirm you have reviewed the licensing [Y/n]:" -n 1 -r
-echo
-
-if [ $REPLY != "Y" ]
-then
-    echo "License was not reviewed, aborting script."
-    exit 1
-fi
-
-RNN_DIR=$(cd `dirname $0`; pwd)
-DATA_DIR="${RNN_DIR}/data/"
-
-if [[ ! -d "${DATA_DIR}" ]]; then
-  echo "${DATA_DIR} doesn't exist, will create one";
-  mkdir -p ${DATA_DIR}
-fi
-
-wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/sherlockholmes/sherlockholmes.train.txt;
-wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/sherlockholmes/sherlockholmes.valid.txt;
-wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/sherlockholmes/sherlockholmes.test.txt;
-wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tinyshakespeare/input.txt;
diff --git a/perl-package/AI-MXNet/examples/gluon/dcgan.pl b/perl-package/AI-MXNet/examples/gluon/dcgan.pl
deleted file mode 100755
index dd5294763cb2..000000000000
--- a/perl-package/AI-MXNet/examples/gluon/dcgan.pl
+++ /dev/null
@@ -1,206 +0,0 @@
-#!/usr/bin/env perl
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-use strict;
-use warnings;
-use AI::MXNet qw(mx);
-use AI::MXNet::Gluon qw(gluon);
-use AI::MXNet::AutoGrad qw(autograd);
-use AI::MXNet::Gluon::NN qw(nn);
-use AI::MXNet::Base;
-use Getopt::Long qw(HelpMessage);
-use Time::HiRes qw(time);
-use PDL::IO::Pic;
-
-my $batch_size = 64;
-my $nz  = 100;
-my $ngf = 64;
-my $ndf = 64;
-my $nepoch = 25;
-my $lr =0.0002;
-my $beta1 = 0.5;
-my $nc = 3;
-## change to my $ctx = mx->cpu(); if needed
-my $ctx = mx->gpu();
-
-my $train_data = gluon->data->DataLoader(
-    gluon->data->vision->MNIST('./data', train=>1, transform => \&transformer),
-    batch_size=>$batch_size, shuffle=>1, last_batch=>'discard'
-);
-
-my $val_data = gluon->data->DataLoader(
-    gluon->data->vision->MNIST('./data', train=>0, transform=> \&transformer),
-    batch_size=>$batch_size, shuffle=>0
-);
-
-sub transformer
-{
-    my ($data, $label) = @_;
-    # resize to 64x64
-    $data = mx->image->imresize($data, 64, 64);
-    $data = $data->reshape([1, 64, 64]);
-    # normalize to [-1, 1]
-    $data = $data->astype('float32')/128 - 1;
-    # if image is greyscale, repeat 3 times to get RGB image.
-    if($data->shape->[0] == 1)
-    {
-        $data = mx->nd->tile($data, [3, 1, 1]);
-    }
-    return ($data, $label);
-}
-
-sub visualize
-{
-    my ($data, $fake, $iter) = @_;
-    mkdir "data_images";
-    mkdir "data_images/$iter";
-    mkdir "fake_images";
-    mkdir "fake_images/$iter";
-    for my $i (0..$batch_size-1)
-    {
-        my $d = ((pdl_shuffle($data->at($i)->at(0)->aspdl, [reverse(0..63)]) + 1)*128)->byte;
-        my $f = ((pdl_shuffle($fake->at($i)->at(0)->aspdl, [reverse(0..63)]) + 1)*128)->byte;
-        $d->wpic("data_images/$iter/$i.jpg");
-        $f->wpic("fake_images/$iter/$i.jpg");
-    }
-}
-
-# build the generator
-my $netG = nn->Sequential();
-$netG->name_scope(sub {
-    # input is Z, going into a convolution
-    $netG->add(nn->Conv2DTranspose($ngf * 8, 4, 1, 0, use_bias=>0));
-    $netG->add(nn->BatchNorm());
-    $netG->add(nn->Activation('relu'));
-    # state size-> ($ngf*8) x 4 x 4
-    $netG->add(nn->Conv2DTranspose($ngf * 4, 4, 2, 1, use_bias=>0));
-    $netG->add(nn->BatchNorm());
-    $netG->add(nn->Activation('relu'));
-    # state size-> ($ngf*8) x 8 x 8
-    $netG->add(nn->Conv2DTranspose($ngf * 2, 4, 2, 1, use_bias=>0));
-    $netG->add(nn->BatchNorm());
-    $netG->add(nn->Activation('relu'));
-    # state size-> ($ngf*8) x 16 x 16
-    $netG->add(nn->Conv2DTranspose($ngf, 4, 2, 1, use_bias=>0));
-    $netG->add(nn->BatchNorm());
-    $netG->add(nn->Activation('relu'));
-    # state size-> ($ngf*8) x 32 x 32
-    $netG->add(nn->Conv2DTranspose($nc, 4, 2, 1, use_bias=>0));
-    $netG->add(nn->Activation('tanh'));
-    # state size-> (nc) x 64 x 64
-});
-
-# build the discriminator
-my $netD = nn->Sequential();
-$netD->name_scope(sub {
-    # input is (nc) x 64 x 64
-    $netD->add(nn->Conv2D($ndf, 4, 2, 1, use_bias=>0));
-    $netD->add(nn->LeakyReLU(0.2));
-    # state size-> ($ndf) x 32 x 32
-    $netD->add(nn->Conv2D($ndf * 2, 4, 2, 1, use_bias=>0));
-    $netD->add(nn->BatchNorm());
-    $netD->add(nn->LeakyReLU(0.2));
-    # state size-> ($ndf) x 16 x 16
-    $netD->add(nn->Conv2D($ndf * 4, 4, 2, 1, use_bias=>0));
-    $netD->add(nn->BatchNorm());
-    $netD->add(nn->LeakyReLU(0.2));
-    # state size-> ($ndf) x 8 x 8
-    $netD->add(nn->Conv2D($ndf * 8, 4, 2, 1, use_bias=>0));
-    $netD->add(nn->BatchNorm());
-    $netD->add(nn->LeakyReLU(0.2));
-    # state size-> ($ndf) x 4 x 4
-    $netD->add(nn->Conv2D(2, 4, 1, 0, use_bias=>0));
-});
-
-# loss
-my $loss = gluon->loss->SoftmaxCrossEntropyLoss();
-
-# initialize the generator and the discriminator
-$netG->initialize(mx->init->Normal(0.02), ctx=>$ctx);
-$netD->initialize(mx->init->Normal(0.02), ctx=>$ctx);
-
-# trainer for the generator and the discriminator
-my $trainerG = gluon->Trainer($netG->collect_params(), 'adam', {learning_rate => $lr, beta1 => $beta1});
-my $trainerD = gluon->Trainer($netD->collect_params(), 'adam', {learning_rate => $lr, beta1 => $beta1});
-# ============printing==============
-my $real_label = mx->nd->ones([$batch_size], ctx=>$ctx);
-my $fake_label = mx->nd->zeros([$batch_size], ctx=>$ctx);
-
-my $metric = mx->metric->Accuracy();
-print "Training...\n";
-
-my $iter = 0;
-for my $epoch (0..$nepoch-1)
-{
-    my $tic = time;
-    my $btic = time;
-    my $fake; my $data;
-    while(defined(my $d = <$train_data>))
-    {
-        $data = $d->[0];
-        ############################
-        # (1) Update D network: maximize log(D(x)) + log(1 - D(G(z)))
-        ###########################
-        # train with real_t
-        $data = $data->as_in_context($ctx);
-        my $noise = mx->nd->random->normal(0, 1, shape=>[$batch_size, $nz, 1, 1], ctx=>$ctx);
-
-        my ($output, $errD, $errG);
-        autograd->record(sub {
-            $output = $netD->($data);
-            $output = $output->reshape([$batch_size, 2]);
-            my $errD_real = $loss->($output, $real_label);
-            $metric->update([$real_label], [$output]);
-
-            $fake = $netG->($noise);
-            $output = $netD->($fake->detach());
-            $output = $output->reshape([$batch_size, 2]);
-            my $errD_fake = $loss->($output, $fake_label);
-            $errD = $errD_real + $errD_fake;
-            $errD->backward();
-            $metric->update([$fake_label], [$output]);
-        });
-        $trainerD->step($batch_size);
-
-        ############################
-        # (2) Update G network: maximize log(D(G(z)))
-        ###########################
-        autograd->record(sub {
-            $output = $netD->($fake);
-            $output = $output->reshape([-1, 2]);
-            $errG = $loss->($output, $real_label);
-            $errG->backward();
-        });
-
-        $trainerG->step($batch_size);
-        my ($name, $acc) = $metric->get();
-        if(not $iter%100)
-        {
-            AI::MXNet::Logging->info("speed: %.2f samples/s", $batch_size / (time-$btic));
-            AI::MXNet::Logging->info("discriminator loss = %f, generator loss = %f, binary training acc = %f at iter %d epoch %d",
-                mx->nd->mean($errD)->asscalar(), mx->nd->mean($errG)->asscalar(), $acc, $iter, $epoch);
-        }
-        $iter++;
-        $btic = time;
-    }
-    my ($name, $acc) = $metric->get();
-    $metric->reset();
-    visualize($data, $fake, $epoch);
-    AI::MXNet::Logging->info("\nbinary training acc at epoch %d: %s=%f", $epoch, $name, $acc);
-    AI::MXNet::Logging->info("time: %f", time - $tic);
-}
diff --git a/perl-package/AI-MXNet/examples/gluon/mnist.pl b/perl-package/AI-MXNet/examples/gluon/mnist.pl
deleted file mode 100755
index 1fb2d897250f..000000000000
--- a/perl-package/AI-MXNet/examples/gluon/mnist.pl
+++ /dev/null
@@ -1,136 +0,0 @@
-#!/usr/bin/env perl
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-use strict;
-use warnings;
-use AI::MXNet qw(mx);
-use AI::MXNet::Gluon qw(gluon);
-use AI::MXNet::AutoGrad qw(autograd);
-use AI::MXNet::Gluon::NN qw(nn);
-use AI::MXNet::Base;
-use Getopt::Long qw(HelpMessage);
-
-GetOptions(
-    'lr=f'           => \(my $lr           = 0.1),
-    'log-interval=i' => \(my $log_interval = 100),
-    'momentum=f'     => \(my $momentum     = 0.9),
-    'hybridize=i'    => \(my $hybridize    = 0  ),
-    'cuda=i'         => \(my $cuda         = 0  ),
-    'load_params=i'  => \(my $load_params  = 0  ),
-    'batch-size=i'   => \(my $batch_size   = 100),
-    'epochs=i'       => \(my $epochs       = 1 ),
-    'help'           => sub { HelpMessage(0) },
-) or HelpMessage(1);
-
-
-# define network
-
-my $net = nn->Sequential();
-$net->name_scope(sub {
-    $net->add(nn->Dense(128, activation=>'relu'));
-    $net->add(nn->Dense(64, activation=>'relu'));
-    $net->add(nn->Dense(10));
-});
-$net->hybridize() if $hybridize;
-$net->load_parameters('mnist.params') if $load_params;
-# data
-
-sub transformer
-{
-    my ($data, $label) = @_;
-    $data = $data->reshape([-1])->astype('float32')/255;
-    return ($data, $label);
-}
-
-my $train_data = gluon->data->DataLoader(
-    gluon->data->vision->MNIST('./data', train=>1, transform => \&transformer),
-    batch_size=>$batch_size, shuffle=>1, last_batch=>'discard'
-);
-
-my $val_data = gluon->data->DataLoader(
-    gluon->data->vision->MNIST('./data', train=>0, transform=> \&transformer),
-    batch_size=>$batch_size, shuffle=>0
-);
-
-# train
-
-sub test
-{
-    my $ctx = shift;
-    my $metric = mx->metric->Accuracy();
-    while(defined(my $d = <$val_data>))
-    {
-        my ($data, $label) = @$d;
-        $data = $data->as_in_context($ctx);
-        $label = $label->as_in_context($ctx);
-        my $output = $net->($data);
-        $metric->update([$label], [$output]);
-    }
-    return $metric->get;
-}
-
-sub train
-{
-    my ($epochs, $ctx) = @_;
-    # Collect all parameters from net and its children, then initialize them.
-    $net->initialize(mx->init->Xavier(magnitude=>2.24), ctx=>$ctx);
-    # Trainer is for updating parameters with gradient.
-    my $trainer = gluon->Trainer($net->collect_params(), 'sgd', { learning_rate => $lr, momentum => $momentum });
-    my $metric = mx->metric->Accuracy();
-    my $loss = gluon->loss->SoftmaxCrossEntropyLoss();
-
-    for my $epoch (0..$epochs-1)
-    {
-        # reset data iterator and metric at begining of epoch.
-        $metric->reset();
-        enumerate(sub {
-            my ($i, $d) = @_;
-            my ($data, $label) = @$d;
-            $data = $data->as_in_context($ctx);
-            $label = $label->as_in_context($ctx);
-            # Start recording computation graph with record() section.
-            # Recorded graphs can then be differentiated with backward.
-            my $output;
-            autograd->record(sub {
-                $output = $net->($data);
-                my $L = $loss->($output, $label);
-                $L->backward;
-            });
-            # take a gradient step with batch_size equal to data.shape[0]
-            $trainer->step($data->shape->[0]);
-            # update metric at last.
-            $metric->update([$label], [$output]);
-
-            if($i % $log_interval == 0 and $i > 0)
-            {
-                my ($name, $acc) = $metric->get();
-                print "[Epoch $epoch Batch $i] Training: $name=$acc\n";
-            }
-        }, \@{ $train_data });
-
-        my ($name, $acc) = $metric->get();
-        print "[Epoch $epoch] Training: $name=$acc\n";
-
-        my ($val_name, $val_acc) = test($ctx);
-        print "[Epoch $epoch] Validation: $val_name=$val_acc\n"
-    }
-    $net->save_parameters('mnist.params');
-}
-
-train($epochs, $cuda ? mx->gpu(0) : mx->cpu);
diff --git a/perl-package/AI-MXNet/examples/gluon/style_transfer/README.md b/perl-package/AI-MXNet/examples/gluon/style_transfer/README.md
deleted file mode 100644
index 9be9b8ad023e..000000000000
--- a/perl-package/AI-MXNet/examples/gluon/style_transfer/README.md
+++ /dev/null
@@ -1,52 +0,0 @@
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-This directory provides AI::MXNet Implementation of MSG-Net real time style transfer, https://arxiv.org/abs/1703.06953
-
-### Stylize Images Using Pre-trained MSG-Net
-Download the pre-trained model:
-
-        ./get_data.sh
-
-Test the model:
-
-        ./style_transfer.pl --content-image <path or url> --style-image < path or url> --content-size 512
-
-More options:
-
-        * --content-image: path or url to content image you want to stylize.
-        * --style-image:   path or url to style image.
-        * --model:         path to the pre-trained model to be used for stylizing the image if you use your custom model
-        * --output-image:  path for saving the output image, default is 'out.jpg'
-        * --content-size:  the output image size, default is 512 pixels for the shorter side,
-                             decrease the size if your computer is low on RAM and the script fails.
-
-<img title="Pembroke Welsh Corgi Kyuubi is enjoying Total Solar Eclipse of Aug 2017 in Salem, OR"
-    alt="Pembroke Welsh Corgi Kyuubi is enjoying Total Solar Eclipse of Aug 2017 in Salem, OR"
-    src ="http://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/kyuubi.jpg" width="512px" />
-<img title="Style image: Kazimir Malevich, Black Square"
-    alt="Style image: Kazimir Malevich, Black Square"
-    src ="http://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/kyuubi_blacksquare.jpg" width="512px" />
-<img title="Style image: random ornate stone wall image"
-    alt="Style image: random ornate stone wall image"
-    src ="http://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/kyuubi_mural.jpg" width="512px" />
-<img title="Style image: Salvador Dali, The Enigma of Desire"
-    alt="Style image: Salvador Dali, The Enigma of Desire"
-    src ="http://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/kyuubi_dali.jpg" width="512px" />
-<img title="Style image: Vincent van Gogh, The Starry Night"
-    alt="Style image: Vincent van Gogh, The Starry Night"
-    src ="http://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/kyuubi_starry.jpg" width="512px" />
diff --git a/perl-package/AI-MXNet/examples/gluon/style_transfer/get_data.sh b/perl-package/AI-MXNet/examples/gluon/style_transfer/get_data.sh
deleted file mode 100755
index 73465f1a3e7f..000000000000
--- a/perl-package/AI-MXNet/examples/gluon/style_transfer/get_data.sh
+++ /dev/null
@@ -1,31 +0,0 @@
-#!/usr/bin/env bash
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-DIR=$(cd `dirname $0`; pwd)
-DATA_DIR="${DIR}/data/"
-
-if [[ ! -d "${DATA_DIR}" ]]; then
-  echo "${DATA_DIR} doesn't exist, will create one";
-  mkdir -p ${DATA_DIR}
-fi
-
-wget -P ${DATA_DIR} https://apache-mxnet.s3-accelerate.amazonaws.com/gluon/models/msgnet_21styles-2cb88353.zip
-cd ${DATA_DIR}
-unzip msgnet_21styles-2cb88353.zip
-rm msgnet_21styles-2cb88353.zip
diff --git a/perl-package/AI-MXNet/examples/gluon/style_transfer/net.pl b/perl-package/AI-MXNet/examples/gluon/style_transfer/net.pl
deleted file mode 100644
index 1e98502ab126..000000000000
--- a/perl-package/AI-MXNet/examples/gluon/style_transfer/net.pl
+++ /dev/null
@@ -1,325 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-use strict;
-use warnings;
-use AI::MXNet::Function::Parameters;
-
-package Bottleneck {
-    # Pre-activation residual block
-    # Identity Mapping in Deep Residual Networks
-    # ref https://arxiv.org/abs/1603.05027
-    use AI::MXNet::Gluon::Mouse;
-    extends 'AI::MXNet::Gluon::Block';
-    has ['inplanes',
-           'planes'] => (is => 'rw', required => 1);
-    has 'stride'     => (is => 'rw', default => 1);
-    has 'downsample' => (is => 'rw');
-    has 'norm_layer' => (is => 'rw', default => 'AI::MXNet::Gluon::NN::InstanceNorm');
-    method python_constructor_arguments(){ [qw/inplanes planes stride downsample norm_layer/] }
-    sub BUILD
-    {
-        my $self = shift;
-        $self->expansion(4);
-        if(defined $self->downsample)
-        {
-            $self->residual_layer(
-                nn->Conv2D(
-                    in_channels=>$self->inplanes,
-                    channels=>$self->planes * $self->expansion,
-                    kernel_size=>1, strides=>[$self->stride, $self->stride]
-                )
-            );
-        }
-        $self->conv_block(nn->Sequential());
-        $self->conv_block->name_scope(sub {
-            $self->conv_block->add(
-                $self->norm_layer->new(in_channels=>$self->inplanes)
-            );
-            $self->conv_block->add(nn->Activation('relu'));
-            $self->conv_block->add(
-                nn->Conv2D(in_channels=>$self->inplanes, 
-                    channels=>$self->planes,
-                    kernel_size=>1
-                )
-            );
-            $self->conv_block->add($self->norm_layer->new(in_channels=>$self->planes));
-            $self->conv_block->add(nn->Activation('relu'));
-            $self->conv_block->add(
-                ConvLayer->new(
-                    $self->planes, $self->planes, kernel_size=>3,
-                    stride=>$self->stride
-                )
-            );
-            $self->conv_block->add($self->norm_layer->new(in_channels=>$self->planes));
-            $self->conv_block->add(nn->Activation('relu'));
-            $self->conv_block->add(
-                nn->Conv2D(
-                    in_channels=>$self->planes,
-                    channels=>$self->planes * $self->expansion,
-                    kernel_size=>1
-                )
-            );
-        });
-    }
-
-    method forward($x)
-    {
-        my $residual;
-        if(defined $self->downsample)
-        {
-            $residual = $self->residual_layer->($x);
-        }
-        else
-        {
-            $residual = $x;
-        }
-        return $residual + $self->conv_block->($x);
-    }
-}
-
-package UpBottleneck {
-    # Up-sample residual block (from MSG-Net paper)
-    # Enables passing identity all the way through the generator
-    # ref https://arxiv.org/abs/1703.06953
-    use AI::MXNet::Gluon::Mouse;
-    extends 'AI::MXNet::Gluon::Block';
-    has ['inplanes',
-           'planes'] => (is => 'rw', required => 1);
-    has 'stride'     => (is => 'rw', default => 2);
-    has 'norm_layer' => (is => 'rw', default => 'AI::MXNet::Gluon::NN::InstanceNorm');
-    method python_constructor_arguments(){ [qw/inplanes planes stride norm_layer/] }
-    sub BUILD
-    {
-        my $self = shift;
-        $self->expansion(4);
-        $self->residual_layer(
-            UpsampleConvLayer->new(
-                $self->inplanes,
-                $self->planes * $self->expansion,
-                kernel_size=>1, stride=>1,
-                upsample=>$self->stride
-            )
-        );
-        $self->conv_block(nn->Sequential());
-        $self->conv_block->name_scope(sub {
-            $self->conv_block->add($self->norm_layer->new(in_channels=>$self->inplanes));
-            $self->conv_block->add(nn->Activation('relu'));
-            $self->conv_block->add(
-                nn->Conv2D(
-                    in_channels=>$self->inplanes,
-                    channels=>$self->planes,
-                    kernel_size=>1
-                )
-            );
-            $self->conv_block->add($self->norm_layer->new(in_channels=>$self->planes));
-            $self->conv_block->add(nn->Activation('relu'));
-            $self->conv_block->add(
-                UpsampleConvLayer->new(
-                    $self->planes, $self->planes,
-                    kernel_size=>3, stride=>1,
-                    upsample=>$self->stride
-                )
-            );
-            $self->conv_block->add($self->norm_layer->new(in_channels=>$self->planes));
-            $self->conv_block->add(nn->Activation('relu'));
-            $self->conv_block->add(
-                nn->Conv2D(
-                    in_channels=>$self->planes,
-                    channels=>$self->planes * $self->expansion,
-                    kernel_size=>1
-                )
-            );
-        });
-    }
-
-    method forward($x)
-    {
-        return  $self->residual_layer->($x) + $self->conv_block->($x);
-    }
-}
-
-package ConvLayer {
-    use AI::MXNet::Gluon::Mouse;
-    use POSIX qw(floor);
-    extends 'AI::MXNet::Gluon::Block';
-    has [qw/in_channels out_channels kernel_size stride/] => (is => 'rw');
-    method python_constructor_arguments(){ [qw/in_channels out_channels kernel_size stride/] }
-    sub BUILD
-    {
-        my $self = shift;
-        $self->pad(nn->ReflectionPad2D(floor($self->kernel_size/2)));
-        $self->conv2d(
-            nn->Conv2D(
-                in_channels=>$self->in_channels,
-                channels=>$self->out_channels,
-                kernel_size=>$self->kernel_size,
-                strides=>[$self->stride, $self->stride],
-                padding=>0
-            )
-        );
-    }
-
-    method forward($x)
-    {
-        $x = $self->pad->($x);
-        my $out = $self->conv2d->($x);
-        return $out;
-    }
-}
-
-
-package UpsampleConvLayer {
-    # UpsampleConvLayer
-    # Upsamples the input and then does a convolution. This method gives better results
-    # compared to ConvTranspose2d.
-    # ref: http://distill.pub/2016/deconv-checkerboard/
-    use AI::MXNet::Gluon::Mouse;
-    use POSIX qw(floor);
-    extends 'AI::MXNet::Gluon::Block';
-    has [qw/in_channels out_channels kernel_size stride upsample/] => (is => 'rw');
-    method python_constructor_arguments(){ [qw/in_channels out_channels kernel_size stride upsample/] }
-    sub BUILD
-    {
-        my $self = shift;
-        $self->conv2d(
-            nn->Conv2D(
-                in_channels=>$self->in_channels,
-                channels=>$self->out_channels,
-                kernel_size=>$self->kernel_size,
-                strides=>[$self->stride, $self->stride],
-                padding=>floor($self->kernel_size/2)
-            )
-        );
-    }
-
-    method forward($x)
-    {
-        if($self->upsample)
-        {
-            $x = nd->UpSampling($x, scale=>$self->upsample, sample_type=>'nearest');
-        }
-        my $out = $self->conv2d->($x);
-        return $out;
-    }
-}
-
-package GramMatrix {
-    use AI::MXNet::Gluon::Mouse;
-    extends 'AI::MXNet::Gluon::Block';
-    method forward($x)
-    {
-        my ($b, $ch, $h, $w) = @{ $x->shape };
-        my $features = $x->reshape([$b, $ch, $w * $h]);
-        my $gram = nd->batch_dot($features, $features, transpose_b=>1) / ($ch * $h * $w);
-        return $gram;
-    }
-};
-
-package Inspiration {
-    # Inspiration Layer (from MSG-Net paper)
-    # tuning the featuremap with target Gram Matrix
-    # ref https://arxiv.org/abs/1703.06953
-    use AI::MXNet::Gluon::Mouse;
-    extends 'AI::MXNet::Gluon::Block';
-    has 'C' => (is => 'rw', required => 1);
-    has 'B' => (is => 'rw', default => 1);
-    method python_constructor_arguments(){ [qw/C B/] }
-    sub BUILD
-    {
-        my $self = shift;
-        $self->weight($self->params->get('weight', shape=>[1,$self->C,$self->C],
-                                      init=>mx->initializer->Uniform(),
-                                      allow_deferred_init=>1));
-        $self->gram(nd->random->uniform(shape=>[$self->B, $self->C, $self->C]));
-    }
-
-    method set_target($target)
-    {
-        $self->gram($target);
-    }
-
-    method forward($x)
-    {
-        $self->P(nd->batch_dot($self->weight->data->broadcast_to($self->gram->shape), $self->gram));
-        return nd->batch_dot(
-                nd->SwapAxis($self->P,1,2)->broadcast_to([$x->shape->[0], $self->C, $self->C]),
-                $x->reshape([0, 0, $x->shape->[2]*$x->shape->[3]])
-        )->reshape($x->shape);
-    }
-}
-
-package Net {
-    use AI::MXNet::Gluon::Mouse;
-    extends 'AI::MXNet::Gluon::Block';
-    has 'input_nc'    => (is => 'rw', default => 3);
-    has 'output_nc'   => (is => 'rw', default => 3);
-    has 'ngf'         => (is => 'rw', default => 64);
-    has 'norm_layer'  => (is => 'rw', default => 'AI::MXNet::Gluon::NN::InstanceNorm');
-    has 'n_blocks'    => (is => 'rw', default => 6);
-    has 'gpu_ids'     => (is => 'rw', default => sub { [] });
-    method python_constructor_arguments(){ [qw/input_nc output_nc ngf norm_layer n_blocks gpu_ids/] }
-    sub BUILD
-    {
-        my $self = shift;
-        $self->gram(GramMatrix->new);
-
-        my $block = 'Bottleneck';
-        my $upblock = 'UpBottleneck';
-        my $expansion = 4;
-
-        $self->name_scope(sub {
-            $self->model1(nn->Sequential());
-            $self->ins(Inspiration->new($self->ngf*$expansion));
-            $self->model(nn->Sequential());
-
-            $self->model1->add(ConvLayer->new($self->input_nc, 64, kernel_size=>7, stride=>1));
-            $self->model1->add($self->norm_layer->new(in_channels=>64));
-            $self->model1->add(nn->Activation('relu'));
-            $self->model1->add($block->new(64, 32, 2, 1, $self->norm_layer));
-            $self->model1->add($block->new(32*$expansion, $self->ngf, 2, 1, $self->norm_layer));
-
-            $self->model->add($self->model1);
-            $self->model->add($self->ins);
-
-            for(1..$self->n_blocks)
-            {
-                $self->model->add($block->new($self->ngf*$expansion, $self->ngf, 1, undef, $self->norm_layer));
-            }
-
-            $self->model->add($upblock->new($self->ngf*$expansion, 32, 2, $self->norm_layer));
-            $self->model->add($upblock->new(32*$expansion, 16, 2, $self->norm_layer));
-            $self->model->add($self->norm_layer->new(in_channels=>16*$expansion));
-            $self->model->add(nn->Activation('relu'));
-            $self->model->add(ConvLayer->new(16*$expansion, $self->output_nc, kernel_size=>7, stride=>1));
-        });
-    }
-
-    method set_target($x)
-    {
-        my $F = $self->model1->($x);
-        my $G = $self->gram->($F);
-        $self->ins->set_target($G);
-    }
-
-    method forward($input)
-    {
-        return $self->model->($input);
-    }
-}
-
-1;
diff --git a/perl-package/AI-MXNet/examples/gluon/style_transfer/style_transfer.pl b/perl-package/AI-MXNet/examples/gluon/style_transfer/style_transfer.pl
deleted file mode 100755
index d85db2652f9a..000000000000
--- a/perl-package/AI-MXNet/examples/gluon/style_transfer/style_transfer.pl
+++ /dev/null
@@ -1,58 +0,0 @@
-#!/usr/bin/env perl
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-use strict;
-use warnings;
-use AI::MXNet::Gluon::Utils qw(download);
-use AI::MXNet 'mx';
-use AI::MXNet::Gluon::NN 'nn';
-use PDL::IO::Pic;
-require './net.pl';
-require './utils.pl';
-use Getopt::Long qw(HelpMessage);
-
-GetOptions(
-    'content-image=s' => \(my $content_image),
-    'style-image=s'   => \(my $style_image),
-    'model=s'         => \(my $model = './data/msgnet_21styles-2cb88353.params'),
-    'output-image=s'  => \(my $output_image = 'out.jpg'),
-    'content-size=i'  => \(my $content_size = 512),
-    'ngf'             => \(my $ngf = 128), ## number of convolutional filters for the model
-    'help'           => sub { HelpMessage(0) },
-) or HelpMessage(1);
-
-die "Please supply --content-image <path or url> and --style-image <path or url>"
-    unless (defined $content_image and defined $style_image);
-if($content_image =~ /^https:/ or $style_image =~ /^https:/)
-{
-    eval { require IO::Socket::SSL; };
-    die "You need to have IO::Socket::SSL installed for https images" if $@;
-}
-$content_image = download($content_image) if $content_image =~ /^https?:/;
-$style_image = download($style_image) if $style_image =~ /^https?:/;
-
-evaluate(
-    content_image  => $content_image,
-    style_image    => $style_image,
-    content_size   => $content_size,
-    style_size     => $content_size,
-    output_image   => $output_image,
-    ngf            => $ngf,
-    model          => $model
-);
diff --git a/perl-package/AI-MXNet/examples/gluon/style_transfer/utils.pl b/perl-package/AI-MXNet/examples/gluon/style_transfer/utils.pl
deleted file mode 100644
index f17cd623e017..000000000000
--- a/perl-package/AI-MXNet/examples/gluon/style_transfer/utils.pl
+++ /dev/null
@@ -1,73 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-use strict;
-use warnings;
-use AI::MXNet::Function::Parameters;
-
-func tensor_load_rgbimage($filename, $size=)
-{
-    my $img = mx->image->imread($filename);
-    if($size)
-    {
-        $img = mx->image->resize_short($img, $size);
-    }
-    return $img->transpose([2,0,1])->expand_dims(axis=>0)->astype('float32');
-}
-
-func tensor_save_rgbimage($img, $filename)
-{
-    $img = nd->clip($img, a_min => 0, a_max => 255)->transpose([1,2,0])->aspdl;
-    $img->slice('X', 'X', '-1:0')->byte->wpic($filename);
-}
-
-func tensor_save_bgrimage($tensor, $filename)
-{
-    $tensor = $tensor->at(0);
-    my ($b, $g, $r) = @{ nd->split($tensor, num_outputs=>3, axis=>0) };
-    $tensor = nd->concat($r, $g, $b, dim=>0);
-    tensor_save_rgbimage($tensor, $filename);
-}
-
-
-func preprocess_batch($batch)
-{
-    $batch = nd->swapaxes($batch, 0, 1);
-    my ($r, $g, $b) = @{ nd->split($batch, num_outputs=>3, axis=>0) };
-    $batch = nd->concat($b, $g, $r, dim=>0);
-    $batch = nd->swapaxes($batch, 0, 1);
-    return $batch;
-}
-
-func evaluate(%args)
-{
-    my $ctx = mx->cpu;
-    # images
-    my $content_image = tensor_load_rgbimage($args{content_image}, $args{content_size});
-    my $style_image = tensor_load_rgbimage($args{style_image}, $args{style_size});
-    $style_image = preprocess_batch($style_image);
-    # model
-    my $style_model = Net->new(ngf=>$args{ngf});
-    $style_model->load_parameters($args{model}, ctx=>$ctx);
-
-    # forward
-    $style_model->set_target($style_image);
-    my $output = $style_model->($content_image);
-    tensor_save_bgrimage($output->[0], $args{output_image});
-}
-
-1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet.pm b/perl-package/AI-MXNet/lib/AI/MXNet.pm
deleted file mode 100644
index b453cc54c6ac..000000000000
--- a/perl-package/AI-MXNet/lib/AI/MXNet.pm
+++ /dev/null
@@ -1,125 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-package AI::MXNet;
-use v5.14.0;
-use strict;
-use warnings;
-use AI::MXNet::NS 'global';
-use AI::MXNet::Base;
-use AI::MXNet::Callback 'callback';
-use AI::MXNet::NDArray qw(nd ndarray);
-use AI::MXNet::Context 'context';
-use AI::MXNet::Symbol qw(sym symbol);
-use AI::MXNet::Executor;
-use AI::MXNet::Executor::Group;
-use AI::MXNet::CudaModule;
-use AI::MXNet::Random qw(rnd random);
-use AI::MXNet::Initializer qw(init initializer);
-use AI::MXNet::Optimizer qw(optimizer opt);
-use AI::MXNet::KVStore 'kv';
-use AI::MXNet::KVStoreServer;
-use AI::MXNet::IO 'io';
-use AI::MXNet::Metric 'metric';
-use AI::MXNet::LRScheduler;
-use AI::MXNet::Monitor 'mon';
-use AI::MXNet::Profiler;
-use AI::MXNet::Module::Base;
-use AI::MXNet::Module qw(mod module);
-use AI::MXNet::Module::Bucketing;
-use AI::MXNet::RNN 'rnn';
-use AI::MXNet::RunTime 'runtime';
-use AI::MXNet::Visualization 'viz';
-use AI::MXNet::RecordIO 'recordio';
-use AI::MXNet::Image qw(img image);
-use AI::MXNet::Contrib 'contrib';
-use AI::MXNet::LinAlg 'linalg';
-use AI::MXNet::CachedOp;
-use AI::MXNet::AutoGrad 'autograd';
-use AI::MXNet::Gluon 'gluon';
-use AI::MXNet::NDArray::Sparse;
-use AI::MXNet::Symbol::Sparse;
-use AI::MXNet::Engine 'engine';
-our $VERSION = '1.5';
-
-sub cpu { AI::MXNet::Context->cpu($_[1]//0) }
-sub cpu_pinned { AI::MXNet::Context->cpu_pinned($_[1]//0) }
-sub gpu { AI::MXNet::Context->gpu($_[1]//0) }
-sub name { __PACKAGE__ }
-sub rtc { __PACKAGE__ }
-sub Prefix { AI::MXNet::Symbol::Prefix->new(prefix => $_[1]) }
-our $AttrScope = AI::MXNet::Symbol::AttrScope->new;
-our $NameManager = AI::MXNet::Symbol::NameManager->new;
-our $Context = AI::MXNet::Context->new(device_type => 'cpu', device_id => 0);
-
-1;
-__END__
-
-=encoding UTF-8
-
-=head1 NAME
-
-AI::MXNet - Perl interface to MXNet machine learning library
-
-=head1 SYNOPSIS
-
-=head1 DESCRIPTION
-
-    Perl interface to MXNet machine learning library.
-    MXNet supports the Perl programming language.
-    The MXNet Perl package brings flexible and efficient GPU computing and
-    state-of-art deep learning to Perl.
-    It enables you to write seamless tensor/matrix computation with multiple GPUs in Perl.
-    It also lets you construct and customize the state-of-art deep learning models in Perl,
-    and apply them to tasks, such as image classification and data science challenges.
-
-    One important thing to internalize is that Perl interface is written to be as close as possible to the Python’s API,
-    so most, if not all of Python’s documentation and examples should just work in Perl after making few changes
-    in order to make the code a bit more Perlish. In nutshell just add $ sigils and replace . = \n with -> => ;
-    and in 99% of cases that’s all that is needed there.
-    In addition please refer to very detailed L<MXNet Python API Documentation|https://mxnet.apache.org/api/python/docs/tutorials/index.html>.
-
-    AI::MXNet supports new imperative PyTorch like Gluon MXNet interface.
-    Please get acquainted with this new interface at L<Dive into Deep Learning|https://www.d2l.ai/>.
-
-    For specific Perl Gluon usage please refer to Perl examples and tests directories on github,
-    but be assured that the Python and Perl usage are extremely close in order to make the use
-    of the Python Gluon docs and examples as easy as possible.
-
-    AI::MXNet is seamlessly glued with L<PDL|https://metacpan.org/pod/PDL>, the C++ level state can be easily initialized from PDL
-    and the results can be transferred to PDL objects in order to allow you to use all the glory and power of the PDL!
-
-=head1 BUGS AND INCOMPATIBILITIES
-
-    Parity with Python interface is mostly achieved, few deprecated
-    and not often used features left unported for now.
-
-=head1 SEE ALSO
-
-    L<https://mxnet.io/>
-    L<https://github.com/dmlc/mxnet/tree/master/perl-package>
-    L<Function::Parameters|https://metacpan.org/pod/Function::Parameters>, L<Mouse|https://metacpan.org/pod/Mouse>
-
-=head1 AUTHOR
-
-    Sergey Kolychev, <sergeykolychev.github@gmail.com>
-
-=head1 COPYRIGHT & LICENSE
-
-    This library is licensed under Apache 2.0 license L<https://www.apache.org/licenses/LICENSE-2.0>
-
-=cut
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/AutoGrad.pm b/perl-package/AI-MXNet/lib/AI/MXNet/AutoGrad.pm
deleted file mode 100644
index d6272b5a1def..000000000000
--- a/perl-package/AI-MXNet/lib/AI/MXNet/AutoGrad.pm
+++ /dev/null
@@ -1,482 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-package AI::MXNet::AutoGrad;
-use strict;
-use warnings;
-use AI::MXNet::NS 'global';
-use AI::MXNet::Base;
-use AI::MXNet::Function::Parameters;
-use Scalar::Util qw(blessed);
-use Carp qw(confess);
-
-=head1 NAME
-
-    AI::MXNet::AutoGrad - Autograd for NDArray.
-=cut
-
-=head1 DESCRIPTION
-
-    Auto gradients differentiation for dynamic graphs, primarily used with Gluon.
-
-=cut
-
-=head1 SYNOPSIS
-
-    use AI::MXNet qw(mx);
-    my $x = mx->nd->ones([1]);
-    $x->attach_grad;
-    my $z;
-    mx->autograd->record(sub {
-        $z = mx->nd->elemwise_add($x->exp, $x);
-    });
-    my $dx = mx->autograd->grad($z, $x, create_graph=>1);
-    ok(abs($dx->asscalar - 3.71828175) < 1e-7);
-    $dx->backward;
-    ok(abs($x->grad->asscalar - 2.71828175) < 1e-7);
-
-=cut
-
-=head2 set_is_training
-
-    Set status to training/not training. When training, graph will be constructed
-    for gradient computation. Operators will also run with $is_train=1. For example,
-    Dropout will drop inputs randomly when is_train=True while simply passing through
-    if $is_train=0.
-
-    Parameters
-    ----------
-    $is_train: Bool
-
-    Returns
-    -------
-    previous state before this set.
-=cut
-
-
-method set_is_training(Bool $is_train)
-{
-    return scalar(check_call(AI::MXNetCAPI::AutogradSetIsTraining($is_train)));
-}
-
-=head2 set_is_recording
-
-    Set status to recording/not recording. When recording, graph will be constructed
-    for gradient computation.
-
-    Parameters
-    ----------
-    $is_recoding: Bool
-
-    Returns
-    -------
-    previous state before this set.
-=cut
-
-method set_is_recording(Bool $is_recording)
-{
-    return scalar(check_call(AI::MXNetCAPI::AutogradSetIsRecording($is_recording)));
-}
-
-=head2 is_recording
-
-    Get status on recording/not recording.
-
-    Returns
-    -------
-    Current state of recording.
-=cut
-
-method is_recording()
-{
-    return scalar(check_call(AI::MXNetCAPI::AutogradIsRecording()));
-}
-
-=head2 is_training
-
-    Get status on training/predicting.
-
-    Returns
-    -------
-    Current state of training/predicting.
-=cut
-
-method is_training()
-{
-    return scalar(check_call(AI::MXNetCAPI::AutogradIsTraining()));
-}
-
-=head2 mark_variables
-
-    Mark AI::MXNet::NDArrays as variables to compute gradient for autograd.
-
-    Parameters
-    ----------
-    ArrayRef[AI::MXNet::NDArray] $variables
-    ArrayRef[AI::MXNet::NDArray] $gradients
-    GradReq|ArrayRef[GradReq]   :$grad_reqs='write'
-=cut
-
-method mark_variables(
-    ArrayRef[AI::MXNet::NDArray]  $variables,
-    ArrayRef[AI::MXNet::NDArray]  $gradients,
-    GradReq|ArrayRef[GradReq]    :$grad_reqs='write'
-)
-{
-    my @variable_handles = map { $_->handle } @{ $variables };
-    my @gradient_handles = map { $_->handle } @{ $gradients };
-    my @grad_reqs;
-    if(not ref $grad_reqs)
-    {
-        @grad_reqs = (GRAD_REQ_MAP->{ $grad_reqs }) x scalar(@variable_handles);
-    }
-    else
-    {
-        @grad_reqs = map { GRAD_REQ_MAP->{ $_ } } @{ $grad_reqs };
-    }
-    check_call(
-        AI::MXNetCAPI::AutogradMarkVariables(
-            scalar(@variable_handles),
-            \@variable_handles,
-            \@grad_reqs,
-            \@gradient_handles
-        )
-    );
-}
-
-=head2 backward
-
-    Compute the gradients of heads w.r.t previously marked variables.
-
-    Parameters
-    ----------
-    $heads: ArrayRef[AI::MXNet::NDArray]
-        Output NDArray(s)
-    :$head_grads=: Maybe[AI::MXNet::NDArray|ArrayRef[AI::MXNet::NDArray|Undef]]
-        Gradients with respect to heads.
-    :$retain_graph=0: Bool, optional
-        Whether to retain graph.
-    :$train_mode=1: Bool, optional
-        Whether to do backward for training or predicting.
-=cut
-method backward(
-    AI::MXNet::NDArray|ArrayRef[AI::MXNet::NDArray] $heads,
-    Maybe[AI::MXNet::NDArray|ArrayRef[AI::MXNet::NDArray|Undef]] :$head_grads=,
-    Bool :$retain_graph=0,
-    Bool :$train_mode=1
-)
-{
-    my ($head_handles, $hgrad_handles) = _parse_head($heads, $head_grads);
-    check_call(
-        AI::MXNetCAPI::AutogradBackwardEx(
-            scalar(@{ $head_handles }),
-            $head_handles,
-            $hgrad_handles,
-            0,
-            [],
-            $retain_graph,
-            0,
-            $train_mode
-        )
-    );
-}
-
-=head2 compute_gradient
-
-    Compute the gradients of outputs w.r.t variables.
-
-    Parameters
-    ----------
-    outputs: ArrayRef[AI::MXNet::NDArray]
-
-    Returns
-    -------
-    gradients: ArrayRef[AI::MXNet::NDArray]
-=cut
-
-
-method compute_gradient(ArrayRef[AI::MXNet::NDArray] $outputs)
-{
-    __PACKAGE__->backward($outputs);
-}
-
-=head2 grad_and_loss
-
-    Return function that computes both gradient of arguments and loss value.
-
-    Parameters
-    ----------
-    $func: CodeRef
-        The forward (loss) function.
-    $argnum: Maybe[Int|ArrayRef[Int]]
-        The index of argument to calculate gradient for.
-
-    Returns
-    -------
-    grad_and_loss_func: CodeRef
-        A function that would compute both the gradient of arguments and loss value.
-=cut
-
-method grad_and_loss(CodeRef $func, Maybe[Int|ArrayRef[Int]] $argnum=)
-{
-    return sub {
-        my @args = @_;
-        my @variables = @_;
-        if(defined $argnum)
-        {
-            my @argnum = ref $argnum ? @$argnum : ($argnum);
-            @variables = map { $args[$_] } @argnum;
-        }
-        map {
-            assert(
-                (blessed($_) and $_->isa('AI::MXNet::NDArray')),
-                "type of autograd input should NDArray")
-        } @variables;
-        my @grads = map { $_->zeros_like } @variables;
-        __PACKAGE__->mark_variables(\@variables, \@grads);
-        my $outputs;
-        __PACKAGE__->record(sub { $outputs = $func->(@args) });
-        __PACKAGE__->backward(ref $outputs eq 'ARRAY' ? $outputs : [$outputs]);
-        return (\@grads, $outputs);
-    };
-}
-
-=head2 grad
-
-    Compute the gradients of heads w.r.t variables. Gradients will be
-    returned as new NDArrays instead of stored into `variable.grad`.
-    Supports recording gradient graph for computing higher order gradients.
-
-    Note: Currently only a very limited set of operators support higher order
-    gradients.
-
-    Parameters
-    ----------
-    $heads: AI::MXNet::NDArray|ArrayRef[AI::MXNet::NDArray]
-        Output NDArray(s)
-    $variables: AI::MXNet::NDArray|ArrayRef[AI::MXNet::NDArray]
-        Input variables to compute gradients for.
-    :$head_grads=: Maybe[AI::MXNet::NDArray|ArrayRef[AI::MXNet::NDArray|Undef]]
-        Gradients with respect to heads.
-    :$retain_graph=: Bool
-        Whether to keep computation graph to differentiate again, instead
-        of clearing history and release memory. Defaults to the same value
-        as create_graph.
-    :$create_graph=0: Bool
-        Whether to record gradient graph for computing of higher order gradients.
-    $train_mode=1: Bool, optional
-        Whether to do backward for training or prediction.
-
-    Returns
-    -------
-    AI::MXNet::NDArray|ArrayRef[AI::MXNet::NDArray]:
-        Gradients with respect to variables.
-
-    Examples
-    --------
-    >>> $x = mx->nd->ones([1]);
-    >>> $x->attach_grad();
-    >>> mx->autograd->record(sub {
-            $z = mx->nd->elemwise_add(mx->nd->exp($x), $x);
-        });
-    >>> $dx = mx->autograd->grad($z, [$x], create_graph=>1)
-    >>> $dx->backward();
-    >>> print($dx->grad->aspdl)
-    [3.71828175]
-=cut
-
-method grad(
-    AI::MXNet::NDArray|ArrayRef[AI::MXNet::NDArray] $heads,
-    AI::MXNet::NDArray|ArrayRef[AI::MXNet::NDArray] $variables,
-    Maybe[AI::MXNet::NDArray|ArrayRef[AI::MXNet::NDArray|Undef]] :$head_grads=,
-    Bool :$retain_graph=,
-    Bool :$create_graph=0,
-    Bool :$train_mode=1
-)
-{
-    my ($head_handles, $hgrad_handles) = _parse_head($heads, $head_grads);
-    my @var_handles;
-    if(blessed $variables)
-    {
-        @var_handles = ($variables->handle);
-    }
-    else
-    {
-        assert(scalar(@{ $variables }), "variables cannot be an empty array.");
-        @var_handles = map { $_->handle } @{ $variables };
-    }
-
-    $retain_graph //= $create_graph;
-
-    my ($grad_vars, $grad_stypes)
-        =
-    check_call(
-        AI::MXNetCAPI::AutogradBackwardEx(
-            scalar(@{ $head_handles }),
-            $head_handles,
-            $hgrad_handles,
-            scalar(@var_handles),
-            \@var_handles,
-            $retain_graph,
-            $create_graph,
-            $train_mode
-        )
-    );
-
-    my @ret;
-    for(zip($grad_vars, $grad_stypes)) {
-        my ($handle, $stype) = @$_;
-        push @ret, AI::MXNet::NDArray->_ndarray_cls($handle, 1, $stype);
-    }
-    if(blessed $variables)
-    {
-        return $ret[0];
-    }
-    return \@ret;
-}
-
-=head2 train_mode
-
-    Executes $sub within an autograd training scope context.
-    Parameters
-    ----------
-    $sub: CodeRef
-=cut
-
-method train_mode(CodeRef $sub)
-{
-    my $prev = __PACKAGE__->set_is_training(1);
-    eval { $sub->(); };
-    __PACKAGE__->set_is_training(0) unless $prev;
-    confess($@) if $@;
-}
-
-=head2 predict_mode
-
-    Executes $sub within an autograd predicting scope context.
-    Parameters
-    ----------
-    $sub: CodeRef
-=cut
-
-method predict_mode(CodeRef $sub)
-{
-    my $prev = __PACKAGE__->set_is_training(0);
-    eval { $sub->(); };
-    __PACKAGE__->set_is_training(1) if $prev;
-    confess($@) if $@;
-}
-
-=head2 record
-
-    Executes $sub within an autograd recording scope context
-    and captures code that needs gradients to be calculated.
-    Parameters
-    ----------
-    $sub: CodeRef
-    :$train_mode=1 : Maybe[Bool]
-=cut
-
-method record(CodeRef $sub, Maybe[Bool] :$train_mode=1)
-{
-    my $prev_train;
-    if(defined $train_mode)
-    {
-        $prev_train = __PACKAGE__->set_is_training($train_mode);
-    }
-    my $prev_recording = __PACKAGE__->set_is_recording(1);
-    eval { $sub->(); };
-    if(defined $train_mode)
-    {
-        $prev_train = __PACKAGE__->set_is_training($prev_train) if not $prev_train == $train_mode;
-    }
-    __PACKAGE__->set_is_recording(0) unless $prev_recording;
-    confess($@) if $@;
-}
-
-=head2 pause
-
-    Executes $sub within an autograd recording scope context
-    and captures code that needs gradients to be calculated.
-    Parameters
-    ----------
-    $sub: CodeRef
-    :$train_mode=0 : Maybe[Bool]
-=cut
-
-method pause(CodeRef $sub, Maybe[Bool] :$train_mode=0)
-{
-    my $prev_train;
-    if(defined $train_mode)
-    {
-        $prev_train = __PACKAGE__->set_is_training($train_mode);
-    }
-    my $prev_recording = __PACKAGE__->set_is_recording(0);
-    eval { $sub->(); };
-    if(defined $train_mode)
-    {
-        $prev_train = __PACKAGE__->set_is_training($prev_train) if not $prev_train == $train_mode;
-    }
-    __PACKAGE__->set_is_recording(1) if $prev_recording;
-    confess($@) if $@;
-}
-
-=head2 get_symbol
-
-    Retrieve recorded computation history as `Symbol`.
-
-    Parameters
-    ----------
-    $x : AI::MXNet::NDArray
-        AI::MXNet::NDArray representing the head of computation graph.
-    Returns
-    -------
-    AI::MXNet::Symbol
-        The retrieved Symbol.
-=cut
-
-method get_symbol(AI::MXNet::NDArray $x)
-{
-    my $handle = scalar(check_call(AI::MXNetCAPI::AutogradGetSymbol($x->handle)));
-    return AI::MXNet::Symbol->new(handle => $handle);
-}
-
-# parse head gradient for backward and grad.
-func _parse_head($heads, $head_grads)
-{
-    if(blessed $heads)
-    {
-        $heads = [$heads];
-    }
-    if(blessed $head_grads)
-    {
-        $head_grads = [$head_grads];
-    }
-    my @head_handles = map { $_->handle } @{ $heads };
-    my @hgrad_handles;
-    if(defined $head_grads)
-    {
-        assert(
-            (@{ $heads } == @{ $head_grads }),
-            "heads and head_grads must be lists of the same length"
-        );
-        @hgrad_handles = map { defined($_) ? $_->handle : undef } @{ $head_grads };
-    }
-    return (\@head_handles, \@hgrad_handles);
-}
-
-1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/AutoLoad.pm b/perl-package/AI-MXNet/lib/AI/MXNet/AutoLoad.pm
deleted file mode 100644
index 927fd53b13a1..000000000000
--- a/perl-package/AI-MXNet/lib/AI/MXNet/AutoLoad.pm
+++ /dev/null
@@ -1,35 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-package AI::MXNet::AutoLoad;
-use strict;
-use warnings;
-
-sub AUTOLOAD
-{
-    my ($class) = @_;
-    my ($prefix, $real_class) = $class->config;
-    my ($name) = our $AUTOLOAD =~ /::(\w+)$/;
-    my $sub = "_${prefix}_$name";
-    {
-        no strict 'refs';
-        *{"${class}::$name"} = sub { shift; $real_class->$sub(@_); };
-    }
-    goto $class->can($name);
-}
-
-1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Base.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Base.pm
deleted file mode 100644
index 9ac917b57b41..000000000000
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Base.pm
+++ /dev/null
@@ -1,482 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-package AI::MXNet::Base;
-use strict;
-use warnings;
-use PDL;
-use PDL::Types ();
-use PDL::CCS::Nd;
-use AI::MXNetCAPI 1.5;
-use AI::NNVMCAPI 1.4;
-use AI::MXNet::Types;
-use Time::HiRes;
-use Scalar::Util qw(blessed);
-use Carp;
-use Exporter;
-use base qw(Exporter);
-use List::Util qw(shuffle);
-use Data::Dumper;
-
-our @EXPORT = qw(product enumerate assert zip check_call build_param_doc
-                 pdl cat dog svd bisect_left pdl_shuffle as_array ascsr rand_sparse
-                 DTYPE_STR_TO_MX DTYPE_MX_TO_STR DTYPE_MX_TO_PDL
-                 DTYPE_PDL_TO_MX DTYPE_MX_TO_PERL GRAD_REQ_MAP
-                 STORAGE_TYPE_UNDEFINED STORAGE_TYPE_DEFAULT
-                 STORAGE_TYPE_ROW_SPARSE STORAGE_TYPE_CSR
-                 STORAGE_TYPE_STR_TO_ID STORAGE_TYPE_ID_TO_STR STORAGE_AUX_TYPES);
-our @EXPORT_OK = qw(pzeros pceil pones digitize hash array_index range);
-
-use constant DTYPE_STR_TO_MX => {
-    float32 => 0,
-    float64 => 1,
-    float16 => 2,
-    uint8   => 3,
-    int32   => 4,
-    int8    => 5,
-    int64   => 6
-};
-use constant DTYPE_MX_TO_STR => {
-    0 => 'float32',
-    1 => 'float64',
-    2 => 'float16',
-    3 => 'uint8',
-    4 => 'int32',
-    5 => 'int8',
-    6 => 'int64'
-};
-use constant DTYPE_MX_TO_PDL => {
-    0 => 6,
-    1 => 7,
-    2 => 6,
-    3 => 0,
-    4 => 3,
-    5 => 0,
-    6 => 5,
-    float32 => 6,
-    float64 => 7,
-    float16 => 6,
-    uint8   => 0,
-    int32   => 3,
-    int8    => 0,
-    int64   => 5
-};
-use constant DTYPE_PDL_TO_MX => {
-    6 => 0,
-    7 => 1,
-    0 => 3,
-    3 => 4,
-    5 => 6
-};
-use constant DTYPE_MX_TO_PERL => {
-    0 => 'f',
-    1 => 'd',
-    2 => 'S',
-    3 => 'C',
-    4 => 'l',
-    5 => 'c',
-    6 => 'q',
-    float32 => 'f',
-    float64 => 'd',
-    float16 => 'S',
-    uint8   => 'C',
-    int32   => 'l',
-    int8    => 'c',
-    int64   => 'q'
-};
-use constant GRAD_REQ_MAP => {
-    null  => 0,
-    write => 1,
-    add   => 3
-};
-use constant {
-    STORAGE_TYPE_UNDEFINED  => -1,
-    STORAGE_TYPE_DEFAULT    =>  0,
-    STORAGE_TYPE_ROW_SPARSE =>  1,
-    STORAGE_TYPE_CSR        =>  2
-};
-use constant STORAGE_TYPE_STR_TO_ID => {
-    undefined  => STORAGE_TYPE_UNDEFINED,
-    default    => STORAGE_TYPE_DEFAULT,
-    row_sparse => STORAGE_TYPE_ROW_SPARSE,
-    csr        => STORAGE_TYPE_CSR
-};
-use constant STORAGE_TYPE_ID_TO_STR => {
-    STORAGE_TYPE_UNDEFINED()  => 'undefined',
-    STORAGE_TYPE_DEFAULT()    => 'default',
-    STORAGE_TYPE_ROW_SPARSE() => 'row_sparse',
-    STORAGE_TYPE_CSR()        => 'csr'
-};
-use constant STORAGE_AUX_TYPES => {
-    row_sparse => ['int64'],
-    csr => ['int64', 'int64']
-};
-
-
-=head1 NAME
-
-    AI::MXNet::Base - Helper functions
-
-=head1 DEFINITION
-
-    Helper functions
-
-=head2 zip
-
-    Perl version of for x,y,z in zip (arr_x, arr_y, arr_z)
-
-    Parameters
-    ----------
-    $sub_ref, called with @_ filled with $arr_x->[$i], $arr_y->[$i], $arr_z->[$i]
-    for each loop iteration.
-
-    @array_refs
-=cut
-
-sub zip
-{
-    if('CODE' eq ref $_[0])
-    {
-        # continue supporting the callback style
-        my $code = shift;
-        $code->(@$_) for AI::MXNetCAPI::py_zip(map { \@$_ } @_);
-        return;
-    }
-    # the map() here may seem like a no-op, but triggers overloading or
-    # whatever else is needed to make array-ish things actually arrays
-    # before entering the low level list builder.
-    return AI::MXNetCAPI::py_zip(map { \@$_ } @_);
-}
-
-=head2 enumerate
-
-    Same as zip, but the argument list in the anonymous sub is prepended
-    by the iteration count.
-=cut
-
-sub enumerate
-{
-    if('CODE' eq ref $_[0])
-    {
-        # continue supporting the callback style
-        my $code = shift;
-        my $len = @{ $_[0] };
-        $code->(@$_) for AI::MXNetCAPI::py_zip([0..$len-1], map { \@$_ } @_);
-        return;
-    }
-    my $len = @{ $_[0] };
-    return AI::MXNetCAPI::py_zip([0..$len-1], map { \@$_ } @_);
-}
-
-=head2 product
-
-    Calculates the product of the input agruments.
-=cut
-
-sub product
-{
-    my $p = 1;
-    map { $p = $p * $_ } @_;
-    return $p;
-}
-
-=head2 bisect_left
-
-    https://hg.python.org/cpython/file/2.7/Lib/bisect.py
-=cut
-
-sub bisect_left
-{
-    my ($a, $x, $lo, $hi) = @_;
-    $lo //= 0;
-    $hi //= @{ $a };
-    if($lo < 0)
-    {
-        Carp::confess('lo must be non-negative');
-    }
-    while($lo < $hi)
-    {
-        my $mid = int(($lo+$hi)/2);
-        if($a->[$mid] < $x)
-        {
-            $lo = $mid+1;
-        }
-        else
-        {
-            $hi = $mid;
-        }
-    }
-    return $lo;
-}
-
-=head2 pdl_shuffle
-
-    Shuffle the pdl by the last dimension
-
-    Parameters
-    -----------
-    PDL $pdl
-    $preshuffle Maybe[ArrayRef[Index]], if defined the array elements are used
-    as shuffled last dimension's indexes
-=cut
-
-sub pdl_shuffle
-{
-    my ($pdl, $preshuffle) = @_;
-    my @shuffle = $preshuffle ? @{ $preshuffle } : shuffle(0..$pdl->dim(-1)-1);
-    return $pdl->dice_axis(-1, pdl(\@shuffle));
-}
-
-=head2 assert
-
-    Parameters
-    -----------
-    Bool $input
-    Str  $error_str
-    Calls Carp::confess with $error_str//"AssertionError" if the $input is false
-=cut
-
-sub assert
-{
-    my ($input, $error_str) = @_;
-    local($Carp::CarpLevel) = 1;
-    Carp::confess($error_str//'AssertionError')
-        unless $input;
-}
-
-=head2 check_call
-
-    Checks the return value of C API call
-
-    This function will raise an exception when error occurs.
-    Every API call is wrapped with this function.
-
-    Returns the C API call return values stripped of first return value,
-    checks for return context and returns first element in
-    the values list when called in scalar context.
-=cut
-
-sub check_call
-{
-    Carp::confess(AI::MXNetCAPI::GetLastError()) if shift;
-    return wantarray ? @_ : $_[0];
-}
-
-=head2 build_param_doc
-
-    Builds argument docs in python style.
-
-    arg_names : array ref of str
-        Argument names.
-
-    arg_types : array ref of str
-        Argument type information.
-
-    arg_descs : array ref of str
-        Argument description information.
-
-    remove_dup : boolean, optional
-        Whether to remove duplication or not.
-
-    Returns
-    -------
-    docstr : str
-        Python docstring of parameter sections.
-=cut
-
-sub build_param_doc
-{
-    my ($arg_names, $arg_types, $arg_descs, $remove_dup) = @_;
-    $remove_dup //= 1;
-    my %param_keys;
-    my @param_str;
-    for(zip($arg_names, $arg_types, $arg_descs)) {
-            my ($key, $type_info, $desc) = @$_;
-            next if exists $param_keys{$key} and $remove_dup;
-            $param_keys{$key} = 1;
-            my $ret = sprintf("%s : %s", $key, $type_info);
-            $ret .= "\n    ".$desc if length($desc);
-            push @param_str,  $ret;
-    }
-    return sprintf("Parameters\n----------\n%s\n", join("\n", @param_str));
-}
-
-=head2 _notify_shutdown
-
-    Notify MXNet about shutdown.
-=cut
-
-sub _notify_shutdown
-{
-    check_call(AI::MXNetCAPI::NotifyShutdown());
-}
-
-sub _indent
-{
-    my ($s_, $numSpaces) = @_;
-    my @s = split(/\n/, $s_);
-    if (@s == 1)
-    {
-        return $s_;
-    }
-    my $first = shift(@s);
-    @s = ($first, map { (' 'x$numSpaces) . $_ } @s);
-    return join("\n", @s);
-}
-
-sub as_array
-{
-    return ref $_[0] eq 'ARRAY' ? $_[0] : [$_[0]];
-}
-
-my %internal_arguments = (prefix => 1, params => 1, shared => 1);
-my %attributes_per_class;
-sub process_arguments
-{
-    my $orig  = shift;
-    my $class = shift;
-    if($class->can('python_constructor_arguments'))
-    {
-        if(not exists $attributes_per_class{$class})
-        {
-            %{ $attributes_per_class{$class} } = map { $_->name => 1 } $class->meta->get_all_attributes;
-        }
-        my %kwargs;
-        while(@_ >= 2 and defined $_[-2] and not ref $_[-2] and (exists $attributes_per_class{$class}{ $_[-2] } or exists $internal_arguments{ $_[-2] }))
-        {
-            my $v = pop(@_);
-            my $k = pop(@_);
-            $kwargs{ $k } = $v;
-        }
-        if(@_)
-        {
-            my @named_params = @{ $class->python_constructor_arguments };
-            Carp::confess("Paramers mismatch expected ".Dumper(\@named_params).", but got ".Dumper(\@_))
-                if @_ > @named_params;
-            @kwargs{ @named_params[0..@_-1] } = @_;
-        }
-        return $class->$orig(%kwargs);
-    }
-    return $class->$orig(@_);
-}
-
-END {
-    _notify_shutdown();
-    Time::HiRes::sleep(0.01);
-}
-
-*pzeros = \&zeros;
-*pones = \&ones;
-*pceil  = \&ceil;
-## making sure that we can stringify arbitrarily large piddles
-$PDL::toolongtoprint = 1000_000_000;
-## convenience subs
-
-sub ascsr
-{
-    my ($data, $indptr, $indices, $shape) = @_;
-    my @which;
-    my $i = 0;
-    my $j = 0;
-    while($i < $indices->nelem)
-    {
-        for($i = $indptr->at($j); $i < $indptr->at($j+1); $i++)
-        {
-            push @which, [$j, $indices->at($i)];
-        }
-        $j++;
-    }
-    return PDL::CCS::Nd->newFromWhich(
-            pdl(\@which), $data, pdims => blessed $shape ? $shape : pdl($shape)
-    )->xchg(0, 1);
-}
-
-package AI::MXNet::COO::Nd;
-use Mouse;
-has ['data', 'row', 'col'] => (is => 'rw');
-no Mouse;
-
-package AI::MXNet::Base;
-
-sub tocoo
-{
-    my $csr = shift;
-    return AI::MXNet::COO::Nd->new(
-        data => $csr->data,
-        row  => $csr->_whichND->slice(0)->flat,
-        col  => $csr->_whichND->slice(1)->flat
-    );
-}
-
-sub rand_sparse
-{
-    my ($num_rows, $num_cols, $density, $dtype, $format) = @_;
-    $dtype  //= 'float32';
-    $format //= 'csr';
-    my $pdl_type = PDL::Type->new(DTYPE_MX_TO_PDL->{ $dtype });
-    my $dense = random($pdl_type, $num_cols, $num_rows);
-    my $missing = 0;
-    $dense->where(random($num_cols, $num_rows)<=1-$density) .= $missing;
-    if($format eq 'csr')
-    {
-        return $dense->tocsr;
-    }
-    return $dense;
-}
-
-{
-    no warnings 'once';
-    *PDL::CCS::Nd::data    = sub { shift->_nzvals };
-    *PDL::CCS::Nd::indptr  = sub { my $self = shift; ($self->hasptr ? $self->getptr : $self->ptr)[0] };
-    *PDL::CCS::Nd::indices = sub { shift->_whichND->slice(1)->flat };
-    *PDL::CCS::Nd::tocoo   = sub { tocoo(shift) };
-    *PDL::CCS::Nd::shape   = sub { shift->pdims };
-    *PDL::CCS::Nd::dtype   = sub { DTYPE_MX_TO_STR->{ DTYPE_PDL_TO_MX->{ shift->type->numval } } };
-    *PDL::tocsr            = sub { shift->xchg(0, 1)->toccs->xchg(0, 1) };
-    *PDL::rand_sparse      = sub { shift; rand_sparse(@_) };
-}
-
-{
-    my $orig_at = PDL->can('at');
-    no warnings 'redefine';
-    *PDL::at = sub {
-        my ($self, @args) = @_;
-        return $orig_at->($self, @args) if @args != 1;
-        return $orig_at->($self, @args) if $self->ndims == 1;
-        return $self->slice(('X')x($self->ndims-1), $args[0])->squeeze;
-    };
-    *PDL::len    = sub { shift->dim(-1) };
-    *PDL::dtype  = sub { DTYPE_MX_TO_STR->{ DTYPE_PDL_TO_MX->{ shift->type->numval } } };
-}
-
-sub digitize
-{
-    my ($d, $bins) = @_;
-    for(my $i = 0; $i < @$bins; $i++)
-    {
-        return $i if $d < $bins->[$i];
-    }
-    return scalar(@$bins);
-}
-
-use B;
-sub hash { hex(B::hash(shift)) }
-use List::Util ();
-sub array_index { my ($s, $array) = @_; return List::Util::first { $array->[$_] eq $s } 0..@$array-1 }
-sub range { my ($begin, $end, $step) = @_; $step //= 1; grep { not (($_-$begin) % $step) } $begin..$end-1 }
-
-1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/CachedOp.pm b/perl-package/AI-MXNet/lib/AI/MXNet/CachedOp.pm
deleted file mode 100644
index 7e73ded8ad07..000000000000
--- a/perl-package/AI-MXNet/lib/AI/MXNet/CachedOp.pm
+++ /dev/null
@@ -1,119 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-package AI::MXNet::CachedOp;
-
-=head1 NAME
-
-    AI::MXNet::CachedOp - A wrapper around CachedOpHandle
-=cut
-
-=head1 DESCRIPTION
-
-    Internal module, used as a part of AI::MXNet::Gluon::HybridBlock.
-=cut
-
-use strict;
-use warnings;
-use AI::MXNet::Base;
-use Mouse;
-use overload '&{}' => sub { my $self = shift; sub { $self->call(@_) } };
-
-has 'handle'   => (is => 'ro', isa => 'CachedOpHandle', required => 1);
-around BUILDARGS => sub {
-    my $orig  = shift;
-    my $class = shift;
-    my ($sym, $flags) = @_;
-    for my $key (keys %$flags)
-    {
-        $flags->{ $key } = "(" .join(", ", map { defined($_) ? $_ : 'None' } @{ $flags->{ $key } }) .")"
-                if ref $flags->{ $key } eq 'ARRAY';
-    }
-    my $handle = check_call(
-        AI::MXNetCAPI::CreateCachedOpEx(
-            $sym->handle,
-            scalar(keys %{ $flags//{} }),
-            $flags//{},
-        )
-    );
-    return $class->$orig(handle => $handle);
-};
-
-sub DEMOLISH
-{
-    check_call(AI::MXNetCAPI::FreeCachedOp(shift->handle));
-}
-
-sub call
-{
-    my $self = shift;
-    my @args;
-    my %kwargs;
-    if(blessed $_[0] and $_[0]->isa('AI::MXNet::NDArray'))
-    {
-        while(blessed $_[0] and $_[0]->isa('AI::MXNet::NDArray'))
-        {
-            push @args, shift(@_);
-        }
-        %kwargs = @_;
-    }
-    else
-    {
-        %kwargs = @_;
-    }
-    my $out = delete $kwargs{out};
-    if(%kwargs)
-    {
-        confess(
-            "AI::MXNet::CachedOp::call got unexpected keyword argument(s): ".
-            join(', ', keys %kwargs)
-        );
-    }
-    my $original_output;
-    if(defined $out)
-    {
-        $original_output = $out;
-        if(blessed($out))
-        {
-            $out = [$out];
-        }
-    }
-    else
-    {
-        $out = [];
-    }
-    my ($output, $stypes) = check_call(
-        AI::MXNetCAPI::InvokeCachedOpEx(
-            $self->handle,
-            scalar(@args),
-            [map { $_->handle } @args],
-            [map { $_->handle } @$out]
-        )
-    );
-    return $original_output if defined $original_output;
-    if(@$output == 1)
-    {
-        return AI::MXNet::NDArray->_ndarray_cls($output->[0], 1, $stypes->[0]);
-    }
-    else
-    {
-        my $i = 0;
-        return [map { AI::MXNet::NDArray->_ndarray_cls($_, 1, $stypes->[$i++]) } @$output];
-    }
-}
-
-1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Callback.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Callback.pm
deleted file mode 100644
index 72ea1bd4502c..000000000000
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Callback.pm
+++ /dev/null
@@ -1,296 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-package AI::MXNet::Callback;
-use strict;
-use warnings;
-use List::Util qw/max/;
-use AI::MXNet::NS;
-use AI::MXNet::Function::Parameters;
-use Mouse;
-use overload "&{}" => sub { my $self = shift; sub { $self->call(@_) } };
-
-=head1 NAME
-
-    AI::MXNet::Callback - A collection of predefined callback functions.
-=cut
-
-=head1 DESCRIPTION
-
-    A collection of predefined callback functions, mainly to be used in AI::MXNet::Module::Base::fit.
-=cut
-
-=head1 SYNOPSIS
-
-    my $model = mx->mod->Module(
-        symbol  => $net,
-        context => $contexts
-    );
-    $model->fit(
-        $data_iter,
-        eval_metric         => mx->metric->Perplexity,
-        kvstore             => $kv_store,
-        optimizer           => $optimizer,
-        optimizer_params    => {
-            learning_rate => $lr,
-            momentum      => $mom,
-            wd            => $wd,
-            clip_gradient => 5,
-            rescale_grad  => 1/$batch_size,
-            lr_scheduler  => AI::MXNet::FactorScheduler->new(step => 1000, factor => 0.99)
-        },
-        initializer         => mx->init->Xavier(factor_type => "in", magnitude => 2.34),
-        num_epoch           => $num_epoch,
-        batch_end_callback  => mx->callback->Speedometer($batch_size, $disp_batches),
-        ($chkp_epoch ? (epoch_end_callback  => [mx->callback->module_checkpoint($model, $chkp_prefix, $chkp_epoch), \&sample]) : ())
-    );
-=cut
-
-=head2 module_checkpoint
-
-    Callback to save the module setup in the checkpoint files.
-
-    Parameters
-    ----------
-    $mod : subclass of AI::MXNet::Module::Base
-        The module to checkpoint.
-    $prefix : Str
-        The file prefix to checkpoint to
-    $period=1 : Int
-        How many epochs to wait before checkpointing. Default is 1.
-    $save_optimizer_states=0 : Bool
-        Whether to save optimizer states for later training.
-
-    Returns
-    -------
-    $callback : sub ref
-        The callback function that can be passed as iter_end_callback to fit.
-=cut
-
-method module_checkpoint(
-    AI::MXNet::Module::Base $mod,
-    Str $prefix,
-    Int $period=1,
-    Int $save_optimizer_states=0
-)
-{
-    $period = max(1, $period);
-    return sub {
-        my ($iter_no, $sym, $arg, $aux) = @_;
-        if(($iter_no + 1) % $period == 0)
-        {
-            $mod->save_checkpoint($prefix, $iter_no + 1, $save_optimizer_states);
-        }
-    }
-}
-
-=head2 log_train_metric
-
-    Callback to log the training evaluation result every period.
-
-    Parameters
-    ----------
-    $period : Int
-        The number of batches after which to log the training evaluation metric.
-    $auto_reset : Bool
-        Whether to reset the metric after the logging.
-
-    Returns
-    -------
-    $callback : sub ref
-        The callback function that can be passed as iter_epoch_callback to fit.
-=cut
-
-method log_train_metric(Int $period, Int $auto_reset=0)
-{
-    return sub {
-        my ($param) = @_;
-        if($param->nbatch % $period == 0 and defined $param->eval_metric)
-        {
-            my $name_value = $param->eval_metric->get_name_value;
-            while(my ($name, $value) = each %{ $name_value })
-            {
-                AI::MXNet::Logging->info(
-                    "Iter[%d] Batch[%d] Train-%s=%f",
-                    $param->epoch, $param->nbatch, $name, $value
-                );
-            }
-            $param->eval_metric->reset if $auto_reset;
-        }
-    }
-}
-
-package AI::MXNet::Speedometer;
-use Mouse;
-use Time::HiRes qw/time/;
-extends 'AI::MXNet::Callback';
-
-=head1 NAME
-
-    AI::MXNet::Speedometer - A callback that logs training speed
-=cut
-
-=head1 DESCRIPTION
-
-    Calculate and log training speed periodically.
-
-    Parameters
-    ----------
-    batch_size: int
-        batch_size of data
-    frequent: int
-        How many batches between calculations.
-        Defaults to calculating & logging every 50 batches.
-    auto_reset: Bool
-        Reset the metric after each log, defaults to true.
-=cut
-
-has 'batch_size' => (is => 'ro', isa => 'Int', required => 1);
-has 'frequent'   => (is => 'ro', isa => 'Int', default  => 50);
-has 'init'       => (is => 'rw', isa => 'Int', default  => 0);
-has 'tic'        => (is => 'rw', isa => 'Num', default  => 0);
-has 'last_count' => (is => 'rw', isa => 'Int', default  => 0);
-has 'auto_reset' => (is => 'ro', isa => 'Bool', default  => 1);
-
-method call(AI::MXNet::BatchEndParam $param)
-{
-    my $count = $param->nbatch;
-    if($self->last_count > $count)
-    {
-        $self->init(0);
-    }
-    $self->last_count($count);
-
-    if($self->init)
-    {
-        if(($count % $self->frequent) == 0)
-        {
-            my $speed = $self->frequent * $self->batch_size / (time - $self->tic);
-            if(defined $param->eval_metric)
-            {
-                my $name_value = $param->eval_metric->get_name_value;
-                $param->eval_metric->reset if $self->auto_reset;
-                while(my ($name, $value) = each %{ $name_value })
-                {
-                    AI::MXNet::Logging->info(
-                        "Epoch[%d] Batch [%d]\tSpeed: %.2f samples/sec\tTrain-%s=%f",
-                        $param->epoch, $count, $speed, $name, $value
-                    );
-                }
-            }
-            else
-            {
-                AI::MXNet::Logging->info(
-                    "Iter[%d] Batch [%d]\tSpeed: %.2f samples/sec",
-                    $param->epoch, $count, $speed
-                );
-            }
-            $self->tic(time);
-        }
-    }
-    else
-    {
-        $self->init(1);
-        $self->tic(time);
-    }
-}
-
-*slice = \&call;
-
-package AI::MXNet::ProgressBar;
-use Mouse;
-extends 'AI::MXNet::Callback';
-
-=head1 NAME
-
-    AI::MXNet::ProgressBar - A callback to show a progress bar.
-
-=head1 DESCRIPTION
-
-    Shows a progress bar.
-
-    Parameters
-    ----------
-    total: Int
-        batch size, default is 1
-    length: Int
-        the length of the progress bar, default is 80 chars
-=cut
-
-has 'length'  => (is => 'ro', isa => 'Int', default => 80);
-has 'total'   => (is => 'ro', isa => 'Int', required => 1);
-
-method call(AI::MXNet::BatchEndParam $param)
-{
-    my $count = $param->nbatch;
-    my $filled_len = int(0.5 + $self->length * $count / $self->total);
-    my $percents = int(100.0 * $count / $self->total) + 1;
-    my $prog_bar = ('=' x $filled_len) . ('-' x ($self->length - $filled_len));
-    print "[$prog_bar] $percents%\r";
-}
-
-*slice = \&call;
-
-# Just logs the eval metrics at the end of an epoch.
-package AI::MXNet::LogValidationMetricsCallback;
-use Mouse;
-extends 'AI::MXNet::Callback';
-
-=head1 NAME
-
-    AI::MXNet::LogValidationMetricsCallback - A callback to log the eval metrics at the end of an epoch.
-=cut
-
-method call(AI::MXNet::BatchEndParam $param)
-{
-    return unless defined $param->eval_metric;
-    my $name_value = $param->eval_metric->get_name_value;
-    while(my ($name, $value) = each %{ $name_value })
-    {
-        AI::MXNet::Logging->info(
-            "Epoch[%d] Validation-%s=%f",
-            $param->epoch, $name, $value
-        );
-    }
-}
-
-package AI::MXNet::Callback;
-
-method Speedometer(@args)
-{
-    AI::MXNet::Speedometer->new(
-        @args == 3 ?
-            (batch_size => $args[0], frequent => $args[1], auto_reset => $args[2])
-            : @args == 2 ?
-                (batch_size => $args[0], frequent => $args[1])
-                    : (batch_size => $args[0])
-    )
-}
-
-method ProgressBar(@args)
-{
-    AI::MXNet::ProgressBar->new(
-        @args == 2 ? (total => $args[0], 'length' => $args[1]) : (total => $args[0])
-    )
-}
-
-method LogValidationMetricsCallback()
-{
-    AI::MXNet::LogValidationMetricsCallback->new
-}
-
-1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Context.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Context.pm
deleted file mode 100644
index 2cca47f9ab4d..000000000000
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Context.pm
+++ /dev/null
@@ -1,240 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-package AI::MXNet::Context;
-use strict;
-use warnings;
-use Mouse;
-use AI::MXNet::NS;
-use AI::MXNet::Base;
-use AI::MXNet::Types;
-use AI::MXNet::Function::Parameters;
-use constant devtype2str => { 1 => 'cpu', 2 => 'gpu', 3 => 'cpu_pinned' };
-use constant devstr2type => { cpu => 1, gpu => 2, cpu_pinned => 3 };
-around BUILDARGS => sub {
-    my $orig  = shift;
-    my $class = shift;
-    return $class->$orig(device_type => $_[0])
-        if @_ == 1 and $_[0] =~ /^(?:cpu|gpu|cpu_pinned)$/;
-    return $class->$orig(
-        device_type => $_[0]->device_type,
-        device_id   => $_[0]->device_id
-    ) if @_ == 1 and blessed $_[0];
-    return $class->$orig(device_type => $_[0], device_id => $_[0])
-        if @_ == 2 and $_[0] =~ /^(?:cpu|gpu|cpu_pinned)$/;
-    return $class->$orig(@_);
-};
-
-has 'device_type' => (
-    is => 'rw',
-    isa => enum([qw[cpu gpu cpu_pinned]]),
-    default => 'cpu'
-);
-
-has 'device_type_id' => (
-    is => 'rw',
-    isa => enum([1, 2, 3]),
-    default => sub { devstr2type->{ shift->device_type } },
-    lazy => 1
-);
-
-has 'device_id' => (
-    is => 'rw',
-    isa => 'Int',
-    default => 0
-);
-
-use overload
-    '==' => sub {
-        my ($self, $other) = @_;
-        return 0 unless blessed($other) and $other->isa(__PACKAGE__);
-        return "$self" eq "$other";
-    },
-    '""' => sub {
-        my ($self) = @_;
-        return sprintf("%s(%s)", $self->device_type, $self->device_id);
-    },
-    fallback => 1;
-=head1 NAME
-
-    AI::MXNet::Context - A device context.
-=cut
-
-=head1 DESCRIPTION
-
-    This class governs the device context of AI::MXNet::NDArray objects.
-=cut
-
-=head1 SYNOPSIS
-
-    use AI::MXNet qw(mx);
-    print nd->array([[1,2],[3,4]], ctx => mx->cpu)->aspdl;
-    my $arr_gpu = nd->random->uniform(shape => [10, 10], ctx => mx->gpu(0));
-=cut
-
-=head2
-
-    Constructing a context.
-
-    Parameters
-    ----------
-    device_type : {'cpu', 'gpu'} or Context.
-        String representing the device type
-
-    device_id : int (default=0)
-        The device id of the device, needed for GPU
-=cut
-
-=head2 cpu
-
-    Returns a CPU context.
-
-    Parameters
-    ----------
-    device_id : int, optional
-        The device id of the device. device_id is not needed for CPU.
-        This is included to make interface compatible with GPU.
-
-    Returns
-    -------
-    context : AI::MXNet::Context
-        The corresponding CPU context.
-=cut
-
-method cpu(Int $device_id=0)
-{
-    return $self->new(device_type => 'cpu', device_id => $device_id);
-}
-
-=head2 cpu_pinned
-
-    Returns a CPU pinned memory context. Copying from CPU pinned memory to GPU
-    is faster than from normal CPU memory.
-
-    Parameters
-    ----------
-    device_id : int, optional
-        The device id of the device. `device_id` is not needed for CPU.
-        This is included to make interface compatible with GPU.
-
-    Returns
-    -------
-    context : Context
-        The corresponding CPU pinned memory context.
-=cut
-
-method cpu_pinned(Int $device_id=0)
-{
-    return $self->new(device_type => 'cpu_pinned', device_id => $device_id);
-}
-
-=head2 gpu
-
-    Returns a GPU context.
-
-    Parameters
-    ----------
-    device_id : int, optional
-
-    Returns
-    -------
-    context : AI::MXNet::Context
-        The corresponding GPU context.
-=cut
-
-method gpu(Int $device_id=0)
-{
-    return $self->new(device_type => 'gpu', device_id => $device_id);
-}
-
-=head2 current_context
-
-    Returns the current context.
-
-    Returns
-    -------
-    $default_ctx : AI::MXNet::Context
-=cut
-
-
-=head2 num_gpus
-
-    Query CUDA for the number of GPUs present.
-
-    Raises
-    ------
-    Will raise an exception on any CUDA error.
-
-    Returns
-    -------
-    count : int
-        The number of GPUs.
-
-=cut
-
-method num_gpus()
-{
-    return scalar(check_call(AI::MXNetCAPI::GetGPUCount()));
-}
-
-=head2 gpu_memory_info
-
-    Query CUDA for the free and total bytes of GPU global memory.
-
-    Parameters
-    ----------
-    $device_id=0 : int, optional
-        The device id of the GPU device.
-
-    Raises
-    ------
-    Will raise an exception on any CUDA error.
-
-    Returns
-    -------
-    ($free, $total) : (int, int)
-        Free and total memory in bytes.
-=cut
-
-method gpu_memory_info($device_id=0)
-{
-    return check_call(AI::MXNetCAPI::GetGPUMemoryInformation64($device_id));
-}
-
-method current_ctx()
-{
-    return $AI::MXNet::Context;
-}
-
-method set_current(AI::MXNet::Context $current)
-{
-    $AI::MXNet::Context = $current;
-}
-
-*current_context = \&current_ctx;
-
-method deepcopy()
-{
-    return __PACKAGE__->new(
-                device_type => $self->device_type,
-                device_id => $self->device_id
-    );
-}
-
-__PACKAGE__->AI::MXNet::NS::register('AI::MXNet');
-
-1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Contrib.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Contrib.pm
deleted file mode 100644
index f9a99f706302..000000000000
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Contrib.pm
+++ /dev/null
@@ -1,51 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-package AI::MXNet::Contrib;
-use strict;
-use warnings;
-use AI::MXNet::NS;
-use AI::MXNet::Contrib::Symbol qw(sym symbol);
-use AI::MXNet::Contrib::NDArray qw(nd ndarray);
-
-=head1 NAME
-
-    AI::MXNet::Contrib - An interface to experimental operators defined in C++ space.
-=cut
-
-=head1 SYNOPSIS
-
-    my $embed;
-    if($sparse_embedding)
-    {
-        my $embed_weight = mx->sym->Variable('embed_weight', stype=>'row_sparse');
-        $embed = mx->sym->contrib->SparseEmbedding(
-            data=>$data, input_dim=>$num_words,
-            weight=>$embed_weight, output_dim=>$num_embed,
-            name=>'embed'
-        );
-    }
-    else
-    {
-        $embed = mx->sym->Embedding(
-            data=>$data, input_dim=>$num_words,
-            output_dim=>$num_embed, name=>'embed'
-        );
-    }
-=cut
-
-1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Contrib/NDArray.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Contrib/NDArray.pm
deleted file mode 100644
index 83e7cbabce5a..000000000000
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Contrib/NDArray.pm
+++ /dev/null
@@ -1,35 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-package AI::MXNet::Contrib::NDArray;
-use strict;
-use warnings;
-use AI::MXNet::NS;
-use parent 'AI::MXNet::AutoLoad';
-sub config { ('contrib', 'AI::MXNet::NDArray') }
-
-=head1 NAME
-
-    AI::MXNet::Contrib::NDArray - An interface to experimental NDArray operators defined in C++ space.
-=cut
-
-=head1 SYNOPSIS
-
-    mx->contrib->ndarray->fft(nd->random->normal(0, 1, [3, 4], ctx => mx->gpu));
-=cut
-
-1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Contrib/Symbol.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Contrib/Symbol.pm
deleted file mode 100644
index 03f4b90e3373..000000000000
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Contrib/Symbol.pm
+++ /dev/null
@@ -1,51 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-package AI::MXNet::Contrib::Symbol;
-use strict;
-use warnings;
-use AI::MXNet::NS;
-use parent 'AI::MXNet::AutoLoad';
-sub config { ('contrib', 'AI::MXNet::Symbol') }
-
-=head1 NAME
-
-    AI::MXNet::Contrib - An interface to experimental symbol operators defined in C++ space.
-=cut
-
-=head1 SYNOPSIS
-
-    my $embed;
-    if($sparse_embedding)
-    {
-        my $embed_weight = mx->sym->Variable('embed_weight', stype=>'row_sparse');
-        $embed = mx->sym->contrib->SparseEmbedding(
-            data=>$data, input_dim=>$num_words,
-            weight=>$embed_weight, output_dim=>$num_embed,
-            name=>'embed'
-        );
-    }
-    else
-    {
-        $embed = mx->sym->Embedding(
-            data=>$data, input_dim=>$num_words,
-            output_dim=>$num_embed, name=>'embed'
-        );
-    }
-=cut
-
-1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/CudaModule.pm b/perl-package/AI-MXNet/lib/AI/MXNet/CudaModule.pm
deleted file mode 100644
index 67e6b60a0190..000000000000
--- a/perl-package/AI-MXNet/lib/AI/MXNet/CudaModule.pm
+++ /dev/null
@@ -1,302 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-package AI::MXNet::CudaModule;
-use strict;
-use warnings;
-use AI::MXNet::NS;
-use AI::MXNet::Base;
-use Mouse;
-use AI::MXNet::Function::Parameters;
-
-our %DTYPE_CPP_TO_STR = qw(
-    float    float32
-    double   float64
-    __half   float16
-    uint8_t  uint8
-    int      int32
-    int32_t  int32
-    int8_t   int8
-    char     int8
-    int64_t  int64
-);
-
-=head1 NAME
-
-    AI::MXNet::CudaModule - Interface to runtime cuda kernel compile module.
-=cut
-
-=head1 DESCRIPTION
-
-    Interface to runtime cuda kernel compile module.
-    Compile and run CUDA code from Perl.
-
-    In CUDA 7.5, you need to prepend your kernel definitions
-    with 'extern "C"' to avoid name mangling::
-
-        $source = '
-        extern "C" __global__ void axpy(const float *x, float *y, float alpha) {
-            int i = threadIdx.x + blockIdx.x * blockDim.x;
-            y[i] += alpha * x[i];
-        }
-        ';
-        $module = mx->rtc->CudaModule(source);
-        $func = $module->get_kernel("axpy", "const float *x, float *y, float alpha");
-        $x = mx->nd->ones([10]), ctx=>mx->gpu(0));
-        $y = mx->nd->zeros([10]), ctx=>mx->gpu(0));
-        $func->launch([$x, $y, 3.0], mx->gpu(0), [1, 1, 1], [10, 1, 1]);
-        print $y->aspdl;
-
-    Starting from CUDA 8.0, you can instead export functions by name.
-    This also allows you to use templates::
-
-        my $source = '
-        template<typename DType>
-        __global__ void axpy(const DType *x, DType *y, DType alpha) {
-            int i = threadIdx.x + blockIdx.x * blockDim.x;
-            y[i] += alpha * x[i];
-        }
-        ';
-        $module = mx->rtc->CudaModule($source, exports=>['axpy<float>', 'axpy<double>']);
-        $func32 = $module->get_kernel("axpy<float>", "const float *x, float *y, float alpha");
-        $x = mx->nd->ones([10], dtype=>'float32', ctx=>mx->gpu(0));
-        $y = mx->nd->zeros([10], dtype=>'float32', ctx=>mx->gpu(0));
-        $func32->launch([$x, $y, 3.0], mx->gpu(0), [1, 1, 1], [10, 1, 1]);
-        print $y->aspdl;
-
-        $func64 = $module->get_kernel("axpy<double>", "const double *x, double *y, double alpha");
-        $x = mx->nd->ones([10], dtype=>'float64', ctx=>mx->gpu(0));
-        $y = mx->nd->zeros([10], dtype=>'float64', ctx=>mx->gpu(0));
-        $func32->launch([$x, $y, 3.0], mx->gpu(0), [1, 1, 1], [10, 1, 1]);
-        print $y->aspdl;
-
-
-    Parameters
-    ----------
-    source : Str
-        Complete source code.
-    options : Str|ArrayRef[Str]
-        Compiler flags. For example, use "-I/usr/local/cuda/include" to
-        add cuda headers to include path.
-    exports : Str|ArrayRef[Str]
-        Export kernel names.
-=cut
-
-has 'source' => (is => 'rw', isa => 'Str', required => 1);
-has [qw/options exports/] => (is => 'rw', isa => 'Str|ArrayRef[Str]', default => sub { [] });
-has 'handle' => (is => 'rw', isa => 'CudaModuleHandle');
-around BUILDARGS => \&AI::MXNet::Base::process_arguments;
-method python_constructor_arguments() { ['source', 'options', 'exports'] }
-
-sub BUILD
-{
-    my $self = shift;
-    $self->options([$self->options]) unless ref $self->options;
-    $self->options([$self->exports]) unless ref $self->exports;
-    my $handle = check_call(
-                    AI::MXNetCAPI::RtcCudaModuleCreate(
-                        $self->source,
-                        scalar(@{ $self->options }),
-                        $self->options,
-                        scalar(@{ $self->exports }),
-                        $self->exports
-                    )
-    );
-    $self->handle($handle);
-}
-
-sub DEMOLISH
-{
-    check_call(AI::MXNetCAPI::RtcCudaModuleFree(shift->handle));
-}
-
-=head2 get_kernel
-
-        Get CUDA kernel from compiled module.
-
-        Parameters
-        ----------
-        $name : Str
-            String name of the kernel.
-        $signature : Str
-            Function signature for the kernel. For example, if a kernel is
-            declared as::
-
-                extern "C" __global__ void axpy(const float *x, double *y, int alpha)
-
-            Then its signature should be::
-
-                const float *x, double *y, int alpha
-
-            or::
-
-                const float *, double *, int
-
-            Note that `*` in signature marks an argument as array and
-            `const` marks an argument as constant (input) array.
-
-        Returns
-        -------
-        AI::MXNet::CudaKernel
-            CUDA kernels that can be launched on GPUs.
-=cut
-
-method get_kernel(Str $name, Str $signature)
-{
-    my @is_ndarray;
-    my @is_const;
-    my @dtypes;
-    my $pattern = qr/^\s*(const)?\s*([\w_]+)\s*(\*)?\s*([\w_]+)?\s*$/;
-    $signature =~ s/\s+/ /g;
-    my @args = split(/,/, $signature);
-    for my $arg (@args)
-    {
-        if(not $arg =~ $pattern or $2 eq 'const')
-        {
-            confess(
-                "Invalid function prototype \"$arg\". Must be in the ".
-                'form of "(const) type (*) (name)'
-            );
-        }
-        push @is_const, $1 ? 1 : 0;
-        my $dtype = $2;
-        push @is_ndarray, $3 ? 1 : 0;
-        if(not exists $DTYPE_CPP_TO_STR{$dtype})
-        {
-            my $types = join(',', sort keys %DTYPE_CPP_TO_STR);
-            confess("Unsupported kernel argument type $arg. Supported types are: $types.");
-        }
-        push @dtypes, DTYPE_STR_TO_MX->{$DTYPE_CPP_TO_STR{$dtype}};
-    }
-
-    my $handle = check_call(
-        AI::MXNetCAPI::RtcCudaKernelCreate(
-            $self->handle,
-            $name,
-            scalar(@dtypes),
-            \@is_ndarray,
-            \@is_const,
-            \@dtypes
-        )
-    );
-    return AI::MXNet::CudaKernel->new($handle, $name, \@is_ndarray, \@dtypes);
-}
-
-__PACKAGE__->AI::MXNet::NS::register('AI::MXNet');
-
-package AI::MXNet::CudaKernel;
-use Mouse;
-use AI::MXNet::Base;
-
-=head1 NAME
-
-    AI::MXNet::CudaKernel - Constructs CUDA kernel.
-=cut
-
-=head1 DESCRIPTION
-
-    Constructs CUDA kernel.
-    Intended to be created by calling AI::MXNet::CudaModule->get_kernel only.
-=cut
-
-has [qw/handle name is_ndarray dtypes/] => (is => 'rw');
-around BUILDARGS => sub {
-    my ($orig, $class, $handle, $name, $is_ndarray, $dtypes) = @_;
-    return $class->$orig(handle => $handle, name => $name, is_ndarray => $is_ndarray, dtypes => $dtypes);
-};
-
-sub BUILD
-{
-    my $self = shift;
-    $self->dtypes([map { DTYPE_MX_TO_STR->{$_} } @{ $self->dtypes }]);
-}
-
-sub DEMOLISH
-{
-    check_call(AI::MXNetCAPI::RtcCudaKernelFree(shift->handle));
-}
-
-=head2 launch
-
-        Launch cuda kernel.
-
-        Parameters
-        ----------
-        $args : ArrayRef[AI::MXNet::NDArray|Num]
-            List of arguments for kernel. NDArrays are expected for pointer
-            types (e.g. `float*`, `double*`) while numbers are expected for
-            non-pointer types (e.g. `int`, `float`).
-        $ctx : AI::MXNet::Context
-            The context to launch kernel on. Must be GPU context.
-        $grid_dims : array ref of 3 integers (CudaKernelShape)
-            Grid dimensions for CUDA kernel.
-        $block_dims : array ref of 3 integers (CudaKernelShape)
-            Block dimensions for CUDA kernel.
-        $shared_mem=0 : integer, optional
-            Size of dynamically allocated shared memory. Defaults to 0.
-=cut
-
-method launch(
-    ArrayRef[AI::MXNet::NDArray|Num] $args,
-    AI::MXNet::Context $ctx,
-    CudaKernelShape $grid_dims,
-    CudaKernelShape $block_dims,
-    Int $shared_mem=0
-)
-{
-    assert(($ctx->device_type eq 'gpu'), "Cuda kernel can only be launched on GPU");
-    confess("CudaKernel(${\ $self->name }) expects ".scalar(@{$self->dtypes}). "arguments but got ".scalar(@$args).".")
-        unless (@{ $args } == @{ $self->dtypes });
-    my @void_args;
-    enumerate(sub {
-        my ($i, $arg, $is_nd, $dtype) = @_;
-        if($is_nd)
-        {
-            confess("The $i-th argument is expected to be a NDArray but got [$arg]")
-                unless blessed $arg;
-            push @void_args, $arg->handle;
-        }
-        else
-        {
-            my $perl_pack_type = DTYPE_MX_TO_PERL->{$dtype};
-            my $packed_arg;
-            ## special handling for float16
-            if($perl_pack_type eq 'S')
-            {
-                $packed_arg = pack("S", AI::MXNetCAPI::_float_to_half($arg));
-            }
-            else
-            {
-                $packed_arg = pack($perl_pack_type, $arg);
-
-            }
-            push @void_args, $packed_arg;
-        }
-    }, $args, $self->is_ndarray, $self->dtypes);
-    check_call(
-        AI::MXNetCAPI::RtcCudaKernelCall(
-            $self->handle,
-            $ctx->device_id,
-            \@void_args,
-            @{ $grid_dims },
-            @{ $block_dims },
-            $shared_mem
-        )
-    );
-}
-
-1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Engine.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Engine.pm
deleted file mode 100644
index e74be0230d7b..000000000000
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Engine.pm
+++ /dev/null
@@ -1,104 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-package AI::MXNet::Engine;
-use strict;
-use warnings;
-use AI::MXNet::Function::Parameters;
-use AI::MXNet::Base;
-use AI::MXNet::NS;
-
-=head1 NAME
-
-    AI::MXNet::Engine - Allows management of properties of the MXNet's engine.
-=cut
-
-=head1 SYNOPSIS
-
-    my $x;
-    mx->engine->bulk(10, sub {
-        $x = mx->nd->ones([10]);
-        $x *= 2;
-        $x += 1;
-        $x->wait_to_read();
-        $x += 1;
-        ok(($x->aspdl == 4)->all);
-        for my $i (1..100)
-        {
-            $x += 1;
-        }
-    });
-    ok(($x->aspdl == 104)->all);
-=cut
-
-=head2 set_bulk_size
-
-    Set size limit on bulk execution.
-
-    Bulk execution bundles many operators to run together.
-    This can improve performance when running a lot of small
-    operators sequentially.
-
-    Parameters
-    ----------
-    $size : int
-        Maximum number of operators that can be bundled in a bulk.
-
-    Returns
-    -------
-    int
-        Previous bulk size.
-=cut
-
-method set_bulk_size(Int $size)
-{
-    return scalar(check_call(AI::MXNetCAPI::EngineSetBulkSize($size)));
-}
-
-
-=head2 bulk
-
-    Bulk execution bundles many operators to run together.
-    This can improve performance when running a lot of small
-    operators sequentially.
-
-    Parameters
-    ----------
-    $size : int
-        Maximum number of operators that can be bundled in a bulk.
-    $sub: CodeRef to execute
-
-    my $x;
-    mx->engine->bulk(10, sub {
-        $x = mx->nd->zeros([1]);
-        for my $i (1..100)
-        {
-            $x += 1;
-        }
-    });
-=cut
-
-method bulk(Int $size, CodeRef $sub)
-{
-    my $prev = __PACKAGE__->set_bulk_size($size);
-    eval { $sub->() };
-    my $err = $@;
-    __PACKAGE__->set_bulk_size($prev) unless $prev == $size;
-    Carp::confess($err) if $err;
-}
-
-1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Executor.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Executor.pm
deleted file mode 100644
index 5844302fce16..000000000000
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Executor.pm
+++ /dev/null
@@ -1,523 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-package AI::MXNet::Executor;
-use strict;
-use warnings;
-use AI::MXNet::Base;
-use AI::MXNet::Context;
-use Mouse;
-use AI::MXNet::Types;
-use AI::MXNet::Function::Parameters;
-
-has 'handle'            => (is => 'ro', isa => 'ExecutorHandle', required => 1);
-has 'arg_arrays'        => (is => 'rw', isa => 'Maybe[ArrayRef[AI::MXNet::NDArray]]');
-has 'grad_arrays'       => (is => 'rw', isa => 'Maybe[ArrayRef[Undef|AI::MXNet::NDArray]]');
-has 'aux_arrays'        => (is => 'rw', isa => 'Maybe[ArrayRef[AI::MXNet::NDArray]]');
-has '_symbol'           => (is => 'rw', init_arg => 'symbol',    isa => 'AI::MXNet::Symbol');
-has '_ctx'              => (is => 'rw', init_arg => 'ctx',       isa => 'AI::MXNet::Context' );
-has '_grad_req'         => (is => 'rw', init_arg => 'grad_req',  isa => 'Maybe[Str|ArrayRef[Str]|HashRef[Str]]');
-has '_group2ctx'        => (is => 'rw', init_arg => 'group2ctx', isa => 'Maybe[HashRef[AI::MXNet::Context]]');
-has [qw/_arg_dict
-        _grad_dict
-        _aux_dict
-        _output_dict
-        outputs
-    /]                  => (is => 'rw', init_arg => undef);
-=head1 NAME
-
-    AI::MXNet::Executor - The actual executing object of MXNet.
-=cut
-
-=head1 SYNOPSIS
-
-    my $executor = $sym->bind(
-        ctx       => mx->Context('cpu'),
-        args      => [$lhs_arr, $rhs_arr],
-        args_grad => [$lhs_grad, $rhs_grad]
-    );
-    $executor->forward(1);
-    print $executor->outputs->[0]->aspdl;
-=cut
-
-=head2 new
-
-    Constructor, used by AI::MXNet::Symbol->bind and by AI::MXNet::Symbol->simple_bind.
-
-    Parameters
-    ----------
-    handle: ExecutorHandle
-        ExecutorHandle is generated by calling bind.
-
-    See Also
-    --------
-    AI::MXNet::Symbol->bind : how to create the AI::MXNet::Executor.
-=cut
-
-sub BUILD
-{
-    my $self = shift;
-    my ($symbol, $ctx, $grad_req, $group2ctx)
-        =
-    ($self->_symbol, $self->_ctx, $self->_grad_req, $self->_group2ctx);
-    $symbol = $symbol->deepcopy;
-    $ctx    = $ctx->deepcopy;
-    if(ref $grad_req)
-    {
-        if(ref $grad_req eq 'ARRAY')
-        {
-            $grad_req = [ @{ $grad_req }];
-        }
-        elsif(ref $grad_req eq 'HASH')
-        {
-            $grad_req = { %{ $grad_req } };
-
-        }
-    }
-    if(ref $group2ctx)
-    {
-        $group2ctx = { %{ $group2ctx } };
-    }
-    $self->_symbol($symbol);
-    $self->_ctx($ctx);
-    $self->_grad_req($grad_req);
-    $self->_group2ctx($group2ctx);
-    $self->outputs($self->_get_outputs);
-}
-
-sub DEMOLISH
-{
-    check_call(AI::MXNetCAPI::ExecutorFree(shift->handle));
-}
-
-# Get the dictionary given name and ndarray pairs.
-func _get_dict(
-    ArrayRef[Str]                       $names,
-    ArrayRef[Maybe[AI::MXNet::NDArray]] $ndarrays
-)
-{
-    my %nset = ();
-    for my $nm (@{ $names })
-    {
-        if(exists $nset{ $nm })
-        {
-            confess("Duplicate names detected, @$names")
-        }
-        $nset{ $nm }++;
-    }
-    my %ret;
-    @ret{ @{ $names } } = @{ $ndarrays };
-    return \%ret;
-}
-
-=head2 outputs
-
-    The output ndarrays bound to this executor.
-
-    Returns
-    -------
-    An array ref with AI::MXNet::NDArray objects bound to the heads of the executor.
-=cut
-
-method _get_outputs()
-{
-    return [
-            map {
-                AI::MXNet::NDArray->_ndarray_cls($_)
-            }
-            @{ check_call(AI::MXNetCAPI::ExecutorOutputs($self->handle)) }
-    ];
-}
-
-=head2 forward
-
-    Calculate the outputs specified by the bound symbol.
-
-    Parameters
-    ----------
-    $is_train=0: Bool, optional
-        whether this forward is for evaluation purpose. If True,
-        a backward call is expected to follow. Otherwise following
-        backward is invalid.
-
-    %kwargs
-        Additional specification of input arguments.
-
-    Examples
-    --------
-        >>> # doing forward by specifying data
-        >>> $texec->forward(1, data => $mydata);
-        >>> # doing forward by not specifying things, but copy to the executor before hand
-        >>> $mydata->copyto($texec->arg_dict->{'data'});
-        >>> $texec->forward(1);
-        >>> # doing forward by specifying data and get outputs
-        >>> my $outputs = $texec->forward(1, data => $mydata);
-        >>> print $outputs->[0]->aspdl;
-=cut
-
-method forward(Int $is_train=0, %kwargs)
-{
-    if(%kwargs)
-    {
-        my $arg_dict = $self->arg_dict;
-        while (my ($name, $array) = each %kwargs)
-        {
-            if(not find_type_constraint('AcceptableInput')->check($array))
-            {
-                confess('only accept keyword argument of NDArrays/PDLs/Perl Array refs');
-            }
-            if(not exists $arg_dict->{ $name })
-            {
-                confess("unknown argument $name");
-            }
-            if(not blessed($array) or not $array->isa('AI::MXNet::NDArray'))
-            {
-                $array = AI::MXNet::NDArray->array($array);
-            }
-            if(join(',', @{ $arg_dict->{$name}->shape }) ne join(',', @{ $array->shape }))
-            {
-                my $expected = $arg_dict->{$name}->shape;
-                my $got = $array->shape;
-                confess("Shape not match! Argument $name, need: @$expected, received: @$got'");
-            }
-            $arg_dict->{ $name } .= $array;
-        }
-    }
-    check_call(AI::MXNetCAPI::ExecutorForward(
-            $self->handle,
-            $is_train
-        )
-    );
-    return $self->outputs;
-}
-
-=head2 backward
-
-    Do a backward pass to get the gradient of the arguments.
-
-    Parameters
-    ----------
-    $out_grads : NDArray or an array ref of NDArrays or hash ref of NDArrays, optional.
-        The gradient on the outputs to be propagated back.
-        This parameter is only needed when bind is called
-        on outputs that are not a loss function.
-
-    $is_train : Bool, default 1
-        Whether this backward is for training or inference. Note that in rare
-        cases you want to call backward with is_train=0 to get gradient
-        during inference.
-=cut
-
-method backward(
-    Maybe[AI::MXNet::NDArray|ArrayRef[AI::MXNet::NDArray]|HashRef[AI::MXNet::NDArray]] $out_grads=,
-    Bool $is_train=1
-)
-{
-    $out_grads //= [];
-    if(blessed $out_grads)
-    {
-        $out_grads = [$out_grads];
-    }
-    elsif(ref $out_grads eq 'HASH')
-    {
-        $out_grads = [ @{ $out_grads }{ @{ $self->symbol->list_outputs() } } ];
-    }
-    check_call(
-        AI::MXNetCAPI::ExecutorBackwardEx(
-            $self->handle,
-            scalar(@{ $out_grads }),
-            [map { $_->handle } @{ $out_grads }],
-            $is_train
-        )
-    );
-}
-
-=head2 set_monitor_callback
-
-    Install callback.
-
-    Parameters
-    ----------
-    $callback : CodeRef
-        Takes a string and an NDArrayHandle.
-=cut
-
-method set_monitor_callback(CodeRef $callback)
-{
-    check_call(
-        AI::MXNetCAPI::ExecutorSetMonitorCallback(
-            $self->handle,
-            $callback
-        )
-    );
-}
-
-=head2 arg_dict
-
-    Get a hash ref representation of the argument arrays.
-
-    Returns
-    -------
-    $arg_dict : HashRef[AI::MXNet::NDArray]
-        The map that maps a name of the arguments to the NDArrays.
-=cut
-
-method arg_dict()
-{
-    if(not defined $self->_arg_dict)
-    {
-        $self->_arg_dict(_get_dict(
-                $self->_symbol->list_arguments(),
-                $self->arg_arrays
-            )
-        );
-    }
-    return $self->_arg_dict;
-}
-
-=head2 grad_dict
-
-    Get a hash ref representation of the gradient arrays.
-
-    Returns
-    -------
-    $grad_dict : HashRef[AI::MXNet::NDArray]
-        The map that maps a name of the arguments to the gradient NDArrays.
-=cut
-
-method grad_dict()
-{
-    if(not defined $self->_grad_dict)
-    {
-        $self->_grad_dict(_get_dict(
-                $self->_symbol->list_arguments(),
-                $self->grad_arrays
-            )
-        );
-    }
-    return $self->_grad_dict;
-}
-
-=head2 aux_dict
-
-    Get a hash ref representation of the auxiliary states arrays.
-
-    Returns
-    -------
-    $aux_dict : HashRef[AI::MXNet::NDArray]
-        The map that maps a name of the auxiliary states to the NDArrays.
-=cut
-
-method aux_dict()
-{
-    if(not defined $self->_aux_dict)
-    {
-        $self->_aux_dict(_get_dict(
-                $self->_symbol->list_auxiliary_states(),
-                $self->aux_arrays()
-            )
-        );
-    }
-    return $self->_aux_dict;
-}
-
-=head2 output_dict
-
-    Get a hash ref representation of the output arrays.
-
-    Returns
-    -------
-    $output_dict : HashRef[AI::MXNet::NDArray]
-        The map that maps a name of the outputs to the NDArrays.
-=cut
-
-method output_dict()
-{
-    if(not defined $self->_output_dict)
-    {
-        $self->_output_dict(_get_dict(
-                $self->_symbol->list_outputs(),
-                $self->outputs
-            )
-        );
-    }
-    return $self->_output_dict;
-}
-
-=head2 copy_params_from
-
-    Copy parameters from arg_params, aux_params into the executor's internal array.
-
-    Parameters
-    ----------
-    $arg_params : HashRef[AI::MXNet::NDArray]
-        Parameters, hash ref of name to NDArray of arguments
-
-    $aux_params= : Maybe[HashRef[AI::MXNet::NDArray]], optional
-        Parameters, hash ref of name to NDArray of auxiliary states.
-
-    $allow_extra_params= : Bool, optional
-        Whether to allow extra parameters that are not needed by symbol
-        If this is True, no error will be thrown when arg_params or aux_params
-        contain extra parameters that is not needed by the executor.
-=cut
-
-method copy_params_from(
-    HashRef[AI::MXNet::NDArray]        $arg_params,
-    Maybe[HashRef[AI::MXNet::NDArray]] $aux_params=,
-    Maybe[Bool]                        $allow_extra_params=
-)
-{
-    my %arg_dict = %{ $self->arg_dict };
-    while (my ($name, $array) = each %{ $arg_params })
-    {
-        if(exists $arg_dict{ $name })
-        {
-            my $dst = $arg_dict{ $name };
-            $array->astype($dst->dtype)->copyto($dst);
-        }
-        elsif(not $allow_extra_params)
-        {
-            confess("Found name \"$name\" that is not in the arguments");
-        }
-    }
-    if(defined $aux_params)
-    {
-        my %aux_dict = %{ $self->aux_dict };
-        while (my ($name, $array) = each %{ $aux_params })
-        {
-            if(exists $aux_dict{ $name })
-            {
-                my $dst = $aux_dict{ $name };
-                $array->astype($dst->dtype)->copyto($dst);
-            }
-            elsif(not $allow_extra_params)
-            {
-                confess("Found name \"$name\" that is not in the arguments");
-            }
-        }
-    }
-}
-
-=head2 reshape
-
-    Returns new executor with the same symbol and shared memory,
-    but different input/output shapes.
-    For runtime reshaping, variable length sequences, etc.
-    The returned executor shares state with the current one,
-    and cannot be used in parallel with it.
-
-    Parameters
-    ----------
-    $kwargs : HashRef[Shape]
-        new shape for arguments.
-    :$partial_shaping : Bool
-        Whether to allow changing the shape of unspecified arguments.
-    :$allow_up_sizing : Bool
-        Whether to allow allocating new ndarrays that's larger than the original.
-
-    Returns
-    -------
-    $exec : AI::MXNet::Executor
-        A new executor that shares memory with self.
-=cut
-
-
-method reshape(HashRef[Shape] $kwargs, Int :$partial_shaping=0, Int :$allow_up_sizing=0)
-{
-    my @provided_arg_shape_data;
-    # argument shape index in sdata,
-    # e.g. [sdata[indptr[0]], sdata[indptr[1]]) is the shape of the first arg
-    my @provided_arg_shape_idx = (0);
-    my @provided_arg_shape_names = ();  # provided argument names
-    while(my ($k, $v) = each %{ $kwargs })
-    {
-        if(ref $v eq 'ARRAY')
-        {
-            push @provided_arg_shape_names, $k;
-            push @provided_arg_shape_data, @{ $v };
-            push @provided_arg_shape_idx, scalar(@provided_arg_shape_data);
-        }
-    }
-
-    my @ctx_map_keys;
-    my @ctx_map_dev_types;
-    my @ctx_map_dev_ids;
-
-    if(ref $self->_group2ctx eq 'HASH')
-    {
-        while(my ($k, $v) = each %{ $self->_group2ctx })
-        {
-            push @ctx_map_keys, $k;
-            push @ctx_map_dev_types, $v->device_type_id;
-            push @ctx_map_dev_ids, $v->device_id;
-        }
-    }
-
-    my $shared_handle = $self->handle;
-
-    my ($in_args_and_grad_handles, $aux_state_handles, $handle) = check_call(
-        AI::MXNetCAPI::ExecutorReshapeEx(
-            $partial_shaping,
-            $allow_up_sizing,
-            $self->_ctx->device_type_id,
-            $self->_ctx->device_id,
-            scalar(@ctx_map_keys),
-            \@ctx_map_keys,
-            \@ctx_map_dev_types,
-            \@ctx_map_dev_ids,
-            scalar(@provided_arg_shape_names),
-            \@provided_arg_shape_names,
-            \@provided_arg_shape_data,
-            \@provided_arg_shape_idx,
-            $shared_handle
-        )
-    );
-    my ($in_args_handles, $arg_grad_handles) = @{ $in_args_and_grad_handles };
-    my @arg_arrays  = map { AI::MXNet::NDArray->_ndarray_cls($_) } @{ $in_args_handles };
-    my @grad_arrays = map { defined($_) ? AI::MXNet::NDArray->_ndarray_cls($_) : undef } @{ $arg_grad_handles };
-    my @aux_arrays  = map { AI::MXNet::NDArray->_ndarray_cls($_) } @{ $aux_state_handles };
-
-    my $executor = __PACKAGE__->new(
-        handle     => $handle,
-        symbol    => $self->_symbol,
-        ctx       => $self->_ctx,
-        grad_req  => $self->_grad_req,
-        group2ctx => $self->_group2ctx
-    );
-    $executor->arg_arrays(\@arg_arrays);
-    $executor->grad_arrays(\@grad_arrays);
-    $executor->aux_arrays(\@aux_arrays);
-    return $executor;
-}
-
-=head2 debug_str
-
-    A debug string about the internal execution plan.
-
-    Returns
-    -------
-    $debug_str : Str
-        Debug string of the executor.
-=cut
-
-method debug_str()
-{
-    return scalar(check_call(AI::MXNetCAPI::ExecutorPrint($self->handle)));
-}
-
-1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Executor/Group.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Executor/Group.pm
deleted file mode 100644
index 79f395fb7c5a..000000000000
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Executor/Group.pm
+++ /dev/null
@@ -1,918 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-package AI::MXNet::Executor::Group;
-use strict;
-use warnings;
-use Scalar::Util qw(blessed);
-use List::Util qw(sum min);
-use AI::MXNet::Base;
-use AI::MXNet::Function::Parameters;
-
-=head1 NAME
-
-    AI::MXNet::Executor::Group - Manager for a group of executors working in different contexts.
-=cut
-
-func _split_input_slice($batch_size, $work_load_list)
-{
-    my $total_work_load = sum(@{ $work_load_list });
-    my @batch_num_list = map { # perl does not have builtin round
-        int(($_ * $batch_size / $total_work_load) + 0.5)
-    } @{ $work_load_list };
-    my $batch_num_sum = sum(@batch_num_list);
-    my @slices;
-    if($batch_num_sum < $batch_size)
-    {
-        $batch_num_list[-1] += $batch_size - $batch_num_sum;
-    }
-    my $end = 0;
-    for my $batch_num (@batch_num_list)
-    {
-        my $begin = int(min($end, $batch_size));
-        $end = int(min($begin + $batch_num, $batch_size));
-        if($begin >= $end)
-        {
-            Carp::confess('Too many slices such that some splits are empty');
-        }
-        push @slices, [$begin, $end];
-    }
-    return \@slices;
-}
-
-# Load a array ref of arrays into a array ref of arrays specified by slices
-func _load_general($data, $targets, $major_axis)
-{
-    for(zip($data, $targets, $major_axis)) {
-        my ($d_src, $d_targets, $axis) = @$_;
-        if(blessed($d_targets) and $d_targets->isa('AI::MXNet::NDarray'))
-        {
-            $d_src->copyto($d_targets);
-        }
-        elsif(ref $d_targets eq 'ARRAY' and blessed $d_targets->[0])
-        {
-            for(zip($d_src, $d_targets)) {
-                my ($src, $dst) = @$_;
-                $src->copyto($dst);
-            }
-        }
-        else
-        {
-            for my $d (@{ $d_targets })
-            {
-                my ($slice_idx, $d_dst) = @{ $d };
-                if($axis >= 0)
-                {
-                    my $shape = $d_src->shape;
-                    my $do_crop = ($slice_idx->[0] != 0 or $shape->[$axis] != $slice_idx->[1]);
-                    if($do_crop)
-                    {
-                        if($axis == 0)
-                        {
-                            $d_src->slice([$slice_idx->[0], $slice_idx->[1] - 1])->copyto($d_dst);
-                        }
-                        else
-                        {
-                            if($d_src->context == $d_dst->context)
-                            {
-                                AI::MXNet::NDArray->slice_axis(
-                                    $d_src,
-                                    {
-                                        axis  => $axis,
-                                        begin => $slice_idx->[0],
-                                        end   => $slice_idx->[1],
-                                        out   => $d_dst
-                                    }
-                                );
-                            }
-                            else
-                            {
-                                my $d_dst_copy = AI::MXNet::NDArray->slice_axis(
-                                    $d_src,
-                                    {
-                                        axis  => $axis,
-                                        begin => $slice_idx->[0],
-                                        end   => $slice_idx->[1]
-                                    }
-                                );
-                                $d_dst_copy->copyto($d_dst);
-                            }
-                        }
-                    }
-                    else
-                    {
-                        $d_src->copyto($d_dst);
-                    }
-                }
-                else
-                {
-                    $d_src->copyto($d_dst);
-                }
-            }
-        }
-    }
-}
-
-# Load data into sliced arrays
-func _load_data($batch, $targets, $major_axis)
-{
-    _load_general($batch->data, $targets, $major_axis);
-}
-
-# Load label into sliced arrays
-func _load_label($batch, $targets, $major_axis)
-{
-    _load_general($batch->label, $targets, $major_axis);
-}
-
-# Merge outputs that live on multiple context into one, so that they look
-# like living on one context.
-func _merge_multi_context($outputs, $major_axis)
-{
-    my @rets;
-    for(zip($outputs, $major_axis)) {
-        my ($tensors, $axis) = @$_;
-        if($axis >= 0)
-        {
-            if(@$tensors == 1)
-            {
-                push @rets, $tensors->[0];
-            }
-            else
-            {
-                my $ctx = $tensors->[0]->context;
-                push @rets, AI::MXNet::NDArray->concat((map { $_->as_in_context($ctx) } @$tensors), { dim => $axis });
-            }
-        }
-        else
-        {
-            # negative axis means the there is no batch_size axis, and all the
-            # results should be the same on each device. We simply take the
-            # first one, without checking they are actually the same
-            push @rets, $tensors->[0];
-        }
-    }
-    return \@rets;
-}
-
-## TODO
-## this class is here because of https://github.com/gfx/p5-Mouse/pull/67
-## once 2.4.7 version of Mouse in Ubuntu for affected Perl version
-## these accessors should be merged into main class
-package AI::MXNet::DataParallelExecutorGroup::_private;
-use Mouse;
-has [qw/output_layouts label_layouts arg_names aux_names
-        batch_size slices execs data_arrays
-        label_arrays param_arrays grad_arrays aux_arrays
-        data_layouts shared_data_arrays input_grad_arrays
-        _default_execs state_arrays/
-    ] => (is => 'rw', init_arg => undef);
-
-package AI::MXNet::DataParallelExecutorGroup;
-use Mouse;
-use AI::MXNet::Base;
-use List::Util qw(sum);
-
-=head1 DESCRIPTION
-
-    DataParallelExecutorGroup is a group of executors that lives on a group of devices.
-    This is a helper class used to implement data parallelization. Each mini-batch will
-    be split and run on the devices.
-
-    Parameters for constructor
-    ----------
-    symbol : AI::MXNet::Symbol
-        The common symbolic computation graph for all executors.
-    contexts : ArrayRef[AI::MXNet::Context]
-        A array ref of contexts.
-    workload : ArrayRef[Num]
-        If not undef, could be an array ref of numbers that specify the workload to be assigned
-        to different context. Larger number indicate heavier workload.
-    data_shapes : ArrayRef[NameShape|AI::MXNet::DataDesc]
-        Should be a array ref of [name, shape] array refs, for the shapes of data. Note the order is
-        important and should be the same as the order that the `DataIter` provide the data.
-    label_shapes : Maybe[ArrayRef[NameShape|AI::MXNet::DataDesc]]
-        Should be a array ref of [$name, $shape] array refs, for the shapes of label. Note the order is
-        important and should be the same as the order that the `DataIter` provide the label.
-    param_names : ArrayRef[Str]
-        A array ref of strings, indicating the names of parameters (e.g. weights, filters, etc.)
-        in the computation graph.
-    for_training : Bool
-        Indicate whether the executors should be bind for training. When not doing training,
-        the memory for gradients will not be allocated.
-    inputs_need_grad : Bool
-        Indicate whether the gradients for the input data should be computed. This is currently
-        not used. It will be useful for implementing composition of modules.
-    shared_group : AI::MXNet::DataParallelExecutorGroup
-        Default is undef. This is used in bucketing. When not undef, it should be a executor
-        group corresponding to a different bucket. In other words, it will correspond to a different
-        symbol with the same set of parameters (e.g. unrolled RNNs with different lengths).
-        In this case the memory regions of the parameters will be shared.
-    logger : Logger
-        Default is AI::MXNet::Logging->get_logger.
-    fixed_param_names: Maybe[ArrayRef[Str]]
-        Indicate parameters to be fixed during training. Parameters in this array ref will not allocate
-        space for gradient, nor do gradient calculation.
-    grad_req : ArrayRef[GradReq]|HashRef[GradReq]|GradReq
-        Requirement for gradient accumulation. Can be 'write', 'add', or 'null'
-        (default to 'write').
-        Can be specified globally (str) or for each argument (array ref, hash ref).
-    state_names: Maybe[ArrayRef[Str]]
-=cut
-
-has 'symbol'            => (is => 'ro', isa => 'AI::MXNet::Symbol', required => 1);
-has 'contexts'          => (is => 'ro', isa => 'ArrayRef[AI::MXNet::Context]', required => 1);
-has 'workload'          => (is => 'ro', isa => 'ArrayRef[Num]', default => sub { [] });
-has 'data_shapes'       => (is => 'rw', isa => 'ArrayRef[NameShape|AI::MXNet::DataDesc]', required => 1);
-has 'label_shapes'      => (is => 'rw', isa => 'Maybe[ArrayRef[NameShape|AI::MXNet::DataDesc]]');
-has 'param_names'       => (is => 'ro', isa => 'ArrayRef[Str]', required => 1);
-has 'for_training'      => (is => 'ro', isa => 'Bool', required => 1);
-has 'inputs_need_grad'  => (is => 'ro', isa => 'Bool', default  => 0);
-has 'shared_group'      => (is => 'ro', isa => 'Maybe[AI::MXNet::DataParallelExecutorGroup]');
-has 'logger'            => (is => 'ro', default => sub { AI::MXNet::Logging->get_logger });
-has 'fixed_param_names' => (is => 'rw', isa => 'Maybe[ArrayRef[Str]]');
-has 'state_names'       => (is => 'rw', isa => 'Maybe[ArrayRef[Str]]');
-has 'grad_req'          => (is => 'rw', isa => 'ArrayRef[GradReq]|HashRef[GradReq]|GradReq', default=>'write');
-has '_p'                => (is => 'rw', init_arg => undef);
-sub BUILD
-{
-    my $self = shift;
-    my $p = AI::MXNet::DataParallelExecutorGroup::_private->new;
-    $p->arg_names($self->symbol->list_arguments);
-    $p->aux_names($self->symbol->list_auxiliary_states);
-    $p->execs([]);
-    $self->_p($p);
-    $self->grad_req('null') if not $self->for_training;
-    $self->fixed_param_names([]) unless defined $self->fixed_param_names;
-    $self->state_names([]) unless defined $self->state_names;
-    my $data_shapes = [];
-    for my $d (@{ $self->data_shapes })
-    {
-        $d = AI::MXNet::DataDesc->new(name => $d->[0], shape => $d->[1])
-            unless blessed $d;
-        push @{ $data_shapes }, $d;
-    }
-    $self->data_shapes($data_shapes);
-    if(defined $self->label_shapes)
-    {
-        my $label_shapes = [];
-        for my $l (@{ $self->label_shapes })
-        {
-            $l = AI::MXNet::DataDesc->new(name => $l->[0], shape => $l->[1])
-                unless blessed $l;
-            push @{ $label_shapes }, $l;
-        }
-        $self->label_shapes($label_shapes);
-    }
-    my %data_names  = map { $_->name => 1 } @{ $self->data_shapes };
-    my %param_names = map { $_    =>    1 } @{ $self->param_names };
-    my %fixed_param_names = map { $_ => 1 } @{ $self->fixed_param_names };
-    my %grad_req;
-    if(not ref $self->grad_req)
-    {
-        for my $k (@{ $self->_p->arg_names })
-        {
-            if(exists $param_names{ $k })
-            {
-                $grad_req{$k} = exists $fixed_param_names{ $k } ? 'null' : $self->grad_req;
-            }
-            elsif(exists $data_names{ $k })
-            {
-                $grad_req{$k} = $self->inputs_need_grad ? $self->grad_req : 'null';
-            }
-            else
-            {
-                $grad_req{$k} = 'null';
-            }
-        }
-    }
-    elsif(ref $self->grad_req eq 'ARRAY')
-    {
-        @grad_req{ @{ $self->_p->arg_names } } = @{ $self->grad_req };
-    }
-    else
-    {
-        for my $k (@{ $self->_p->arg_names })
-        {
-            if(exists $param_names{ $k })
-            {
-                $grad_req{$k} = exists $fixed_param_names{ $k } ? 'null' : 'write';
-            }
-            elsif(exists $data_names{ $k })
-            {
-                $grad_req{$k} = $self->inputs_need_grad ? 'write' : 'null';
-            }
-            else
-            {
-                $grad_req{$k} = 'null';
-            }
-        }
-        %grad_req = (%grad_req, %{ $self->grad_req });
-    }
-    $self->grad_req(\%grad_req);
-    if(defined $self->shared_group)
-    {
-        $self->_p->shared_data_arrays($self->shared_group->_p->shared_data_arrays);
-    }
-    else
-    {
-        $self->_p->shared_data_arrays([map { +{} } 0..@{ $self->contexts }-1]);
-    }
-    $self->_p->output_layouts([
-        map {
-            AI::MXNet::DataDesc->get_batch_axis($self->symbol->slice($_)->attr('__layout__'))
-        } @{ $self->symbol->list_outputs }
-    ]);
-    $self->bind_exec($self->data_shapes, $self->label_shapes, $self->shared_group);
-}
-
-=head2 decide_slices
-
-    Decide the slices for each context according to the workload.
-
-    Parameters
-    ----------
-    $data_shapes : ArrayRef[AI::MXNet::DataDesc]
-=cut
-
-method decide_slices(ArrayRef[AI::MXNet::DataDesc] $data_shapes)
-{
-    confess("empty data_shapes array") unless @{ $data_shapes } > 0;
-    my $major_axis = [map { AI::MXNet::DataDesc->get_batch_axis($_->layout) } @{ $data_shapes }];
-    for(zip($data_shapes, $major_axis)) {
-        my ($desc, $axis) = @$_;
-        next if($axis == -1);
-        my $batch_size = $desc->shape->[$axis];
-        if(defined $self->_p->batch_size)
-        {
-            confess(
-                "all data must have the same batch size: "
-                . sprintf("batch_size = %d, but ", $self->_p->batch_size)
-                . sprintf("%s has shape %s", $desc->name, '('. join(',', @{ $desc->shape }) . ')')
-            ) unless $batch_size == $self->_p->batch_size;
-        }
-        else
-        {
-            $self->_p->batch_size($batch_size);
-            $self->_p->slices(AI::MXNet::Executor::Group::_split_input_slice($self->_p->batch_size, $self->workload));
-        }
-    }
-    return $major_axis;
-}
-
-# Collect internal arrays from executors.
-method _collect_arrays()
-{
-    # convenient data structures
-    $self->_p->data_arrays([]);
-    for my $d (@{ $self->data_shapes })
-    {
-        my $name = $d->name;
-        my @tmp;
-        for my $i (0..@{ $self->_p->execs }-1)
-        {
-            push @tmp, [ $self->_p->slices->[$i], $self->_p->execs->[$i]->arg_dict->{$name} ];
-        }
-        push @{ $self->_p->data_arrays }, \@tmp;
-    }
-    if(defined $self->label_shapes)
-    {
-        $self->_p->label_arrays([]);
-        for my $l (@{ $self->label_shapes })
-        {
-            my $name = $l->name;
-            my @tmp;
-            for my $i (0..@{ $self->_p->execs }-1)
-            {
-                push @tmp, [ $self->_p->slices->[$i], $self->_p->execs->[$i]->arg_dict->{$name} ];
-            }
-            push @{ $self->_p->label_arrays }, \@tmp;
-        }
-    }
-    $self->_p->param_arrays([]);
-    my %param_names = map { $_ => 1 } @{ $self->param_names };
-    for my $i (0..@{ $self->_p->arg_names }-1)
-    {
-        my $name = $self->_p->arg_names->[$i];
-        if(exists $param_names{$name})
-        {
-            my @tmp;
-            for my $exec (@{ $self->_p->execs })
-            {
-                push @tmp, $exec->arg_arrays->[$i];
-            }
-            push @{ $self->_p->param_arrays }, \@tmp;
-        }
-    }
-    $self->_p->state_arrays([]);
-    for my $i (0..@{ $self->state_names }-1)
-    {
-        my $name = $self->state_names->[$i];
-        my @tmp;
-        for my $exec (@{ $self->_p->execs })
-        {
-            push @tmp, $exec->arg_dict->{$name};
-        }
-        push @{ $self->_p->state_arrays }, \@tmp;
-    }
-    if($self->for_training)
-    {
-        $self->_p->grad_arrays([]);
-        for my $i (0..@{ $self->_p->arg_names }-1)
-        {
-            my $name = $self->_p->arg_names->[$i];
-            if(exists $param_names{$name})
-            {
-                my @tmp;
-                for my $exec (@{ $self->_p->execs })
-                {
-                    push @tmp, $exec->grad_arrays->[$i];
-                }
-                push @{ $self->_p->grad_arrays }, \@tmp;
-            }
-        }
-    }
-    my @data_names = map { $_->name } @{ $self->data_shapes };
-    my $j = 0; my %arg_names  = map { $_ => $j++ } @{ $self->_p->arg_names };
-    if($self->inputs_need_grad)
-    {
-        $self->_p->input_grad_arrays([]);
-        for my $name (@data_names)
-        {
-            next unless exists $arg_names{$name};
-            my @tmp;
-            for my $exec (@{ $self->_p->execs })
-            {
-                push @tmp, $exec->grad_arrays->[$arg_names{$name}];
-            }
-            push @{ $self->_p->input_grad_arrays }, \@tmp;
-        }
-    }
-    $self->_p->aux_arrays([]);
-    for my $i (0..@{ $self->_p->aux_names }-1)
-    {
-        my @tmp;
-        for my $exec (@{ $self->_p->execs })
-        {
-            push @tmp, $exec->aux_arrays->[$i];
-        }
-        push @{ $self->_p->aux_arrays }, \@tmp;
-    }
-}
-
-=head2 bind_exec
-
-    Bind executors on their respective devices.
-
-    Parameters
-    ----------
-    $data_shapes  : ArrayRef[AI::MXNet::DataDesc]
-    $label_shapes : Maybe[ArrayRef[AI::MXNet::DataDesc]]
-    $shared_group : Maybe[AI::MXNet::DataParallelExecutorGroup]
-    $reshape      : Bool
-=cut
-
-method bind_exec(
-    ArrayRef[AI::MXNet::DataDesc]               $data_shapes,
-    Maybe[ArrayRef[AI::MXNet::DataDesc]]        $label_shapes=,
-    Maybe[AI::MXNet::DataParallelExecutorGroup] $shared_group=,
-    Bool                                        $reshape=0
-)
-{
-    assert($reshape or not @{ $self->_p->execs });
-    $self->_p->batch_size(undef);
-
-    # calculate workload and bind executors
-    $self->_p->data_layouts($self->decide_slices($data_shapes));
-    # call it to make sure labels has the same batch size as data
-    if(defined $label_shapes)
-    {
-        $self->_p->label_layouts($self->decide_slices($label_shapes));
-    }
-
-    for my $i (0..@{ $self->contexts }-1)
-    {
-        my $data_shapes_i = $self->_sliced_shape($data_shapes, $i, $self->_p->data_layouts);
-        my $label_shapes_i = [];
-        if(defined $label_shapes)
-        {
-            $label_shapes_i = $self->_sliced_shape($label_shapes, $i, $self->_p->label_layouts);
-        }
-        if($reshape)
-        {
-            my %combined_hash = map { $_->name => $_->shape } (@{ $data_shapes_i }, @{ $label_shapes_i });
-            $self->_p->execs->[$i] = $self->_p->_default_execs->[$i]->reshape(
-                \%combined_hash,
-                allow_up_sizing => 1,
-            );
-        }
-        else
-        {
-            push @{ $self->_p->execs }, $self->_bind_ith_exec($i, $data_shapes_i, $label_shapes_i, $shared_group);
-        }
-    }
-    $self->data_shapes($data_shapes);
-    $self->label_shapes($label_shapes);
-    $self->_collect_arrays;
-}
-
-=head2 reshape
-
-    Reshape executors.
-
-    Parameters
-    ----------
-    $data_shapes : ArrayRef[AI::MXNet::DataDesc]
-    $label_shapes : Maybe[ArrayRef[AI::MXNet::DataDesc]]
-=cut
-
-
-method reshape(
-    ArrayRef[AI::MXNet::DataDesc]          $data_shapes,
-    Maybe[ArrayRef[AI::MXNet::DataDesc]]   $label_shapes=
-)
-{
-    return if($data_shapes eq $self->data_shapes and $label_shapes eq $self->label_shapes);
-    if (not defined $self->_p->_default_execs)
-    {
-        $self->_p->_default_execs([@{ $self->_p->execs }]);
-    }
-    $self->bind_exec($data_shapes, $label_shapes, undef, 1);
-}
-
-=head2 set_params
-
-    Assign, i.e. copy parameters to all the executors.
-
-    Parameters
-    ----------
-    $arg_params : HashRef[AI::MXNet::NDArray]
-        A dictionary of name to AI::MXNet::NDArray parameter mapping.
-    $aux_params : HashRef[AI::MXNet::NDArray]
-        A dictionary of name to AI::MXNet::NDArray auxiliary variable mapping.
-=cut
-
-method set_params(HashRef[AI::MXNet::NDArray] $arg_params, HashRef[AI::MXNet::NDArray] $aux_params, Bool $allow_extra=0)
-{
-    $_->copy_params_from($arg_params, $aux_params, $allow_extra) for @{ $self->_p->execs };
-}
-
-=head2 get_params
-
-    Copy data from each executor to arg_params and aux_params.
-
-    Parameters
-    ----------
-    $arg_params : HashRef[AI::MXNet::NDArray]
-        target parameter arrays
-    $aux_params : HashRef[AI::MXNet::NDArray]
-        target aux arrays
-
-    Notes
-    -----
-    - This function will inplace update the NDArrays in arg_params and aux_params.
-=cut
-
-method get_params(HashRef[AI::MXNet::NDArray] $arg_params, HashRef[AI::MXNet::NDArray] $aux_params)
-{
-    my $weight = 0;
-    for(zip($self->param_names, $self->_p->param_arrays)) {
-        my ($name, $block) = @$_;
-            my $weight = sum(map { $_->copyto(AI::MXNet::Context->cpu) } @{ $block }) / @{ $block };
-            $weight->astype($arg_params->{$name}->dtype)->copyto($arg_params->{$name});
-    }
-    for(zip($self->_p->aux_names, $self->_p->aux_arrays)) {
-        my ($name, $block) = @$_;
-            my $weight = sum(map { $_->copyto(AI::MXNet::Context->cpu) } @{ $block }) / @{ $block };
-            $weight->astype($aux_params->{$name}->dtype)->copyto($aux_params->{$name});
-    }
-}
-
-
-
-method get_states($merge_multi_context=1)
-{
-    assert((not $merge_multi_context), "merge_multi_context=True is not supported for get_states yet.");
-    return $self->_p->state_arrays;
-}
-
-method set_states($states, $value)
-{
-    if(defined $states)
-    {
-        assert((not defined $value), "Only one of states & value can be specified.");
-        AI::MXNet::Executor::Group::_load_general($states, $self->_p->state_arrays, [(0)x@{ $states }]);
-    }
-    else
-    {
-        assert((defined $value), "At least one of states & value must be specified.");
-        assert((not defined $states), "Only one of states & value can be specified.");
-        for my $d_dst (@{ $self->_p->state_arrays })
-        {
-            for my $dst (@{ $d_dst })
-            {
-                $dst .= $value;
-            }
-        }
-    }
-}
-
-=head2 forward
-
-    Split the data_batch according to a workload and run forward on each devices.
-
-    Parameters
-    ----------
-    data_batch : AI::MXNet::DataBatch
-    Or could be any object implementing similar interface.
-
-    is_train : bool
-    The hint for the backend, indicating whether we are during training phase.
-    Default is undef, then the value $self->for_training will be used.
-=cut
-
-
-method forward(AI::MXNet::DataBatch $data_batch, Maybe[Bool] $is_train=)
-{
-    AI::MXNet::Executor::Group::_load_data($data_batch, $self->_p->data_arrays, $self->_p->data_layouts);
-    $is_train //= $self->for_training;
-    if(defined $self->_p->label_arrays)
-    {
-        confess("assert not is_train or data_batch.label")
-            unless (not $is_train or $data_batch->label);
-        if($data_batch->label)
-        {
-            AI::MXNet::Executor::Group::_load_label($data_batch, $self->_p->label_arrays, $self->_p->label_layouts);
-        }
-    }
-    $_->forward($is_train) for @{ $self->_p->execs };
-}
-
-# Get the shapes of the outputs
-
-method get_output_shapes()
-{
-    my @shapes = map { $_->shape } @{ $self->execs->[0]->outputs };
-    my @concat_shapes;
-    for(zip($self->symbol->list_outputs, \@shapes, $self->_p->output_layouts)) {
-        my ($key, $shape, $axis) = @$_;
-        my @the_shape = @{ $shape };
-        if($axis >= 0)
-        {
-            $the_shape[$axis] = $self->_p->batch_size;
-        }
-        push @concat_shapes, AI::MXNet::DataDesc->new(name => $key, shape => \@the_shape);
-    }
-    return \@concat_shapes;
-}
-
-=head2 get_outputs
-
-    Gets outputs of the previous forward computation.
-
-    Parameters
-    ----------
-    merge_multi_context : bool
-    Default is 1. In the case when data-parallelism is used, the outputs
-    will be collected from multiple devices. A 1 value indicates that we
-    should merge the collected results so that they look like from a single
-    executor.
-
-    Returns
-    -------
-    If merge_multi_context is 1, it is [$out1, $out2]. Otherwise, it
-    is [[$out1_dev1, $out1_dev2], [$out2_dev1, $out2_dev2]]. All the output
-    elements are `AI::MXNet::NDArray`.
-=cut
-
-method get_outputs(Bool $merge_multi_context=1)
-{
-    my $outputs;
-    for my $i (0..@{ $self->_p->execs->[0]->outputs }-1)
-    {
-        my @tmp;
-        for my $exec (@{ $self->_p->execs })
-        {
-            push @tmp, $exec->outputs->[$i];
-        }
-        push @$outputs, \@tmp;
-    }
-    if($merge_multi_context)
-    {
-        $outputs = AI::MXNet::Executor::Group::_merge_multi_context($outputs, $self->_p->output_layouts);
-    }
-    return $outputs;
-}
-
-=head2  get_input_grads
-
-    Get the gradients with respect to the inputs of the module.
-
-    Parameters
-    ----------
-    merge_multi_context : bool
-    Default is 1. In the case when data-parallelism is used, the outputs
-    will be collected from multiple devices. A 1 value indicates that we
-    should merge the collected results so that they look like from a single
-    executor.
-
-    Returns
-    -------
-    If merge_multi_context is 1, it is [$grad1, $grad2]. Otherwise, it
-    is [[$grad1_dev1, $grad1_dev2], [$grad2_dev1, $grad2_dev2]]. All the output
-    elements are AI::MXNet::NDArray.
-=cut
-
-method get_input_grads(Bool $merge_multi_context=1)
-{
-    confess("assert \$self->inputs_need_grad") unless $self->inputs_need_grad;
-    if($merge_multi_context)
-    {
-        return AI::MXNet::Executor::Group::_merge_multi_context($self->_p->input_grad_arrays, $self->_p->data_layouts);
-    }
-    return $self->_p->input_grad_arrays;
-}
-
-=head2 backward
-
-    Run backward on all devices. A backward should be called after
-    a call to the forward function. Backward cannot be called unless
-    $self->for_training is 1.
-
-    Parameters
-    ----------
-    out_grads : NDArray or array ref of NDArray, optional
-    Gradient on the outputs to be propagated back.
-    This parameter is only needed when bind is called
-    on outputs that are not a loss function.
-=cut
-
-method backward(Maybe[AI::MXNet::NDArray|ArrayRef[AI::MXNet::NDArray]] $out_grads=)
-{
-    confess('re-bind with for_training=1 to run backward') unless $self->for_training;
-    $out_grads //= [];
-    for(zip([0..@{ $self->_p->execs }-1], $self->_p->execs, $self->_p->slices)) {
-        my ($i, $exec, $islice) = @$_;
-        my @out_grads_slice;
-        for(zip($out_grads, $self->_p->output_layouts)) {
-            my ($grad, $axis) = @$_;
-            if($axis >= 0)
-            {
-                my $og_my_slice = $grad->slice_axis({
-                    axis  => $axis,
-                    begin => $islice->[0],
-                    end   => $islice->[1]
-                });
-                push @out_grads_slice, $og_my_slice->as_in_context($self->contexts->[$i]);
-            }
-            else
-            {
-                push @out_grads_slice, $grad->copyto($self->contexts->[$i]);
-            }
-        }
-        $exec->backward(\@out_grads_slice);
-    }
-}
-
-=head2 update_metric
-
-    Accumulate the performance according to eval_metric on all devices.
-
-    Parameters
-    ----------
-    eval_metric : AI::MXNet::EvalMetric
-        The metric used for evaluation.
-    labels : array ref of NDArray
-        Typically comes from label of AI::MXNet::DataBatch.
-=cut
-
-method update_metric(AI::MXNet::EvalMetric $eval_metric, ArrayRef[AI::MXNet::NDArray] $labels)
-{
-    for(zip($self->_p->execs, $self->_p->slices)) {
-        my ($texec, $islice) = @$_;
-        my @labels_slice;
-        for(zip($labels, $self->_p->label_layouts)) {
-            my ($label, $axis) = @$_;
-            if($axis == 0)
-            {
-                # slicing NDArray along axis 0 can avoid copying
-                push @labels_slice, $label->slice([$islice->[0], $islice->[1]-1]);
-            }
-            elsif($axis > 0)
-            {
-                my $label_my_slice = $label->slice_axis({
-                    axis  => $axis,
-                    begin => $islice->[0],
-                    end   => $islice->[1]
-                })->as_in_context($label->context);
-                push @labels_slice, $label_my_slice;
-            }
-            else
-            {
-                push @labels_slice, $label;
-            }
-        }
-        $eval_metric->update(\@labels_slice, $texec->outputs);
-    }
-}
-
-method _bind_ith_exec(
-    Int                                         $i,
-    ArrayRef[AI::MXNet::DataDesc]               $data_shapes,
-    Maybe[ArrayRef[AI::MXNet::DataDesc]]        $label_shapes,
-    Maybe[AI::MXNet::DataParallelExecutorGroup] $shared_group
-)
-{
-    my $shared_exec = $shared_group ? $shared_group->_p->execs->[$i] : undef;
-    my $context = $self->contexts->[$i];
-    my $shared_data_arrays = $self->_p->shared_data_arrays->[$i];
-    my %input_shapes = map { $_->name => $_->shape } @{ $data_shapes };
-    my %input_types  = map { $_->name => $_->dtype } @{ $data_shapes };
-    if(defined $label_shapes)
-    {
-        %input_shapes = (%input_shapes, map { $_->name => $_->shape } @{ $label_shapes });
-        %input_types  = (%input_types,  map { $_->name => $_->dtype } @{ $label_shapes });
-    }
-    my $executor = $self->symbol->simple_bind(
-        ctx              => $context,
-        grad_req         => $self->grad_req,
-        type_dict        => \%input_types,
-        shared_arg_names => $self->param_names,
-        shared_exec      => $shared_exec,
-        shared_buffer    => $shared_data_arrays,
-        shapes           => \%input_shapes
-    );
-    return $executor;
-}
-
-=head2 _sliced_shape
-
-    Get the sliced shapes for the i-th executor.
-
-    Parameters
-    ----------
-    shapes : array ref of (str, array ref)
-        The original (name, shape) pairs.
-    i : int
-    Which executor we are dealing with.
-=cut
-
-method _sliced_shape(ArrayRef[AI::MXNet::DataDesc] $shapes, Int $i, ArrayRef[Int] $major_axis)
-{
-    my @sliced_shapes;
-    for(zip($shapes, $major_axis)) {
-        my ($desc, $axis) = @$_;
-        my @shape = @{ $desc->shape };
-        if($axis >= 0)
-        {
-            $shape[$axis] = $self->_p->slices->[$i]->[1] - $self->_p->slices->[$i]->[0];
-        }
-        push @sliced_shapes, AI::MXNet::DataDesc->new(
-            name    => $desc->name,
-            shape   => \@shape,
-            dtype   => $desc->dtype,
-            layout  => $desc->layout
-        );
-    }
-    return \@sliced_shapes;
-}
-
-=head2 install_monitor
-
-    Install monitor on all executors
-
-    Parameters
-    ----------
-    $mon : AI::MXNet::Monitor
-=cut
-
-method install_monitor(AI::MXNet::Monitor $mon)
-{
-    $mon->install($_) for @{ $self->_p->execs };
-}
-
-method shared_data_arrays()
-{
-    $self->_p->shared_data_arrays;
-}
-
-method execs()
-{
-    $self->_p->execs;
-}
-
-1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Function/Parameters.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Function/Parameters.pm
deleted file mode 100644
index e4bbc90ca0a3..000000000000
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Function/Parameters.pm
+++ /dev/null
@@ -1,52 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-package AI::MXNet::Function::Parameters;
-use strict;
-use warnings;
-use Function::Parameters ();
-use AI::MXNet::Types ();
-sub import {
-    Function::Parameters->import(
-        {
-            func => {
-                defaults => 'function_strict',
-                runtime  => 1,
-                reify_type => sub {
-                    Mouse::Util::TypeConstraints::find_or_create_isa_type_constraint($_[0])
-                }
-            },
-            method => {
-                defaults => 'method_strict',
-                runtime  => 1,
-                reify_type => sub {
-                    Mouse::Util::TypeConstraints::find_or_create_isa_type_constraint($_[0])
-                }
-            },
-        }
-    );
-}
-
-{
-    no warnings 'redefine';
-    *Function::Parameters::_croak = sub {
-        local($Carp::CarpLevel) = 1;
-        Carp::confess ("@_");
-    };
-}
-
-1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon.pm
deleted file mode 100644
index a2dd64e926a5..000000000000
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon.pm
+++ /dev/null
@@ -1,134 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-package AI::MXNet::Gluon;
-use strict;
-use warnings;
-use AI::MXNet::NS 'global';
-use AI::MXNet::Gluon::Loss 'loss';
-use AI::MXNet::Gluon::Trainer;
-use AI::MXNet::Gluon::Utils;
-use AI::MXNet::Gluon::Data 'data';
-use AI::MXNet::Gluon::NN 'nn';
-use AI::MXNet::Gluon::RNN 'rnn';
-
-sub utils { 'AI::MXNet::Gluon::Utils' }
-sub model_zoo { require AI::MXNet::Gluon::ModelZoo; 'AI::MXNet::Gluon::ModelZoo' }
-
-=head1 NAME
-
-    AI::MXNet::Gluon - High-level interface for MXNet.
-=cut
-
-=head1 DESCRIPTION
-
-    The AI::MXNet::Gluon package is a high-level interface for MXNet designed to be easy to use,
-    while keeping most of the flexibility of a low level API.
-    AI::MXNet::Gluon supports both imperative and symbolic programming,
-    making it easy to train complex models imperatively in Perl.
-
-    Based on the Gluon API specification,
-    the Gluon API in Apache MXNet provides a clear, concise, and simple API for deep learning.
-    It makes it easy to prototype, build, and train deep learning models without sacrificing training speed.
-
-    Advantages.
-
-    Simple, Easy-to-Understand Code: Gluon offers a full set of plug-and-play neural network building blocks,
-    including predefined layers, optimizers, and initializers.
-
-    Flexible, Imperative Structure: Gluon does not require the neural network model to be rigidly defined,
-    but rather brings the training algorithm and model closer together to provide flexibility in the development process.
-
-    Dynamic Graphs: Gluon enables developers to define neural network models that are dynamic,
-    meaning they can be built on the fly, with any structure, and using any of Perl's native control flow.
-
-    High Performance: Gluon provides all of the above benefits without impacting the training speed that the underlying engine provides.
-
-
-    Simple, Easy-to-Understand Code
-    Use plug-and-play neural network building blocks, including predefined layers, optimizers, and initializers:
-
-    use AI::MXNet qw(mx);
-    use AI::MXNet::Gluon qw(gluon);
-
-    my $net = gluon->nn->Sequential;
-    # When instantiated, Sequential stores a chain of neural network layers.
-    # Once presented with data, Sequential executes each layer in turn, using
-    # the output of one layer as the input for the next
-    $net->name_scope(sub {
-        $net->add(gluon->nn->Dense(256, activation=>"relu")); # 1st layer (256 nodes)
-        $net->add(gluon->nn->Dense(256, activation=>"relu")); # 2nd hidden layer
-        $net->add(gluon->nn->Dense($num_outputs));
-    });
-
-    Flexible, Imperative Structure.
-
-    Prototype, build, and train neural networks in fully imperative manner using the AI::MXNet::MXNet package and the Gluon trainer method:
-
-    use AI::MXNet::Base; # provides helpers, such as zip, enumerate, etc.
-    use AI::MXNet::AutoGrad qw(autograd);
-    my $epochs = 10;
-
-    for(1..$epochs)
-    {
-        for(zip($train_data))
-        {
-            my ($data, $label) = @$_;
-            autograd->record(sub {
-                my $output = $net->($data); # the forward iteration
-                my $loss = gluon->loss->softmax_cross_entropy($output, $label);
-                $loss->backward;
-            });
-            $trainer->step($data->shape->[0]); ## batch size
-        }
-    }
-
-    Dynamic Graphs.
-
-    Build neural networks on the fly for use cases where neural networks must change in size and shape during model training:
-
-    use AI::MXNet::Function::Parameters;
-
-    method forward(GluonClass $F, GluonInput $inputs, GluonInput :$tree)
-    {
-        my $children_outputs = [
-            map { $self->forward($F, $inputs, $_) @{ $tree->children }
-        ];
-        #Recursively builds the neural network based on each input sentence
-        #syntactic structure during the model definition and training process
-        ...
-    }
-
-    High Performance
-
-    Easily cache the neural network to achieve high performance by defining your neural network with HybridSequential
-    and calling the hybridize method:
-
-    use AI::MXNet::Gluon::NN qw(nn);
-
-    my $net = nn->HybridSequential;
-    $net->name_scope(sub {
-        $net->add(nn->Dense(256, activation=>"relu"));
-        $net->add(nn->Dense(128, activation=>"relu"));
-        $net->add(nn->Dense(2));
-    });
-
-    $net->hybridize();
-    See more at L<Python docs|https://mxnet.apache.org/api/python/docs/tutorials/packages/gluon/index.html>
-=cut
-
-1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Block.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Block.pm
deleted file mode 100644
index 29cacffcdd05..000000000000
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Block.pm
+++ /dev/null
@@ -1,1590 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-
-# Scope for collecting child 'Block's
-use strict;
-use warnings;
-use AI::MXNet::Gluon::Parameter;
-package AI::MXNet::Gluon::BlockScope;
-use AI::MXNet::Function::Parameters;
-my $_current;
-use Mouse;
-has '_block'      => (is => 'ro', init_arg => 'block', weak_ref => 1);
-has [qw/_counter _old_scope
-    _name_scope/] => (is => 'rw', init_arg => undef);
-
-sub BUILD
-{
-    my $self = shift;
-    $self->_counter({});
-}
-
-# Creates prefix and params for new Block.
-method create($prefix, $params, $hint)
-{
-    my $current = $_current;
-    if(not defined $current)
-    {
-        if(not defined $prefix)
-        {
-            $prefix = AI::MXNet::Symbol::NameManager->current->get(undef, $hint) . '_';
-        }
-        if(not defined $params)
-        {
-            $params = AI::MXNet::Gluon::ParameterDict->new(prefix => $prefix);
-        }
-        else
-        {
-            $params = AI::MXNet::Gluon::ParameterDict->new(prefix => $params->prefix, shared => $params);
-        }
-        return ($prefix, $params);
-    }
-
-    if(not defined $prefix)
-    {
-        my $count = $current->_counter->{ $hint } // 0;
-        $prefix = sprintf('%s%d_', $hint, $count);
-        $current->_counter->{$hint} = $count + 1;
-    }
-    if(not defined $params)
-    {
-        my $parent = $current->_block->params;
-        $params = AI::MXNet::Gluon::ParameterDict->new(prefix => $parent->prefix.$prefix, shared => $parent->_shared);
-    }
-    else
-    {
-        $params = AI::MXNet::Gluon::ParameterDict->new(prefix => $params->prefix, $params);
-    }
-    return ($current->_block->prefix.$prefix, $params);
-}
-
-method __enter__()
-{
-    return $self if $self->_block->_empty_prefix;
-    $self->_old_scope($_current);
-    $_current = $self;
-    $self->_name_scope(AI::MXNet::Symbol::NameManager->current);
-    AI::MXNet::Symbol::NameManager->set_current(AI::MXNet::Symbol::Prefix->new(prefix => $self->_block->prefix));
-    return $self;
-}
-
-method __exit__()
-{
-    return if $self->_block->_empty_prefix;
-    AI::MXNet::Symbol::NameManager->set_current($self->_name_scope);
-    $self->_name_scope(undef);
-    $_current = $self->_old_scope;
-}
-
-package AI::MXNet::Gluon::Block;
-use AI::MXNet::Gluon::Mouse;
-use Scalar::Util qw(refaddr);
-
-=head2 NAME
-
-    AI::MXNet::Gluon::Block - Base class for all neural network layers and models.
-
-=head2 DESCRIPTION
-
-    Base class for all neural network layers and models. Your models should
-    subclass this class.
-
-    AI::MXNet::Gluon::Block can be nested recursively in a tree structure. You can create and
-    assign child AI::MXNet::Gluon::Block as regular attributes
-
-    use AI::MXNet::Gluon::NN qw(nn);
-    use AI::MXNet qw(mx);
-
-    package Model;
-    use AI::MXNet::Gluon::Mouse;
-    use AI::MXNet::Function::Parameters;
-    extends 'AI::MXNet::Gluon::Block';
-
-    sub BUILD
-    {
-        my $self = shift;
-        $self->name_scope(sub {
-            $self->dense0(nn->Dense(5, in_units=>5));
-            $self->dense1(nn->Dense(5, in_units=>5));
-        });
-    }
-
-    method forward($x)
-    {
-        return $self->dense1->($self->dense0->($x));
-    }
-
-    my $model = Model->new()
-    $model->initialize(ctx=>mx->cpu(0))
-    $model->(nd->zeros([10, 10], ctx=>mx->cpu(0)));
-
-
-    Child AI::MXNet::Gluon::Block assigned this way will be registered and ->collect_params
-    will collect their Parameters recursively.
-
-    Parameters
-    ----------
-    Prefix acts like a name space. All children blocks created in parent block's
-    name_scope will have parent block's prefix in their name.
-    Please refer to
-    naming tutorial https://mxnet.apache.org/api/python/docs/tutorials/packages/gluon/naming.html
-    for more info on prefix and naming.
-
-    params : AI::MXNet::Gluon::ParameterDict or undef
-        AI::MXNet::Gluon::ParameterDict for sharing weights with the new AI::MXNet::Gluon::Block. For example,
-        if you want `dense1` to share `dense0`'s weights, you can do
-
-        $dense0 = nn->Dense(20);
-        $dense1 = nn->Dense(20, params=>dense0->collect_params());
-=cut
-
-method _flatten(
-    $args
-)
-{
-    if(blessed $args and $args->isa('AI::MXNet::NDArray'))
-    {
-        return ([$args], 0);
-    }
-    elsif(blessed $args and $args->isa('AI::MXNet::Symbol'))
-    {
-        my $length = @{ $args->list_outputs() };
-        $length = $length > 1 ? $length : 0;
-        return ([$args], $length)
-    }
-    my @flat;
-    my @fmts;
-    for my $i (@{ $args })
-    {
-        my ($arg, $fmt) = __PACKAGE__->_flatten($i);
-        push @flat, @{ $arg };
-        push @fmts, $fmt;
-    }
-    return (\@flat, \@fmts);
-}
-
-method _regroup(
-    $args, $fmt
-)
-{
-    my $in_symbol = (blessed $args and $args->isa('AI::MXNet::Symbol'));
-    my @ret;
-    if(not ref $fmt)
-    {
-        my $len = @{$args} - 1;
-        if($fmt == 0)
-        {
-            @ret = ([@{$args}[1..$len]]);
-            if($in_symbol)
-            {
-                $ret[0] = AI::MXNet::Symbol->Group($ret[0]);
-            }
-            return (@{$args}[0], $ret[0]);
-        }
-        @ret = ([@{$args}[0..$fmt-1]], [@{$args}[$fmt..$len]]);
-        if($in_symbol)
-        {
-            @ret = map { AI::MXNet::Symbol->Group($_) } @ret;
-        }
-        return @ret;
-    }
-    for my $i (@{ $fmt })
-    {
-        my $res;
-        ($res, $args) = __PACKAGE__->_regroup($args, $i);
-        push @ret, $res;
-    }
-    return (\@ret, $args);
-}
-
-has _prefix => (is => 'rw', init_arg => 'prefix', isa => 'Str');
-has _params => (is => 'rw', init_arg => 'params', isa => 'Maybe[AI::MXNet::Gluon::ParameterDict]');
-has [qw/_name _scope _empty_prefix/] => (is => 'rw', init_arg => undef);
-has [qw/_children _forward_hooks _forward_pre_hooks/]  => (is => 'rw', init_arg => undef, default => sub { Hash::Ordered->new });
-has '_reg_params' => (is => 'rw', init_arg => undef, default => sub { +{} });
-around BUILDARGS => \&AI::MXNet::Base::process_arguments;
-
-sub AUTOLOAD {
-    my $name = $AI::MXNet::Gluon::Block::AUTOLOAD;
-    $name =~ s/.*:://;
-    my $self = shift;
-    AI::MXNet::Gluon::Mouse::has($name => (is => 'rw', 'init_arg' => undef, 'caller' => ref $self));
-    $self->$name(@_);
-}
-
-sub BUILD
-{
-    my $self = shift;
-    $self->_empty_prefix(defined $self->_prefix and $self->_prefix eq '');
-    my ($prefix, $params) = AI::MXNet::Gluon::BlockScope->create($self->_prefix, $self->_params, $self->_alias);
-    $self->_prefix($prefix);
-    $self->_params($params);
-    my $name = $prefix;
-    $name =~ s/_$//;
-    $self->_name($name);
-    $self->_scope(AI::MXNet::Gluon::BlockScope->new(block => $self));
-}
-
-method _class_name()
-{
-    my $class = ref $self || $self;
-    $class =~ s/^.+:://;
-    $class;
-}
-
-method __setattr__($name, $current, $prev=)
-{
-    if(defined $prev)
-    {
-        if(
-            (
-                blessed $prev
-                    and
-                ($prev->isa('AI::MXNet::Gluon::Parameter') or $prev->isa('AI::MXNet::Gluon::Block'))
-            )
-            and not (blessed $current and (ref($prev) eq ref($current)))
-        )
-        {
-            confess(
-                sprintf(
-                    "Changing attribute type for %s from %s to %s is not allowed.",
-                    $self->name,
-                    ref($prev),
-                    ref($current)||'no ref'
-                )
-            );
-        }
-    }
-    if(blessed $current and $current->isa('AI::MXNet::Gluon::Block'))
-    {
-        $self->register_child($current, $name);
-    }
-    elsif(blessed $current and $current->isa('AI::MXNet::Gluon::Parameter'))
-    {
-        if(exists $self->_reg_params->{ $name })
-        {
-            confess("Overriding Parameter attribute $name is not allowed. ".
-                "If you want to share parameters between blocks, please set".
-                "'params' at Block construction instead."
-            );
-        }
-        $self->_reg_params->{ $name } = $current;
-    }
-}
-
-method _check_container_with_block()
-{
-    my $_find_unregistered_block_in_container;
-    my %children = map { refaddr($_) => 1 } $self->_children->values;
-    $_find_unregistered_block_in_container = sub { my ($data) = @_;
-    # Find whether a nested container structure contains Blocks
-        if(ref $data eq 'ARRAY')
-        {
-            for my $ele (@{ $data })
-            {
-                if($_find_unregistered_block_in_container->($ele))
-                {
-                    return 1
-                }
-            }
-            return 0;
-        }
-        elsif(ref $data eq 'HASH')
-        {
-            for my $v (values %$data)
-            {
-                if($_find_unregistered_block_in_container->($v))
-                {
-                    return 1;
-                }
-            }
-            return 0;
-        }
-        elsif(blessed $data and $data->isa('AI::MXNet::Gluon::Block'))
-        {
-            return not exists $children{ refaddr($data) };
-        }
-        else
-        {
-            return 0;
-        }
-    };
-    my $attributes_hash = $self->attributes_hash();
-    while(my ($k, $v) = each %{ $attributes_hash })
-    {
-        if((ref $v eq 'HASH' or ref $v eq 'ARRAY') and not $k =~ /^__/)
-        {
-            if($_find_unregistered_block_in_container->($v))
-            {
-                AI::MXNet::Logging->warning(
-                    '"%s" is a unregsitered container with Blocks. '.
-                    'Note that Blocks inside the list, tuple or dict will not be '.
-                    'registered automatically. Make sure to register them using '.
-                    'register_child() or switching to '.
-                    'nn->Sequential/nn->HybridSequential instead. ',
-                    $self->_class_name.'.'.$k
-                );
-            }
-        }
-    }
-}
-
-method _alias()
-{
-    lc $self->_class_name;
-}
-
-method attributes_hash()
-{
-    +{ map { $_ => $self->$_ } $self->meta->get_attribute_list };
-}
-
-use overload
-    '""' => sub
-    {
-        my $self = shift;
-        my $s = "%s(\n%s\n)";
-        my @blocks;
-        my %attributes_hash = %{ $self->attributes_hash };
-        while(my ($k, $v) = each %attributes_hash)
-        {
-            if(blessed $v and $v->isa(__PACKAGE__))
-            {
-                push @blocks, "  ($k): ".AI::MXNet::Base::_indent("$v", 2);
-            }
-        }
-        sprintf("%s(\n%s\n)", $self->_class_name, join("\n", @blocks));
-    },
-    '&{}' => sub { my $self = shift; sub { $self->call(@_) } };
-
-method prefix()
-{
-    $self->_prefix;
-}
-
-method name()
-{
-    $self->_name;
-}
-
-method class()
-{
-    __PACKAGE__;
-}
-
-method name_scope(CodeRef $sub)
-{
-    $self->_scope->__enter__;
-    eval { $sub->(); };
-    my $err = $@;
-    $self->_scope->__exit__;
-    confess($err) if $err;
-}
-
-=head2 params
-
-        Returns this `Block`'s parameter dictionary (does not include its
-        children's parameters).
-=cut
-
-method params()
-{
-    return $self->_params;
-}
-
-=head2 collect_params
-
-        Returns a AI::MXNet::Gluon::ParameterDict containing this AI::MXNet::Gluon::Block and all of its
-        children's Parameters(default), also can returns the ParameterDict
-        with parameters that match a regular expression.
-
-        For example, collects parameters specified in ['conv1_weight', 'conv1_bias', 'fc_weight',
-        'fc_bias'
-
-            $model->collect_params('conv1_weight|conv1_bias|fc_weight|fc_bias')
-
-        or collects all parameters that have the name end with 'weight' or 'bias', this can be done
-        using regular expressions.
-
-            $model->collect_params('.*weight|.*bias')
-
-=cut
-
-method collect_params(Maybe[Str] $select=)
-{
-    $self->_check_container_with_block();
-    my $ret = AI::MXNet::Gluon::ParameterDict->new(prefix => $self->_params->prefix);
-    $ret->update($self->params, $select);
-    for my $cld ($self->_children->values)
-    {
-        $ret->update($cld->collect_params($select));
-    }
-    return $ret;
-}
-
-
-method _collect_params_with_prefix(Str $prefix='')
-{
-    if($prefix)
-    {
-        $prefix .= '.';
-    }
-    my %ret = map { $prefix.$_ => $self->_reg_params->{ $_ } } keys %{ $self->_reg_params };
-    my $iter = $self->_children->iterator;
-    while(my ($name, $child) = $iter->())
-    {
-        %ret = (%ret, %{ $child->_collect_params_with_prefix("$prefix$name") });
-    }
-    return \%ret;
-}
-
-=head2 save_parameters
-
-        Save parameters to file.
-
-        filename : str
-            Path to file.
-=cut
-
-method save_parameters(Str $filename)
-{
-    my $params = $self->_collect_params_with_prefix();
-    my %arg_dict = map { $_ => $params->{$_}->_reduce } keys %{ $params };
-    AI::MXNet::NDArray->save($filename, \%arg_dict);
-}
-
-=head2 load_parameters
-
-        Load parameters from file.
-
-        $filename : str
-            Path to parameter file.
-        :$ctx= : Context or list of Context
-            Context(s) initialize loaded parameters on.
-        :$allow_missing : bool, default False
-            Whether to silently skip loading parameters not represents in the file.
-        :$ignore_extra : bool, default False
-            Whether to silently ignore parameters from the file that are not
-            present in this Block.
-=cut
-
-method load_parameters(
-    Str   $filename,
-    AI::MXNet::Context|ArrayRef[AI::MXNet::Context] :$ctx=AI::MXNet::Context->current_ctx,
-    Bool  :$allow_missing=0,
-    Bool  :$ignore_extra=0
-)
-{
-    my $loaded = AI::MXNet::NDArray->load($filename);
-    my $params = $self->_collect_params_with_prefix;
-    return if not keys %$loaded and not keys %$params;
-
-    if(not grep { /\./ } keys %$loaded)
-    {
-        # legacy loading
-        %$loaded = ();
-        $self->collect_params->load(
-            $filename,
-            ($ctx ? (ctx   => $ctx) : ()),
-            allow_missing  => $allow_missing,
-            ignore_extra   => $ignore_extra,
-            restore_prefix => $self->prefix
-        );
-        return;
-    }
-
-    if(not $allow_missing)
-    {
-        for my $name (keys %$params)
-        {
-            if(not exists $loaded->{$name})
-            {
-                confess(
-                    "Parameter $name is missing in file $filename, which contains parameters:".
-                    join(',', keys %$loaded)."\n".
-                    "Set allow_missing=>1 to ignore missing parameters."
-                );
-            }
-        }
-    }
-    for my $name (keys %$loaded)
-    {
-        if(not $ignore_extra and not exists $params->{ $name })
-        {
-            confess(
-                "Parameter $name loaded from file $filename is not present in ParameterDict, ".
-                "which contains parameters ".
-                join(',', keys %$params)."\n".
-                "Set ignore_extra=>1 to ignore."
-            );
-        }
-        $params->{$name}->_load_init($loaded->{$name}, $ctx) if exists $params->{$name};
-    }
-}
-
-=head2 register_child
-
-        Registers block as a child of self. `Block`s assigned to self as
-        attributes will be registered automatically.
-=cut
-
-method register_child(AI::MXNet::Gluon::Block $block, Maybe[Str] $name=)
-{
-    $name //= $self->_children->keys;
-    $self->_children->set($name, $block);
-}
-
-=head2 register_forward_pre_hook
-
-        Registers a forward pre-hook on the block.
-
-        The hook function is called immediately before 'forward'.
-        It should not modify the input or output.
-
-        Parameters
-        ----------
-        $hook : CodeRef or callable object
-            The forward hook function of form $hook->($block, $input).
-
-        Returns
-        -------
-        AI::MXNet::Gluon::Utils::HookHandle
-=cut
-
-method register_forward_pre_hook($hook)
-{
-    my $handle = AI::MXNet::Gluon::Utils::HookHandle->new;
-    $handle->attach($self->_forward_pre_hooks, $hook);
-    return $handle;
-}
-
-=head2 register_forward_hook
-
-        Registers a forward hook on the block.
-
-        The hook function is called immediately after 'forward'.
-        It should not modify the input or output.
-
-        Parameters
-        ----------
-        $hook : CodeRef or callable object
-            The forward hook function of form $hook->($block, $input).
-
-        Returns
-        -------
-        AI::MXNet::Gluon::Utils::HookHandle
-=cut
-
-method register_forward_hook($hook)
-{
-    my $handle = AI::MXNet::Gluon::Utils::HookHandle->new;
-    $handle->attach($self->_forward_hooks, $hook);
-    return $handle;
-}
-
-=head2 apply
-
-        Applies $fn recursively to every child block as well as self.
-
-        Parameters
-        ----------
-        $fn : callable
-            Function to be applied to each submodule, of form `$fn->($block)`.
-
-        Returns
-        -------
-        this block
-=cut
-
-method apply($fn)
-{
-    for my $cld ($self->_children->values)
-    {
-        $cld->apply($fn);
-    }
-    $fn->($self);
-    return $self;
-}
-
-=head2 initialize
-
-
-        Initializes AI::MXNet::Gluon::Parameters of this AI::MXNet::Gluon::Block and its children.
-        Equivalent to $block->collect_params()->initialize(...)
-
-        Parameters
-        ----------
-        $init : Initializer
-            Global default Initializer to be used when Parameter->init is undefined`.
-            Otherwise, Parameter->init takes precedence.
-        ctx : Context or array ref of Context
-            Keeps a copy of Parameters on one or many context(s).
-        verbose : bool, default False
-            Whether to verbosely print out details on initialization.
-        force_reinit : bool, default False
-            Whether to force re-initialization if parameter is already initialized.
-=cut
-
-method initialize(
-    Initializer $init=AI::MXNet::Initializer->Uniform(),
-    AI::MXNet::Context|ArrayRef[AI::MXNet::Context] :$ctx=AI::MXNet::Context->current_ctx,
-    Bool :$verbose=0,
-    Bool :$force_reinit=0
-)
-{
-    $self->collect_params->initialize(init => $init, ctx => $ctx, verbose => $verbose, force_reinit => $force_reinit);
-}
-
-
-=head2 hybridize
-
-        Activates or deactivates `HybridBlock`s recursively. Has no effect on
-        non-hybrid children.
-
-        Parameters
-        ----------
-        $active : bool, default True
-            Whether to turn hybrid on or off.
-        :$static_alloc : bool, default False
-            Statically allocate memory to improve speed. Memory usage may increase.
-        :$static_shape : bool, default False
-            Optimize for invariant input shapes between iterations. Must also
-            set static_alloc to True. Change of input shapes is still allowed
-            but slower.
-=cut
-
-method hybridize(
-    Bool $active=1,
-    %args
-)
-{
-    $_->hybridize(
-        $active,
-        %args
-    ) for $self->_children->values;
-}
-
-=head2 cast
-
-        Cast this Block to use another data type.
-
-        Parameters
-        ----------
-        dtype : Dtype
-            The new data type.
-=cut
-
-method cast(Dtype $dtype)
-{
-    for my $child ($self->_children->values)
-    {
-        $child->cast($dtype);
-    }
-    $_->cast($dtype) for $self->params->values;
-}
-
-method call(@args)
-{
-    for my $hook ($self->_forward_pre_hooks->values)
-    {
-        $hook->($self, \@args);
-    }
-    my @out = $self->forward(@args);
-    for my $hook ($self->_forward_hooks->values)
-    {
-        $hook->($self, \@args, \@out);
-    }
-    return wantarray ? @out : $out[0];
-}
-
-=head2 forward
-
-        Overrides to implement forward computation using `NDArray`. Only
-        accepts positional arguments.
-
-        Parameters
-        ----------
-        @args : array of NDArray
-            Input tensors.
-=cut
-
-method forward(@args)
-{
-    confess("Not Implemented");
-}
-
-method register(Str $container)
-{
-    my $sub_name = $self->_class_name;
-    my $dest = $self->can('new');
-    my $func = sub {
-        splice @_, 0, 1, $self;
-        goto $dest;
-    };
-    no strict 'refs';
-    *{"$container\::$sub_name"} = $func;
-}
-
-=head2 summary
-
-        Print the summary of the model's output and parameters.
-
-        The network must have been initialized, and must not have been hybridized.
-
-        Parameters
-        ----------
-        @inputs : objects
-            Any inputs that the model supports. For any tensor in the input, only
-            AI::MXNet::NDArray is supported.
-=cut
-
-method summary(@inputs)
-{
-    my $summary = Hash::Ordered->new;
-    my %seen;
-    my @hooks;
-    my $stringify;
-    $stringify = sub {
-        my $in = shift;
-        if(ref($in) eq 'ARRAY')
-        {
-            return '('.join(', ', map { $stringify->($_) } @$in).')';
-        }
-         else
-        {
-            return "$in";
-        }
-    };
-    my $_get_shape_str = sub { my ($args) = @_;
-        $args = $args->[0] if(ref $args eq 'ARRAY' and @$args == 1);
-        my ($flat_args, $fmts) = __PACKAGE__->_flatten($args);
-        my $flat_arg_shapes = [map { (blessed($_) and $_->isa('AI::MXNet::NDArray')) ? $_->shape : $_ } @$flat_args];
-        my $shapes = (__PACKAGE__->_regroup($flat_arg_shapes, $fmts))[0];
-        my $shape_str = $stringify->($shapes);
-        $shape_str =~ s/L//g;
-        return $shape_str;
-    };
-
-    my $_register_summary_hook = sub { my ($block) = @_;
-        unless(not $block->isa('AI::MXNet::Gluon:::HybridBlock') or not $block->_active)
-        {
-            confess("\"${\ $block->name }\" must not be hybridized to print summary.");
-        }
-        my $_summary_hook = sub { my ($block, undef, $outputs) = @_;
-            my $class_name = $block->_class_name;
-            my $block_idx = $summary->keys - 1;
-
-            my $m_key = sprintf('%s-%i', $class_name, $block_idx+1);
-            $summary->set($m_key, Hash::Ordered->new);
-            $summary->get($m_key)->set('output_shape', $_get_shape_str->($outputs));
-
-            my $params = 0;
-            $summary->get($m_key)->set('trainable', 0);
-            $summary->get($m_key)->set('shared', 0);
-            for my $p (values %{ $block->_reg_params })
-            {
-                $params += $p->data->size;
-                $summary->get($m_key)->set('trainable', $summary->get($m_key)->get('trainable') + ($p->grad_req eq 'null' ? 0 : $p->data->size));
-                if(exists $seen{$p})
-                {
-                    $summary->get($m_key)->set('shared', $summary->get($m_key)->get('shared') + $p->data->size);
-                }
-                else
-                {
-                    $seen{$p} = 1;
-                }
-            }
-            $summary->get($m_key)->set('n_params', $params);
-        };
-
-        if(not $block->isa('AI::MXNet::Gluon::NN::Sequential') and not $block->isa('AI::MXNet::Gluon::NN::HybridSequential'))
-        {
-            push @hooks, $block->register_forward_hook($_summary_hook);
-        }
-    };
-
-    my $input = Hash::Ordered->new;
-    $summary->set('Input', $input);
-    $input->set('output_shape', $_get_shape_str->(\@inputs));
-    $input->set('n_params', 0);
-    $input->set('trainable', 0);
-    $input->set('shared', 0);
-
-    eval {
-        $self->apply($_register_summary_hook);
-        $self->(@inputs);
-
-        my $line_format = "%20s  %42s %15s\n";
-        print (('-')x80, "\n");
-        printf($line_format, 'Layer (type)', 'Output Shape', 'Param #');
-        print (('=')x80, "\n");
-        my $total_params = 0;
-        my $trainable_params = 0;
-        my $shared_params = 0;
-        for my $layer ($summary->keys)
-        {
-            printf($line_format, $layer, $summary->get($layer)->get('output_shape'), $summary->get($layer)->get('n_params'));
-            $total_params += $summary->get($layer)->get('n_params');
-            $trainable_params += $summary->get($layer)->get('trainable');
-            $shared_params += $summary->get($layer)->get('shared');
-        }
-        print (('=')x80, "\n");
-        print "Parameters in forward computation graph, duplicate included\n";
-        print "   Total params: $total_params\n";
-        print "   Non-trainable params: ", $total_params - $trainable_params, "\n";
-        print "Shared params in forward computation graph: $shared_params\n";
-        print "Unique parameters in model: ", $total_params - $shared_params, "\n";
-        print (('-')x80, "\n");
-    };
-    $_->detach for @hooks;
-}
-
-__PACKAGE__->register('AI::MXNet::Gluon');
-
-package AI::MXNet::Gluon::HybridBlock;
-=head2 NAME
-
-    AI::MXNet::Gluon::HybridBlock
-
-=head2 DESCRIPTION
-
-    HybridBlock supports forwarding with both Symbol and NDArray.
-
-    Forward computation in HybridBlock must be static to work with Symbols,
-    i.e. you cannot call aspdl, shape, dtype, etc on tensors.
-    Also, you cannot use branching or loop logic that bases on non-constant
-    expressions like random numbers or intermediate results, since they change
-    the graph structure for each iteration.
-
-    Before activating with hybridize(), HybridBlock works just like normal
-    Block. After activation, HybridBlock will create a symbolic graph
-    representing the forward computation and cache it. On subsequent forwards,
-    the cached graph will be used instead of hybrid_forward.
-
-    Refer Hybrid tutorial L<https://mxnet.io/tutorials/gluon/hybrid.html> to see
-    the end-to-end usage.
-=cut
-
-use AI::MXNet::Gluon::Mouse;
-use AI::MXNet::Base;
-extends 'AI::MXNet::Gluon::Block';
-has [qw/
-        _cached_graph
-        _cached_op
-        _out_format _in_format
-        _active _flags _cached_op_args
-/] => (is => 'rw', init_arg => undef);
-
-sub BUILD
-{
-    my $self = shift;
-    $self->_active(0);
-    $self->_flags([]);
-    $self->_cached_graph([]);
-    $self->_cached_op_args([]);
-}
-
-method __setattr__($name, $current, $prev=)
-{
-    $self->SUPER::__setattr__($name, $current, $prev);
-    if(blessed $current and $current->isa('AI::MXNet::Gluon::HybridBlock'))
-    {
-        $self->_clear_cached_op();
-    }
-}
-
-method register_child(AI::MXNet::Gluon::HybridBlock $block, Maybe[Str] $name=)
-{
-    $self->SUPER::register_child($block, $name);
-    $self->_clear_cached_op();
-}
-
-method hybridize(@args)
-{
-    my $active;
-    if(@args%2)
-    {
-        $active = shift(@args);
-    }
-    else
-    {
-        $active = 1;
-    }
-    $self->_active($active);
-    @{ $self->_flags } = @args;
-    $self->_clear_cached_op();
-    if($self->_active and ($self->_forward_hooks or $self->_forward_pre_hooks))
-    {
-        AI::MXNet::Logging->warning(
-            "$self is being hybridized while still having forward hook/pre-hook. ".
-            "If $self is a child of HybridBlock, the hooks will not take effect."
-        );
-    }
-    $self->SUPER::hybridize($self->_active, @args);
-}
-
-method cast(Dtype $dtype)
-{
-    $self->_clear_cached_op;
-    $self->SUPER::cast($dtype);
-}
-
-method  _infer_attrs($infer_fn, $attr, @args)
-{
-    my ($inputs, $out) = $self->_get_graph(@args);
-    my ($args) = __PACKAGE__->_flatten([@args]);
-    my %in;
-    zip(sub {
-        my ($i, $j) = @_;
-        $in{ $i->name } = $j->$attr;
-    }, $inputs, $args);
-    my ($arg_attrs, $aux_attrs);
-    ($arg_attrs, undef, $aux_attrs) = $out->$infer_fn(%in);
-    if(not defined $arg_attrs)
-    {
-        confess($@);
-    }
-    my %sdict;
-    zip(sub {
-        my ($i, $j) = @_;
-        $sdict{ $i } = $j;
-    }, $out->list_arguments, $arg_attrs);
-    zip(sub {
-        my ($i, $j) = @_;
-        $sdict{ $i } = $j;
-    }, $out->list_auxiliary_states, $aux_attrs);
-
-    for my $i ($self->collect_params->values)
-    {
-        $i->$attr($sdict{ $i->name });
-    }
-}
-
-method infer_shape(@args)
-{
-    $self->_infer_attrs('infer_shape', 'shape', @args);
-}
-
-method infer_type(@args)
-{
-    $self->_infer_attrs('infer_type', 'dtype', @args);
-}
-
-method _get_graph(@args)
-{
-    if(not @{ $self->_cached_graph })
-    {
-        my $args = [@args];
-        my ($in_format, $out_format);
-        ($args, $in_format) = __PACKAGE__->_flatten($args);
-        $self->_in_format($in_format);
-        my @inputs; 
-        if(@args > 1)
-        {
-            @inputs = map { AI::MXNet::Symbol->var("data_$_") } 0 .. @$args-1;
-        }
-        else
-        {
-            @inputs = (AI::MXNet::Symbol->var("data"))
-        }
-        my ($grouped_inputs) = __PACKAGE__->_regroup(\@inputs, $self->_in_format);
-        my %params = map { $_ => $self->_reg_params->{$_}->var } keys %{ $self->_reg_params };
-        my @out;
-        $self->name_scope(sub {
-            @out = $self->hybrid_forward('AI::MXNet::Symbol', @{ $grouped_inputs }, %params);
-        });
-        my $out = @out > 1 ? [@out] : $out[0];
-        ($out, $out_format) = __PACKAGE__->_flatten($out);
-        $self->_out_format($out_format);
-        @{ $self->_cached_graph } = (\@inputs, AI::MXNet::Symbol->Group($out));
-    }
-    return @{ $self->_cached_graph };
-}
-
-=head2 infer_shape
-
-        Infers shape of Parameters from inputs.
-=cut
-
-method _build_cache(@args)
-{
-    my ($data, $out) = $self->_get_graph(@args);
-    my $i = 0;
-    my %data_names = map { $_->name => $i++ } @{ $data };
-    my $params = $self->collect_params;
-    my $input_names = $out->list_inputs;
-    my %param_names = map { $_ => 1 } $params->keys;
-    my %expected_names = map { $_ => 1 } @{ $input_names };
-    for my $name (keys %expected_names)
-    {
-        assert(
-            (exists $param_names{ $name } or exists $data_names{ $name }),
-            "Unknown input to HybridBlock: $name"
-        );
-    }
-    my $unused = join(', ', map { "$data_names{$_}-th" } grep { !exists $expected_names{ $_ } } keys %data_names);
-    AI::MXNet::Logging->warn(
-        "The $unused input to HybridBlock is not used by any ".
-        "computation. Is this intended?"
-    ) if $unused;
-    $unused = join(', ', grep { !exists $expected_names{ $_ } } keys %param_names);
-    AI::MXNet::Logging->warn(
-        "Parameter %s is not used by any computation. " .
-        "Is this intended?"
-    ) if $unused;
-
-    my @data_indices;
-    my @param_indices;
-    $self->_cached_op_args([]);
-    enumerate(sub {
-        my ($i, $name) = @_;
-        if(exists $data_names{ $name })
-        {
-            push @data_indices, $i;
-            push @{ $self->_cached_op_args }, [1, $data_names{$name}];
-        }
-        else
-        {
-            push @param_indices, $i;
-            push @{ $self->_cached_op_args }, [0, $params->params->get($name)];
-        }
-    }, $input_names);
-    my %flags = (
-        data_indices  => \@data_indices,
-        param_indices => \@param_indices,
-        @{ $self->_flags }
-    );
-    $self->_cached_op(AI::MXNet::CachedOp->new($out, \%flags));
-}
-
-method _deferred_infer_shape(@args)
-{
-    eval {
-        $self->infer_shape(@args)
-    };
-    if($@)
-    {
-        confess(
-            "Deferred initialization failed because shape".
-            " cannot be inferred. $@"
-        );
-    }
-}
-
-method _clear_cached_op()
-{
-    $self->_cached_graph([]);
-    $self->_cached_op(undef);
-}
-
-use Data::Dumper;
-method _call_cached_op(@args)
-{
-    if(not defined $self->_cached_op)
-    {
-        $self->_build_cache(@args);
-    }
-    my $args = [@args];
-    my $fmt;
-    ($args, $fmt) = __PACKAGE__->_flatten($args);
-    assert((Dumper($fmt) eq Dumper($self->_in_format)), "Invalid input format");
-    my @cargs;
-    eval {
-        @cargs = map { (not $_->[0]) ? $_->[1]->data() : $args->[$_->[1]] } @{ $self->_cached_op_args };
-    };
-    if($@)
-    {
-        if($@ =~ /DeferredInitializationError/)
-        {
-            $self->_deferred_infer_shape(@$args);
-            @cargs = ();
-            map {
-                if($_->[0])
-                {
-                    push @cargs, $args->[$_->[1]];
-                }
-                else
-                {
-                    $_->[1]->_finish_deferred_init();
-                    push @cargs, $_->[1]->data;
-                }
-            } @{ $self->_cached_op_args };
-        }
-        else
-        {
-            confess($@);
-        }
-    }
-    my $out = $self->_cached_op->(@cargs);
-    if(blessed $out and $out->isa('AI::MXNet::NDArray'))
-    {
-        $out = [$out];
-    }
-    my $ret = (__PACKAGE__->_regroup($out, $self->_out_format))[0];
-    if(ref($ret) eq 'ARRAY' and wantarray)
-    {
-        return @$ret;
-    }
-    else
-    {
-        return $ret;
-    }
-}
-
-=head2 forward
-
-        Defines the forward computation. Arguments can be either
-        NDArray or Symbol
-=cut
-
-method forward($x, @args)
-{
-    if(blessed $x and $x->isa('AI::MXNet::NDArray'))
-    {
-        my @out;
-        my $out;
-        my $ctx = $x->context;
-        my $current_ctx = AI::MXNet::Context->current_ctx;
-        AI::MXNet::Context->set_current($ctx);
-        if($self->_active)
-        {
-            if(wantarray)
-            {
-                my @out = $self->_call_cached_op($x, @args);
-                AI::MXNet::Context->set_current($current_ctx);
-                return @out;
-            }
-            else
-            {
-                my $out = $self->_call_cached_op($x, @args);
-                AI::MXNet::Context->set_current($current_ctx);
-                return $out;
-            }
-        }
-        my %params;
-        eval {
-            %params = map { $_ => $self->_reg_params->{ $_ }->data($ctx) } keys %{ $self->_reg_params };
-        };
-        if($@)
-        {
-            if($@ =~ /DeferredInitializationError/)
-            {
-                $self->_deferred_infer_shape($x, @args);
-                $_->_finish_deferred_init for $self->params->values;
-                %params = map { $_ => $self->_reg_params->{ $_ }->data($ctx) } keys %{ $self->_reg_params };
-            }
-            else
-            {
-                confess($@);
-            }
-        }
-        @out = $self->hybrid_forward('AI::MXNet::NDArray', $x, @args, %params);
-        AI::MXNet::Context->set_current($current_ctx);
-        return wantarray ? @out : $out[0];
-    }
-    assert(
-        (blessed $x and $x->isa('AI::MXNet::Symbol')),
-        "HybridBlock requires the first argument to forward be either ".
-        "Symbol or NDArray, but got [".ref($x)."]"
-    );
-    my %params = map { $_ => $self->_reg_params->{ $_ }->var } keys %{ $self->_reg_params };
-    my @ret;
-    $self->name_scope(sub {
-        @ret = $self->hybrid_forward('AI::MXNet::Symbol', $x, @args, %params);
-    });
-    return wantarray ? @ret : $ret[0];
-}
-
-=head2 hybrid_forward
-
-        Overrides to construct symbolic graph for this `Block`.
-
-        Parameters
-        ----------
-        x : Symbol or NDArray
-            The first input tensor.
-        *args : list of Symbol or list of NDArray
-            Additional input tensors.
-=cut
-
-method hybrid_forward($F, $x, @args)
-{
-    confess("NotImplementedError");
-}
-
-=head2 export
-
-        Export HybridBlock to json format that can be loaded by AI::MXNet::Module
-        or the C++ interface.
-
-        When there are only one input, it will have name 'data'. When there
-        Are more than one inputs, they will be named as 'data0', 'data1', etc.
-
-        Parameters
-        ----------
-        $path : str
-            Path to save model. Two files 'path-symbol.json' and 'path-xxxx.params'
-            will be created, where xxxx is the 4 digits epoch number.
-        :$epoch=0 : Int
-            Epoch number of saved model.
-=cut
-
-method export(Str $path, :$epoch=0)
-{
-    if(not @{ $self->_cached_graph })
-    {
-        confess(
-            "Please first call \$block->hybridize() and then run forward with ".
-            "this block at least once before calling export."
-        );
-    }
-    my $sym = $self->_cached_graph->[1];
-    my $sym_filename = "$path-symbol.json";
-    $sym->save($sym_filename);
-
-    my %arg_names = map { $_ => 1 } @{ $sym->list_arguments };
-    my %aux_names = map { $_ => 1 } @{ $sym->list_auxiliary_states };
-    my %arg_dict;
-    my $params = $self->collect_params;
-    for my $name ($params->keys)
-    {
-        my $param = $params->get($name);
-        if(exists $arg_names{ $name })
-        {
-            $arg_dict{ "arg:$name" } = $param->_reduce;
-        }
-        else
-        {
-            assert(exists $aux_names{ $name });
-            $arg_dict{ "aux:$name" } = $param->_reduce;
-        }
-    }
-    my $params_filename = sprintf('%s-%04d.params', $path, $epoch);
-    AI::MXNet::NDArray->save($params_filename, \%arg_dict);
-    return ($sym_filename, $params_filename);
-}
-
-__PACKAGE__->register('AI::MXNet::Gluon');
-
-package AI::MXNet::Gluon::SymbolBlock;
-use AI::MXNet::Gluon::Mouse;
-use AI::MXNet::Base;
-extends 'AI::MXNet::Gluon::HybridBlock';
-
-=head1 NAME
-
-    AI::MXNet::Gluon::SymbolBlock - Construct block from symbol.
-=cut
-
-=head1 DESCRIPTION
-
-    Construct block from symbol. This is useful for using pre-trained models
-    as feature extractors. For example, you may want to extract get the output
-    from fc2 layer in AlexNet.
-
-    Parameters
-    ----------
-    outputs : Symbol or list of Symbol
-        The desired output for SymbolBlock.
-    inputs : Symbol or list of Symbol
-        The Variables in output's argument that should be used as inputs.
-    params : ParameterDict
-        Parameter dictionary for arguments and auxililary states of outputs
-        that are not inputs.
-
-    Examples
-    --------
-    >>> # To extract the feature from fc1 and fc2 layers of AlexNet
-    >>> $alexnet = gluon->model_zoo->vision->alexnet(pretrained=>1, ctx=>mx->cpu(),
-                                                 prefix=>'model_');
-    >>> $inputs = mx->sym->var('data');
-    >>> $out = $alexnet->($inputs);
-    >>> $internals = $out->get_internals()
-    >>> print($internals->list_outputs())
-    ['data', ..., 'model_dense0_relu_fwd_output', ..., 'model_dense1_relu_fwd_output', ...]
-    >>> $outputs = [$internals->slice('model_dense0_relu_fwd_output'),
-                   $internals->slice('model_dense1_relu_fwd_output')];
-    >>> # Create SymbolBlock that shares parameters with alexnet
-    >>> $feat_model = gluon->SymbolBlock($outputs, $inputs, params=>$alexnet->collect_params());
-    >>> $x = mx->nd->random_normal(shape=>[16, 3, 224, 224]);
-    >>> print($feat_model->($x));
-=cut
-
-has [qw/outputs inputs/] => (is => 'rw', isa => 'AI::MXNet::Symbol|ArrayRef[AI::MXNet::Symbol]');
-method python_constructor_arguments() { [qw/outputs inputs/] }
-
-sub BUILD
-{
-    my ($self, $orig_params) = @_;
-    return unless defined $self->outputs and defined $self->inputs;
-    $self->_prefix('');
-    $self->_params(AI::MXNet::Gluon::ParameterDict->new(prefix => '', shared => $orig_params->{params}));
-    if(blessed $self->inputs and @{ $self->inputs->list_outputs } == 1)
-    {
-        $self->inputs([$self->inputs]);
-    }
-    if(not blessed $self->outputs and @{ $self->outputs } == 1)
-    {
-        $self->outputs($self->outputs->[0]);
-    }
-    my ($syms, $in_format) = __PACKAGE__->_flatten($self->inputs);
-    my ($out, $out_format) = __PACKAGE__->_flatten($self->outputs);
-    $self->_in_format($in_format);
-    $self->_out_format($out_format);
-    $out = AI::MXNet::Symbol->Group($out);
-
-    my %input_names;
-    for my $i (@{ $syms })
-    {
-        assert(
-            (@{ $i->get_internals->list_outputs() } == 1),
-            "Input symbols must be variable, but $i is an output of operators"
-        );
-        $input_names{ $i->name } = 1;
-    }
-
-    # check if any symbol is row_sparse
-    my $row_sparse_storage = STORAGE_TYPE_STR_TO_ID->{row_sparse};
-    for my $i (@{ $out })
-    {
-        for my $j (@{ $i->get_internals })
-        {
-            assert(
-                (not defined $j->attr("__storage_type__") or $j->attr("__storage_type__") ne $row_sparse_storage),
-                "SymbolBlock doesn't support Parameter ${\ $j->name }  because its storage ".
-                "type is 'row_sparse'."
-            );
-        }
-    }
-
-    my $arg_params = $out->list_arguments;
-    my $aux_params = $out->list_auxiliary_states;
-    my ($arg_types, $aux_types) = _infer_param_types($syms, $out, $arg_params, $aux_params);
-
-    for(enumerate($arg_params))
-    {
-        my ($i, $arg) = @$_;
-        if(not exists $input_names{ $arg })
-        {
-            $self->params->get($arg, allow_deferred_init => 1, dtype => $arg_types->[$i]);
-        }
-    }
-
-    for(enumerate($aux_params))
-    {
-        my ($i, $arg) = @$_;
-        if(not exists $input_names{ $arg })
-        {
-            $self->params->get($arg, grad_req => 'null', allow_deferred_init => 1, dtype => $aux_types->[$i]);
-        }
-    }
-
-    $self->_cached_graph([$syms, $out]);
-    my $prefix = _common_prefix($self->_params->keys);
-    my %params = $self->_params->items;
-    while(my ($key, $val) = each %params)
-    {
-        $key =~ s/^$prefix//;
-        $self->_reg_params->{ $key } = $val;
-    }
-    $self->_prefix($prefix);
-}
-
-
-func _infer_param_types($in_params, $out_params, $arg_params, $aux_params, $default_dtype='float32')
-{
-    # Utility function that helps in inferring DType of args and auxs params
-    # from given input param.
-    # Parameters
-    # ----------
-    # in_params: array ref of AI::MXNet::Symbol objects
-    #     List of input symbol variables.
-    # out_params: AI::MXNet::Symbol
-    #     Output symbol variable.
-    # arg_params: array ref of Str
-    #     List of names of argument parametrs.
-    # aux_params: array ref of Str
-    #     List of names of auxiliary parameters.
-    # default_dtype: Dtype, default 'float32'
-    #     Default data type for arg_params and aux_params, if unable to infer the type.
-    #  Returns
-    # -------
-    # arg_types: Array ref of Dtype
-    #     List of arg_params type. Order is same as arg_params.
-    #     Defaults to 'float32', if unable to infer type.
-    # aux_types: Array ref of Dtype
-    #     List of aux_params type. Order is same as aux_params.
-    #     Defaults to 'float32', if unable to infer type.
-
-    my $arg_types;
-    my $aux_types;
-    # Get Input symbol details. This will be used to infer types of
-    # other parameters.
-    my @input_sym_names = map { $_->name } @{ $in_params };
-    # Try to infer input types. If not successful, we will set default dtype.
-    # If successful, we will try to infer other params in the graph.
-    my @input_sym_arg_types;
-    my $can_infer_input_type = 1;
-    for my $in_param(@{ $in_params })
-    {
-        my $input_sym_arg_type = ($in_param->infer_type)[0];
-        if(not $input_sym_arg_type or @$input_sym_arg_type < 1)
-        {
-            $can_infer_input_type = 0;
-            last;
-        }
-        else
-        {
-            push @input_sym_arg_types, $input_sym_arg_type->[0];
-        }
-    }
-    # Try to infer types of other parameters.
-    if($can_infer_input_type)
-    {
-        my %params = map { $_->[0] => $_->[1] } zip(\@input_sym_names, \@input_sym_arg_types);
-        ($arg_types, undef, $aux_types) = $out_params->infer_type(%params);
-        if(not defined $arg_types or @$arg_types != @$arg_params)
-        {
-            $arg_types = [($default_dtype)x@$arg_params];
-        }
-        if(not defined $aux_types or @$aux_types != @$aux_params)
-        {
-            $aux_types = [($default_dtype)x@$aux_params];
-        }
-    }
-    return ($arg_types, $aux_types);
-}
-
-func _common_prefix(@names)
-{
-    if(not @names)
-    {
-        return ''
-    }
-    my $prefix = $names[0];
-    for my $name (@names)
-    {
-        my $i = 0;
-        while($i < length($prefix) and $i < length($name) and substr($prefix, $i, 1) eq substr($name, $i, 1))
-        {
-            $i++;
-        }
-        $prefix = substr($prefix, 0, $i);
-    }
-    return $prefix;
-}
-
-method forward($x, @args)
-{
-    if(blessed $x and $x->isa('AI::MXNet::NDArray'))
-    {
-        my @out;
-        my $out;
-        my $ctx = $x->context;
-        my $current_ctx = AI::MXNet::Context->current_ctx;
-        AI::MXNet::Context->set_current($ctx);
-        if(wantarray)
-        {
-            my @out = $self->_call_cached_op($x, @args);
-            AI::MXNet::Context->set_current($current_ctx);
-            return @out;
-        }
-        else
-        {
-            my $out = $self->_call_cached_op($x, @args);
-            AI::MXNet::Context->set_current($current_ctx);
-            return $out;
-        }
-    }
-    assert(
-        (blessed $x and $x->isa('AI::MXNet::Symbol')),
-        "HybridBlock requires the first argument to forward be either ".
-        "Symbol or NDArray, but got [".ref($x)."]"
-    );
-    my $args = \@args;
-    my $in_fmt;
-    ($args, $in_fmt) = __PACKAGE__->_flatten([$x, @$args]);
-    assert((Data::Dumper::Dumper($in_fmt) eq Data::Dumper::Dumper($self->_in_format)), "Invalid input format");
-    my $ret = $self->_cached_graph->[1]->deepcopy;
-    my %in;
-    for(zip($self->_cached_graph->[0], $args)) {
-        my ($k, $v) = @$_;
-        $in{$k->name} = $v;
-    }
-    $ret->_compose(%in);
-    $ret = (__PACKAGE__->_regroup($ret, $self->_out_format))[0];
-    if(ref($ret) eq 'ARRAY' and wantarray)
-    {
-        return @$ret;
-    }
-    else
-    {
-        return $ret;
-    }
-}
-
-method _clear_cached_op()
-{
-    my $tmp = $self->_cached_graph;
-    $self->SUPER::_clear_cached_op;
-    $self->_cached_graph($tmp);
-}
-
-method hybrid_forward(@args)
-{
-    confess('NotImplementedError');
-}
-
-=head2 imports
-
-        Import model previously saved by HybridBlock->export or
-        Module->save_checkpoint as a SymbolBlock for use in Gluon.
-
-        Parameters
-        ----------
-        $symbol_file : Str
-            Path to symbol file.
-        $input_names : Str|ArrayRef[Str]
-            List of input variable names
-        :$param_file : Str, optional
-            Path to parameter file.
-        $ctx : Context, default undef
-            The context to initialize SymbolBlock on.
-
-        Returns
-        -------
-        SymbolBlock
-            SymbolBlock loaded from symbol and parameter files.
-=cut
-
-method imports(Str $symbol_file, Str|ArrayRef[Str] $input_names, Maybe [Str] $param_file=, Maybe[AI::MXNet::Context] $ctx=)
-{
-    my $sym = AI::MXNet::Symbol->load($symbol_file);
-    $input_names = [$input_names] unless ref $input_names;
-    my @inputs = map { AI::MXNet::Symbol->var($_) } @{ $input_names };
-    my $ret = __PACKAGE__->new($sym, \@inputs);
-    if(defined $param_file)
-    {
-        $ret->load_parameters($param_file, (defined $ctx ? (ctx=>$ctx) : ()));
-    }
-    return $ret
-}
-
-__PACKAGE__->register('AI::MXNet::Gluon');
-
-1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Data.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Data.pm
deleted file mode 100644
index 95612d9d8cf3..000000000000
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Data.pm
+++ /dev/null
@@ -1,29 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-package AI::MXNet::Gluon::Data;
-use strict;
-use warnings;
-use AI::MXNet::NS;
-use AI::MXNet::Gluon::Data::Set;
-use AI::MXNet::Gluon::Data::Sampler;
-use AI::MXNet::Gluon::Data::Loader;
-use AI::MXNet::Gluon::Data::Vision 'vision';
-
-1;
-
-
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Data/Loader.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Data/Loader.pm
deleted file mode 100644
index e6a0e7461a80..000000000000
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Data/Loader.pm
+++ /dev/null
@@ -1,186 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-=head1 NAME
-
-    AI::MXNet::Gluon::Data::Loader::DataLoader - Dataset generator.
-=cut
-
-=head1 DESCRIPTION
-
-    Loads data from a dataset and returns mini-batches of data.
-
-    Parameters
-    ----------
-    dataset : Dataset
-        Source dataset. Note that numpy and mxnet arrays can be directly used
-        as a Dataset.
-    batch_size : int
-        Size of mini-batch.
-    shuffle : bool
-        Whether to shuffle the samples.
-    sampler : Sampler
-        The sampler to use. Either specify sampler or shuffle, not both.
-    last_batch : {'keep', 'discard', 'rollover'}
-        How to handle the last batch if batch_size does not evenly divide
-        `len(dataset)`.
-
-        keep - A batch with less samples than previous batches is returned.
-        discard - The last batch is discarded if its incomplete.
-        rollover - The remaining samples are rolled over to the next epoch.
-    batch_sampler : Sampler
-        A sampler that returns mini-batches. Do not specify batch_size,
-        shuffle, sampler, and last_batch if batch_sampler is specified.
-=cut
-
-use strict;
-use warnings;
-package AI::MXNet::Gluon::Data::Loader::DataLoader;
-use AI::MXNet::Function::Parameters;
-use Mouse;
-
-method _class_name()
-{
-    my $class = ref $self || $self;
-    $class =~ s/^.+:://;
-    $class;
-}
-
-method register(Str $container)
-{
-    my $sub_name = $self->_class_name;
-    no strict 'refs';
-    *{$container.'::'.$sub_name} = sub { shift; $self->new(@_) };
-}
-
-# Collate data into batch.
-func _batchify($data, $dtype)
-{
-    if(blessed $data->[0] and $data->[0]->isa('AI::MXNet::NDArray'))
-    {
-        return AI::MXNet::NDArray->stack(@{ $data });
-    }
-    elsif(ref $data->[0] eq 'ARRAY')
-    {
-        my (@data, @label);
-        for my $i (@$data)
-        {
-            my ($d, $l) = @$i;
-            push @data, $d;
-            push @label, $l;
-        }
-        return [_batchify(\@data, $dtype), _batchify(\@label, $dtype)];
-    }
-    else
-    {
-        return AI::MXNet::NDArray->array($data, dtype => $dtype);
-    }
-}
-
-has 'dataset'       => (is => 'rw', isa => 'AI::MXNet::Gluon::Data::Set|AI::MXNet::NDArray|PDL', required => 1);
-has 'batch_size'    => (is => 'ro', isa => 'Int');
-has 'shuffle'       => (is => 'ro', isa => 'Bool', default => 0);
-has 'sampler'       => (is => 'rw', isa => 'AI::MXNet::Gluon::Data::Sampler');
-has 'batch_sampler' => (is => 'rw', isa => 'AI::MXNet::Gluon::Data::Sampler');
-has 'last_batch'    => (is => 'rw', isa => 'Str', default => 'keep');
-
-around BUILDARGS => \&AI::MXNet::Base::process_arguments;
-method python_constructor_arguments() { ['dataset', 'batch_size'] }
-
-sub BUILD
-{
-    my $self = shift;
-    if($self->dataset->isa('PDL'))
-    {
-        $self->dataset(AI::MXNet::NDArray->array($self->dataset));
-    }
-    if(not defined $self->batch_sampler)
-    {
-        if(not defined $self->batch_size)
-        {
-            confess("batch_size must be specified unless batch_sampler is specified");
-        }
-        if(not defined $self->sampler)
-        {
-            if($self->shuffle)
-            {
-                $self->sampler(
-                    AI::MXNet::Gluon::Data::Sampler::RandomSampler->new(
-                        length => $self->dataset->len
-                    )
-                );
-            }
-            else
-            {
-                $self->sampler(
-                    AI::MXNet::Gluon::Data::Sampler::SequentialSampler->new(
-                        length => $self->dataset->len,
-                    )
-                );
-            }
-        }
-        elsif($self->shuffle)
-        {
-            confess("shuffle must not be specified if sampler is specified");
-        }
-        $self->batch_sampler(
-            AI::MXNet::Gluon::Data::Sampler::BatchSampler->new(
-                sampler => $self->sampler,
-                batch_size => $self->batch_size,
-                last_batch => $self->last_batch
-            )
-        );
-    }
-    elsif(defined $self->batch_size or $self->shuffle or defined $self->sampler or defined $self->last_batch)
-    {
-        confess("batch_size, shuffle, sampler and last_batch must ".
-                "not be specified if batch_sampler is specified.");
-    }
-}
-
-use overload
-    '<>' => sub {
-        my $self = shift;
-        my $sampler = $self->batch_sampler;
-        my $batch = <$sampler>;
-        if(not defined $batch)
-        {
-            return undef;
-        };
-        return _batchify([map { $self->dataset->at($_) } @{ $batch }], eval { $self->dataset->label->dtype }//'int32');
-    };
-
-method len()
-{
-    $self->batch_sampler->len;
-}
-
-use overload '@{}' => sub { shift->list };
-
-method list()
-{
-    my @ret;
-    while(defined(my $data = <$self>))
-    {
-        push @ret, $data;
-    }
-    return \@ret;
-}
-
-__PACKAGE__->register('AI::MXNet::Gluon::Data');
-
-1;
\ No newline at end of file
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Data/Sampler.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Data/Sampler.pm
deleted file mode 100644
index e19f9d8c5f32..000000000000
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Data/Sampler.pm
+++ /dev/null
@@ -1,285 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-use strict;
-use warnings;
-
-package AI::MXNet::Gluon::Data::Sampler;
-use AI::MXNet::Function::Parameters;
-use Mouse;
-around BUILDARGS => \&AI::MXNet::Base::process_arguments;
-
-method _class_name()
-{
-    my $class = ref $self || $self;
-    $class =~ s/^.+:://;
-    $class;
-}
-
-method register(Str $container)
-{
-    my $sub_name = $self->_class_name;
-    no strict 'refs';
-    *{$container.'::'.$sub_name} = sub { shift; $self->new(@_) };
-}
-
-=head1 NAME
-
-    AI::MXNet::Gluon::Data::Sampler
-=cut
-
-=head1 DESCRIPTION
-
-    Base class for samplers.
-
-    All samplers should subclass AI::MXNet::Gluon::Data::Sampler 
-    and define method 'len' and 'next'
-    methods.
-=cut
-
-use overload '<>' =>  sub { shift->next },
-             '@{}' => sub { shift->list };
-
-method list()
-{
-    my @ret;
-    while(defined(my $data = <$self>))
-    {
-        push @ret, $data;
-    }
-    return \@ret;
-}
-
-method len() { confess('Not Implemented') }
-method next() { confess('Not Implemented') }
-
-package AI::MXNet::Gluon::Data::Sampler::SequentialSampler;
-use Mouse;
-extends 'AI::MXNet::Gluon::Data::Sampler';
-
-=head1 NAME
-
-    AI::MXNet::Gluon::Data::Sampler::SequentialSampler
-=cut
-
-=head1 DESCRIPTION
-
-    Samples elements from [0, length) sequentially.
-
-    Parameters
-    ----------
-    length : int
-        Length of the sequence.
-=cut
-has 'length'   => (is => 'ro', isa => 'Int', required => 1);
-has '_current' => (is => 'rw', init_arg => undef, default => 0);
-method python_constructor_arguments() { ['length'] }
-
-method next()
-{
-    my $current = $self->_current;
-    if($self->_current == $self->length)
-    {
-        $self->reset;
-        return undef;
-    }
-    else
-    {
-        $self->_current($self->_current + 1);
-        return $current;
-    }
-};
-
-method reset() { $self->_current(0) }
-method len() { $self->length }
-
-__PACKAGE__->register('AI::MXNet::Gluon::Data');
-
-package AI::MXNet::Gluon::Data::Sampler::RandomSampler;
-use Mouse;
-use List::Util qw(shuffle);
-extends 'AI::MXNet::Gluon::Data::Sampler';
-
-=head1 NAME
-
-    AI::MXNet::Gluon::Data::Sampler::RandomSampler
-=cut
-
-=head1 DESCRIPTION
-
-    Samples elements from [0, length) randomly without replacement.
-
-    Parameters
-    ----------
-    length : int
-        Length of the sequence.
-=cut
-has 'length'   => (is => 'ro', isa => 'Int', required => 1);
-has '_current' => (is => 'rw', init_arg => undef, default => 0);
-has '_indices' => (is => 'rw', init_arg => undef);
-method python_constructor_arguments() { ['length'] }
-
-sub BUILD
-{
-    my $self = shift;
-    $self->_indices([shuffle(0..$self->length-1)]);
-}
-
-method next()
-{
-    my $current = $self->_current;
-    if($self->_current == $self->length)
-    {
-        $self->reset;
-        return undef;
-    }
-    else
-    {
-        $self->_current($self->_current + 1);
-        return $self->_indices->[$current];
-    }
-};
-
-method reset() { @{ $self->_indices } = shuffle(@{ $self->_indices }); $self->_current(0) }
-method len() { $self->length }
-
-__PACKAGE__->register('AI::MXNet::Gluon::Data');
-
-package AI::MXNet::Gluon::Data::Sampler::BatchSampler;
-use Mouse;
-use List::Util qw(shuffle);
-extends 'AI::MXNet::Gluon::Data::Sampler';
-
-=head1 NAME
-
-    AI::MXNet::Gluon::Data::Sampler::BatchSampler
-=cut
-
-=head1 DESCRIPTION
-
-    Wraps over another AI::MXNet::Gluon::Data::Sampler and return mini-batches of samples.
-
-    Parameters
-    ----------
-    sampler : AI::MXNet::Gluon::Data::Sampler
-        The source Sampler.
-    batch_size : int
-        Size of mini-batch.
-    last_batch : {'keep', 'discard', 'rollover'}
-        Specifies how the last batch is handled if batch_size does not evenly
-        divide sequence length.
-
-        If 'keep', the last batch will be returned directly, but will contain
-        less element than `batch_size` requires.
-
-        If 'discard', the last batch will be discarded.
-
-        If 'rollover', the remaining elements will be rolled over to the next
-        iteration.
-
-    Examples
-    --------
-    >>> $sampler = gluon->data->SequentialSampler(10)
-    >>> $batch_sampler = gluon->data->BatchSampler($sampler, batch_size => 3, last_batch => 'keep');
-    >>> @{ $batch_sampler }
-    [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]]
-=cut
-has 'batch_size' => (is => 'ro', isa => 'Int', required => 1);
-has 'sampler'    => (is => 'ro', isa => 'AI::MXNet::Gluon::Data::Sampler', required => 1);
-has 'last_batch' => (is => 'ro', isa => 'Str', default => 'keep');
-has '_prev'      => (is => 'rw', init_arg => undef);
-has '_kept'      => (is => 'rw', init_arg => undef);
-method python_constructor_arguments() { ['sampler', 'batch_size', 'last_batch'] }
-
-sub BUILD
-{
-    my $self = shift;
-    $self->_prev([]);
-}
-
-method next()
-{
-    if($self->_kept)
-    {
-        $self->_kept(0);
-        return undef;
-    }
-    $self->_kept(0);
-    my $batch = $self->_prev;
-    $self->_prev([]);
-    my $sampler = $self->sampler;
-    while(defined(my $i = <$sampler>))
-    {
-        push @{ $batch }, $i;
-        if(@{ $batch } == $self->batch_size)
-        {
-            return $batch;
-        }
-    }
-    if(@{ $batch })
-    {
-        if($self->last_batch eq 'keep')
-        {
-            $self->_kept(1);
-            return $batch;
-        }
-        elsif($self->last_batch eq 'discard')
-        {
-            return undef;
-        }
-        elsif($self->last_batch eq 'rollover')
-        {
-            $self->_prev($batch);
-            return undef;
-        }
-        else
-        {
-            confess(
-                "last_batch must be one of 'keep', 'discard', or 'rollover', ".
-                "but got ${\ $self->last_batch }"
-            );
-        }
-    }
-    return undef;
-}
-
-method len()
-{
-    if($self->last_batch eq 'keep')
-    {
-        return int(($self->sampler->len + $self->batch_size - 1) / $self->batch_size);
-    }
-    elsif($self->last_batch eq 'discard')
-    {
-        return int($self->sampler->len/$self->batch_size);
-    }
-    elsif($self->last_batch eq 'rollover')
-    {
-        return int((@{ $self->_prev } + $self->sampler->len) / $self->batch_size);
-    }
-    else
-    {
-        confess(
-            "last_batch must be one of 'keep', 'discard', or 'rollover', ".
-            "but got ${\ $self->last_batch }"
-        );
-    }
-}
-
-__PACKAGE__->register('AI::MXNet::Gluon::Data');
-
-1;
\ No newline at end of file
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Data/Set.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Data/Set.pm
deleted file mode 100644
index 753659055e34..000000000000
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Data/Set.pm
+++ /dev/null
@@ -1,155 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-use strict;
-use warnings;
-package AI::MXNet::Gluon::Data::Set;
-use AI::MXNet::Function::Parameters;
-use Mouse;
-around BUILDARGS => \&AI::MXNet::Base::process_arguments;
-method _class_name()
-{
-    my $class = ref $self || $self;
-    $class =~ s/^.+:://;
-    $class;
-}
-
-method register(Str $container)
-{
-    my $sub_name = $self->_class_name;
-    no strict 'refs';
-    *{$container.'::'.$sub_name} = sub { shift; $self->new(@_) };
-}
-
-=head1 NAME
-
-    AI::MXNet::Gluon::Data::Set
-=cut
-
-=head1 DESCRIPTION
-
-    Abstract dataset class. All datasets should have this interface.
-
-    Subclasses need to override method at($i), which returns the i-th
-    element, method len() which returns the total number elements.
-
-    AI::MXNet::NDArray can be directly used as a dataset.
-=cut
-
-method at(Index $idx) { confess("Not Implemented") }
-
-method len() { confess("Not Implemented") }
-
-package AI::MXNet::Gluon::Data::ArrayDataset;
-use AI::MXNet::Base;
-use Mouse;
-extends 'AI::MXNet::Gluon::Data::Set';
-
-=head1 NAME
-
-    AI::MXNet::Gluon::Data::ArrayDataset
-=cut
-
-=head1 DESCRIPTION
-
-    A dataset with a data array and a label array.
-
-    The i-th sample is `(data[i], label[i])`.
-
-    Parameters
-    ----------
-    data : AI::MXNet::NDArray or PDL
-        The data array.
-    label : AI::MXNet::NDArray or PDL
-        The label array.
-=cut
-has [qw/data label/] => (is => 'rw', isa => 'PDL|AI::MXNet::NDArray', required => 1);
-method python_constructor_arguments() { ['data', 'label'] }
-
-sub BUILD
-{
-    my $self = shift;
-    assert(($self->data->len == $self->label->len), "data and label lengths must be the same");
-    if($self->label->isa('AI::MXNet::NDArray') and @{$self->label->shape} == 1)
-    {
-        $self->label($self->label->aspdl);
-    }
-    if($self->data->isa('PDL'))
-    {
-        $self->data(AI::MXNet::NDArray->array($self->data));
-    }
-}
-
-method at(Index $idx)
-{
-    return [
-        $self->data->at($idx),
-        $self->label->at($idx)
-    ];
-}
-
-method len()
-{
-    return $self->data->len
-}
-
-__PACKAGE__->register('AI::MXNet::Gluon::Data');
-
-package AI::MXNet::Gluon::Data::RecordFileSet;
-use AI::MXNet::Base;
-use Mouse;
-extends 'AI::MXNet::Gluon::Data::Set';
-
-=head1 NAME
-
-    AI::MXNet::Gluon::Data::RecordFileSet
-=cut
-
-=head1 DESCRIPTION
-
-    A dataset wrapping over a RecordIO (.rec) file.
-
-    Each sample is a string representing the raw content of an record.
-
-    Parameters
-    ----------
-    filename : str
-        Path to rec file.
-=cut
-has 'filename' => (is => 'ro', isa =>'Str', required => 1);
-has '_record'  => (is => 'rw', init_arg => undef);
-method python_constructor_arguments() { ['filename'] }
-
-sub BUILD
-{
-    my $self = shift;
-    my $idx_file = $self->filename;
-    $idx_file =~ s/\.[^.]+$/.idx/;
-    $self->_record(
-        AI::MXNet::IndexedRecordIO->new(
-            idx_path => $idx_file, uri => $self->filename, flag => 'r'
-        )
-    );
-}
-
-method at(Index $idx) { return $self->_record->read_idx($idx); }
-
-method len() { return scalar(@{ $self->_record->keys }) }
-
-__PACKAGE__->register('AI::MXNet::Gluon::Data');
-
-1;
\ No newline at end of file
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Data/Vision.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Data/Vision.pm
deleted file mode 100644
index abc1c51bfbbe..000000000000
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Data/Vision.pm
+++ /dev/null
@@ -1,436 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-package AI::MXNet::Gluon::Data::Vision;
-use AI::MXNet::NS;
-
-package AI::MXNet::Gluon::Data::Vision::DownloadedDataSet;
-use strict;
-use warnings;
-use File::Path qw(make_path);
-use Archive::Tar;
-use IO::Zlib;
-use IO::File;
-use Mouse;
-use AI::MXNet::Function::Parameters;
-has 'root'           => (is => 'ro', isa => 'Str', required => 1);
-has 'train'          => (is => 'ro', isa => 'Bool', required => 1);
-has 'transform'      => (is => 'ro', isa => 'Maybe[CodeRef]');
-has [qw(data label)] => (is => 'rw', init_arg => undef);
-extends 'AI::MXNet::Gluon::Data::Set';
-method python_constructor_arguments() { ['root', 'train', 'transform'] }
-
-sub BUILD
-{
-    my $self = shift;
-    my $root = $self->root;
-    $root =~ s/~/$ENV{HOME}/;
-    if(not -d $root)
-    {
-        make_path($root);
-    }
-    $self->_get_data;
-}
-
-method at(Index $idx)
-{
-    if(defined $self->transform)
-    {
-        return [$self->transform->($self->data->at($idx), $self->label->at($idx))];
-    }
-    return [$self->data->at($idx), $self->label->at($idx)];
-}
-
-method len() { $self->label->len }
-method _get_data() { confess("Not Implemented") }
-
-package AI::MXNet::Gluon::Data::Vision::DownloadedDataSet::MNIST;
-use Mouse;
-use AI::MXNet::Gluon::Utils qw(download);
-use AI::MXNet::Base;
-extends 'AI::MXNet::Gluon::Data::Vision::DownloadedDataSet';
-
-=head1 NAME
-
-    AI::MXNet::Gluon::Data::Vision::DownloadedDataSet::MNIST
-=cut
-
-=head1 DESCRIPTION
-
-    MNIST handwritten digits dataset from `http://yann.lecun.com/exdb/mnist`_.
-
-    Each sample is an image (in 3D NDArray) with shape (28, 28, 1).
-
-    Parameters
-    ----------
-    root : str
-        Path to temp folder for storing data.
-        Defaults to ~/.mxnet/datasets/mnist
-    train : bool
-        Whether to load the training or testing set.
-        Defaults to True
-    transform : function
-        A user defined callback that transforms each instance. For example:
-
-    transform => sub { my ($data, $label) = @_; return ($data->astype('float32')/255, $label) }
-=cut
-
-has [qw/_base_url _train_data _train_label _test_data _test_label/] => (is => 'rw');
-has '+root'  => (default => '~/.mxnet/datasets/mnist');
-has '+train' => (default => 1);
-has '_base_url'    => (is => 'ro', default => 'https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/mnist/');
-has '_train_data'  => (is => 'ro', default => sub { ['train-images-idx3-ubyte.gz',
-                                                     '6c95f4b05d2bf285e1bfb0e7960c31bd3b3f8a7d'] });
-has '_train_label' => (is => 'ro', default => sub { ['train-labels-idx1-ubyte.gz',
-                                                     '2a80914081dc54586dbdf242f9805a6b8d2a15fc'] });
-has '_test_data'   => (is => 'ro', default => sub { ['t10k-images-idx3-ubyte.gz',
-                                                     'c3a25af1f52dad7f726cce8cacb138654b760d48'] });
-has '_test_label'  => (is => 'ro', default => sub { ['t10k-labels-idx1-ubyte.gz',
-                                                     '763e7fa3757d93b0cdec073cef058b2004252c17'] });
-
-method _get_data()
-{
-    my ($data, $label);
-    if($self->train)
-    {
-        ($data, $label) = ($self->_train_data, $self->_train_label);
-    }
-    else
-    {
-        ($data, $label) = ($self->_test_data, $self->_test_label);
-    }
-    my $data_file = download($self->_base_url . $data->[0], path => $self->root,
-                             sha1_hash => $data->[1]);
-    my $label_file = download($self->_base_url . $label->[0], path => $self->root,
-                             sha1_hash => $label->[1]);
-    my $fh = new IO::Zlib;
-    my ($l, $d);
-    if ($fh->open($label_file, "rb"))
-    {
-        $fh->read($l, 100_000_000);
-        $l = substr($l, 8);
-        my $p = PDL->new_from_specification(PDL::Type->new(0), length($l));
-        ${$p->get_dataref} = $l;
-        $p->upd_data;
-        $l = $p;
-        $fh->close;
-        $l = AI::MXNet::NDArray->array($l, dtype => 'int32')->aspdl;
-    }
-    if ($fh->open($data_file, "rb"))
-    {
-        $fh->read($d, 100_000_000);
-        $d = substr($d, 16);
-        my $p = PDL->new_from_specification(PDL::Type->new(0), length($d));
-        ${$p->get_dataref} = $d;
-        $p->upd_data;
-        $d = $p;
-        $fh->close;
-        $d->reshape(1, 28, 28, $l->dim(-1));
-    }
-    $self->data(AI::MXNet::NDArray->array($d, dtype => 'uint8'));
-    $self->label($l);
-}
-
-__PACKAGE__->register('AI::MXNet::Gluon::Data::Vision');
-
-package AI::MXNet::Gluon::Data::Vision::DownloadedDataSet::FashionMNIST;
-use Mouse;
-
-=head1 NAME
-
-    AI::MXNet::Gluon::Data::Vision::DownloadedDataSet::FashionMNIST
-=cut
-
-=head1 DESCRIPTION
-
-    A dataset of Zalando's article images consisting of fashion products,
-    a drop-in replacement of the original MNIST dataset from
-    `https://github.com/zalandoresearch/fashion-mnist`_.
-
-    Each sample is an image (in 3D NDArray) with shape (28, 28, 1).
-
-    Parameters
-    ----------
-    root : str
-        Path to temp folder for storing data.
-        Defaults to ~/.mxnet/datasets/mnist
-    train : bool
-        Whether to load the training or testing set.
-        Defaults to True
-    transform : function
-        A user defined callback that transforms each instance. For example:
-
-    transform => sub { my ($data, $label) = @_; return ($data->astype('float32')/255, $label) }
-=cut
-
-extends 'AI::MXNet::Gluon::Data::Vision::DownloadedDataSet::MNIST';
-has '+root'         => (default => '~/.mxnet/datasets/fashion-mnist');
-has '+_base_url'    => (default => 'https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/fashion-mnist/');
-has '+_train_data'  => (default => sub { ['train-images-idx3-ubyte.gz',
-                                          '0cf37b0d40ed5169c6b3aba31069a9770ac9043d'] });
-has '+_train_label' => (default => sub { ['train-labels-idx1-ubyte.gz',
-                                          '236021d52f1e40852b06a4c3008d8de8aef1e40b'] });
-has '+_test_data'   => (default => sub { ['t10k-images-idx3-ubyte.gz',
-                                          '626ed6a7c06dd17c0eec72fa3be1740f146a2863'] });
-has '+_test_label'  => (default => sub { ['t10k-labels-idx1-ubyte.gz',
-                                          '17f9ab60e7257a1620f4ad76bbbaf857c3920701'] });
-
-__PACKAGE__->register('AI::MXNet::Gluon::Data::Vision');
-
-package AI::MXNet::Gluon::Data::Vision::DownloadedDataSet::CIFAR10;
-use Mouse;
-use AI::MXNet::Gluon::Utils qw(download);
-use AI::MXNet::Base;
-use Cwd;
-extends 'AI::MXNet::Gluon::Data::Vision::DownloadedDataSet';
-
-=head1 NAME
-
-    AI::MXNet::Gluon::Data::Vision::DownloadedDataSet::CIFAR10
-=cut
-
-=head1 DESCRIPTION
-
-    CIFAR10 image classification dataset from `https://www.cs.toronto.edu/~kriz/cifar.html`_.
-
-    Each sample is an image (in 3D NDArray) with shape (32, 32, 1).
-
-    Parameters
-    ----------
-    root : str
-        Path to temp folder for storing data.
-    train : bool
-        Whether to load the training or testing set.
-    transform : function
-        A user defined callback that transforms each instance. For example:
-
-    transform => sub { my ($data, $label) = @_; return ($data->astype('float32')/255, $label) }
-=cut
-has '+root'  => (default => '~/.mxnet/datasets/cifar10');
-has '+train' => (default => 1);
-has '_file_hashes' => (is => 'ro', default => sub { +{
-    qw/data_batch_1.bin aadd24acce27caa71bf4b10992e9e7b2d74c2540
-       data_batch_2.bin c0ba65cce70568cd57b4e03e9ac8d2a5367c1795
-       data_batch_3.bin 1dd00a74ab1d17a6e7d73e185b69dbf31242f295
-       data_batch_4.bin aab85764eb3584312d3c7f65fd2fd016e36a258e
-       data_batch_5.bin 26e2849e66a845b7f1e4614ae70f4889ae604628
-       test_batch.bin   67eb016db431130d61cd03c7ad570b013799c88c/
-    } });
-
-method _read_batch(Str $filename)
-{
-    my $data = join('', IO::File->new($filename)->getlines);
-    $data = PDL->new_from_specification(PDL::Type->new(0), length($data))->reshape(3073, length($data)/3073);
-    $data = AI::MXNet::NDArray->array($data, dtype => 'uint8');
-    return (
-        $data->slice('X', [1, -1])->sever->reshape([-1, 3, 32, 32])->transpose([0, 2, 3, 1]),
-        $data->slice('X', 0)->astype('int32')
-    );
-}
-
-method _get_data()
-{
-    my @file_paths = map { [$_, join('/', $self->root, 'cifar-10-batches-bin/', $_)] } keys %{ $self->_file_hashes };
-    if(grep { not -f $_->[1] or not check_sha1($_->[1], $self->_file_hashes->{ $_->[0] }) } @file_paths)
-    {
-        my $filename = download(
-            'https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/cifar10/cifar-10-binary.tar.gz',
-            path => $self->root,
-            sha1_hash => 'fab780a1e191a7eda0f345501ccd62d20f7ed891'
-        );
-        my $tar = Archive::Tar->new($filename);
-        my $cwd = cwd();
-        chdir($self->root);
-        $tar->extract;
-        chdir($cwd);
-    }
-    my ($data, $label);
-    if($self->train)
-    {
-        my (@data, @label);
-        for my $i (1..5)
-        {
-            my $filename = join('/', $self->root, "data_batch_$i.bin");
-            my ($data, $label) = $self->_read_batch($filename);
-            push @data, $data;
-            push @label, $label;
-        }
-        $data = AI::MXNet::NDArray->concatenate(\@data);
-        $label = AI::MXNet::NDArray->concatenate(\@label);
-    }
-    else
-    {
-        my $filename = join('/', $self->root, "test_batch.bin");
-        ($data, $label) = $self->_read_batch($filename);
-    }
-    $self->data(\@{$data});
-    $self->label($label->aspdl);
-}
-
-__PACKAGE__->register('AI::MXNet::Gluon::Data::Vision');
-
-package AI::MXNet::Gluon::Data::Vision::RecordFileSet::ImageRecordDataset;
-use Mouse;
-extends 'AI::MXNet::Gluon::Data::RecordFileSet';
-=head1 NAME
-
-    AI::MXNet::Gluon::Data::Vision::RecordFileSet::ImageRecordDataset
-=cut
-
-=head1 DESCRIPTION
-
-    A dataset wrapping over a RecordIO file containing images.
-
-    Each sample is an image and its corresponding label.
-
-    Parameters
-    ----------
-    filename : str
-        Path to rec file.
-    flag : {0, 1}, default 1
-        If 0, always convert images to greyscale.
-
-        If 1, always convert images to colored (RGB).
-    transform : function
-        A user defined callback that transforms each instance. For example:
-=cut
-has 'flag'      => (is => 'rw', isa => 'Bool', default => 1);
-has 'transform' => (is => 'rw', isa => 'Maybe[CodeRef]');
-
-method at(Int $idx)
-{
-    my $record = $self->SUPER::at($idx);
-    my ($header, $img) = AI::MXNet::RecordIO->unpack($record);
-    if(defined $self->transform)
-    {
-        my $data = [AI::MXNet::Image->imdecode($img)];
-        return [$self->transform->(
-            AI::MXNet::Image->imdecode($img, flag => $self->flag), $header->label
-        )];
-    }
-    return [AI::MXNet::Image->imdecode($img, flag => $self->flag), $header->label];
-}
-
-__PACKAGE__->register('AI::MXNet::Gluon::Data::Vision');
-
-package AI::MXNet::Gluon::Data::Vision::Set::ImageFolderDataset;
-use Mouse;
-extends 'AI::MXNet::Gluon::Data::Set';
-=head1 NAME
-
-    AI::MXNet::Gluon::Data::Vision::ImageFolderDataset
-=cut
-
-=head1 DESCRIPTION
-
-    A dataset for loading image files stored in a folder structure like::
-
-        root/car/0001.jpg
-        root/car/xxxa.jpg
-        root/car/yyyb.jpg
-        root/bus/123.jpg
-        root/bus/023.jpg
-        root/bus/wwww.jpg
-
-    Parameters
-    ----------
-    root : str
-        Path to root directory.
-    flag : {0, 1}, default 1
-        If 0, always convert loaded images to greyscale (1 channel).
-        If 1, always convert loaded images to colored (3 channels).
-    transform : callable
-        A function that takes data and label and transforms them::
-
-            transform = lambda data, label: (data.astype(np.float32)/255, label)
-
-    Attributes
-    ----------
-    synsets : list
-        List of class names. `synsets[i]` is the name for the integer label `i`
-    items : list of tuples
-        List of all images in (filename, label) pairs.
-=cut
-has 'root'      => (is => 'rw', isa => 'Str');
-has 'flag'      => (is => 'rw', isa => 'Bool', default => 1);
-has 'transform' => (is => 'rw', isa => 'Maybe[CodeRef]');
-has [qw/exts
-    synsets
-    items/]     => (is => 'rw', init_arg => undef);
-method python_constructor_arguments() { ['root', 'flag', 'transform'] }
-
-sub BUILD
-{
-    my $self = shift;
-    my $root = $self->root;
-    $root =~ s/~/$ENV{HOME}/;
-    $self->root($root);
-    $self->exts({'.jpg', 1, '.jpeg', 1, '.png', 1});
-    $self->list_images($self->root);
-}
-
-method list_images(Str $root)
-{
-    $self->synsets([]);
-    $self->items([]);
-
-    for my $path (sort(glob("$root/*")))
-    {
-        my $folder = $path;
-        $folder =~ s,^.+/,,;
-        if(not -d $path)
-        {
-            AI::MXNet::Logging->warning("Ignoring %s, which is not a directory.", $folder);
-            next;
-        }
-        my $label = @{ $self->synsets };
-        push @{ $self->synsets }, $folder;
-        for my $filename (sort(glob("$path/*")))
-        {
-            my ($ext) = $filename =~ /(\.[^\.]+)$/;
-            if(not $ext or not exists $self->exts->{lc $ext})
-            {
-                AI::MXNet::Logging->warning(
-                    'Ignoring %s of type %s. Only support .jpg, .jpeg, .png',
-                    $filename, $ext//'undef'
-                );
-                next;
-            }
-            push @{ $self->items }, [$filename, AI::MXNet::NDArray->array([$label], dtype => 'int32')->aspdl];
-        }
-    }
-}
-
-method at(Int $idx)
-{
-    my $img = AI::MXNet::Image->imread($self->items->[$idx][0], flag => $self->flag);
-    my $label = $self->items->[$idx][1];
-    if(defined $self->transform)
-    {
-        return [$self->transform->($img, $label)];
-    }
-    return [$img, $label];
-}
-
-method len()
-{
-    return scalar(@{ $self->items });
-}
-
-__PACKAGE__->register('AI::MXNet::Gluon::Data::Vision');
-
-1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Loss.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Loss.pm
deleted file mode 100644
index a5938595df6f..000000000000
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Loss.pm
+++ /dev/null
@@ -1,999 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-use strict;
-use warnings;
-package AI::MXNet::Gluon::Loss;
-use AI::MXNet::NS;
-use AI::MXNet::Gluon::Block;
-use AI::MXNet::Function::Parameters;
-
-=head1 NAME
-
-    AI::MXNet::Gluon::Loss - Base class for loss.
-=cut
-
-=head2 DESCRIPTION
-
-    Base class for loss.
-
-    Parameters
-    ----------
-    weight : float or None
-        Global scalar weight for loss.
-    batch_axis : int, default 0
-        The axis that represents mini-batch.
-=cut
-
-=head2 _apply_weighting
-
-    Apply weighting to loss.
-
-    Parameters
-    ----------
-    loss : Symbol
-        The loss to be weighted.
-    weight : float or None
-        Global scalar weight for loss.
-    sample_weight : Symbol or None
-        Per sample weighting. Must be broadcastable to
-        the same shape as loss. For example, if loss has
-        shape (64, 10) and you want to weight each sample
-        in the batch separately, `sample_weight` should have
-        shape (64, 1).
-
-    Returns
-    -------
-    loss : Symbol
-        Weighted loss
-=cut
-
-
-method _apply_weighting(Str $F, GluonInput $loss, Maybe[Num] $weight=, Maybe[GluonInput] $sample_weight=)
-{
-    if(defined $sample_weight)
-    {
-        $loss = $F->broadcast_mul($loss, $sample_weight);
-    }
-    if(defined $weight)
-    {
-        $loss = $loss * $weight;
-    }
-    return $loss;
-}
-
-# Reshapes x to the same shape as y
-method _reshape_like(GluonClass $F, GluonInput $x, GluonInput $y)
-{
-    if($F eq 'AI::MXNet::NDArray')
-    {
-        return $x->reshape($y->shape);
-    }
-    else
-    {
-        return $F->reshape_like($x, $y);
-    }
-}
-
-use AI::MXNet::Gluon::Mouse;
-extends 'AI::MXNet::Gluon::HybridBlock';
-has 'weight'     => (is => 'rw', isa => 'Num');
-has 'batch_axis' => (is => 'rw', isa => 'Int', default => 0);
-
-use overload '""' => sub {
-        my $self = shift;
-        sprintf(
-            "%s(batch_axis=%s, w=%s)",
-            $self->_class_name,
-            $self->batch_axis,
-            $self->weight
-        );
-    };
-
-method hybrid_forward($F, $x, @args)
-{
-    confess('NotImplementedError');
-}
-
-package AI::MXNet::Gluon::L2Loss;
-use AI::MXNet::Gluon::Mouse;
-extends 'AI::MXNet::Gluon::Loss';
-
-=head1 NAME
-
-    AI::MXNet::Gluon::L2Loss
-=cut
-
-=head1 DESCRIPTION
-
-    Calculates the mean squared error between output and label:
-
-    Output and label can have arbitrary shape as long as they have the same
-    number of elements.
-
-    Parameters
-    ----------
-    weight : float or None
-        Global scalar weight for loss.
-    sample_weight : Symbol or None
-        Per sample weighting. Must be broadcastable to
-        the same shape as loss. For example, if loss has
-        shape (64, 10) and you want to weight each sample
-        in the batch, `sample_weight` should have shape (64, 1).
-    batch_axis : int, default 0
-        The axis that represents mini-batch.
-=cut
-has '+weight'     => (default => 1);
-has '+batch_axis' => (default => 0);
-
-method hybrid_forward(GluonClass $F, GluonInput $pred, GluonInput $label, Maybe[GluonInput] $sample_weight=)
-{
-
-    $label = __PACKAGE__->_reshape_like($F, $label, $pred);
-    my $loss = $F->square($pred - $label);
-    $loss = __PACKAGE__->_apply_weighting($F, $loss, $self->weight/2, $sample_weight);
-    return $F->mean($loss, axis => $self->batch_axis, exclude => 1);
-}
-
-__PACKAGE__->register('AI::MXNet::Gluon::Loss');
-
-package AI::MXNet::Gluon::L1Loss;
-use AI::MXNet::Gluon::Mouse;
-extends 'AI::MXNet::Gluon::Loss';
-has '+weight'     => (default => 1);
-has '+batch_axis' => (default => 0);
-
-=head1 NAME
-
-    AI::MXNet::Gluon::L1Loss
-=cut
-
-=head1 DESCRIPTION
-
-    Calculates the mean absolute error between output and label:
-
-    .. math::
-        L = \\frac{1}{2}\\sum_i \\vert {output}_i - {label}_i \\vert.
-
-    Output and label must have the same shape.
-
-    Parameters
-    ----------
-    weight : float or None
-        Global scalar weight for loss.
-    sample_weight : Symbol or None
-        Per sample weighting. Must be broadcastable to
-        the same shape as loss. For example, if loss has
-        shape (64, 10) and you want to weight each sample
-        in the batch, `sample_weight` should have shape (64, 1).
-    batch_axis : int, default 0
-        The axis that represents mini-batch.
-=cut
-
-method hybrid_forward(GluonClass $F, GluonInput $pred, GluonInput $label, Maybe[GluonInput] $sample_weight=)
-{
-    $label = __PACKAGE__->_reshape_like($F, $label, $pred);
-    my $loss = $F->abs($pred - $label);
-    $loss = __PACKAGE__->_apply_weighting($F, $loss, $self->weight, $sample_weight);
-    return $F->mean($loss, axis => $self->batch_axis, exclude => 1);
-}
-
-__PACKAGE__->register('AI::MXNet::Gluon::Loss');
-
-package AI::MXNet::Gluon::SigmoidBinaryCrossEntropyLoss;
-use AI::MXNet::Gluon::Mouse;
-extends 'AI::MXNet::Gluon::Loss';
-has 'from_sigmoid' => (is => 'ro', isa => 'Bool', default => 0);
-has '+batch_axis'  => (default => 0);
-
-=head1 NAME
-
-    AI::MXNet::Gluon::SigmoidBinaryCrossEntropyLoss
-=cut
-
-=head1 DESCRIPTION
-
-    The cross-entropy loss for binary classification. (alias: SigmoidBCELoss)
-
-    BCE loss is useful when training logistic regression.
-
-    .. math::
-        loss(o, t) = - 1/n \sum_i (t[i] * log(o[i]) + (1 - t[i]) * log(1 - o[i]))
-
-
-    Parameters
-    ----------
-    from_sigmoid : bool, default is `False`
-        Whether the input is from the output of sigmoid. Set this to false will make
-        the loss calculate sigmoid and then BCE, which is more numerically stable through
-        log-sum-exp trick.
-    weight : float or None
-        Global scalar weight for loss.
-    sample_weight : Symbol or None
-        Per sample weighting. Must be broadcastable to
-        the same shape as loss. For example, if loss has
-        shape (64, 10) and you want to weight each sample
-        in the batch, `sample_weight` should have shape (64, 1).
-    batch_axis : int, default 0
-        The axis that represents mini-batch.
-=cut
-
-method hybrid_forward(GluonClass $F, GluonInput $pred, GluonInput $label, Maybe[GluonInput] $sample_weight=)
-{
-    $label = __PACKAGE__->_reshape_like($F, $label, $pred);
-    my $loss;
-    if(not $self->from_sigmoid)
-    {
-        $loss = $F->relu($pred) - $pred * $label + $F->Activation(-$F->abs($pred), act_type=>'softrelu');
-    }
-    else
-    {
-        $loss = -($F->log($pred+1e-12)*$label + $F->log(1-$pred+1e-12)*(1-$label));
-    }
-    $loss = __PACKAGE__->_apply_weighting($F, $loss, $self->weight, $sample_weight);
-    return $F->mean($loss, axis => $self->batch_axis, exclude => 1);
-}
-
-__PACKAGE__->register('AI::MXNet::Gluon::Loss');
-
-package AI::MXNet::Gluon::SigmoidBCELoss;
-use AI::MXNet::Gluon::Mouse;
-extends 'AI::MXNet::Gluon::SigmoidBinaryCrossEntropyLoss';
-
-__PACKAGE__->register('AI::MXNet::Gluon::Loss');
-
-package AI::MXNet::Gluon::SoftmaxCrossEntropyLoss;
-use AI::MXNet::Gluon::Mouse;
-extends 'AI::MXNet::Gluon::Loss';
-
-
-=head1 NAME
-
-    AI::MXNet::Gluon::SoftmaxCrossEntropyLoss
-=cut
-
-=head1 DESCRIPTION
-
-    Computes the softmax cross entropy loss. (alias: SoftmaxCELoss)
-
-    If `sparse_label` is `True`, label should contain integer category indicators:
-
-    .. math::
-        p = {softmax}({output})
-
-        L = -\\sum_i {log}(p_{i,{label}_i})
-
-    Label's shape should be output's shape without the `axis` dimension. i.e. for
-    `output.shape` = (1,2,3,4) and axis = 2, `label.shape` should be (1,2,4).
-
-    If `sparse_label` is `False`, label should contain probability distribution
-    with the same shape as output:
-
-    .. math::
-        p = {softmax}({output})
-
-        L = -\\sum_i \\sum_j {label}_j {log}(p_{ij})
-
-    Parameters
-    ----------
-    axis : int, default -1
-        The axis to sum over when computing softmax and entropy.
-    sparse_label : bool, default True
-        Whether label is an integer array instead of probability distribution.
-    from_logits : bool, default False
-        Whether input is a log probability (usually from log_softmax) instead
-        of unnormalized numbers.
-    weight : float or None
-        Global scalar weight for loss.
-    sample_weight : Symbol or None
-        Per sample weighting. Must be broadcastable to
-        the same shape as loss. For example, if loss has
-        shape (64, 10) and you want to weight each sample
-        in the batch, `sample_weight` should have shape (64, 1).
-    batch_axis : int, default 0
-        The axis that represents mini-batch.
-=cut
-
-has 'axis'         => (is => 'ro', isa => 'Int', default => -1);
-has '+batch_axis'  => (default => 0);
-has 'sparse_label' => (is => 'ro', isa => 'Bool', default => 1);
-has 'from_logits'  => (is => 'ro', isa => 'Bool', default => 0);
-
-method hybrid_forward(GluonClass $F, GluonInput $pred, GluonInput $label, Maybe[GluonInput] $sample_weight=)
-{
-    if(not $self->from_logits)
-    {
-        $pred = $F->log_softmax($pred, axis => $self->axis);
-    }
-    my $loss;
-    if($self->sparse_label)
-    {
-        $loss = -$F->pick($pred, $label, axis=>$self->axis, keepdims => 1);
-    }
-    else
-    {
-        __PACKAGE__->reshape_like($F, $label, $pred);
-        $loss = -$F->sum($pred*$label, axis => $self->axis, keepdims => 1);
-    }
-    $loss = __PACKAGE__->_apply_weighting($F, $loss, $self->weight, $sample_weight);
-    return $F->mean($loss, axis => $self->batch_axis, exclude => 1);
-}
-
-__PACKAGE__->register('AI::MXNet::Gluon::Loss');
-
-package AI::MXNet::Gluon::SoftmaxCELoss;
-use AI::MXNet::Gluon::Mouse;
-extends 'AI::MXNet::Gluon::SoftmaxCrossEntropyLoss';
-
-__PACKAGE__->register('AI::MXNet::Gluon::Loss');
-
-
-package AI::MXNet::Gluon::KLDivLoss;
-use AI::MXNet::Gluon::Mouse;
-extends 'AI::MXNet::Gluon::Loss';
-has '+batch_axis'  => (default => 0);
-has 'axis'         => (is => 'ro', isa => 'Int', default => -1);
-has 'from_logits'  => (is => 'ro', isa => 'Bool', default => 1);
-
-=head1 NAME
-
-    AI::MXNet::Gluon::KLDivLoss
-=cut
-
-=head1 DESCRIPTION
-
-    The Kullback-Leibler divergence loss.
-
-    KL divergence is a useful distance measure for continuous distributions
-    and is often useful when performing direct regression over the space of
-    (discretely sampled) continuous output distributions.
-
-    .. _Kullback-Leibler divergence:
-        https://en.wikipedia.org/wiki/Kullback-Leibler_divergence
-    .. math::
-        L = 1/n \\sum_i (label_i * (log(label_i) - output_i))
-
-    Label's shape should be the same as output's.
-
-    Parameters
-    ----------
-    from_logits : bool, default is `True`
-        Whether the input is log probability (usually from log_softmax) instead
-        of unnormalized numbers.
-    weight : float or None
-        Global scalar weight for loss.
-    axis : int, default -1
-        The dimension along with to compute softmax. Only used when `from_logits`
-        is False.
-    sample_weight : Symbol or None
-        Per sample weighting. Must be broadcastable to
-        the same shape as loss. For example, if loss has
-        shape (64, 10) and you want to weight each sample
-        in the batch, `sample_weight` should have shape (64, 1).
-    batch_axis : int, default 0
-        The axis that represents mini-batch.
-=cut
-
-method hybrid_forward(GluonClass $F, GluonInput $pred, GluonInput $label, Maybe[GluonInput] $sample_weight=)
-{
-    if(not $self->from_logits)
-    {
-        $pred = $F->log_softmax($pred, axis => $self->axis);
-    }
-    my $loss = $label * ($F->log($label+1e-12) - $pred);
-    $loss = __PACKAGE__->_apply_weighting($F, $loss, $self->weight, $sample_weight);
-    return $F->mean($loss, axis => $self->batch_axis, exclude => 1);
-}
-
-__PACKAGE__->register('AI::MXNet::Gluon::Loss');
-
-package AI::MXNet::Gluon::CTCLoss;
-use AI::MXNet::Gluon::Mouse;
-extends 'AI::MXNet::Gluon::Loss';
-has 'layout'        => (is => 'rw', isa => 'Str', default => 'NTC');
-has 'label_layout'  => (is => 'rw', isa => 'Str', default => 'NT');
-
-=head1 NAME
-
-    AI::MXNet::Gluon::CTCLoss
-=cut
-
-=head1 DESCRIPTION
-
-    Connectionist Temporal Classification Loss.
-
-    See `"Connectionist Temporal Classification: Labelling Unsegmented
-    Sequence Data with Recurrent Neural Networks"
-    <http://www.cs.toronto.edu/~graves/icml_2006.pdf>`_ paper for more information.
-
-    Parameters
-    ----------
-    layout : str, default 'NTC'
-        Layout of the output sequence activation vector.
-    label_layout : str, default 'NT'
-        Layout of the labels.
-    weight : float or None
-        Global scalar weight for loss.
-    sample_weight : Symbol or None
-        Per sample weighting. Must be broadcastable to
-        the same shape as loss. For example, if loss has
-        shape (64, 10) and you want to weight each sample
-        in the batch, `sample_weight` should have shape (64, 1).
-        This should be used as the fifth argument when calling this loss.
-
-    Input shapes:
-        `data` is an activation tensor (i.e. before softmax).
-        Its shape depends on `layout`. For `layout='TNC'`, this
-        input has shape `(sequence_length, batch_size, alphabet_size)`
-        Note that the last dimension with index `alphabet_size-1` is reserved for special
-        blank character.
-
-        `label` is the label index matrix with zero-indexed labels.
-        Its shape depends on `label_layout`. For `label_layout='TN'`, this
-        input has shape `(label_sequence_length, batch_size)`. Padding mask of value ``-1``
-        is available for dealing with unaligned label lengths.
-        When `label_lengths` is specified, label lengths are directly used and padding mask
-        is not allowed in the label.
-        When `label_lengths` is not specified, the first occurrence of ``-1``
-        in each sample marks the end of the label sequence of that sample.
-
-        For example, suppose the vocabulary is `[a, b, c]`, and in one batch we have three
-        sequences 'ba', 'cbb', and 'abac'. We can index the labels as `{'a': 0, 'b': 1, 'c': 2}`.
-        The alphabet size should be 4, and we reserve the channel index 3 for blank label
-        in data tensor. The padding mask value for extra length is -1, so the resulting `label`
-        tensor should be padded to be::
-
-          [[1, 0, -1, -1], [2, 1, 1, -1], [0, 1, 0, 2]]
-
-        `data_lengths` is optional and defaults to None.
-        When specified, it represents the actual lengths of data.
-        The shape should be (batch_size,).
-        If None, the data lengths are treated as being equal to the max sequence length.
-        This should be used as the third argument when calling this loss.
-
-        `label_lengths` is optional and defaults to None.
-        When specified, it represents the actual lengths of labels.
-        The shape should be (batch_size,).
-        If None, the label lengths are derived from the first occurrence of
-        the value specified by `padding_mask`.
-        This should be used as the fourth argument when calling this loss.
-
-    Output shape:
-        The CTC loss output has the shape (batch_size,).
-=cut
-use AI::MXNet::Base;
-
-sub BUILD
-{
-    my $self = shift;
-    assert(
-        (grep { $_ eq $self->layout } ('NTC', 'TNC')),
-        "Only 'NTC' and 'TNC' layouts for output are supported. Got: ${\ $self->layout }"
-    );
-    assert(
-        (grep { $_ eq $self->label_layout } ('NT', 'TN')),
-        "Only 'NT' and 'TN' layouts for label are supported. Got: ${\ $self->label_layout }"
-    );
-    $self->batch_axis(index($self->label_layout, 'N'));
-}
-
-method hybrid_forward(
-    GluonClass $F, GluonInput $data, GluonInput $label,
-    Maybe[GluonInput] $data_lengths=, Maybe[GluonInput] $label_lengths=, Maybe[GluonInput] $sample_weight=
-)
-{
-    if($self->layout eq 'NTC')
-    {
-        $data = $F->swapaxes($data, dim1 => 0, dim2 => 1);
-    }
-    if($self->batch_axis == 1)
-    {
-        $label = $F->swapaxes($label, dim1 => 0, dim2 => 1);
-    }
-    my $loss = $F->contrib->CTCLoss(
-        $data, $label,
-        (defined $data_lengths ? $data_lengths : ()),
-        (defined $label_lengths ? $label_lengths : ()),
-        use_data_lengths  => defined $data_lengths ? 1 : 0,
-        use_label_lengths => defined $label_lengths ? 1 : 0,
-        blank_label=>'last'
-    );
-    return $self->_apply_weighting($F, $loss, $self->weight, $sample_weight);
-}
-
-__PACKAGE__->register('AI::MXNet::Gluon::Loss');
-
-package AI::MXNet::Gluon::HuberLoss;
-use AI::MXNet::Gluon::Mouse;
-extends 'AI::MXNet::Gluon::Loss';
-has 'rho' => (is => 'rw', isa => 'Num', default => 1);
-
-=head1 NAME
-
-    AI::MXNet::Gluon::HuberLoss
-=cut
-
-=head1 DESCRIPTION
-
-    Calculates smoothed L1 loss that is equal to L1 loss if absolute error
-    exceeds rho but is equal to L2 loss otherwise. Also called SmoothedL1 loss.
-
-    .. math::
-        L = \sum_i \begin{cases} \frac{1}{2 {rho}} ({pred}_i - {label}_i)^2 &
-                           \text{ if } |{pred}_i - {label}_i| < {rho} \\
-                           |{pred}_i - {label}_i| - \frac{{rho}}{2} &
-                           \text{ otherwise }
-            \end{cases}
-
-    `pred` and `label` can have arbitrary shape as long as they have the same
-    number of elements.
-
-    Parameters
-    ----------
-    rho : float, default 1
-        Threshold for trimmed mean estimator.
-    weight : float or None
-        Global scalar weight for loss.
-    batch_axis : int, default 0
-        The axis that represents mini-batch.
-
-
-    Inputs:
-        - **pred**: prediction tensor with arbitrary shape
-        - **label**: target tensor with the same size as pred.
-        - **sample_weight**: element-wise weighting tensor. Must be broadcastable
-          to the same shape as pred. For example, if pred has shape [64, 10]
-          and you want to weigh each sample in the batch separately,
-          sample_weight should have shape [64, 1].
-
-    Outputs:
-        - **loss**: loss tensor with shape [batch_size]. Dimenions other than
-          batch_axis are averaged out.
-=cut
-
-method hybrid_forward(
-    GluonClass $F, GluonInput $pred, GluonInput $label, Maybe[GluonInput] $sample_weight=
-)
-{
-    $label = __PACKAGE__->_reshape_like($F, $label, $pred);
-    my $loss = $F->abs($pred - $label);
-    $loss = $F->where(
-        $loss > $self->rho, $loss - 0.5 * $self->rho,
-        (0.5/$self->rho) * $F->square($loss)
-    );
-    $loss = __PACKAGE__->_apply_weighting($F, $loss, $self->weight, $sample_weight);
-    return $F->mean($loss, axis => $self->batch_axis, exclude => 1);
-}
-
-__PACKAGE__->register('AI::MXNet::Gluon::Loss');
-
-package AI::MXNet::Gluon::HingeLoss;
-use AI::MXNet::Gluon::Mouse;
-extends 'AI::MXNet::Gluon::Loss';
-has 'margin' => (is => 'rw', isa => 'Num', default => 1);
-
-=head1 NAME
-
-    AI::MXNet::Gluon::HingeLoss
-=cut
-
-=head1 DESCRIPTION
-
-    Calculates the hinge loss function often used in SVMs:
-
-    .. math::
-        L = \sum_i max(0, {margin} - {pred}_i \cdot {label}_i)
-
-    where `pred` is the classifier prediction and `label` is the target tensor
-    containing values -1 or 1. `pred` and `label` must have the same number of
-    elements.
-
-    Parameters
-    ----------
-    margin : float
-        The margin in hinge loss. Defaults to 1.0
-    weight : float or None
-        Global scalar weight for loss.
-    batch_axis : int, default 0
-        The axis that represents mini-batch.
-
-
-    Inputs:
-        - **pred**: prediction tensor with arbitrary shape.
-        - **label**: truth tensor with values -1 or 1. Must have the same size
-          as pred.
-        - **sample_weight**: element-wise weighting tensor. Must be broadcastable
-          to the same shape as pred. For example, if pred has shape (64, 10)
-          and you want to weigh each sample in the batch separately,
-          sample_weight should have shape (64, 1).
-
-    Outputs:
-        - **loss**: loss tensor with shape (batch_size,). Dimenions other than
-          batch_axis are averaged out.
-=cut
-
-method hybrid_forward(
-    GluonClass $F, GluonInput $pred, GluonInput $label, Maybe[GluonInput] $sample_weight=
-)
-{
-    $label = __PACKAGE__->_reshape_like($F, $label, $pred);
-    my $loss = $F->relu($self->margin - $pred * $label);
-    $loss = __PACKAGE__->_apply_weighting($F, $loss, $self->weight, $sample_weight);
-    return $F->mean($loss, axis => $self->batch_axis, exclude => 1);
-}
-
-__PACKAGE__->register('AI::MXNet::Gluon::Loss');
-
-package AI::MXNet::Gluon::SquaredHingeLoss;
-use AI::MXNet::Gluon::Mouse;
-extends 'AI::MXNet::Gluon::Loss';
-has 'margin' => (is => 'rw', isa => 'Num', default => 1);
-
-=head1 NAME
-
-    AI::MXNet::Gluon::SquaredHingeLoss
-=cut
-
-=head1 DESCRIPTION
-
-    Calculates the soft-margin loss function used in SVMs:
-
-    .. math::
-        L = \sum_i max(0, {margin} - {pred}_i \cdot {label}_i)^2
-
-    where `pred` is the classifier prediction and `label` is the target tensor
-    containing values -1 or 1. `pred` and `label` can have arbitrary shape as
-    long as they have the same number of elements.
-
-    Parameters
-    ----------
-    margin : float
-        The margin in hinge loss. Defaults to 1.0
-    weight : float or None
-        Global scalar weight for loss.
-    batch_axis : int, default 0
-        The axis that represents mini-batch.
-
-
-    Inputs:
-        - **pred**: prediction tensor with arbitrary shape
-        - **label**: truth tensor with values -1 or 1. Must have the same size
-          as pred.
-        - **sample_weight**: element-wise weighting tensor. Must be broadcastable
-          to the same shape as pred. For example, if pred has shape (64, 10)
-          and you want to weigh each sample in the batch separately,
-          sample_weight should have shape (64, 1).
-
-    Outputs:
-        - **loss**: loss tensor with shape (batch_size,). Dimenions other than
-          batch_axis are averaged out.
-=cut
-
-method hybrid_forward(
-    GluonClass $F, GluonInput $pred, GluonInput $label, Maybe[GluonInput] $sample_weight=
-)
-{
-    $label = __PACKAGE__->_reshape_like($F, $label, $pred);
-    my $loss = $F->square($F->relu($self->margin - $pred * $label));
-    $loss = __PACKAGE__->_apply_weighting($F, $loss, $self->weight, $sample_weight);
-    return $F->mean($loss, axis => $self->batch_axis, exclude => 1);
-}
-
-__PACKAGE__->register('AI::MXNet::Gluon::Loss');
-
-package AI::MXNet::Gluon::LogisticLoss;
-use AI::MXNet::Gluon::Mouse;
-extends 'AI::MXNet::Gluon::Loss';
-has 'label_format' => (is => 'rw', isa => 'Str', default => 'signed');
-
-=head1 NAME
-
-    AI::MXNet::Gluon::LogisticLoss
-=cut
-
-=head1 DESCRIPTION
-
-    Calculates the logistic loss (for binary losses only):
-
-    .. math::
-        L = \sum_i \log(1 + \exp(- {pred}_i \cdot {label}_i))
-
-    where `pred` is the classifier prediction and `label` is the target tensor
-    containing values -1 or 1 (0 or 1 if `label_format` is binary).
-     `pred` and `label` can have arbitrary shape as long as they have the same number of elements.
-
-    Parameters
-    ----------
-    weight : float or None
-        Global scalar weight for loss.
-    batch_axis : int, default 0
-        The axis that represents mini-batch.
-    label_format : str, default 'signed'
-        Can be either 'signed' or 'binary'. If the label_format is 'signed', all label values should
-        be either -1 or 1. If the label_format is 'binary', all label values should be either
-        0 or 1.
-
-    Inputs:
-        - **pred**: prediction tensor with arbitrary shape.
-        - **label**: truth tensor with values -1/1 (label_format is 'signed')
-          or 0/1 (label_format is 'binary'). Must have the same size as pred.
-        - **sample_weight**: element-wise weighting tensor. Must be broadcastable
-          to the same shape as pred. For example, if pred has shape (64, 10)
-          and you want to weigh each sample in the batch separately,
-          sample_weight should have shape (64, 1).
-
-    Outputs:
-        - **loss**: loss tensor with shape (batch_size,). Dimenions other than
-          batch_axis are averaged out.
-=cut
-
-sub BUILD
-{
-    my $self = shift;
-    if(not ($self->label_format eq 'signed' or $self->label_format eq 'binary'))
-    {
-        confess(sprintf("label_format can only be signed or binary, recieved %s", $self->label_format));
-    }
-}
-
-method hybrid_forward(
-    GluonClass $F, GluonInput $pred, GluonInput $label, Maybe[GluonInput] $sample_weight=
-)
-{
-    $label = __PACKAGE__->_reshape_like($F, $label, $pred);
-    if($self->label_format eq 'signed')
-    {
-        $label = ($label + 1) / 2;  # Transform label to be either 0 or 1
-    }
-    # Use a stable formula in computation
-    my $loss = $F->relu($pred) - $pred * $label + $F->Activation(-$F->abs($pred), act_type=>'softrelu');
-    $loss = __PACKAGE__->_apply_weighting($F, $loss, $self->weight, $sample_weight);
-    return $F->mean($loss, axis => $self->batch_axis, exclude => 1);
-}
-
-__PACKAGE__->register('AI::MXNet::Gluon::Loss');
-
-package AI::MXNet::Gluon::TripletLoss;
-use AI::MXNet::Gluon::Mouse;
-extends 'AI::MXNet::Gluon::Loss';
-has 'margin' => (is => 'rw', isa => 'Num', default => 1);
-
-=head1 NAME
-
-    AI::MXNet::Gluon::TripletLoss
-=cut
-
-=head1 DESCRIPTION
-
-    Calculates triplet loss given three input tensors and a positive margin.
-    Triplet loss measures the relative similarity between prediction, a positive
-    example and a negative example:
-
-    .. math::
-        L = \sum_i \max(\Vert {pred}_i - {pos_i} \Vert_2^2 -
-                        \Vert {pred}_i - {neg_i} \Vert_2^2 + {margin}, 0)
-
-    `pred`, `positive` and `negative` can have arbitrary shape as long as they
-    have the same number of elements.
-
-    Parameters
-    ----------
-    margin : float
-        Margin of separation between correct and incorrect pair.
-    weight : float or None
-        Global scalar weight for loss.
-    batch_axis : int, default 0
-        The axis that represents mini-batch.
-
-
-    Inputs:
-        - **pred**: prediction tensor with arbitrary shape
-        - **positive**: positive example tensor with arbitrary shape. Must have
-          the same size as pred.
-        - **negative**: negative example tensor with arbitrary shape Must have
-          the same size as pred.
-
-    Outputs:
-        - **loss**: loss tensor with shape (batch_size,).
-=cut
-
-method hybrid_forward(
-    GluonClass $F, GluonInput $pred, GluonInput $positive, GluonInput $negative, Maybe[GluonInput] $sample_weight=
-)
-{
-    $positive = __PACKAGE__->_reshape_like($F, $positive, $pred);
-    $negative = __PACKAGE__->_reshape_like($F, $negative, $pred);
-    my $loss = $F->sum($F->square($pred-$positive) - $F->square($pred-$negative),
-                     axis=>$self->batch_axis, exclude=>1);
-    $loss = $F->relu($loss + $self->margin);
-    return __PACKAGE__->_apply_weighting($F, $loss, $self->weight, $sample_weight);
-}
-
-__PACKAGE__->register('AI::MXNet::Gluon::Loss');
-
-package AI::MXNet::Gluon::PoissonNLLLoss;
-use AI::MXNet::Gluon::Mouse;
-extends 'AI::MXNet::Gluon::Loss';
-has 'from_logits'  => (is => 'ro', isa => 'Bool', default => 1);
-has 'compute_full' => (is => 'ro', isa => 'Bool', default => 0);
-
-=head1 NAME
-
-    AI::MXNet::Gluon::PoissonNLLLoss
-=cut
-
-=head1 DESCRIPTION
-
-    For a target (Random Variable) in a Poisson distribution, the function calculates the Negative
-    Log likelihood loss.
-    PoissonNLLLoss measures the loss accrued from a poisson regression prediction made by the model.
-
-    .. math::
-        L = \text{pred} - \text{target} * \log(\text{pred}) +\log(\text{target!})
-
-    `pred`, `target` can have arbitrary shape as long as they have the same number of elements.
-
-    Parameters
-    ----------
-    from_logits : boolean, default True
-        indicating whether log(predicted) value has already been computed. If True, the loss is computed as
-        :math:`\exp(\text{pred}) - \text{target} * \text{pred}`, and if False, then loss is computed as
-        :math:`\text{pred} - \text{target} * \log(\text{pred}+\text{epsilon})`.The default value
-    weight : float or None
-        Global scalar weight for loss.
-    batch_axis : int, default 0
-        The axis that represents mini-batch.
-    compute_full: boolean, default False
-        Indicates whether to add an approximation(Stirling factor) for the Factorial term in the formula for the loss.
-        The Stirling factor is:
-        :math:`\text{target} * \log(\text{target}) - \text{target} + 0.5 * \log(2 * \pi * \text{target})`
-    epsilon: float, default 1e-08
-        This is to avoid calculating log(0) which is not defined.
-
-
-    Inputs:
-        - **pred**:   Predicted value
-        - **target**: Random variable(count or number) which belongs to a Poisson distribution.
-        - **sample_weight**: element-wise weighting tensor. Must be broadcastable
-          to the same shape as pred. For example, if pred has shape (64, 10)
-          and you want to weigh each sample in the batch separately,
-          sample_weight should have shape (64, 1).
-
-    Outputs:
-        - **loss**: Average loss (shape=(1,1)) of the loss tensor with shape (batch_size,).
-=cut
-
-method hybrid_forward(
-    GluonClass $F, GluonInput $pred, GluonInput $target,
-    Maybe[GluonInput] $sample_weight=, Maybe[Num] $epsilon=1e-08
-)
-{
-    $target = __PACKAGE__->_reshape_like($F, $target, $pred);
-    my $loss;
-    if($self->from_logits)
-    {
-        $loss = $F->exp($pred) - $target * $pred;
-    }
-    else
-    {
-        $loss = $pred - $target * $F->log($pred + $epsilon);
-        if($self->compute_full)
-        {
-            my $stirling_factor = $target * $F->log($target) - $target + 0.5 * $F->log(2 * $target * 3.1415926);
-            $stirling_factor *= ($target > 1);
-            $loss += $stirling_factor;
-        }
-        $loss = __PACKAGE__->_apply_weighting($F, $loss, $self->weight, $sample_weight);
-    }
-    return $F->mean($loss);
-}
-
-__PACKAGE__->register('AI::MXNet::Gluon::Loss');
-
-package AI::MXNet::Gluon::CosineEmbeddingLoss;
-use AI::MXNet::Gluon::Mouse;
-extends 'AI::MXNet::Gluon::Loss';
-has 'margin' => (is => 'rw', isa => 'Num', default => 0);
-
-=head1 NAME
-
-    AI::MXNet::Gluon::CosineEmbeddingLoss
-=cut
-
-=head1 DESCRIPTION
-
-    For a target label 1 or -1, vectors input1 and input2, the function computes the cosine distance
-    between the vectors. This can be interpreted as how similar/dissimilar two input vectors are.
-
-    .. math::
-
-        L = \sum_i \begin{cases} 1 - {cos\_sim({input1}_i, {input2}_i)} & \text{ if } {label}_i = 1\\
-                         {cos\_sim({input1}_i, {input2}_i)} & \text{ if } {label}_i = -1 \end{cases}\\
-        cos\_sim(input1, input2) = \frac{{input1}_i.{input2}_i}{||{input1}_i||.||{input2}_i||}
-
-    `input1`, `input2` can have arbitrary shape as long as they have the same number of elements.
-
-    Parameters
-    ----------
-    weight : float or None
-        Global scalar weight for loss.
-    batch_axis : int, default 0
-        The axis that represents mini-batch.
-    margin : float
-        Margin of separation between correct and incorrect pair.
-
-
-    Inputs:
-        - **input1**: a tensor with arbitrary shape
-        - **input2**: another tensor with same shape as pred to which input1 is
-          compared for similarity and loss calculation
-        - **label**: A 1-D tensor indicating for each pair input1 and input2, target label is 1 or -1
-        - **sample_weight**: element-wise weighting tensor. Must be broadcastable
-          to the same shape as input1. For example, if input1 has shape (64, 10)
-          and you want to weigh each sample in the batch separately,
-          sample_weight should have shape (64, 1).
-
-    Outputs:
-        - **loss**: The loss tensor with shape (batch_size,).
-=cut
-
-method hybrid_forward(
-    GluonClass $F, GluonInput $input1, GluonInput $input2, GluonInput $label, Maybe[GluonInput] $sample_weight=
-)
-{
-    $input1 = __PACKAGE__->_reshape_like($F, $input1, $input2);
-    $label = $label->reshape([-1, 1]);
-    my $cos_sim = $self->_cosine_similarity($F, $input1, $input2);
-    my $y_1 = $label == 1;
-    my $y_minus_1 = $label == -1;
-    my $cos_sim_a = (1 - $cos_sim) * $y_1;
-
-    my $z_array;
-    if($F eq 'AI::MXNet::NDArray')
-    {
-        $z_array = $F->array([0]);
-    }
-    else
-    {
-        $z_array = $F->zeros([1, 1]);
-    }
-    my $cos_sim_b = $F->broadcast_maximum($z_array, $y_minus_1 * ($cos_sim - $self->margin), { axis=>1 });
-    my $loss = $cos_sim_a + $cos_sim_b;
-    $loss = __PACKAGE__->_apply_weighting($F, $loss, $self->weight, $sample_weight);
-    return $loss;
-}
-
-method _cosine_similarity($F, $x, $y, $axis=-1)
-{
-    my $x_norm = $F->norm($x, axis=>$axis)->reshape([-1, 1]);
-    my $y_norm = $F->norm($y, axis=>$axis)->reshape([-1, 1]);
-    my $x_dot_y = $F->sum($x*$y, axis=>$axis)->reshape([-1, 1]);
-    my $eps_arr;
-    if($F eq 'AI::MXNet::NDArray')
-    {
-        $eps_arr = $F->array([1e-12]);
-    }
-    else
-    {
-        $eps_arr = $F->full([1, 1], 1e-12);
-    }
-    return ($x_dot_y / $F->broadcast_maximum($x_norm * $y_norm, $eps_arr));
-}
-
-__PACKAGE__->register('AI::MXNet::Gluon::Loss');
-
-1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Mouse.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Mouse.pm
deleted file mode 100644
index 2d1e9cf6f09a..000000000000
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Mouse.pm
+++ /dev/null
@@ -1,63 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-package AI::MXNet::Gluon::Mouse;
-use strict;
-use warnings;
-use Mouse;
-use Mouse::Exporter;
-no Mouse;
-
-Mouse::Exporter->setup_import_methods(
-    as_is   => [
-        'has',
-        \&Mouse::extends,
-        \&Mouse::with,
-        \&Mouse::before,
-        \&Mouse::after,
-        \&Mouse::around,
-        \&Mouse::override,
-        \&Mouse::super,
-        \&Mouse::augment,
-        \&Mouse::inner,
-        \&Scalar::Util::blessed,
-        \&Carp::confess
-    ]
-);
-
-sub init_meta { return Mouse::init_meta(@_) }
-sub has
-{
-    my $name = shift;
-    my %args = @_;
-    my $caller = delete $args{caller} // caller;
-    my $meta = $caller->meta;
-
-    $meta->throw_error(q{Usage: has 'name' => ( key => value, ... )})
-        if @_ % 2; # odd number of arguments
-
-    for my $n (ref($name) ? @{$name} : $name){
-        $meta->add_attribute(
-            $n,
-            trigger => sub { my $self = shift; $self->__setattr__($n, @_); },
-            %args
-        );
-    }
-    return;
-}
-
-1;
\ No newline at end of file
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/NN.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/NN.pm
deleted file mode 100644
index c8b4c7f4ef60..000000000000
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/NN.pm
+++ /dev/null
@@ -1,27 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-package AI::MXNet::Gluon::NN;
-use strict;
-use warnings;
-use AI::MXNet::NS 'global';
-use AI::MXNet::Gluon::Block;
-use AI::MXNet::Gluon::NN::Activation;
-use AI::MXNet::Gluon::NN::BasicLayers;
-use AI::MXNet::Gluon::NN::ConvLayers;
-
-1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/NN/Activation.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/NN/Activation.pm
deleted file mode 100644
index 63fd80d705b8..000000000000
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/NN/Activation.pm
+++ /dev/null
@@ -1,244 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-package AI::MXNet::Gluon::NN::Activation;
-use strict;
-use warnings;
-use AI::MXNet::Function::Parameters;
-
-=head1
-
-    AI::MXNet::Gluon::NN::Activation
-=cut
-
-=head1 DESCRIPTION
-
-    Applies an activation function to input.
-
-    Parameters
-    ----------
-    activation : str
-        Name of activation function to use.
-        See mxnet.ndarray.Activation for available choices.
-
-    Input shape:
-        Arbitrary.
-
-    Output shape:
-        Same shape as input.
-=cut
-
-use AI::MXNet::Gluon::Mouse;
-extends 'AI::MXNet::Gluon::HybridBlock';
-has 'activation' => (is => 'ro', isa => 'Str', required => 1);
-
-method python_constructor_arguments()
-{
-    ['activation'];
-}
-
-method _alias()
-{
-    return $self->activation;
-}
-
-method hybrid_forward(GluonClass $F, GluonInput $x)
-{
-    return $F->Activation($x, act_type => $self->activation, name=>'fwd');
-}
-
-use overload '""' => sub { my $self = shift; "${\ $self->_class_name }(${\ $self->activation })"; };
-
-__PACKAGE__->register('AI::MXNet::Gluon::NN');
-
-package AI::MXNet::Gluon::NN::LeakyReLU;
-=head1
-
-    AI::MXNet::Gluon::NN::LeakyReLU - Leaky version of a Rectified Linear Unit.
-=cut
-
-=head1 DESCRIPTION
-
-    Leaky version of a Rectified Linear Unit.
-
-    It allows a small gradient when the unit is not active
-
-    Parameters
-    ----------
-    alpha : float
-        slope coefficient for the negative half axis. Must be >= 0.
-=cut
-
-use AI::MXNet::Gluon::Mouse;
-extends 'AI::MXNet::Gluon::HybridBlock';
-has 'alpha' => (is => 'ro', isa => 'Num', required => 1);
-
-method python_constructor_arguments()
-{
-    ['alpha'];
-}
-
-sub BUILD
-{
-    confess('Slope coefficient for LeakyReLU must be no less than 0')
-        unless shift->alpha > 0;
-}
-
-method hybrid_forward(GluonClass $F, GluonInput $x)
-{
-    return $F->LeakyReLU($x, act_type => 'leaky', slope => $self->alpha, name=>'fwd');
-}
-
-use overload '""' => sub { my $self = shift; "${\ $self->_class_name }(${\ $self->alpha })"; };
-
-__PACKAGE__->register('AI::MXNet::Gluon::NN');
-
-package AI::MXNet::Gluon::NN::PReLU;
-=head1
-
-    AI::MXNet::Gluon::NN::PReLU - Parametric leaky version of a Rectified Linear Unit.
-=cut
-
-=head1 DESCRIPTION
-
-    Parametric leaky version of a Rectified Linear Unit.
-    https://arxiv.org/abs/1502.01852
-
-    It learns a gradient when the unit is not active
-
-    Parameters
-    ----------
-    alpha_initializer : Initializer
-        Initializer for the embeddings matrix.
-=cut
-
-use AI::MXNet::Gluon::Mouse;
-extends 'AI::MXNet::Gluon::HybridBlock';
-has 'alpha_initializer' => (is => 'ro', isa => 'Initializer', default => sub { AI::MXNet::Constant->new(0.25) });
-
-method python_constructor_arguments()
-{
-    ['alpha_initializer'];
-}
-
-sub BUILD
-{
-    my $self = shift;
-    $self->name_scope(sub {
-        $self->alpha($self->params->get('alpha', shape=>[1], init=>$self->alpha_initializer));
-    });
-}
-
-method hybrid_forward(GluonClass $F, GluonInput $x, GluonInput :$alpha)
-{
-    return $F->LeakyReLU($x, gamma => $alpha, act_type => 'prelu',  name=>'fwd');
-}
-
-__PACKAGE__->register('AI::MXNet::Gluon::NN');
-
-package AI::MXNet::Gluon::NN::ELU;
-=head1
-
-    AI::MXNet::Gluon::NN::ELU - Exponential Linear Unit (ELU)
-=cut
-
-=head1 DESCRIPTION
-
-    Exponential Linear Unit (ELU)
-        "Fast and Accurate Deep Network Learning by Exponential Linear Units", Clevert et al, 2016
-        https://arxiv.org/abs/1511.07289
-        Published as a conference paper at ICLR 2016
-
-    Parameters
-    ----------
-    alpha : float
-        The alpha parameter as described by Clevert et al, 2016
-=cut
-
-use AI::MXNet::Gluon::Mouse;
-extends 'AI::MXNet::Gluon::HybridBlock';
-has 'alpha' => (is => 'ro', isa => 'Num', default => 1);
-
-method python_constructor_arguments()
-{
-    ['alpha'];
-}
-
-method hybrid_forward(GluonClass $F, GluonInput $x)
-{
-    return $F->where($x > 0, $x, $self->alpha * ($F->exp($x) - 1));
-}
-
-__PACKAGE__->register('AI::MXNet::Gluon::NN');
-
-package AI::MXNet::Gluon::NN::SELU;
-=head1
-
-    AI::MXNet::Gluon::NN::SELU - Scaled Exponential Linear Unit (SELU)
-=cut
-
-=head1 DESCRIPTION
-
-    Scaled Exponential Linear Unit (SELU)
-    "Self-Normalizing Neural Networks", Klambauer et al, 2017
-    https://arxiv.org/abs/1706.02515
-=cut
-
-use AI::MXNet::Gluon::Mouse;
-extends 'AI::MXNet::Gluon::HybridBlock';
-
-method hybrid_forward(GluonClass $F, GluonInput $x)
-{
-    $F->LeakyReLU($x, act_type=>'selu', name=>'fwd');
-}
-
-__PACKAGE__->register('AI::MXNet::Gluon::NN');
-
-package AI::MXNet::Gluon::NN::Swish;
-=head1
-
-    AI::MXNet::Gluon::NN::Swish - Swish Activation function
-=cut
-
-=head1 DESCRIPTION
-
-    Swish Activation function
-        https://arxiv.org/pdf/1710.05941.pdf
-
-    Parameters
-    ----------
-    beta : float
-        swish(x) = x * sigmoid(beta*x)
-=cut
-
-use AI::MXNet::Gluon::Mouse;
-extends 'AI::MXNet::Gluon::HybridBlock';
-has 'beta' => (is => 'ro', isa => 'Num', default => 1);
-
-method python_constructor_arguments()
-{
-    ['beta'];
-}
-
-method hybrid_forward(GluonClass $F, GluonInput $x)
-{
-    return return $x * $F->sigmoid($self->beta * $x, name=>'fwd');
-}
-
-__PACKAGE__->register('AI::MXNet::Gluon::NN');
-
-1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/NN/BasicLayers.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/NN/BasicLayers.pm
deleted file mode 100644
index 6c554bfd0626..000000000000
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/NN/BasicLayers.pm
+++ /dev/null
@@ -1,923 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-use strict;
-use warnings;
-package AI::MXNet::Gluon::NN::Sequential;
-use AI::MXNet::Function::Parameters;
-
-=head1 NAME
-
-    AI::MXNet::Gluon::NN::Sequential
-=cut
-
-=head2 DESCRIPTION
-
-    Stacks `Block`s sequentially.
-
-    Example::
-
-        my $net = nn->Sequential()
-        # use net's name_scope to give child Blocks appropriate names.
-        net->name_scope(sub {
-            $net->add($nn->Dense(10, activation=>'relu'));
-            $net->add($nn->Dense(20));
-        });
-=cut
-
-use AI::MXNet::Gluon::Mouse;
-extends 'AI::MXNet::Gluon::Block';
-
-=head2
-
-    Adds block on top of the stack.
-=cut
-
-method add(AI::MXNet::Gluon::Block @block)
-{
-    $self->register_child($_) for @block;
-}
-
-
-method forward($x)
-{
-    for my $block ($self->_children->values)
-    {
-        $x = $block->($x);
-    }
-    return $x;
-}
-
-use overload
-    '""' => sub
-    {
-        my $self = shift;
-        my $s = "%s(\n%s\n)";
-        my @blocks;
-        my $k = 0;
-        for my $v ($self->_children->values)
-        {
-            push @blocks, "  ($k): ".AI::MXNet::Base::_indent("$v", 2);
-            $k++;
-        }
-        sprintf("%s(\n%s\n)", $self->_class_name, join("\n", @blocks));
-    },
-    '@{}' => sub { [shift->_children->values] };
-
-method slice(Slice $slice)
-{
-    my $new = __PACKAGE__->new;
-    $new->add(@{ $self }[ @$slice ]);
-    return $new;
-}
-
-__PACKAGE__->register('AI::MXNet::Gluon::NN');
-
-package AI::MXNet::Gluon::NN::HybridSequential;
-
-=head1 NAME
-
-    AI::MXNet::Gluon::NN::HybridSequential
-=cut
-
-=head2 DESCRIPTION
-
-    Stacks `Block`s sequentially.
-
-    Example::
-
-        my $net = nn->Sequential()
-        # use net's name_scope to give child Blocks appropriate names.
-        net->name_scope(sub {
-            $net->add($nn->Dense(10, activation=>'relu'));
-            $net->add($nn->Dense(20));
-        });
-=cut
-
-use AI::MXNet::Gluon::Mouse;
-extends 'AI::MXNet::Gluon::HybridBlock';
-
-=head2
-
-    Adds block on top of the stack.
-=cut
-
-method add(AI::MXNet::Gluon::HybridBlock @block)
-{
-    $self->register_child($_) for @block;
-}
-
-
-method hybrid_forward($F, $x)
-{
-    for my $block ($self->_children->values)
-    {
-        $x = $block->($x);
-    }
-    return $x;
-}
-
-use overload
-    '""' => sub
-    {
-        my $self = shift;
-        my $s = "%s(\n%s\n)";
-        my @blocks;
-        my $k = 0;
-        for my $v ($self->_children->values)
-        {
-            push @blocks, "  ($k): ".AI::MXNet::Base::_indent("$v", 2);
-            $k++;
-        }
-        sprintf("%s(\n%s\n)", $self->_class_name, join("\n", @blocks));
-    },
-    '@{}' => sub { [shift->_children->values] };
-
-method slice(Slice $slice)
-{
-    my $new = __PACKAGE__->new;
-    $new->add(@{ $self }[ @$slice ]);
-    return $new;
-}
-
-__PACKAGE__->register('AI::MXNet::Gluon::NN');
-
-package AI::MXNet::Gluon::NN::Dense;
-use AI::MXNet::Gluon::Mouse;
-extends 'AI::MXNet::Gluon::HybridBlock';
-
-method python_constructor_arguments()
-{
-    ['units'];
-}
-
-=head1 NAME
-
-    AI::MXNet::Gluon::NN::Dense
-
-=head1 DESCRIPTION
-
-    Just your regular densely-connected NN layer.
-
-    `Dense` implements the operation:
-    `output = activation(dot(input, weight) + bias)`
-    where `activation` is the element-wise activation function
-    passed as the `activation` argument, `weight` is a weights matrix
-    created by the layer, and `bias` is a bias vector created by the layer
-    (only applicable if `use_bias` is `True`).
-
-    Note: the input must be a tensor with rank 2. Use `flatten` to convert it
-    to rank 2 manually if necessary.
-
-    Parameters
-    ----------
-    units : int
-        Dimensionality of the output space.
-    activation : str
-        Activation function to use. See help on `Activation` layer.
-        If you don't specify anything, no activation is applied
-        (ie. "linear" activation: `a(x) = x`).
-    use_bias : bool
-        Whether the layer uses a bias vector.
-    flatten : bool, default true
-        Whether the input tensor should be flattened.
-        If true, all but the first axis of input data are collapsed together.
-        If false, all but the last axis of input data are kept the same, and the transformation
-        applies on the last axis.
-    weight_initializer : str or `Initializer`
-        Initializer for the `kernel` weights matrix.
-    bias_initializer: str or `Initializer`
-        Initializer for the bias vector.
-    in_units : int, optional
-        Size of the input data. If not specified, initialization will be
-        deferred to the first time `forward` is called and `in_units`
-        will be inferred from the shape of input data.
-    prefix : str or None
-        See document of `Block`.
-    params : ParameterDict or None
-    weight_initializer : str or `Initializer`
-        Initializer for the `kernel` weights matrix.
-    bias_initializer: str or `Initializer`
-        Initializer for the bias vector.
-    in_units : int, optional
-        Size of the input data. If not specified, initialization will be
-        deferred to the first time `forward` is called and `in_units`
-        will be inferred from the shape of input data.
-    prefix : str or None
-        See document of `Block`.
-    params : ParameterDict or None
-        See document of `Block`.
-
-    If flatten is set to be True, then the shapes are:
-    Input shape:
-        An N-D input with shape
-        `(batch_size, x1, x2, ..., xn) with x1 * x2 * ... * xn equal to in_units`.
-
-    Output shape:
-        The output would have shape `(batch_size, units)`.
-
-    If ``flatten`` is set to be false, then the shapes are:
-    Input shape:
-        An N-D input with shape
-        `(x1, x2, ..., xn, in_units)`.
-
-    Output shape:
-        The output would have shape `(x1, x2, ..., xn, units)`.
-=cut
-
-has 'units'               => (is => 'rw', isa => 'Int', required => 1);
-has 'activation'          => (is => 'rw', isa => 'Str');
-has 'use_bias'            => (is => 'rw', isa => 'Bool', default => 1);
-has 'flatten'             => (is => 'rw', isa => 'Bool', default => 1);
-has 'weight_initializer'  => (is => 'rw', isa => 'Initializer');
-has 'bias_initializer'    => (is => 'rw', isa => 'Initializer', default => 'zeros');
-has 'in_units'            => (is => 'rw', isa => 'Int', default => 0);
-has [qw/weight bias act/] => (is => 'rw', init_arg => undef);
-
-sub BUILD
-{
-    my $self = shift;
-    $self->name_scope(sub {
-        $self->weight(
-            $self->params->get(
-                'weight', shape => [$self->units, $self->in_units],
-                init => $self->weight_initializer,
-                allow_deferred_init => 1
-            )
-        );
-        if($self->use_bias)
-        {
-            $self->bias(
-                $self->params->get(
-                    'bias', shape => [$self->units],
-                    init => $self->bias_initializer,
-                    allow_deferred_init => 1
-                )
-            );
-        }
-        if(defined $self->activation)
-        {
-            $self->act(
-                AI::MXNet::Gluon::NN->Activation(
-                    activation => $self->activation,
-                    prefix => $self->activation.'_'
-                )
-            );
-        }
-    });
-}
-
-method hybrid_forward(GluonClass $F, GluonInput $x, GluonInput :$weight, Maybe[GluonInput] :$bias=)
-{
-    my $act;
-    if(not defined $bias)
-    {
-        $act = $F->FullyConnected($x, $weight, no_bias => 1, num_hidden => $self->units, name => 'fwd');
-    }
-    else
-    {
-        $act = $F->FullyConnected($x, $weight, $bias, num_hidden => $self->units, flatten => $self->flatten, name => 'fwd')
-    }
-    if(defined $self->act)
-    {
-        $act = $self->act->($act);
-    }
-    return $act;
-}
-
-use overload '""' => sub {
-    my $self = shift;
-    "${\ $self->_class_name }(${\ $self->units } -> ${\ $self->in_units },"
-    ." @{[ $self->act ? $self->act : 'linear' ]})"
-};
-
-__PACKAGE__->register('AI::MXNet::Gluon::NN');
-
-package AI::MXNet::Gluon::NN::Dropout;
-use AI::MXNet::Gluon::Mouse;
-extends 'AI::MXNet::Gluon::HybridBlock';
-
-=head1 NAME
-
-    AI::MXNet::Gluon::NN::Dropout
-=cut
-
-=head1 DESCRIPTION
-
-    Applies Dropout to the input.
-
-    Dropout consists in randomly setting a fraction `rate` of input units
-    to 0 at each update during training time, which helps prevent overfitting.
-
-    Parameters
-    ----------
-    rate : float
-        Fraction of the input units to drop. Must be a number between 0 and 1.
-
-
-    Input shape:
-        Arbitrary.
-
-    Output shape:
-        Same shape as input.
-
-    References
-    ----------
-        `Dropout: A Simple Way to Prevent Neural Networks from Overfitting
-        <http://www.cs.toronto.edu/~rsalakhu/papers/srivastava14a.pdf>`_
-=cut
-has 'rate' => (is => 'ro', isa => 'Dropout', required => 1);
-method python_constructor_arguments() { ['rate'] }
-
-method hybrid_forward(GluonClass $F, GluonInput $x)
-{
-    return $F->Dropout($x, p => $self->rate, name => 'fwd');
-}
-
-use overload '""' => sub { my $self = shift; "${\ $self->_class_name }(p = ${\ $self->rate })"; };
-
-__PACKAGE__->register('AI::MXNet::Gluon::NN');
-
-package AI::MXNet::Gluon::NN::BatchNorm;
-use AI::MXNet::Gluon::Mouse;
-extends 'AI::MXNet::Gluon::HybridBlock';
-
-=head1 NAME
-
-    AI::MXNet::Gluon::NN::BatchNorm
-=cut
-
-=head1 DESCRIPTION
-
-    Batch normalization layer (Ioffe and Szegedy, 2014).
-    Normalizes the input at each batch, i.e. applies a transformation
-    that maintains the mean activation close to 0 and the activation
-    standard deviation close to 1.
-
-    Parameters
-    ----------
-    axis : int, default 1
-        The axis that should be normalized. This is typically the channels
-        (C) axis. For instance, after a `Conv2D` layer with `layout='NCHW'`,
-        set `axis=1` in `BatchNorm`. If `layout='NHWC'`, then set `axis=3`.
-    momentum: float, default 0.9
-        Momentum for the moving average.
-    epsilon: float, default 1e-5
-        Small float added to variance to avoid dividing by zero.
-    center: bool, default True
-        If True, add offset of `beta` to normalized tensor.
-        If False, `beta` is ignored.
-    scale: bool, default True
-        If True, multiply by `gamma`. If False, `gamma` is not used.
-        When the next layer is linear (also e.g. `nn.relu`),
-        this can be disabled since the scaling
-        will be done by the next layer.
-    beta_initializer: str or `Initializer`, default 'zeros'
-        Initializer for the beta weight.
-    gamma_initializer: str or `Initializer`, default 'ones'
-        Initializer for the gamma weight.
-    moving_mean_initializer: str or `Initializer`, default 'zeros'
-        Initializer for the moving mean.
-    moving_variance_initializer: str or `Initializer`, default 'ones'
-        Initializer for the moving variance.
-    in_channels : int, default 0
-        Number of channels (feature maps) in input data. If not specified,
-        initialization will be deferred to the first time `forward` is called
-        and `in_channels` will be inferred from the shape of input data.
-
-
-    Input shape:
-        Arbitrary.
-
-    Output shape:
-        Same shape as input.
-=cut
-
-has 'axis'             => (is => 'ro', isa => 'DimSize',     default => 1);
-has 'momentum'         => (is => 'ro', isa => 'Num',         default => 0.9);
-has 'epsilon'          => (is => 'ro', isa => 'Num',         default => 1e-5);
-has 'center'           => (is => 'ro', isa => 'Bool',        default => 1);
-has 'scale'            => (is => 'ro', isa => 'Bool',        default => 1);
-has 'beta_initializer' => (is => 'ro', isa => 'Initializer', default => 'zeros');
-has [qw/gamma_initializer
-        running_mean_initializer
-        running_variance_initializer
-    /]                 => (is => 'ro', isa => 'Initializer', default => 'ones');
-has 'in_channels'      => (is => 'ro', isa => 'DimSize',     default => 0);
-has [qw/_kwargs
-        gamma
-        beta
-        running_mean
-        running_var/]  => (is => 'rw', init_arg => undef);
-
-sub BUILD
-{
-    my $self = shift;
-    $self->_kwargs({
-        axis => $self->axis,
-        eps => $self->epsilon,
-        momentum => $self->momentum,
-        fix_gamma => $self->scale ? 0 : 1
-    });
-
-    $self->gamma(
-        $self->params->get(
-            'gamma', grad_req => $self->scale ? 'write' : 'null',
-            shape => [$self->in_channels], init => $self->gamma_initializer,
-            allow_deferred_init => 1, differentiable => $self->scale
-        )
-    );
-    $self->beta(
-        $self->params->get(
-            'beta', grad_req => $self->center ? 'write' : 'null',
-            shape => [$self->in_channels], init => $self->beta_initializer,
-            allow_deferred_init => 1, differentiable => $self->center
-        )
-    );
-    $self->running_mean(
-        $self->params->get(
-            'running_mean', grad_req => 'null',
-            shape => [$self->in_channels], init => $self->running_mean_initializer,
-            allow_deferred_init => 1, differentiable => 0
-        )
-    );
-    $self->running_var(
-        $self->params->get(
-            'running_var', grad_req => $self->center ? 'write' : 'null',
-            shape => [$self->in_channels], init => $self->running_variance_initializer,
-            allow_deferred_init => 1, differentiable => 0
-        )
-    );
-}
-
-method hybrid_forward(
-    GluonClass $F, GluonInput $x,
-    GluonInput :$gamma, GluonInput :$beta,
-    GluonInput :$running_mean, GluonInput :$running_var
-)
-{
-    return $F->BatchNorm(
-        $x, $gamma, $beta, $running_mean, $running_var,
-        name =>'fwd', %{ $self->_kwargs }
-    );
-}
-
-use overload '""' => sub {
-    my $self = shift;
-    my $f = "%s(%s".($self->in_channels ? ", in_channels=".$self->in_channels : '').')';
-    my $content = join(", ", map { join('=', $_, $self->_kwargs->{$_}) } keys %{ $self->_kwargs });
-    return sprintf($f, $self->_class_name, $content);
-};
-
-__PACKAGE__->register('AI::MXNet::Gluon::NN');
-
-package AI::MXNet::Gluon::NN::Embedding;
-use AI::MXNet::Gluon::Mouse;
-extends 'AI::MXNet::Gluon::HybridBlock';
-
-=head1 NAME
-
-    AI::MXNet::Gluon::NN::Embedding
-=cut
-
-=head1 DESCRIPTION
-
-    Turns non-negative integers (indexes/tokens) into dense vectors
-    of fixed size. eg. [[4], [20]] -> [[0.25, 0.1], [0.6, -0.2]]
-
-
-    Parameters
-    ----------
-    input_dim : int
-        Size of the vocabulary, i.e. maximum integer index + 1.
-    output_dim : int
-        Dimension of the dense embedding.
-    dtype : str or np.dtype, default 'float32'
-        Data type of output embeddings.
-    weight_initializer : Initializer
-        Initializer for the `embeddings` matrix.
-    sparse_grad: bool
-        If True, gradient w.r.t. weight will be a 'row_sparse' NDArray.
-=cut
-
-has [qw/input_dim
-    output_dim/]         => (is => 'ro', isa => 'DimSize', required => 1);
-has 'dtype'              => (is => 'ro', isa => 'Dtype', default => 'float32');
-has 'weight_initalizer'  => (is => 'ro', isa => 'Maybe[Initializer]');
-has 'sparse_grad'        => (is => 'ro', isa => 'Bool', default => 0);
-has [qw/_kwargs weight/] => (is => 'rw', init_arg => undef);
-method python_constructor_arguments()
-{
-    ['input_dim', 'output_dim'];
-}
-
-sub BUILD
-{
-    my $self = shift;
-    $self->_kwargs({
-        input_dim => $self->input_dim,
-        output_dim =>  $self->output_dim,
-        dtype => $self->dtype,
-        sparse_grad => $self->sparse_grad
-    });
-    $self->weight(
-        $self->params->get(
-            'weight',
-            shape => [$self->input_dim, $self->output_dim],
-            init => $self->weight_initializer,
-            allow_deferred_init => 1,
-            dtype => $self->dtype,
-            grad_stype => ($self->sparse_grad ? 'row_sparse' : 'default')
-        )
-    );
-}
-
-method hybrid_forward(GluonClass $F, GluonInput $x, GluonInput :$weight)
-{
-    return $F->Embedding($x, $weight, name => 'fwd', %{ $self->_kwargs });
-}
-
-use overload '""' => sub {
-    my $self = shift;
-    "${\ $self->_class_name }(${\ $self->input_dim } -> ${\ $self->output_dim }, ${\ $self->dtype })";
-};
-
-__PACKAGE__->register('AI::MXNet::Gluon::NN');
-
-package AI::MXNet::Gluon::NN::Flatten;
-use AI::MXNet::Gluon::Mouse;
-extends 'AI::MXNet::Gluon::HybridBlock';
-
-=head1 NAME
-
-    AI::MXNet::Gluon::NN::Flatten
-=cut
-
-=head1 DESCRIPTION
-
-    Flattens the input to two dimensional.
-
-    Input shape:
-        Arbitrary shape `(N, a, b, c, ...)`
-
-    Output shape:
-        2D tensor with shape: `(N, a*b*c...)`
-=cut
-
-method hybrid_forward(GluonClass $F, GluonInput $x)
-{
-    return $x->reshape([0, -1]);
-}
-
-use overload '""' => sub { shift->_class_name };
-
-__PACKAGE__->register('AI::MXNet::Gluon::NN');
-
-package AI::MXNet::Gluon::NN::InstanceNorm;
-use AI::MXNet::Gluon::Mouse;
-extends 'AI::MXNet::Gluon::HybridBlock';
-
-=head1 NAME
-
-    AI::MXNet::Gluon::NN::InstanceNorm - Applies instance normalization to the n-dimensional input array.
-=cut
-
-=head1 DESCRIPTION
-
-    Applies instance normalization to the n-dimensional input array.
-    This operator takes an n-dimensional input array where (n>2) and normalizes
-    the input using the following formula:
-
-    Parameters
-    ----------
-    axis : int, default 1
-        The axis that will be excluded in the normalization process. This is typically the channels
-        (C) axis. For instance, after a `Conv2D` layer with `layout='NCHW'`,
-        set `axis=1` in `InstanceNorm`. If `layout='NHWC'`, then set `axis=3`. Data will be
-        normalized along axes excluding the first axis and the axis given.
-    epsilon: float, default 1e-5
-        Small float added to variance to avoid dividing by zero.
-    center: bool, default True
-        If True, add offset of `beta` to normalized tensor.
-        If False, `beta` is ignored.
-    scale: bool, default True
-        If True, multiply by `gamma`. If False, `gamma` is not used.
-        When the next layer is linear (also e.g. `nn.relu`),
-        this can be disabled since the scaling
-        will be done by the next layer.
-    beta_initializer: str or `Initializer`, default 'zeros'
-        Initializer for the beta weight.
-    gamma_initializer: str or `Initializer`, default 'ones'
-        Initializer for the gamma weight.
-    in_channels : int, default 0
-        Number of channels (feature maps) in input data. If not specified,
-        initialization will be deferred to the first time `forward` is called
-        and `in_channels` will be inferred from the shape of input data.
-
-    References
-    ----------
-        Instance Normalization: The Missing Ingredient for Fast Stylization
-        <https://arxiv.org/abs/1607.08022>
-
-    Examples
-    --------
-    >>> # Input of shape (2,1,2)
-    >>> $x = mx->nd->array([[[ 1.1,  2.2]],
-    ...                 [[ 3.3,  4.4]]]);
-    >>> $layer = nn->InstanceNorm()
-    >>> $layer->initialize(ctx=>mx->cpu(0))
-    >>> $layer->($x)
-    [[[-0.99998355  0.99998331]]
-     [[-0.99998319  0.99998361]]]
-    <NDArray 2x1x2 @cpu(0)>
-=cut
-
-has 'axis'              => (is => 'ro', isa => 'Int',  default => 1);
-has 'epsilon'           => (is => 'ro', isa => 'Num',  default => 1e-5);
-has 'center'            => (is => 'ro', isa => 'Bool', default => 1);
-has 'scale'             => (is => 'ro', isa => 'Bool', default => 0);
-has 'beta_initializer'  => (is => 'rw', isa => 'Initializer', default => 'zeros');
-has 'gamma_initializer' => (is => 'rw', isa => 'Initializer', default => 'ones');
-has 'in_channels'       => (is => 'rw', isa => 'Int',  default => 0);
-has [qw/_kwargs
-        gamma beta/]    => (is => 'rw', init_arg => undef);
-method python_constructor_arguments()
-{
-    [qw/axis epsilon center scale beta_initializer gamma_initializer in_channels/];
-}
-
-
-sub BUILD
-{
-    my $self = shift;
-    $self->_kwargs(Hash::Ordered->new(eps => $self->epsilon, axis => $self->axis, center => $self->center, scale => $self->scale));
-    $self->gamma(
-        $self->params->get(
-            'gamma', grad_req => $self->scale ? 'write' :'null',
-            shape => [$self->in_channels], init => $self->gamma_initializer,
-            allow_deferred_init => 1
-        )
-    );
-    $self->beta(
-        $self->params->get(
-            'beta', grad_req => $self->scale ? 'write' :'null',
-            shape => [$self->in_channels], init => $self->beta_initializer,
-            allow_deferred_init => 1
-        )
-    );
-}
-
-method hybrid_forward(GluonClass $F, GluonInput $x, GluonInput :$gamma, GluonInput :$beta)
-{
-    if($self->axis == 1)
-    {
-        return $F->InstanceNorm(
-                    $x, $gamma, $beta,
-                    name=>'fwd', eps=>$self->epsilon
-        );
-    }
-    $x = $x->swapaxes(1, $self->axis);
-    return $F->InstanceNorm(
-                    $x, $gamma, $beta, name=>'fwd',
-                    eps => $self->epsilon
-    )->swapaxes(1, $self->axis);
-}
-
-use overload '""' => sub {
-    my $self = shift;
-    my $in_channels = ", in_channels=${\ $self->in_channels }";
-    my $content = join(', ', map { join('=', $_, $self->_kwargs->get($_)) } $self->_kwargs->keys);
-    return "${\ $self->_class_name }($content, $in_channels)";
-};
-
-__PACKAGE__->register('AI::MXNet::Gluon::NN');
-
-package AI::MXNet::Gluon::NN::LayerNorm;
-use AI::MXNet::Gluon::Mouse;
-extends 'AI::MXNet::Gluon::HybridBlock';
-
-=head1 NAME
-
-    AI::MXNet::Gluon::NN::LayerNorm - Applies layer normalization to the n-dimensional input array.
-=cut
-
-=head1 DESCRIPTION
-
-    Applies layer normalization to the n-dimensional input array.
-    This operator takes an n-dimensional input array and normalizes
-    the input using the given axis:
-
-    Parameters
-    ----------
-    axis : int, default -1
-        The axis that should be normalized. This is typically the axis of the channels.
-    epsilon: float, default 1e-5
-        Small float added to variance to avoid dividing by zero.
-    center: bool, default True
-        If True, add offset of `beta` to normalized tensor.
-        If False, `beta` is ignored.
-    scale: bool, default True
-        If True, multiply by `gamma`. If False, `gamma` is not used.
-    beta_initializer: str or `Initializer`, default 'zeros'
-        Initializer for the beta weight.
-    gamma_initializer: str or `Initializer`, default 'ones'
-        Initializer for the gamma weight.
-    in_channels : int, default 0
-        Number of channels (feature maps) in input data. If not specified,
-        initialization will be deferred to the first time `forward` is called
-        and `in_channels` will be inferred from the shape of input data.
-
-    References
-    ----------
-        `Layer Normalization
-        <https://arxiv.org/pdf/1607.06450.pdf>`_
-
-    Examples
-    --------
-    >>> # Input of shape (2, 5)
-    >>> $x = mx->nd->array([[1, 2, 3, 4, 5], [1, 1, 2, 2, 2]])
-    >>> # Layer normalization is calculated with the above formula
-    >>> $layer = nn->LayerNorm()
-    >>> $layer->initialize(ctx=>mx->cpu(0))
-    >>> $layer->($x)
-    [[-1.41421    -0.707105    0.          0.707105    1.41421   ]
-     [-1.2247195  -1.2247195   0.81647956  0.81647956  0.81647956]]
-    <NDArray 2x5 @cpu(0)>
-=cut
-
-has 'axis'              => (is => 'ro', isa => 'Int',  default => -1);
-has 'epsilon'          => (is => 'ro', isa => 'Num',  default => 1e-5);
-has 'center'            => (is => 'ro', isa => 'Bool', default => 1);
-has 'scale'             => (is => 'ro', isa => 'Bool', default => 0);
-has 'beta_initializer'  => (is => 'rw', isa => 'Initializer', default => 'zeros');
-has 'gamma_initializer' => (is => 'rw', isa => 'Initializer', default => 'ones');
-has 'in_channels'       => (is => 'rw', isa => 'Int',  default => 0);
-has [qw/_kwargs
-        gamma beta/]    => (is => 'rw', init_arg => undef);
-method python_constructor_arguments()
-{
-    [qw/axis epsilon center scale beta_initializer gamma_initializer in_channels/];
-}
-
-sub BUILD
-{
-    my $self = shift;
-    $self->_kwargs(Hash::Ordered->new(eps => $self->epsilon, axis => $self->axis, center => $self->center, scale => $self->scale));
-    $self->gamma(
-        $self->params->get(
-            'gamma', grad_req => $self->scale ? 'write' :'null',
-            shape => [$self->in_channels], init => $self->gamma_initializer,
-            allow_deferred_init => 1
-        )
-    );
-    $self->beta(
-        $self->params->get(
-            'beta', grad_req => $self->scale ? 'write' :'null',
-            shape => [$self->in_channels], init => $self->beta_initializer,
-            allow_deferred_init => 1
-        )
-    );
-}
-
-method hybrid_forward(GluonClass $F, GluonInput $x, GluonInput :$gamma, GluonInput :$beta)
-{
-    return $F->LayerNorm(
-        $x, $gamma, $beta,
-        eps => $self->epsilon, axis => $self->axis
-    );
-}
-
-use overload '""' => sub {
-    my $self = shift;
-    my $in_channels = ", in_channels=${\ $self->in_channels }";
-    my $content = join(', ', map { join('=', $_, $self->_kwargs->get($_)) } $self->_kwargs->keys);
-    return "${\ $self->_class_name }($content, $in_channels)";
-};
-
-__PACKAGE__->register('AI::MXNet::Gluon::NN');
-
-package AI::MXNet::Gluon::NN::Lambda;
-use AI::MXNet::Gluon::Mouse;
-extends 'AI::MXNet::Gluon::Block';
-
-=head1 NAME
-
-    AI::MXNet::Gluon::NN::Lambda - Wraps an operator or an expression as a Block object.
-=cut
-
-=head1 DESCRIPTION
-
-    Wraps an operator or an expression as a Block object.
-
-    Parameters
-    ----------
-    function : str or sub
-        Function used in lambda must be one of the following:
-        1) the name of an operator that is available in ndarray. For example
-
-            $block = nn->Lambda('tanh')
-
-        2) a sub. For example
-
-            $block = nn->Lambda(sub { my $x = shift; nd->LeakyReLU($x, slope=>0.1) });
-=cut
-
-has '_func_impl' => (is => 'rw', init_arg => 'function', isa => 'Str|CodeRef', required => 1);
-has '_func_name' => (is => 'rw', init_arg => undef, default => 'custom_sub');
-method python_constructor_arguments() { ['function'] }
-
-sub BUILD
-{
-    my $self = shift;
-    if(not ref $self->_func_impl)
-    {
-        confess("Function name ${\ $self->_func_impl } is not found in ndarray.")
-            unless AI::MXNet::NDArray->can($self->_func_impl);
-        $self->_func_name($self->_func_impl);
-        my $f = $self->_func_impl;
-        $self->_func_impl(sub { return AI::MXNet::NDArray->$f(@_) });
-    }
-}
-
-method forward(@args)
-{
-    return $self->_func_impl->(@args);
-}
-
-use overload '""' => sub {
-    my $self = shift;
-    return "${\ $self->_class_name }(${\ $self->_func_name })";
-};
-
-__PACKAGE__->register('AI::MXNet::Gluon::NN');
-
-package AI::MXNet::Gluon::NN::HybridLambda;
-use AI::MXNet::Gluon::Mouse;
-extends 'AI::MXNet::Gluon::HybridBlock';
-
-=head1 NAME
-
-    AI::MXNet::Gluon::NN::HybridLambda - Wraps an operator or an expression as a HybridBlock object.
-=cut
-
-=head1 DESCRIPTION
-
-    Wraps an operator or an expression as a HybridBlock object.
-
-    Parameters
-    ----------
-    function : str or sub
-        Function used in lambda must be one of the following:
-        1) the name of an operator that is available in symbol and ndarray. For example
-
-            $block = nn->Lambda('tanh')
-
-        2) a sub. For example
-
-            $block = nn->Lambda(sub { my $F = shift; $F->LeakyReLU($x, slope=>0.1) });
-=cut
-
-has '_func_impl' => (is => 'rw', init_arg => 'function', isa => 'Str|CodeRef', required => 1);
-has '_func_name' => (is => 'rw', init_arg => undef, default => 'custom_sub');
-method python_constructor_arguments() { ['function'] }
-
-sub BUILD
-{
-    my $self = shift;
-    if(not ref $self->_func_impl)
-    {
-        confess("Function name ${\ $self->_func_impl } is not found in ndarray.")
-            unless AI::MXNet::NDArray->can($self->_func_impl) or AI::MXNet::Symbol->can($self->_func_impl);
-        $self->_func_name($self->_func_impl);
-        my $f = $self->_func_impl;
-        $self->_func_impl(sub { my $F = shift; return $F->$f(@_) });
-    }
-}
-
-method hybrid_forward(@args)
-{
-    return $self->_func_impl->(@args);
-}
-
-use overload '""' => sub {
-    my $self = shift;
-    return "${\ $self->_class_name }(${\ $self->_func_name })";
-};
-
-__PACKAGE__->register('AI::MXNet::Gluon::NN');
-
-1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/NN/ConvLayers.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/NN/ConvLayers.pm
deleted file mode 100644
index 0e11714d4a41..000000000000
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/NN/ConvLayers.pm
+++ /dev/null
@@ -1,1418 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-use strict;
-use warnings;
-package AI::MXNet::Gluon::NN::Conv;
-use AI::MXNet::Function::Parameters;
-use AI::MXNet::Symbol;
-use AI::MXNet::Gluon::Mouse;
-extends 'AI::MXNet::Gluon::HybridBlock';
-
-func _infer_weight_shape($op_name, $data_shape, $kwargs)
-{
-    my $sym = AI::MXNet::Symbol->$op_name(
-        AI::MXNet::Symbol->var('data', shape => $data_shape), %{ $kwargs }
-    );
-    return ($sym->infer_shape_partial)[0];
-}
-
-=head1 NAME
-
-    AI::MXNet::Gluon::NN::Conv
-=cut
-
-=head1 DESCRIPTION
-
-    Abstract nD convolution layer (private, used as implementation base).
-
-    This layer creates a convolution kernel that is convolved
-    with the layer input to produce a tensor of outputs.
-    If `use_bias` is `True`, a bias vector is created and added to the outputs.
-    Finally, if `activation` is not `None`,
-    it is applied to the outputs as well.
-
-    Parameters
-    ----------
-    channels : int
-        The dimensionality of the output space
-        i.e. the number of output channels in the convolution.
-    kernel_size : int or tuple/list of n ints
-        Specifies the dimensions of the convolution window.
-    strides: int or tuple/list of n ints,
-        Specifies the strides of the convolution.
-    padding : int or tuple/list of n ints,
-        If padding is non-zero, then the input is implicitly zero-padded
-        on both sides for padding number of points
-    dilation: int or tuple/list of n ints,
-        Specifies the dilation rate to use for dilated convolution.
-    groups : int
-        Controls the connections between inputs and outputs.
-        At groups=1, all inputs are convolved to all outputs.
-        At groups=2, the operation becomes equivalent to having two convolution
-        layers side by side, each seeing half the input channels, and producing
-        half the output channels, and both subsequently concatenated.
-    layout : str,
-        Dimension ordering of data and weight. Can be 'NCW', 'NWC', 'NCHW',
-        'NHWC', 'NCDHW', 'NDHWC', etc. 'N', 'C', 'H', 'W', 'D' stands for
-        batch, channel, height, width and depth dimensions respectively.
-        Convolution is performed over 'D', 'H', and 'W' dimensions.
-    in_channels : int, default 0
-        The number of input channels to this layer. If not specified,
-        initialization will be deferred to the first time `forward` is called
-        and `in_channels` will be inferred from the shape of input data.
-    activation : str
-        Activation function to use. See :func:`~mxnet.ndarray.Activation`.
-        If you don't specify anything, no activation is applied
-        (ie. "linear" activation: `a(x) = x`).
-    use_bias: bool
-        Whether the layer uses a bias vector.
-    weight_initializer : str or `Initializer`
-        Initializer for the `weight` weights matrix.
-    bias_initializer: str or `Initializer`
-        Initializer for the bias vector.
-=cut
-
-has 'channels'           => (is => 'rw', isa => 'Int', required => 1);
-has 'in_channels'        => (is => 'rw', isa => 'Int', default => 0);
-has 'kernel_size'        => (is => 'rw', isa => 'DimSize|Shape', required => 1);
-has [qw/strides
-        padding
-        dilation/]       => (is => 'rw', isa => 'DimSize|Shape');
-has 'groups'             => (is => 'rw', isa => 'Int');
-has [qw/layout
-    activation/]         => (is => 'rw', isa => 'Str');
-has 'op_name'            => (is => 'rw', isa => 'Str', default => 'Convolution');
-has 'use_bias'           => (is => 'rw', isa => 'Bool', default => 1);
-has 'weight_initializer' => (is => 'rw', isa => 'Maybe[Initializer]');
-has 'bias_initializer'   => (is => 'rw', isa => 'Maybe[Initializer]', default => 'zeros');
-has 'adj'                => (is => 'rw');
-has [qw/weight bias
-        kwargs act/]     => (is => 'rw', init_arg => undef);
-method python_constructor_arguments() { [qw/channels kernel_size strides padding dilation/] }
-
-sub BUILD
-{
-    my $self = shift;
-    $self->_update_kernel_size;
-    $self->name_scope(sub {
-        if(not ref $self->strides)
-        {
-            $self->strides([($self->strides) x @{ $self->kernel_size }]);
-        }
-        if(not ref $self->padding)
-        {
-            $self->padding([($self->padding) x @{ $self->kernel_size }]);
-        }
-        if(not ref $self->dilation)
-        {
-            $self->dilation([($self->dilation) x @{ $self->kernel_size }]);
-        }
-        $self->kwargs({
-            kernel => $self->kernel_size, stride => $self->strides, dilate => $self->dilation,
-            pad => $self->padding, num_filter => $self->channels, num_group => $self->groups,
-            no_bias => $self->use_bias ? 0 : 1, layout => $self->layout
-        });
-        if(defined $self->adj)
-        {
-            $self->kwargs->{adj} = $self->adj;
-        }
-
-        my @dshape = (0)x(@{ $self->kernel_size } + 2);
-        $dshape[index($self->layout, 'N')] = 1;
-        $dshape[index($self->layout, 'C')] = $self->in_channels;
-        my $wshapes = _infer_weight_shape($self->op_name, \@dshape, $self->kwargs);
-        $self->weight(
-            $self->params->get(
-                'weight', shape => $wshapes->[1],
-                init => $self->weight_initializer,
-                allow_deferred_init => 1
-            )
-        );
-        if($self->use_bias)
-        {
-            $self->bias(
-                $self->params->get(
-                    'bias', shape => $wshapes->[2],
-                    init => $self->bias_initializer,
-                    allow_deferred_init => 1
-                )
-            );
-        }
-        if(defined $self->activation)
-        {
-            $self->act(
-                AI::MXNet::Gluon::NN->Activation(
-                    activation => $self->activation,
-                    prefix     => $self->activation.'_'
-                )
-            );
-        }
-    });
-}
-
-method hybrid_forward(GluonClass $F, GluonInput $x, GluonInput :$weight, Maybe[GluonInput] :$bias=)
-{
-    my $op_name = $self->op_name;
-    my $act = $F->$op_name($x, $weight, defined $bias ? $bias : (), name => 'fwd', %{ $self->kwargs });
-    if(defined $self->act)
-    {
-        $act = $self->act->($act);
-    }
-    return $act;
-}
-
-method _alias() { 'conv' }
-
-use Data::Dumper;
-use overload '""' => sub {
-    my $self = shift;
-    my $s = '%s(%s, kernel_size=(%s), stride=(%s)';
-    my $len_kernel_size = @{ $self->kwargs->{kernel} };
-    if(Dumper($self->kwargs->{pad}) ne Dumper([(0)x$len_kernel_size]))
-    {
-        $s .= ', padding=(' . join(',', @{ $self->kwargs->{pad} }) . ')';
-    }
-    if(Dumper($self->kwargs->{dilate}) ne Dumper([(1)x$len_kernel_size]))
-    {
-        $s .= ', dilation=(' . join(',', @{ $self->kwargs->{dilate} }) . ')';
-    }
-    if($self->can('out_pad') and Dumper($self->out_pad) ne Dumper([(0)x$len_kernel_size]))
-    {
-        $s .= ', output_padding=(' . join(',', @{ $self->kwargs->{dilate} }) . ')';
-    }
-    if($self->kwargs->{num_group} != 1)
-    {
-        $s .= ', groups=' . $self->kwargs->{num_group};
-    }
-    if(not defined $self->bias)
-    {
-        $s .= ', bias=False';
-    }
-    $s .= ')';
-    return sprintf(
-        $s,
-        $self->_class_name,
-        $self->in_channels
-            ? sprintf("%d -> %d", $self->in_channels, $self->channels)
-            : sprintf("%d", $self->channels),
-        join(',', @{ $self->kwargs->{kernel} }),
-        join(',', @{ $self->kwargs->{stride} })
-    );
-};
-
-package AI::MXNet::Gluon::NN::Conv1D;
-use AI::MXNet::Gluon::Mouse;
-extends 'AI::MXNet::Gluon::NN::Conv';
-
-=head1 NAME
-
-    AI::MXNet::Gluon::NN::Conv1D
-=cut
-
-=head1 DESCRIPTION
-
-    1D convolution layer (e.g. temporal convolution).
-
-    This layer creates a convolution kernel that is convolved
-    with the layer input over a single spatial (or temporal) dimension
-    to produce a tensor of outputs.
-    If `use_bias` is True, a bias vector is created and added to the outputs.
-    Finally, if `activation` is not `None`,
-    it is applied to the outputs as well.
-
-    If `in_channels` is not specified, `Parameter` initialization will be
-    deferred to the first time `forward` is called and `in_channels` will be
-    inferred from the shape of input data.
-
-
-    Parameters
-    ----------
-    channels : int
-        The dimensionality of the output space, i.e. the number of output
-        channels (filters) in the convolution.
-    kernel_size :int or tuple/list of 1 int
-        Specifies the dimensions of the convolution window.
-    strides : int or tuple/list of 1 int,
-        Specify the strides of the convolution.
-    padding : int or a tuple/list of 1 int,
-        If padding is non-zero, then the input is implicitly zero-padded
-        on both sides for padding number of points
-    dilation : int or tuple/list of 1 int
-        Specifies the dilation rate to use for dilated convolution.
-    groups : int
-        Controls the connections between inputs and outputs.
-        At groups=1, all inputs are convolved to all outputs.
-        At groups=2, the operation becomes equivalent to having two conv
-        layers side by side, each seeing half the input channels, and producing
-        half the output channels, and both subsequently concatenated.
-    layout: str, default 'NCW'
-        Dimension ordering of data and weight. Can be 'NCW', 'NWC', etc.
-        'N', 'C', 'W' stands for batch, channel, and width (time) dimensions
-        respectively. Convolution is applied on the 'W' dimension.
-    in_channels : int, default 0
-        The number of input channels to this layer. If not specified,
-        initialization will be deferred to the first time `forward` is called
-        and `in_channels` will be inferred from the shape of input data.
-    activation : str
-        Activation function to use. See :func:`~mxnet.ndarray.Activation`.
-        If you don't specify anything, no activation is applied
-        (ie. "linear" activation: `a(x) = x`).
-    use_bias : bool
-        Whether the layer uses a bias vector.
-    weight_initializer : str or `Initializer`
-        Initializer for the `weight` weights matrix.
-    bias_initializer : str or `Initializer`
-        Initializer for the bias vector.
-
-
-    Input shape:
-        This depends on the `layout` parameter. Input is 3D array of shape
-        (batch_size, in_channels, width) if `layout` is `NCW`.
-
-    Output shape:
-        This depends on the `layout` parameter. Output is 3D array of shape
-        (batch_size, channels, out_width) if `layout` is `NCW`.
-        out_width is calculated as::
-
-            out_width = floor((width+2*padding-dilation*(kernel_size-1)-1)/stride)+1
-=cut
-
-has '+strides'    => (default => 1);
-has '+padding'    => (default => 0);
-has '+dilation'   => (default => 1);
-has '+groups'     => (default => 1);
-has '+layout'     => (default => 'NCW');
-
-method _update_kernel_size()
-{
-    if(not ref $self->kernel_size)
-    {
-        $self->kernel_size([$self->kernel_size]);
-    }
-    confess("kernel_size must be a number or an array ref of 1 ints")
-        unless @{ $self->kernel_size } == 1;
-}
-
-__PACKAGE__->register('AI::MXNet::Gluon::NN');
-
-package AI::MXNet::Gluon::NN::Conv2D;
-use AI::MXNet::Gluon::Mouse;
-extends 'AI::MXNet::Gluon::NN::Conv';
-
-=head1 NAME
-
-    AI::MXNet::Gluon::NN::Conv2D
-=cut
-
-=head1 DESCRIPTION
-
-    2D convolution layer (e.g. spatial convolution over images).
-
-    This layer creates a convolution kernel that is convolved
-    with the layer input to produce a tensor of
-    outputs. If `use_bias` is True,
-    a bias vector is created and added to the outputs. Finally, if
-    `activation` is not `None`, it is applied to the outputs as well.
-
-    If `in_channels` is not specified, `Parameter` initialization will be
-    deferred to the first time `forward` is called and `in_channels` will be
-    inferred from the shape of input data.
-
-    Parameters
-    ----------
-    channels : int
-        The dimensionality of the output space, i.e. the number of output
-        channels (filters) in the convolution.
-    kernel_size :int or tuple/list of 2 int
-        Specifies the dimensions of the convolution window.
-    strides : int or tuple/list of 2 int,
-        Specify the strides of the convolution.
-    padding : int or a tuple/list of 2 int,
-        If padding is non-zero, then the input is implicitly zero-padded
-        on both sides for padding number of points
-    dilation : int or tuple/list of 2 int
-        Specifies the dilation rate to use for dilated convolution.
-    groups : int
-        Controls the connections between inputs and outputs.
-        At groups=1, all inputs are convolved to all outputs.
-        At groups=2, the operation becomes equivalent to having two conv
-        layers side by side, each seeing half the input channels, and producing
-        half the output channels, and both subsequently concatenated.
-    layout : str, default 'NCHW'
-        Dimension ordering of data and weight. Can be 'NCHW', 'NHWC', etc.
-        'N', 'C', 'H', 'W' stands for batch, channel, height, and width
-        dimensions respectively. Convolution is applied on the 'H' and
-        'W' dimensions.
-    in_channels : int, default 0
-        The number of input channels to this layer. If not specified,
-        initialization will be deferred to the first time `forward` is called
-        and `in_channels` will be inferred from the shape of input data.
-    activation : str
-        Activation function to use. See :func:`~mxnet.ndarray.Activation`.
-        If you don't specify anything, no activation is applied
-        (ie. "linear" activation: `a(x) = x`).
-    use_bias : bool
-        Whether the layer uses a bias vector.
-    weight_initializer : str or `Initializer`
-        Initializer for the `weight` weights matrix.
-    bias_initializer : str or `Initializer`
-        Initializer for the bias vector.
-
-
-    Input shape:
-        This depends on the `layout` parameter. Input is 4D array of shape
-        (batch_size, in_channels, height, width) if `layout` is `NCHW`.
-
-    Output shape:
-        This depends on the `layout` parameter. Output is 4D array of shape
-        (batch_size, channels, out_height, out_width) if `layout` is `NCHW`.
-
-        out_height and out_width are calculated as::
-
-            out_height = floor((height+2*padding[0]-dilation[0]*(kernel_size[0]-1)-1)/stride[0])+1
-            out_width = floor((width+2*padding[1]-dilation[1]*(kernel_size[1]-1)-1)/stride[1])+1
-=cut
-
-has '+strides'    => (default => sub { [1, 1] });
-has '+padding'    => (default => sub { [0, 0] });
-has '+dilation'   => (default => sub { [1, 1] });
-has '+groups'     => (default => 1);
-has '+layout'     => (default => 'NCHW');
-
-method _update_kernel_size()
-{
-    if(not ref $self->kernel_size)
-    {
-        $self->kernel_size([($self->kernel_size)x2]);
-    }
-    confess("kernel_size must be a number or an array ref of 2 ints")
-        unless @{ $self->kernel_size } == 2;
-}
-
-__PACKAGE__->register('AI::MXNet::Gluon::NN');
-
-package AI::MXNet::Gluon::NN::Conv3D;
-use AI::MXNet::Gluon::Mouse;
-extends 'AI::MXNet::Gluon::NN::Conv';
-
-=head1 NAME
-
-    AI::MXNet::Gluon::NN::Conv3D
-=cut
-
-=head1 DESCRIPTION
-
-    3D convolution layer (e.g. spatial convolution over volumes).
-
-    This layer creates a convolution kernel that is convolved
-    with the layer input to produce a tensor of
-    outputs. If `use_bias` is `True`,
-    a bias vector is created and added to the outputs. Finally, if
-    `activation` is not `None`, it is applied to the outputs as well.
-
-    If `in_channels` is not specified, `Parameter` initialization will be
-    deferred to the first time `forward` is called and `in_channels` will be
-    inferred from the shape of input data.
-
-    Parameters
-    ----------
-    channels : int
-        The dimensionality of the output space, i.e. the number of output
-        channels (filters) in the convolution.
-    kernel_size :int or tuple/list of 3 int
-        Specifies the dimensions of the convolution window.
-    strides : int or tuple/list of 3 int,
-        Specify the strides of the convolution.
-    padding : int or a tuple/list of 3 int,
-        If padding is non-zero, then the input is implicitly zero-padded
-        on both sides for padding number of points
-    dilation : int or tuple/list of 3 int
-        Specifies the dilation rate to use for dilated convolution.
-    groups : int
-        Controls the connections between inputs and outputs.
-        At groups=1, all inputs are convolved to all outputs.
-        At groups=2, the operation becomes equivalent to having two conv
-        layers side by side, each seeing half the input channels, and producing
-        half the output channels, and both subsequently concatenated.
-    layout : str, default 'NCDHW'
-        Dimension ordering of data and weight. Can be 'NCDHW', 'NDHWC', etc.
-        'N', 'C', 'H', 'W', 'D' stands for batch, channel, height, width and
-        depth dimensions respectively. Convolution is applied on the 'D',
-        'H' and 'W' dimensions.
-    in_channels : int, default 0
-        The number of input channels to this layer. If not specified,
-        initialization will be deferred to the first time `forward` is called
-        and `in_channels` will be inferred from the shape of input data.
-    activation : str
-        Activation function to use. See :func:`~mxnet.ndarray.Activation`.
-        If you don't specify anything, no activation is applied
-        (ie. "linear" activation: `a(x) = x`).
-    use_bias : bool
-        Whether the layer uses a bias vector.
-    weight_initializer : str or `Initializer`
-        Initializer for the `weight` weights matrix.
-    bias_initializer : str or `Initializer`
-        Initializer for the bias vector.
-
-
-    Input shape:
-        This depends on the `layout` parameter. Input is 5D array of shape
-        (batch_size, in_channels, depth, height, width) if `layout` is `NCDHW`.
-
-    Output shape:
-        This depends on the `layout` parameter. Output is 5D array of shape
-        (batch_size, channels, out_depth, out_height, out_width) if `layout` is
-        `NCDHW`.
-
-        out_depth, out_height and out_width are calculated as::
-
-            out_depth = floor((depth+2*padding[0]-dilation[0]*(kernel_size[0]-1)-1)/stride[0])+1
-            out_height = floor((height+2*padding[1]-dilation[1]*(kernel_size[1]-1)-1)/stride[1])+1
-            out_width = floor((width+2*padding[2]-dilation[2]*(kernel_size[2]-1)-1)/stride[2])+1
-=cut
-
-has '+strides'    => (default => sub { [1, 1, 1] });
-has '+padding'    => (default => sub { [0, 0, 0] });
-has '+dilation'   => (default => sub { [1, 1, 1] });
-has '+groups'     => (default => 1);
-has '+layout'     => (default => 'NCDHW');
-
-method _update_kernel_size()
-{
-    if(not ref $self->kernel_size)
-    {
-        $self->kernel_size([($self->kernel_size)x3]);
-    }
-    confess("kernel_size must be a number or an array ref of 3 ints")
-        unless @{ $self->kernel_size } == 3;
-}
-
-__PACKAGE__->register('AI::MXNet::Gluon::NN');
-
-package AI::MXNet::Gluon::NN::Conv1DTranspose;
-use AI::MXNet::Gluon::Mouse;
-extends 'AI::MXNet::Gluon::NN::Conv';
-
-=head1 NAME
-
-    AI::MXNet::Gluon::NN::Conv1DTranspose
-=cut
-
-=head1 DESCRIPTION
-
-    Transposed 1D convolution layer (sometimes called Deconvolution).
-
-    The need for transposed convolutions generally arises
-    from the desire to use a transformation going in the opposite direction
-    of a normal convolution, i.e., from something that has the shape of the
-    output of some convolution to something that has the shape of its input
-    while maintaining a connectivity pattern that is compatible with
-    said convolution.
-
-    If `in_channels` is not specified, `Parameter` initialization will be
-    deferred to the first time `forward` is called and `in_channels` will be
-    inferred from the shape of input data.
-
-    Parameters
-    ----------
-    channels : int
-        The dimensionality of the output space, i.e. the number of output
-        channels (filters) in the convolution.
-    kernel_size :int or tuple/list of 3 int
-        Specifies the dimensions of the convolution window.
-    strides : int or tuple/list of 3 int,
-        Specify the strides of the convolution.
-    padding : int or a tuple/list of 3 int,
-        If padding is non-zero, then the input is implicitly zero-padded
-        on both sides for padding number of points
-    dilation : int or tuple/list of 3 int
-        Specifies the dilation rate to use for dilated convolution.
-    groups : int
-        Controls the connections between inputs and outputs.
-        At groups=1, all inputs are convolved to all outputs.
-        At groups=2, the operation becomes equivalent to having two conv
-        layers side by side, each seeing half the input channels, and producing
-        half the output channels, and both subsequently concatenated.
-    layout : str, default 'NCW'
-        Dimension ordering of data and weight. Can be 'NCW', 'NWC', etc.
-        'N', 'C', 'W' stands for batch, channel, and width (time) dimensions
-        respectively. Convolution is applied on the 'W' dimension.
-    in_channels : int, default 0
-        The number of input channels to this layer. If not specified,
-        initialization will be deferred to the first time `forward` is called
-        and `in_channels` will be inferred from the shape of input data.
-    activation : str
-        Activation function to use. See :func:`~mxnet.ndarray.Activation`.
-        If you don't specify anything, no activation is applied
-        (ie. "linear" activation: `a(x) = x`).
-    use_bias : bool
-        Whether the layer uses a bias vector.
-    weight_initializer : str or `Initializer`
-        Initializer for the `weight` weights matrix.
-    bias_initializer : str or `Initializer`
-        Initializer for the bias vector.
-
-
-    Input shape:
-        This depends on the `layout` parameter. Input is 3D array of shape
-        (batch_size, in_channels, width) if `layout` is `NCW`.
-
-    Output shape:
-        This depends on the `layout` parameter. Output is 3D array of shape
-        (batch_size, channels, out_width) if `layout` is `NCW`.
-
-        out_width is calculated as::
-
-            out_width = (width-1)*strides-2*padding+kernel_size+output_padding
-=cut
-
-has 'output_padding' => (is => 'rw', isa => 'DimSize|Shape', default => 0);
-has '+adj'           => (default => sub { shift->output_padding }, lazy => 1);
-has '+op_name'       => (default => 'Deconvolution');
-has '+strides'       => (default => 1);
-has '+padding'       => (default => 0 );
-has '+dilation'      => (default => 1);
-has '+groups'        => (default => 1);
-has '+layout'        => (default => 'NCW');
-
-method _update_kernel_size()
-{
-    if(not ref $self->kernel_size)
-    {
-        $self->kernel_size([$self->kernel_size]);
-    }
-    if(not ref $self->output_padding)
-    {
-        $self->output_padding([$self->output_padding]);
-    }
-    confess("kernel_size must be a number or an array ref of 1 ints")
-        unless @{ $self->kernel_size } == 1;
-    confess("output_padding must be a number or an array ref of 1 ints")
-        unless @{ $self->output_padding } == 1;
-}
-
-__PACKAGE__->register('AI::MXNet::Gluon::NN');
-
-package AI::MXNet::Gluon::NN::Conv2DTranspose;
-use AI::MXNet::Gluon::Mouse;
-extends 'AI::MXNet::Gluon::NN::Conv';
-
-=head1 NAME
-
-    AI::MXNet::Gluon::NN::Conv2DTranspose
-=cut
-
-=head1 DESCRIPTION
-
-    Transposed 2D convolution layer (sometimes called Deconvolution).
-
-    The need for transposed convolutions generally arises
-    from the desire to use a transformation going in the opposite direction
-    of a normal convolution, i.e., from something that has the shape of the
-    output of some convolution to something that has the shape of its input
-    while maintaining a connectivity pattern that is compatible with
-    said convolution.
-
-    If `in_channels` is not specified, `Parameter` initialization will be
-    deferred to the first time `forward` is called and `in_channels` will be
-    inferred from the shape of input data.
-
-
-    Parameters
-    ----------
-    channels : int
-        The dimensionality of the output space, i.e. the number of output
-        channels (filters) in the convolution.
-    kernel_size :int or tuple/list of 3 int
-        Specifies the dimensions of the convolution window.
-    strides : int or tuple/list of 3 int,
-        Specify the strides of the convolution.
-    padding : int or a tuple/list of 3 int,
-        If padding is non-zero, then the input is implicitly zero-padded
-        on both sides for padding number of points
-    dilation : int or tuple/list of 3 int
-        Specifies the dilation rate to use for dilated convolution.
-    groups : int
-        Controls the connections between inputs and outputs.
-        At groups=1, all inputs are convolved to all outputs.
-        At groups=2, the operation becomes equivalent to having two conv
-        layers side by side, each seeing half the input channels, and producing
-        half the output channels, and both subsequently concatenated.
-    layout : str, default 'NCHW'
-        Dimension ordering of data and weight. Can be 'NCHW', 'NHWC', etc.
-        'N', 'C', 'H', 'W' stands for batch, channel, height, and width
-        dimensions respectively. Convolution is applied on the 'H' and
-        'W' dimensions.
-    in_channels : int, default 0
-        The number of input channels to this layer. If not specified,
-        initialization will be deferred to the first time `forward` is called
-        and `in_channels` will be inferred from the shape of input data.
-    activation : str
-        Activation function to use. See :func:`~mxnet.ndarray.Activation`.
-        If you don't specify anything, no activation is applied
-        (ie. "linear" activation: `a(x) = x`).
-    use_bias : bool
-        Whether the layer uses a bias vector.
-    weight_initializer : str or `Initializer`
-        Initializer for the `weight` weights matrix.
-    bias_initializer : str or `Initializer`
-        Initializer for the bias vector.
-
-
-    Input shape:
-        This depends on the `layout` parameter. Input is 4D array of shape
-        (batch_size, in_channels, height, width) if `layout` is `NCHW`.
-
-    Output shape:
-        This depends on the `layout` parameter. Output is 4D array of shape
-        (batch_size, channels, out_height, out_width) if `layout` is `NCHW`.
-
-        out_height and out_width are calculated as::
-
-            out_height = (height-1)*strides[0]-2*padding[0]+kernel_size[0]+output_padding[0]
-            out_width = (width-1)*strides[1]-2*padding[1]+kernel_size[1]+output_padding[1]
-=cut
-
-has 'output_padding'      => (is => 'rw', isa => 'DimSize|Shape', default => 0);
-has '+adj'        => (default => sub { shift->output_padding }, lazy => 1);
-has '+op_name'    => (default => 'Deconvolution');
-has '+strides'    => (default => sub { [1, 1] });
-has '+padding'    => (default => sub { [0, 0] });
-has '+dilation'   => (default => sub { [1, 1] });
-has '+groups'     => (default => 1);
-has '+layout'     => (default => 'NCHW');
-
-method _update_kernel_size()
-{
-    if(not ref $self->kernel_size)
-    {
-        $self->kernel_size([($self->kernel_size)x2]);
-    }
-    if(not ref $self->output_padding)
-    {
-        $self->output_padding([($self->output_padding)x2]);
-    }
-    confess("kernel_size must be a number or an array ref of 2 ints")
-        unless @{ $self->kernel_size } == 2;
-    confess("output_padding must be a number or an array ref of 2 ints")
-        unless @{ $self->output_padding } == 2;
-}
-
-__PACKAGE__->register('AI::MXNet::Gluon::NN');
-
-package AI::MXNet::Gluon::NN::Conv3DTranspose;
-use AI::MXNet::Gluon::Mouse;
-extends 'AI::MXNet::Gluon::NN::Conv';
-
-=head1 NAME
-
-    AI::MXNet::Gluon::NN::Conv3DTranspose
-=cut
-
-=head1 DESCRIPTION
-
-    Transposed 3D convolution layer (sometimes called Deconvolution).
-
-    The need for transposed convolutions generally arises
-    from the desire to use a transformation going in the opposite direction
-    of a normal convolution, i.e., from something that has the shape of the
-    output of some convolution to something that has the shape of its input
-    while maintaining a connectivity pattern that is compatible with
-    said convolution.
-
-    If `in_channels` is not specified, `Parameter` initialization will be
-    deferred to the first time `forward` is called and `in_channels` will be
-    inferred from the shape of input data.
-
-
-    Parameters
-    ----------
-    channels : int
-        The dimensionality of the output space, i.e. the number of output
-        channels (filters) in the convolution.
-    kernel_size :int or tuple/list of 3 int
-        Specifies the dimensions of the convolution window.
-    strides : int or tuple/list of 3 int,
-        Specify the strides of the convolution.
-    padding : int or a tuple/list of 3 int,
-        If padding is non-zero, then the input is implicitly zero-padded
-        on both sides for padding number of points
-    dilation : int or tuple/list of 3 int
-        Specifies the dilation rate to use for dilated convolution.
-    groups : int
-        Controls the connections between inputs and outputs.
-        At groups=1, all inputs are convolved to all outputs.
-        At groups=2, the operation becomes equivalent to having two conv
-        layers side by side, each seeing half the input channels, and producing
-        half the output channels, and both subsequently concatenated.
-    layout : str, default 'NCDHW'
-        Dimension ordering of data and weight. Can be 'NCDHW', 'NDHWC', etc.
-        'N', 'C', 'H', 'W', 'D' stands for batch, channel, height, width and
-        depth dimensions respectively. Convolution is applied on the 'D',
-        'H', and 'W' dimensions.
-    in_channels : int, default 0
-        The number of input channels to this layer. If not specified,
-        initialization will be deferred to the first time `forward` is called
-        and `in_channels` will be inferred from the shape of input data.
-    activation : str
-        Activation function to use. See :func:`~mxnet.ndarray.Activation`.
-        If you don't specify anything, no activation is applied
-        (ie. "linear" activation: `a(x) = x`).
-    use_bias : bool
-        Whether the layer uses a bias vector.
-    weight_initializer : str or `Initializer`
-        Initializer for the `weight` weights matrix.
-    bias_initializer : str or `Initializer`
-        Initializer for the bias vector.
-
-
-    Input shape:
-        This depends on the `layout` parameter. Input is 5D array of shape
-        (batch_size, in_channels, depth, height, width) if `layout` is `NCDHW`.
-
-    Output shape:
-        This depends on the `layout` parameter. Output is 5D array of shape
-        (batch_size, channels, out_depth, out_height, out_width) if `layout` is `NCDHW`.
-        out_depth, out_height and out_width are calculated as::
-
-            out_depth = (depth-1)*strides[0]-2*padding[0]+kernel_size[0]+output_padding[0]
-            out_height = (height-1)*strides[1]-2*padding[1]+kernel_size[1]+output_padding[1]
-            out_width = (width-1)*strides[2]-2*padding[2]+kernel_size[2]+output_padding[2]
-=cut
-
-has 'output_padding'      => (is => 'rw', isa => 'DimSize|Shape', default => 0);
-has '+adj'        => (default => sub { shift->output_padding }, lazy => 1);
-has '+op_name'    => (default => 'Deconvolution');
-has '+strides'    => (default => sub { [1, 1, 1] });
-has '+padding'    => (default => sub { [0, 0, 0] });
-has '+dilation'   => (default => sub { [1, 1, 1] });
-has '+groups'     => (default => 1);
-has '+layout'     => (default => 'NCDHW');
-
-method _update_kernel_size()
-{
-    if(not ref $self->kernel_size)
-    {
-        $self->kernel_size([($self->kernel_size)x3]);
-    }
-    if(not ref $self->output_padding)
-    {
-        $self->output_padding([($self->output_padding)x3]);
-    }
-    confess("kernel_size must be a number or an array ref of 3 ints")
-        unless @{ $self->kernel_size } == 3;
-    confess("output_padding must be a number or an array ref of 3 ints")
-        unless @{ $self->output_padding } == 3;
-}
-
-__PACKAGE__->register('AI::MXNet::Gluon::NN');
-
-# Abstract class for different pooling layers.
-package AI::MXNet::Gluon::NN::Pooling;
-use AI::MXNet::Gluon::Mouse;
-extends 'AI::MXNet::Gluon::HybridBlock';
-
-has 'pool_size'   => (is => 'rw', isa => 'DimSize|Shape', required => 1);
-has 'strides'     => (is => 'rw', isa => 'Maybe[DimSize|Shape]');
-has 'padding'     => (is => 'rw', isa => 'DimSize|Shape');
-has 'ceil_mode'   => (is => 'rw', isa => 'Bool', default => 0);
-has 'global_pool' => (is => 'rw', isa => 'Bool', default => 0);
-has 'kwargs'      => (is => 'rw', init_arg => undef);
-has 'pool_type'   => (is => 'rw', isa => 'PoolType');
-has 'layout'      => (is => 'rw');
-has 'count_include_pad' => (is => 'rw', isa => 'Bool');
-method python_constructor_arguments() { [qw/pool_size strides padding/] }
-
-sub BUILD
-{
-    my $self = shift;
-    $self->_update_pool_size;
-    if(not defined $self->strides)
-    {
-        $self->strides($self->pool_size);
-    }
-    if(not ref $self->strides)
-    {
-        $self->strides([($self->strides)x@{ $self->pool_size }]);
-    }
-    if(not ref $self->padding)
-    {
-        $self->padding([($self->padding)x@{ $self->pool_size }]);
-    }
-    $self->kwargs({
-        kernel => $self->pool_size, stride => $self->strides, pad => $self->padding,
-        global_pool => $self->global_pool, pool_type => $self->pool_type,
-        pooling_convention => $self->ceil_mode ? 'full' : 'valid',
-        (defined $self->count_include_pad ? (count_include_pad => $self->count_include_pad) : ())
-    });
-}
-
-method _alias() { 'pool' }
-
-method hybrid_forward(GluonClass $F, GluonInput $x)
-{
-    return $F->Pooling($x, name=>'fwd', %{ $self->kwargs });
-}
-
-use overload '""' => sub {
-    my $self = shift;
-    sprintf(
-        '%s(size=(%s), stride=(%s), padding=(%s), ceil_mode=%d)',
-        $self->_class_name,
-        join(',', @{ $self->kwargs->{kernel} }),
-        join(',', @{ $self->kwargs->{stride} }),
-        join(',', @{ $self->kwargs->{pad} }),
-        $self->kwargs->{pooling_convention} eq 'full' ? 1 : 0
-    )
-};
-
-package AI::MXNet::Gluon::NN::MaxPool1D;
-use AI::MXNet::Gluon::Mouse;
-extends 'AI::MXNet::Gluon::NN::Pooling';
-method python_constructor_arguments() { [qw/pool_size strides padding layout ceil_mode/] }
-
-=head1 NAME
-
-    AI::MXNet::Gluon::NN::MaxPool1D
-=cut
-
-=head1 DESCRIPTION
-
-    Max pooling operation for one dimensional data.
-
-
-    Parameters
-    ----------
-    pool_size: int
-        Size of the max pooling windows.
-    strides: int, or None
-        Factor by which to downscale. E.g. 2 will halve the input size.
-        If `None`, it will default to `pool_size`.
-    padding: int
-        If padding is non-zero, then the input is implicitly
-        zero-padded on both sides for padding number of points.
-    layout : str, default 'NCW'
-        Dimension ordering of data and weight. Can be 'NCW', 'NWC', etc.
-        'N', 'C', 'W' stands for batch, channel, and width (time) dimensions
-        respectively. Pooling is applied on the W dimension.
-    ceil_mode : bool, default False
-        When `True`, will use ceil instead of floor to compute the output shape.
-
-
-    Input shape:
-        This depends on the `layout` parameter. Input is 3D array of shape
-        (batch_size, channels, width) if `layout` is `NCW`.
-
-    Output shape:
-        This depends on the `layout` parameter. Output is 3D array of shape
-        (batch_size, channels, out_width) if `layout` is `NCW`.
-
-        out_width is calculated as::
-
-            out_width = floor((width+2*padding-pool_size)/strides)+1
-
-        When `ceil_mode` is `True`, ceil will be used instead of floor in this
-        equation.
-=cut
-
-
-has '+pool_size' => (default => 2);
-has '+padding'   => (default => 0);
-has '+layout'    => (default => 'NCW');
-has '+pool_type' => (default => 'max');
-
-method _update_pool_size()
-{
-    confess("Only supports NCW layout for now")
-        unless $self->layout eq 'NCW';
-    if(not ref $self->pool_size)
-    {
-        $self->pool_size([$self->pool_size]);
-    }
-    confess("pool_size must be a number or an array ref of 1 ints")
-        unless @{ $self->pool_size } == 1;
-}
-
-__PACKAGE__->register('AI::MXNet::Gluon::NN');
-
-package AI::MXNet::Gluon::NN::MaxPool2D;
-use AI::MXNet::Gluon::Mouse;
-extends 'AI::MXNet::Gluon::NN::Pooling';
-
-=head1 NAME
-
-    AI::MXNet::Gluon::NN::MaxPool2D
-=cut
-
-=head1 DESCRIPTION
-
-    Max pooling operation for two dimensional (spatial) data.
-
-
-    Parameters
-    ----------
-    pool_size: int or list/tuple of 2 ints,
-        Size of the max pooling windows.
-    strides: int, list/tuple of 2 ints, or None.
-        Factor by which to downscale. E.g. 2 will halve the input size.
-        If `None`, it will default to `pool_size`.
-    padding: int or list/tuple of 2 ints,
-        If padding is non-zero, then the input is implicitly
-        zero-padded on both sides for padding number of points.
-    layout : str, default 'NCHW'
-        Dimension ordering of data and weight. Can be 'NCHW', 'NHWC', etc.
-        'N', 'C', 'H', 'W' stands for batch, channel, height, and width
-        dimensions respectively. padding is applied on 'H' and 'W' dimension.
-    ceil_mode : bool, default False
-        When `True`, will use ceil instead of floor to compute the output shape.
-
-
-    Input shape:
-        This depends on the `layout` parameter. Input is 4D array of shape
-        (batch_size, channels, height, width) if `layout` is `NCHW`.
-
-    Output shape:
-        This depends on the `layout` parameter. Output is 4D array of shape
-        (batch_size, channels, out_height, out_width)  if `layout` is `NCHW`.
-
-        out_height and out_width are calculated as::
-
-            out_height = floor((height+2*padding[0]-pool_size[0])/strides[0])+1
-            out_width = floor((width+2*padding[1]-pool_size[1])/strides[1])+1
-
-        When `ceil_mode` is `True`, ceil will be used instead of floor in this
-        equation.
-=cut
-
-has '+pool_size' => (default => sub { [2, 2] });
-has '+padding'   => (default => 0);
-has '+layout'    => (default => 'NCHW');
-has '+pool_type' => (default => 'max');
-
-method _update_pool_size()
-{
-    confess("Only supports NCHW layout for now")
-        unless $self->layout eq 'NCHW';
-    if(not ref $self->pool_size)
-    {
-        $self->pool_size([($self->pool_size)x2]);
-    }
-    confess("pool_size must be a number or an array ref of 2 ints")
-        unless @{ $self->pool_size } == 2;
-}
-
-__PACKAGE__->register('AI::MXNet::Gluon::NN');
-
-package AI::MXNet::Gluon::NN::MaxPool3D;
-use AI::MXNet::Gluon::Mouse;
-extends 'AI::MXNet::Gluon::NN::Pooling';
-
-=head1 NAME
-
-    AI::MXNet::Gluon::NN::MaxPool3D
-=cut
-
-=head1 DESCRIPTION
-
-    Max pooling operation for 3D data (spatial or spatio-temporal).
-
-
-    Parameters
-    ----------
-    pool_size: int or list/tuple of 3 ints,
-        Size of the max pooling windows.
-    strides: int, list/tuple of 3 ints, or None.
-        Factor by which to downscale. E.g. 2 will halve the input size.
-        If `None`, it will default to `pool_size`.
-    padding: int or list/tuple of 3 ints,
-        If padding is non-zero, then the input is implicitly
-        zero-padded on both sides for padding number of points.
-    layout : str, default 'NCDHW'
-        Dimension ordering of data and weight. Can be 'NCDHW', 'NDHWC', etc.
-        'N', 'C', 'H', 'W', 'D' stands for batch, channel, height, width and
-        depth dimensions respectively. padding is applied on 'D', 'H' and 'W'
-        dimension.
-    ceil_mode : bool, default False
-        When `True`, will use ceil instead of floor to compute the output shape.
-
-
-    Input shape:
-        This depends on the `layout` parameter. Input is 5D array of shape
-        (batch_size, channels, depth, height, width) if `layout` is `NCDHW`.
-
-    Output shape:
-        This depends on the `layout` parameter. Output is 5D array of shape
-        (batch_size, channels, out_depth, out_height, out_width) if `layout`
-        is `NCDHW`.
-
-        out_depth, out_height and out_width are calculated as ::
-
-            out_depth = floor((depth+2*padding[0]-pool_size[0])/strides[0])+1
-            out_height = floor((height+2*padding[1]-pool_size[1])/strides[1])+1
-            out_width = floor((width+2*padding[2]-pool_size[2])/strides[2])+1
-
-        When `ceil_mode` is `True`, ceil will be used instead of floor in this
-        equation.
-=cut
-
-has '+pool_size' => (default => sub { [2, 2, 2] });
-has '+padding'   => (default => 0);
-has '+layout'    => (default => 'NCDHW');
-has '+pool_type' => (default => 'max');
-
-method _update_pool_size()
-{
-    confess("Only supports NCDHW layout for now")
-        unless $self->layout eq 'NCDHW';
-    if(not ref $self->pool_size)
-    {
-        $self->pool_size([($self->pool_size)x3]);
-    }
-    confess("pool_size must be a number or an array ref of 3 ints")
-        unless @{ $self->pool_size } == 3;
-}
-
-__PACKAGE__->register('AI::MXNet::Gluon::NN');
-
-package AI::MXNet::Gluon::NN::AvgPool1D;
-use AI::MXNet::Gluon::Mouse;
-extends 'AI::MXNet::Gluon::NN::MaxPool1D';
-
-=head1 NAME
-
-    AI::MXNet::Gluon::NN::AvgPool1D
-=cut
-
-=head1 DESCRIPTION
-
-    Average pooling operation for temporal data.
-
-    Parameters
-    ----------
-    pool_size: int
-        Size of the max pooling windows.
-    strides: int, or None
-        Factor by which to downscale. E.g. 2 will halve the input size.
-        If `None`, it will default to `pool_size`.
-    padding: int
-        If padding is non-zero, then the input is implicitly
-        zero-padded on both sides for padding number of points.
-    layout : str, default 'NCW'
-        Dimension ordering of data and weight. Can be 'NCW', 'NWC', etc.
-        'N', 'C', 'W' stands for batch, channel, and width (time) dimensions
-        respectively. padding is applied on 'W' dimension.
-    ceil_mode : bool, default False
-        When `True`, will use ceil instead of floor to compute the output shape.
-    count_include_pad : bool, default True
-        When 'False', will exclude padding elements when computing the average value.
-
-
-    Input shape:
-        This depends on the `layout` parameter. Input is 3D array of shape
-        (batch_size, channels, width) if `layout` is `NCW`.
-
-    Output shape:
-        This depends on the `layout` parameter. Output is 3D array of shape
-        (batch_size, channels, out_width) if `layout` is `NCW`.
-
-        out_width is calculated as::
-
-            out_width = floor((width+2*padding-pool_size)/strides)+1
-
-        When `ceil_mode` is `True`, ceil will be used instead of floor in this
-        equation.
-=cut
-
-has '+pool_type' => (default => 'avg');
-has '+count_include_pad' => (default => 1);
-
-__PACKAGE__->register('AI::MXNet::Gluon::NN');
-
-package AI::MXNet::Gluon::NN::AvgPool2D;
-use AI::MXNet::Gluon::Mouse;
-extends 'AI::MXNet::Gluon::NN::MaxPool2D';
-
-=head1 NAME
-
-    AI::MXNet::Gluon::NN::AvgPool2D
-=cut
-
-=head1 DESCRIPTION
-
-    Average pooling operation for spatial data.
-
-    Parameters
-    ----------
-    pool_size: int or list/tuple of 2 ints,
-        Size of the max pooling windows.
-    strides: int, list/tuple of 2 ints, or None.
-        Factor by which to downscale. E.g. 2 will halve the input size.
-        If `None`, it will default to `pool_size`.
-    padding: int or list/tuple of 2 ints,
-        If padding is non-zero, then the input is implicitly
-        zero-padded on both sides for padding number of points.
-    layout : str, default 'NCHW'
-        Dimension ordering of data and weight. Can be 'NCHW', 'NHWC', etc.
-        'N', 'C', 'H', 'W' stands for batch, channel, height, and width
-        dimensions respectively. padding is applied on 'H' and 'W' dimension.
-    ceil_mode : bool, default False
-        When True, will use ceil instead of floor to compute the output shape.
-    count_include_pad : bool, default True
-        When 'False', will exclude padding elements when computing the average value.
-
-
-    Input shape:
-        This depends on the `layout` parameter. Input is 4D array of shape
-        (batch_size, channels, height, width) if `layout` is `NCHW`.
-
-    Output shape:
-        This depends on the `layout` parameter. Output is 4D array of shape
-        (batch_size, channels, out_height, out_width)  if `layout` is `NCHW`.
-
-        out_height and out_width are calculated as::
-
-            out_height = floor((height+2*padding[0]-pool_size[0])/strides[0])+1
-            out_width = floor((width+2*padding[1]-pool_size[1])/strides[1])+1
-
-        When `ceil_mode` is `True`, ceil will be used instead of floor in this
-        equation.
-=cut
-
-has '+pool_type' => (default => 'avg');
-has '+count_include_pad' => (default => 1);
-
-__PACKAGE__->register('AI::MXNet::Gluon::NN');
-
-package AI::MXNet::Gluon::NN::AvgPool3D;
-use AI::MXNet::Gluon::Mouse;
-extends 'AI::MXNet::Gluon::NN::MaxPool3D';
-
-=head1 NAME
-
-    AI::MXNet::Gluon::NN::AvgPool3D
-=cut
-
-=head1 DESCRIPTION
-
-    Average pooling operation for 3D data (spatial or spatio-temporal).
-
-    Parameters
-    ----------
-    pool_size: int or list/tuple of 3 ints,
-        Size of the max pooling windows.
-    strides: int, list/tuple of 3 ints, or None.
-        Factor by which to downscale. E.g. 2 will halve the input size.
-        If `None`, it will default to `pool_size`.
-    padding: int or list/tuple of 3 ints,
-        If padding is non-zero, then the input is implicitly
-        zero-padded on both sides for padding number of points.
-    layout : str, default 'NCDHW'
-        Dimension ordering of data and weight. Can be 'NCDHW', 'NDHWC', etc.
-        'N', 'C', 'H', 'W', 'D' stands for batch, channel, height, width and
-        depth dimensions respectively. padding is applied on 'D', 'H' and 'W'
-        dimension.
-    ceil_mode : bool, default False
-        When True, will use ceil instead of floor to compute the output shape.
-    count_include_pad : bool, default True
-        When 'False', will exclude padding elements when computing the average value.
-
-
-    Input shape:
-        This depends on the `layout` parameter. Input is 5D array of shape
-        (batch_size, channels, depth, height, width) if `layout` is `NCDHW`.
-
-    Output shape:
-        This depends on the `layout` parameter. Output is 5D array of shape
-        (batch_size, channels, out_depth, out_height, out_width) if `layout`
-        is `NCDHW`.
-
-        out_depth, out_height and out_width are calculated as ::
-
-            out_depth = floor((depth+2*padding[0]-pool_size[0])/strides[0])+1
-            out_height = floor((height+2*padding[1]-pool_size[1])/strides[1])+1
-            out_width = floor((width+2*padding[2]-pool_size[2])/strides[2])+1
-
-        When `ceil_mode` is `True,` ceil will be used instead of floor in this
-        equation.
-=cut
-
-has '+pool_type' => (default => 'avg');
-has '+count_include_pad' => (default => 1);
-
-__PACKAGE__->register('AI::MXNet::Gluon::NN');
-
-package AI::MXNet::Gluon::NN::GlobalMaxPool1D;
-
-=head1 NAME
-
-    AI::MXNet::Gluon::NN::GlobalMaxPool1D
-=cut
-
-=head1 DESCRIPTION
-
-    Global max pooling operation for temporal data.
-=cut
-
-use AI::MXNet::Gluon::Mouse;
-extends 'AI::MXNet::Gluon::NN::MaxPool1D';
-has '+pool_size'   => (default => sub { [1] });
-has '+global_pool' => (default => 1);
-has '+ceil_mode'   => (default => 1);
-
-__PACKAGE__->register('AI::MXNet::Gluon::NN');
-
-package AI::MXNet::Gluon::NN::GlobalMaxPool2D;
-=head1 NAME
-
-    AI::MXNet::Gluon::NN::GlobalMaxPool2D
-=cut
-
-=head1 DESCRIPTION
-
-    Global max pooling operation for spatial data.
-=cut
-
-use AI::MXNet::Gluon::Mouse;
-extends 'AI::MXNet::Gluon::NN::MaxPool2D';
-
-has '+pool_size'   => (default => sub { [1, 1] });
-has '+global_pool' => (default => 1);
-has '+ceil_mode'   => (default => 1);
-
-__PACKAGE__->register('AI::MXNet::Gluon::NN');
-
-package AI::MXNet::Gluon::NN::GlobalMaxPool3D;
-=head1 NAME
-
-    AI::MXNet::Gluon::NN::GlobalMaxPool3D
-=cut
-
-=head1 DESCRIPTION
-
-    Global max pooling operation for 3D data.
-=cut
-
-use AI::MXNet::Gluon::Mouse;
-extends 'AI::MXNet::Gluon::NN::MaxPool3D';
-has '+pool_size'   => (default => sub { [1, 1, 1] });
-has '+global_pool' => (default => 1);
-has '+ceil_mode'   => (default => 1);
-
-__PACKAGE__->register('AI::MXNet::Gluon::NN');
-
-package AI::MXNet::Gluon::NN::GlobalAvgPool1D;
-
-=head1 NAME
-
-    AI::MXNet::Gluon::NN::GlobalAvgPool1D
-=cut
-
-=head1 DESCRIPTION
-
-    Global average pooling operation for temporal data.
-=cut
-
-use AI::MXNet::Gluon::Mouse;
-extends 'AI::MXNet::Gluon::NN::AvgPool1D';
-has '+pool_size'   => (default => sub { [1] });
-has '+global_pool' => (default => 1);
-has '+ceil_mode'   => (default => 1);
-
-__PACKAGE__->register('AI::MXNet::Gluon::NN');
-
-package AI::MXNet::Gluon::NN::GlobalAvgPool2D;
-=head1 NAME
-
-    AI::MXNet::Gluon::NN::GlobalAvgPool2D
-=cut
-
-=head1 DESCRIPTION
-
-    Global average pooling operation for spatial data.
-=cut
-
-use AI::MXNet::Gluon::Mouse;
-extends 'AI::MXNet::Gluon::NN::AvgPool2D';
-
-has '+pool_size'   => (default => sub { [1, 1] });
-has '+global_pool' => (default => 1);
-has '+ceil_mode'   => (default => 1);
-
-__PACKAGE__->register('AI::MXNet::Gluon::NN');
-
-package AI::MXNet::Gluon::NN::GlobalAvgPool3D;
-=head1 NAME
-
-    AI::MXNet::Gluon::NN::GlobalAvgPool2D
-=cut
-
-=head1 DESCRIPTION
-
-    Global average pooling operation for 3D data.
-=cut
-
-use AI::MXNet::Gluon::Mouse;
-extends 'AI::MXNet::Gluon::NN::AvgPool3D';
-has '+pool_size'   => (default => sub { [1, 1, 1] });
-has '+global_pool' => (default => 1);
-has '+ceil_mode'   => (default => 1);
-
-__PACKAGE__->register('AI::MXNet::Gluon::NN');
-
-package AI::MXNet::Gluon::NN::ReflectionPad2D;
-use AI::MXNet::Gluon::Mouse;
-extends 'AI::MXNet::Gluon::HybridBlock';
-
-=head1 NAME
-
-    AI::MXNet::Gluon::NN::ReflectionPad2D
-=cut
-
-=head1 DESCRIPTION
-
-    Pads the input tensor using the reflection of the input boundary.
-
-    Parameters
-    ----------
-    padding: int
-        An integer padding size
-
-    Examples
-    --------
-    >>> $m = nn->ReflectionPad2D(3);
-    >>> $input = mx->nd->random->normal(shape=>[16, 3, 224, 224]);
-    >>> $output = $m->($input);
-=cut
-
-has 'padding' => (is => 'rw', isa => 'Int|ArrayRef[Int]', default => 0);
-method python_constructor_arguments() { ['padding'] }
-sub BUILD
-{
-    my $self = shift;
-    $self->padding([(0)x4, ($self->padding)x4]) if not ref $self->padding;
-    confess("pading must be 8 integer long") unless @{ $self->padding } == 8;
-}
-
-method hybrid_forward(GluonClass $F, GluonInput $x)
-{
-    return $F->pad($x, mode=>'reflect', pad_width=>$self->padding);
-}
-
-__PACKAGE__->register('AI::MXNet::Gluon::NN');
-
-1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Parameter.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Parameter.pm
deleted file mode 100644
index a914c1ff68ca..000000000000
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Parameter.pm
+++ /dev/null
@@ -1,1331 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-use strict;
-use warnings;
-use Hash::Ordered;
-package AI::MXNet::Gluon::Parameter;
-use AI::MXNet::NS;
-use AI::MXNet::Function::Parameters;
-
-=head1 NAME 
-
-    AI::MXNet::Gluon::Parameter - A Container holding parameters (weights) of AI::MXNEt::Gluon::Block(s).
-=cut
-
-=head1 DESCRIPTION
-
-    AI::MXNet::Gluon::Parameter holds a copy of the parameter on each AI::MXNet::Context after
-    it is initialized with AI::MXNet::Gluon::Parameter->initialize(...)`. If grad_req is
-    not 'null', it will also hold a gradient array on each AI::MXNet::Context
-
-        $ctx = mx->gpu(0);
-        $x = mx->nd->zeros([16, 100], ctx=>$ctx);
-        $w = mx->gluon->Parameter('fc_weight', shape=>[64, 100], init=>mx->init->Xavier());
-        $b = mx->gluon->Parameter('fc_bias', shape=>[64], init=>mx->init->Zero());
-        $w->initialize(ctx=>$ctx);
-        $b->initialize(ctx=>ctx);
-        $out = mx->nd->FullyConnected($x, $w->data($ctx), $b->data($ctx), num_hidden=>64);
-
-    Parameters
-    ----------
-    name : str
-        Name of this parameter.
-    grad_req : {'write', 'add', 'null'}, default 'write'
-        Specifies how to update gradient to grad arrays.
-
-        - 'write' means everytime gradient is written to grad NDArray.
-        - 'add' means everytime gradient is added to the grad NDArray. You need
-          to manually call zero_grad() to clear the gradient buffer before each
-          iteration when using this option.
-        - 'null' means gradient is not requested for this parameter. gradient arrays
-          will not be allocated.
-    shape : array ref of int or int, default undef
-        Shape of this parameter. By default shape is not specified. Parameter with
-        unknown shape can be used for `Symbol` API, but `init` will throw an error
-        when using `NDArray` API.
-    dtype : Dtype, default 'float32'
-        Data type of this parameter. For example, 'float64'.
-    lr_mult : float, default 1.0
-        Learning rate multiplier. Learning rate will be multiplied by lr_mult
-        when updating this parameter with optimizer.
-    wd_mult : float, default 1.0
-        Weight decay multiplier (L2 regularizer coefficient). Works similar to lr_mult.
-    init : Initializer, default None
-        Initializer of this parameter. Will use the global initializer by default.
-    stype: {'default', 'row_sparse', 'csr'}, defaults to 'default'.
-        The storage type of the parameter.
-    grad_stype: {'default', 'row_sparse', 'csr'}, defaults to 'default'.
-        The storage type of the parameter's gradient.
-
-
-    Attributes
-    ----------
-    grad_req : {'write', 'add', 'null'}
-        This can be set before or after initialization. Setting grad_req to null
-        with $x->grad_req = 'null' saves memory and computation when you don't
-        need gradient w.r.t x.
-=cut
-
-use Mouse;
-use AI::MXNet::Base;
-use overload '""' => sub {
-        my $self = shift;
-        "Parameter " . $self->name.
-        " (shape=(" . join(', ', @{ $self->shape//[] }) .")".
-        ", dtype=" . $self->dtype.
-        ", stype=" . $self->stype.")"
-    },
-    fallback => 1;
-
-around BUILDARGS => sub {
-    my $orig  = shift;
-    my $class = shift;
-    if(@_ % 2)
-    {
-        my $name = shift;
-        return $class->$orig(name => $name, @_);
-    }
-    else
-    {
-        return $class->$orig(@_);
-    }
-};
-
-sub BUILD
-{
-    my $self = shift;
-    $self->grad_req($self->_grad_req);
-    $self->_shape([$self->_shape]) if defined $self->_shape and not ref $self->_shape;
-    $self->_deferred_init([]);
-}
-
-has 'name'                => (is => 'ro', isa => 'Str', required => 1);
-has '_grad_req'           => (is => 'rw', isa => 'GradReq', init_arg => 'grad_req', default => 'write');
-has '_shape'              => (is => 'rw', isa => 'Maybe[Shape|Int]', init_arg => 'shape');
-has 'dtype'               => (is => 'rw', isa => 'Dtype', default => 'float32');
-has ['stype',
-     'grad_stype']        => (is => 'rw', isa => 'Stype', default => 'default');
-has [qw/lr_mult wd_mult/] => (is => 'rw', isa => 'Num', default => 1);
-has 'init'                => (is => 'rw', isa => 'Maybe[Initializer]');
-has 'allow_deferred_init' => (is => 'rw', isa => 'Bool', default => 0);
-has 'differentiable'      => (is => 'rw', isa => 'Bool', default => 1);
-has [qw/_var _data _grad
-    _deferred_init _trainer
-    _ctx_list _ctx_map/]  => (is => 'rw', init_arg => undef);
-
-method grad_req(Maybe[GradReq] $req=)
-{
-    return $self->_grad_req unless defined $req;
-    if(not $self->differentiable)
-    {
-        $req = 'null';
-    }
-    return if $self->_grad_req eq $req;
-    $self->_grad_req($req);
-    if($req eq 'null' and defined $self->_grad)
-    {
-        $self->_grad(undef);
-        $self->_data([map { $_->detach } @{ $self->_data }]);
-    }
-    elsif(defined $self->_data)
-    {
-        $self->_init_grad();
-    }
-}
-
-method shape(@args)
-{
-    return $self->_shape unless @args;
-    if(not defined $args[0])
-    {
-        $self->_shape(undef);
-        return undef;
-    }
-    if(not defined $self->_shape and defined $args[0])
-    {
-        $self->_shape(ref $args[0] ? $args[0] : [$args[0]]);
-        return $self->_shape;
-    }
-    my $new_shape = ref $args[0] ? $args[0] : [$args[0]];
-    my $shape_validated = 0;
-    if(@{ $self->_shape } == @{ $new_shape })
-    {
-        $shape_validated = 1;
-        zip(sub {
-            my ($i, $j) = @_;
-            return unless $i;
-            return if $i == $j;
-            $shape_validated = 0;
-        }, $self->_shape, $new_shape);
-    }
-    assert($shape_validated, 'Expected shape is incompatible with given shape');
-    $self->_shape($new_shape);
-    return $self->_shape;
-}
-
-method _set_trainer($trainer)
-{
-    if($self->stype ne 'default' and $self->_trainer and $trainer and Scalar::Util::refaddr($self->_trainer) ne Scalar::Util::refaddr($trainer))
-    {
-        confess(
-            "Failed to set the trainer for Parameter '${\ $self->name }' because it was already set. ".
-            "More than one trainers for a ${\ $self->stype } Parameter is not supported."
-        );
-    }
-    $self->_trainer($trainer);
-}
-
-method _get_row_sparse($arr_list, $ctx, AI::MXNet::NDArray $row_id)
-{
-    if(not $self->_trainer)
-    {
-        confess(
-            "Cannot get row_sparse data for Parameter '${\ $self->name }' when no ".
-            "Trainer is created with it."
-        );
-    }
-    my $results = $self->_check_and_get($arr_list, $ctx);
-
-    # fetch row sparse params from the trainer
-    $self->_trainer->_row_sparse_pull($self, $results, $row_id);
-    return $results;
-}
-
-method _check_and_get($arr_list, $ctx)
-{
-    if(defined $arr_list)
-    {
-        if(ref $ctx eq 'ARRAY')
-        {
-            return $arr_list;
-        }
-        if(not defined $ctx)
-        {
-            if(@{ $arr_list } == 1)
-            {
-                return $arr_list->[0];
-            }
-            else
-            {
-                $ctx = AI::MXNet::Context->current_ctx;
-            }
-        }
-        my $ctx_list = $self->_ctx_map->[$ctx->device_type_id&1];
-        if($ctx->device_id < @{ $ctx_list })
-        {
-            my $idx = $ctx_list->[$ctx->device_id];
-            if(defined $idx)
-            {
-                return $arr_list->[$idx];
-            }
-        }
-        confess(
-            "Parameter '${\ $self->name }' was not initialized on context $ctx. ".
-            "It was only initialized on @{ $self->_ctx_list }."
-        );
-    }
-    if(@{ $self->_deferred_init })
-    {
-        confess("DeferredInitializationError: ".
-            "Parameter '${\ $self->name }' has not been initialized yet because initialization was ".
-            "deferred. Actual initialization happens during the first forward pass. ".
-            "Please pass one batch of data through the network before accessing Parameters. ".
-            "You can also avoid deferred initialization by specifying in_units, ".
-            "num_features, etc., for network layers.");
-    }
-    confess(
-        "Parameter '${\ $self->name }' has not been initialized. Note that ".
-        "you should initialize parameters and create Trainer ".
-        "with Block.collect_params() instead of Block.params ".
-        "because the later does not include Parameters of ".
-        "nested child Blocks"
-    );
-}
-
-
-# (Re)initializes by loading from data. 
-method _load_init($data, $ctx)
-{
-    if($self->shape)
-    {
-        for(zip($self->shape, $data->shape)) {
-            my ($self_dim, $data_dim) = @$_;
-            assert(
-                ($self_dim == 0 or $self_dim == $data_dim),
-                sprintf(
-                    "Failed loading Parameter '%s' from saved params: ".
-                    "shape incompatible expected (%s) vs saved (%s)",
-                    $self->name, "@{$self->shape}", "@{$data->shape}"
-                )
-            );
-        }
-        $self->shape([map { $_->[0] ? $_->[0] : $_->[1] } zip($self->shape, $data->shape)]);
-    }
-    if($self->dtype)
-    {
-        assert(
-            ($self->dtype eq $data->dtype),
-            sprintf(
-                "Failed loading Parameter '%s' from saved params: ".
-                "dtype incompatible expected %s vs saved %s",
-                $self->name, $self->dtype, $data->dtype
-            )
-        );
-    }
-    if($self->stype ne $data->stype)
-    {
-        $data = $data->tostype($self->stype);
-    }
-
-    if(blessed ($ctx) and $ctx->isa('AI::MXNet::Context'))
-    {
-        $ctx = [$ctx];
-    }
-    if(not defined $self->_data)
-    {
-        if(@{ $self->_deferred_init })
-        {
-            assert(
-                (not defined $ctx or join('', @{ $ctx }) eq join('', @{ $self->_deferred_init->[1] })),
-                sprintf(
-                    "Failed to load Parameter '%s' on %s because it was ".
-                    "previously initialized on %s.",
-                    $self->name, $ctx, $self->list_ctx
-                )
-            );
-            $ctx = $self->_deferred_init->[1];
-        }
-        elsif(not defined $ctx)
-        {
-            $ctx = [AI::MXNet::Context->cpu];
-        }
-        $self->_init_impl($data, $ctx);
-    }
-    else
-    {
-        assert(
-            (not defined $ctx or join('', @{ $ctx }) eq join('', @{ $self->list_ctx })),
-            sprintf(
-                "Failed to load Parameter '%s' on %s because it was ".
-                "previously initialized on %s.",
-                $self->name, "@$ctx", "@{$self->list_ctx}"
-            )
-        );
-        $self->set_data($data);
-    }
-    $self->_deferred_init([]);
-}
-
-# Finishes deferred initialization.
-method _finish_deferred_init()
-{
-    return unless @{ $self->_deferred_init };
-    my ($init, $ctx, $default_init, $data) = @{ $self->_deferred_init };
-    $self->_deferred_init([]);
-    assert(
-        (defined($self->shape) and product(@{ $self->shape }) > 0),
-        sprintf(
-            "Cannot initialize Parameter '%s' because it has ".
-            "invalid shape: %s. Please specify in_units, ".
-            "in_channels, etc for `Block`s.",
-            $self->name, $self->shape
-        )
-    );
-    AI::MXNet::AutoGrad->pause(sub {
-        if(not defined $data)
-        {
-            $data = AI::MXNet::NDArray->zeros(
-                $self->shape,
-                dtype => $self->dtype,
-                ctx => AI::MXNet::Context->cpu,
-                stype => $self->stype
-            );
-            AI::MXNet::Initializer->new->(
-                AI::MXNet::InitDesc->new(
-                    name => $self->name,
-                    attrs => { __init__ => defined $init ? "$init" : "$default_init" }
-                ),
-                $data
-            );
-        }
-        $self->_init_impl($data, $ctx);
-    });
-}
-
-# Sets data and grad.
-method _init_impl($data, $ctx_list)
-{
-    $self->_ctx_list([@{ $ctx_list }]);
-    $self->_ctx_map([[], []]);
-    enumerate(sub {
-        my ($i, $ctx) = @_;
-        my $dev_list = $self->_ctx_map->[$ctx->device_type_id&1];
-        while(@{ $dev_list } <= $ctx->device_id)
-        {
-            push @{ $dev_list }, undef;
-        }
-        $dev_list->[$ctx->device_id] = $i;
-    }, $self->_ctx_list);
-    $self->_data([map { $data->copyto($_) } @{ $self->_ctx_list }]);
-    $self->_init_grad;
-}
-
-# Initialize grad buffers.
-method _init_grad()
-{
-    if($self->grad_req eq 'null')
-    {
-        $self->_grad(undef);
-        return;
-    }
-    $self->_grad([
-        map {
-            AI::MXNet::NDArray->zeros(
-                $_->shape, dtype => $_->dtype,
-                ctx => $_->context, stype => $self->grad_stype
-            )
-        } @{ $self->_data }
-    ]);
-    AI::MXNet::AutoGrad->mark_variables(
-        $self->_check_and_get($self->_data, []),
-        $self->_grad,
-        grad_reqs => $self->grad_req
-    );
-}
-
-# Reduce data from multiple contexts to cpu.
-method _reduce()
-{
-    my $data;
-    my $ctx = AI::MXNet::Context->cpu;
-    if($self->stype eq 'default')
-    {
-        my $block = $self->list_data;
-        $data = AI::MXNet::NDArray->add_n(map { $_->copyto($ctx) } @{ $block }) / @{ $block };
-    }
-    else
-    {
-        my $all_row_ids = AI::MXNet::NDArray->arange(stop => $self->shape->[0], dtype=>'int64', ctx=>$ctx);
-        $data = AI::MXNet::NDArray->zeros($self->shape, stype=>'row_sparse', ctx=>$ctx);
-        $self->_trainer->_row_sparse_pull($self, $data, $all_row_ids, 1);
-    }
-    return $data;
-}
-
-=head2 initialize
-
-        Initializes parameter and gradient arrays. Only used for `NDArray` API.
-
-        Parameters
-        ----------
-        :$init : Initializer
-            The initializer to use. Overrides AI::MXNet::Gluon::Parameter->init and default_init.
-        :$ctx : AI::MXNet::Context or array ref of AI::MXNet::Context, defaults to AI::MXNet::Context->current_ctx().
-            Initialize Parameter on given context. If ctx is a list of Context, a
-            copy will be made for each context.
-            Copies are independent arrays. User is responsible for keeping
-            their values consistent when updating. Normally gluon->Trainer does this for you.
-        :$default_init : Initializer
-            Default initializer is used when both 'init' and AI::MXNet::Gluon::Parameter->init are undefined.
-        :$force_reinit : bool, default False
-            Whether to force re-initialization if parameter is already initialized.
-
-        Examples
-        --------
-        >>> $weight = mx->gluon->Parameter('weight', shape=>[2, 2]);
-        >>> $weight->initialize(ctx=>mx->cpu(0));
-        >>> print $weight->data
-        [[-0.01068833  0.01729892]
-         [ 0.02042518 -0.01618656]]
-        <NDArray 2x2 @cpu(0)>
-        >>> print $weight->grad()
-        [[ 0.  0.]
-         [ 0.  0.]]
-        <NDArray 2x2 @cpu(0)>
-        >>> $weight->initialize(ctx=>[mx->gpu(0), mx->gpu(1)]);
-        >>> print $weight->data(mx->gpu(0));
-        [[-0.00873779 -0.02834515]
-         [ 0.05484822 -0.06206018]]
-        <NDArray 2x2 @gpu(0)>
-        >>> print $weight->data(mx->gpu(1))
-        [[-0.00873779 -0.02834515]
-         [ 0.05484822 -0.06206018]]
-        <NDArray 2x2 @gpu(1)>
-=cut
-
-method initialize(
-    Maybe[Initializer]                                     :$init=,
-    Maybe[AI::MXNet::Context|ArrayRef[AI::MXNet::Context]] :$ctx=AI::MXNet::Context->current_ctx,
-    Initializer                                            :$default_init=AI::MXNet::Initializer->Uniform,
-    Bool                                                   :$force_reinit=0
-)
-{
-    $ctx //=AI::MXNet::Context->current_ctx;
-    if(defined $self->_data and not $force_reinit)
-    {
-        AI::MXNet::Logging->warning(
-            "Parameter '%s' is already initialized, ignoring. ".
-            "Set force_reinit=True to re-initialize.", $self->name
-        );
-        return;
-    }
-    $self->_data(undef);
-    $self->_grad(undef);
-    if(blessed($ctx) and $ctx->isa('AI::MXNet::Context'))
-    {
-        $ctx = [$ctx];
-    }
-    if(not defined $init)
-    {
-        if(defined $self->init)
-        {
-            $init = $self->init;
-        }
-        else
-        {
-            $init = $default_init;
-        }
-    }
-    if(not defined $self->shape or not @{ $self->shape } or product(@{ $self->shape }) <= 0)
-    {
-        if($self->allow_deferred_init)
-        {
-            $self->_deferred_init([$init, $ctx, $default_init, undef]);
-            return;
-        }
-        confess("Cannot initialize Parameter '${\ $self->name }' because it has ".
-                "invalid shape: @{$self->shape//[]}.");
-    }
-    $self->_deferred_init([$init, $ctx, $default_init, undef]);
-    $self->_finish_deferred_init;
-}
-
-=head2 reset_ctx
-
-        Re-assign Parameter to other contexts.
-
-        :$ctx : AI::MXNet::Context or array ref of AI::MXNet::Context, default AI::MXNet::Context->current_ctx.
-        Assign Parameter to given context. If ctx is a list of Context, a
-        copy will be made for each context.
-=cut
-
-method reset_ctx(Maybe[AI::MXNet::Context|ArrayRef[AI::MXNet::Context]] :$ctx=AI::MXNet::Context->current_ctx)
-{
-    if(blessed($ctx) and $ctx->isa('AI::MXNet::Context'))
-    {
-        $ctx = [$ctx];
-    }
-    if(defined $self->_data)
-    {
-        my $data = $self->_reduce;
-        AI::MXNet::AutoGrad->pause(sub {
-            $self->_init_impl($data, $ctx);
-        });
-    }
-    elsif(@{ $self->_deferred_init })
-    {
-        my ($init, undef, $default_init, $data) = @{ $self->_deferred_init };
-        $self->_deferred_init([$init, $ctx, $default_init, $data]);
-    }
-    else
-    {
-        confess("Cannot reset context for Parameter '${ \ $self->name }' because it ".
-                "has not been initialized.");
-    }
-}
-
-=head2 set_data
-
-    Sets this parameter's value on all contexts to data.
-=cut
-
-method set_data($data)
-{
-    $self->shape($data->shape);
-    if(not defined $self->_data)
-    {
-        assert(
-            (@{ $self->_deferred_init }),
-            "Parameter '${\ $self->name }' has not been initialized"
-        );
-        $self->_deferred_init->[3] = $data;
-        return;
-    }
-
-    # if update_on_kvstore, we need to make sure the copy stored in kvstore is in sync
-    if($self->_trainer and $self->_trainer->_kv_initialized and $self->_trainer->update_on_kvstore)
-    {
-        if(!grep { Scalar::Util::refaddr($self) == Scalar::Util::refaddr($_) } @{ $self->_trainer->_params_to_init })
-        {
-            $self->_trainer->_reset_kvstore();
-        }
-    }
-    for my $arr (@{ $self->_check_and_get($self->_data, []) })
-    {
-        $arr .= $data;
-    }
-}
-
-=head2 row_sparse_data 
-
-        Returns a copy of the 'row_sparse' parameter on the same context as row_id's.
-        The copy only retains rows whose ids occur in provided row ids.
-        The parameter must have been initialized on this context before.
-
-        Parameters
-        ----------
-        $row_id: AI::MXNet::NDArray
-            Row ids to retain for the 'row_sparse' parameter.
-
-        Returns
-        -------
-        AI::MXNet::NDArray on row_id's context
-=cut
-
-method row_sparse_data(AI::MXNet::NDArray $row_id)
-{
-    if($self->stype ne 'row_sparse')
-    {
-        confess(
-            "Cannot return a copy of Parameter ${\ $self->name } via row_sparse_data() ".
-            "because its storage type is ${\ $self->stype }. Please use data() instead."
-        );
-    }
-    return $self->_get_row_sparse($self->_data, $row_id->context, $row_id);
-}
-
-=head2 list_row_sparse_data
-
-        Returns copies of the 'row_sparse' parameter on all contexts, in the same order
-        as creation. The copy only retains rows whose ids occur in provided row ids.
-        The parameter must have been initialized before.
-
-        Parameters
-        ----------
-        $row_id: AI::MXNet::NDArray
-            Row ids to retain for the 'row_sparse' parameter.
-
-        Returns
-        -------
-        array ref of AI::MXNet::NDArrays
-=cut
-
-method list_row_sparse_data(AI::MXNet::NDArray $row_id)
-{
-    if($self->stype ne 'row_sparse')
-    {
-        confess(
-            "Cannot return copies of Parameter '${\ $self->name }' on all contexts via ".
-            "list_row_sparse_data() because its storage type is ${\ $self->stype }. Please ".
-            "use data() instead."
-        );
-    }
-    return $self->_get_row_sparse($self->_data, [], $row_id);
-}
-
-=head2 data
-
-        Returns a copy of this parameter on one context. Must have been
-        initialized on this context before. For sparse parameters, use
-        row_sparse_data instead.
-
-        Parameters
-        ----------
-        ctx : Context
-            Desired context.
-
-        Returns
-        -------
-        NDArray on ctx
-=cut
-
-method data(Maybe[AI::MXNet::Context] $ctx=)
-{
-    if($self->stype ne 'default')
-    {
-        $ctx //= AI::MXNet::Context->current_ctx;
-        confess(
-            "Cannot return a copy of Parameter '${\ $self->name }' on ctx $ctx via data() ".
-            "because its storage type is ${\ $self->stype }. Please use row_sparse_data() ".
-            "instead."
-        );
-    }
-    return $self->_check_and_get($self->_data, $ctx);
-}
-
-=head2 list_data
-
-        Returns copies of this parameter on all contexts, in the same order
-        as creation. For sparse parameters, use list_row_sparse_data
-        instead.
-=cut
-
-method list_data()
-{
-    if($self->stype ne 'default')
-    {
-        confess(
-            "Cannot return a copies of Parameter '${\ $self->data }' on all contexts via list_data() ".
-            "because its storage type is ${\ $self->stype }. Please use row_sparse_data() ".
-            "instead."
-        );
-    }
-    return $self->_check_and_get($self->_data, [])
-}
-
-=head2 grad
-
-        Returns a gradient buffer for this parameter on one context.
-
-        Parameters
-        ----------
-        ctx : Context
-            Desired context.
-=cut
-
-method grad(Maybe [AI::MXNet::Context] $ctx=)
-{
-    if(defined $self->_data and not defined $self->_grad)
-    {
-        confess(
-            "Cannot get gradient array for Parameter ${\ $self->name } ".
-            "because grad_req='null'"
-        );
-    }
-    return $self->_check_and_get($self->_grad, $ctx);
-}
-
-=head2 list_grad
-
-        Returns gradient buffers on all contexts, in the same order
-        as 'values'.
-=cut
-
-method list_grad()
-{
-    if(defined $self->_data and not defined $self->_grad)
-    {
-        confess(
-            "Cannot get gradient array for Parameter ${\ $self->name } ".
-            "because grad_req='null'"
-        );
-    }
-    return $self->_check_and_get($self->_grad, []);
-}
-
-=head2 list_ctx
-
-        Returns a list of contexts this parameter is initialized on.
-=cut
-
-method list_ctx()
-{
-    if(not defined $self->_data)
-    {
-        if(@{ $self->_deferred_init })
-        {
-            return $self->_deferred_init->[1];
-        }
-        confess("Parameter ${\ $self->name } has not been initialized");
-    }
-    return $self->_ctx_list;
-}
-
-=head2 zero_grad
-
-        Sets gradient buffer on all contexts to 0. No action is taken if
-        parameter is uninitialized or doesn't require gradient.
-=cut
-
-method zero_grad()
-{
-    return unless defined $self->_grad;
-    AI::MXNet::NDArray->zeros_like($_, { out => $_ }) for @{ $self->_grad };
-}
-
-=head2 var
-
-        Returns a symbol representing this parameter.
-=cut
-
-method var()
-{
-    if(not defined $self->_var)
-    {
-        $self->_var(
-            AI::MXNet::Symbol->var(
-                $self->name, shape => $self->shape, dtype => $self->dtype,
-                lr_mult => $self->lr_mult, wd_mult => $self->wd_mult,
-                init => $self->init, stype => $self->stype
-            )
-        );
-    }
-    return $self->_var;
-}
-
-=head2 cast
-
-    Cast data and gradient of this Parameter to a new data type.
-
-    Parameters
-     ----------
-    $dtype : Dtype
-    The new data type.
-=cut
-
-method cast(Dtype $dtype)
-{
-    $self->dtype($dtype);
-    return unless defined $self->_data;
-    AI::MXNet::AutoGrad->pause(sub {
-        $self->_data([map { $_->astype($dtype) } @{ $self->_data }]);
-        return unless defined $self->_grad;
-        $self->_grad([map { $_->astype($dtype) } @{ $self->_grad }]);
-        AI::MXNet::AutoGrad->mark_variables($self->_data, $self->_grad, grad_reqs => $self->grad_req);
-    });
-}
-
-__PACKAGE__->AI::MXNet::NS::register('AI::MXNet::Gluon');
-
-package AI::MXNet::Gluon::Constant;
-use strict;
-use warnings;
-use Mouse;
-extends 'AI::MXNet::Gluon::Parameter';
-
-=head1 NAME 
-
-    AI::MXNet::Gluon::Constant - A constant parameter for holding immutable tensors.
-=cut
-
-=head1 DESCRIPTION
-
-    A constant parameter for holding immutable tensors.
-    Constants are ignored by autograd and Trainer, thus their values
-    will not change during training. But you can still update their values
-    manually with the set_data method.
-
-    Constants can be created with either
-
-        $const = mx->gluon->Constant('const', [[1,2],[3,4]]);
-
-    or
-
-        package Block;
-        use AI::MXNet::Gluon::Mouse;
-        extends 'AI::MXNet::Gluon::Block';
-        sub BUILD
-        {
-            $self->const($self->params->get_constant('const', [[1,2],[3,4]]));
-        }
-
-    Constructor Attributes
-    ----------
-    name : str
-        Name of the parameter.
-    value : AcceptableInput (perl array, pdl, ndarray, etc)
-        Initial value for the constant.
-=cut
-
-use Mouse;
-use AI::MXNet::Base;
-use Scalar::Util qw(refaddr);
-around BUILDARGS => \&AI::MXNet::Base::process_arguments;
-method python_constructor_arguments() { ['name', 'value'] }
-has 'value'     => (is => 'rw', isa => 'AcceptableInput');
-has '+_grad_req' => (is => 'rw', default => 'null');
-use overload '""' => sub {
-        my $self = shift;
-        "Constant " . $self->name.
-        " (shape=(" . join(', ', @{ $self->shape//[] }) .")".
-        ", dtype=" . $self->dtype.
-        ", stype=" . $self->stype.")"
-    },
-    fallback => 1;
-
-
-sub BUILD
-{
-    my $self = shift;
-    if(not (blessed $self->value and $self->value->isa('AI::MXNet::NDArray')))
-    {
-        $self->value(AI::MXNet::NDArray->array($self->value, dtype => $self->dtype));
-    }
-    $self->shape($self->value->shape);
-    my $init = "AI::MXNet::Gluon::Constant::Init_${\ $self->name }_${\ refaddr($self) }";
-    my $tmp =<<"EOP";
-    package $init;
-    use Mouse;
-    extends 'AI::MXNet::Initializer';
-    sub _init_weight
-    {
-        \$self->value->copyto(\$_[2]);
-    }
-    $init->register;
-    1;
-EOP
-    eval $tmp;
-    $self->init($init->new);
-}
-
-method grad_req($req=)
-{
-    if(defined $req and $req ne 'null')
-    {
-        AI::MXNet::Logging->warning(
-            'Constant parameter "%s" does not support '.
-            'grad_req other than "null", and new value "%s" '.
-            'is ignored.',
-            $self->name, $req
-        );
-    }
-    return 'null';
-}
-
-package AI::MXNet::Gluon::ParameterDict;
-use AI::MXNet::Base;
-=head1 NAME
-
-    AI::MXNet::Gluon::ParameterDict - A dictionary managing a set of parameters.
-=cut
-
-=head1 DESCRIPTION
-
-    Parameters
-    ----------
-    prefix : str, default ''
-        The prefix to be prepended to all Parameters' names created by this dict.
-    shared : ParameterDict or undef
-        If not undef, when this dict's `get` method creates a new parameter, will
-        first try to retrieve it from `shared` dict. Usually used for sharing
-        parameters with another `Block`.
-=cut
-
-use Mouse;
-has _prefix => (is => 'ro', isa => 'Str', init_arg => 'prefix', default => '');
-has _shared => (is => 'rw', isa => 'Maybe[AI::MXNet::Gluon::ParameterDict]', init_arg => 'shared');
-has _params => (is => 'rw', init_arg => undef);
-
-around BUILDARGS => \&AI::MXNet::Base::process_arguments;
-method python_constructor_arguments() { [qw/prefix shared/] }
-
-sub BUILD
-{
-    my $self = shift;
-    $self->_params(Hash::Ordered->new);
-}
-
-use overload
-    '""'   => sub {
-        my $self = shift;
-        my $name = $self->_prefix ? $self->_prefix." " : '';
-        my $content = join("\n", map { AI::MXNet::Base::_indent("   $_", 2) } $self->values);
-        return "$name(\n$content\n)";
-    },
-    '@{}'  => sub { my @tmp = shift->_params->as_list; \@tmp },
-    fallback => 1;
-
-method items()
-{
-    return @{$self};
-}
-
-method keys()
-{
-    return $self->_params->keys;
-}
-
-method values()
-{
-    return $self->_params->values;
-}
-
-method prefix()
-{
-    $self->_prefix;
-}
-
-method params()
-{
-    $self->_params;
-}
-
-method _get_impl($name)
-{
-    if($self->_params->exists($name))
-    {
-        return $self->_params->get($name);
-    }
-    if(defined $self->_shared and $self->_shared->_params->exists($name))
-    {
-        $self->_params->set($name => $self->_shared->_params->get($name));
-        return $self->_params->get($name);
-    }
-    return undef;
-}
-
-=head2 get
-
-        Retrieves a 'AI::MXNet::Gluon::Parameter' with name '$self->prefix.$name'. If not found,
-        'get' will first try to retrieve it from 'shared' dict. If still not
-        found, 'get' will create a new 'AI::MXNet::Gluon::Parameter' with key-word arguments and
-        insert it to self.
-
-        Parameters
-        ----------
-        name : str
-            Name of the desired Parameter. It will be prepended with this dictionary's
-            prefix.
-        %kwargs : hash
-            The rest of key-word arguments for the created `Parameter`.
-
-        Returns
-        -------
-        Parameter
-            The created or retrieved `Parameter`.
-=cut
-
-use Data::Dumper;
-method get(Str $name, %kwargs)
-{
-    $name = $self->prefix . $name;
-    my $param = $self->_get_impl($name);
-    if(not defined $param)
-    {
-        $param = AI::MXNet::Gluon::Parameter->new($name, %kwargs);
-        $self->_params->set($name => $param);
-    }
-    else
-    {
-        while(my ($k, $v) = each %kwargs)
-        {
-            if($param->can($k))
-            {
-                if(defined $param->$k)
-                {
-                    my $existing = $param->$k;
-                    if($k eq 'shape' and @{$v} == @{$existing})
-                    {
-                        my @inferred_shape;
-                        my $matched = 1;
-                        for(zip($v, $existing))
-                        {
-                            my ($dim1, $dim2) = @$_;
-                            if($dim1 != $dim2 and $dim1 * $dim2 != 0)
-                            {
-                                $matched = 0;
-                                 last;
-                            }
-                            elsif($dim1 == $dim2)
-                            {
-                                push @inferred_shape, $dim1;
-                            }
-                            elsif($dim1 == 0)
-                            {
-                                push @inferred_shape, $dim2;
-                            }
-                            else
-                            {
-                                push @inferred_shape, $dim1;
-                            }
-                        }
-                        if($matched)
-                        {
-                            $param->_shape(\@inferred_shape);
-                            next;
-                        }
-                    }
-                    elsif($k eq 'dtype' and ($v//'') eq ($existing//''))
-                    {
-                        next;
-                    }
-                    assert(
-                        (not defined $v or Dumper($v) eq Dumper($param->$k)),
-                        "Cannot retrieve Parameter $name because desired attribute ".
-                        "does not match with stored for attribute $k: ".
-                        "desired ".Dumper($v)." vs stored ". Dumper($param->$k)
-                    );
-                }
-                else
-                {
-                    $param->$k($v);
-                }
-            }
-            else
-            {
-                confess("unknown param $k, $v");
-            }
-        }
-    }
-    return $param;
-}
-
-=head2 update
-
-    Copies all Parameters in $other to self.
-=cut
-
-method update($other, Maybe[Str] $select=)
-{
-    my @keys = $other->keys;
-    for my $k (grep { not defined $select or /$select/ } @keys)
-    {
-        if($self->_params->exists($k))
-        {
-            assert(
-                ($self->_params->get($k) eq $other->_params->get($k)),
-                "Cannot update self with other because they have different ".
-                "Parameters with the same name $k"
-            );
-        }
-        else
-        {
-            $self->_params->set($k => $other->_params->get($k));
-        }
-    }
-}
-
-=head2 get_constant
-
-        Retrieves AI::MXNet::Gluon::Constant with name $self->prefix.$name. If not found,
-        'get' will first try to retrieve it from "shared" dictionary. If still not
-        found, 'get' will create a new Constant with key-word
-        arguments and insert it to self.
-
-        Parameters
-        ----------
-        name : str
-            Name of the desired Constant. It will be prepended with this dictionary's
-            prefix.
-        value : array-like
-            Initial value of constant.
-
-        Returns
-        -------
-        Constant
-            The created or retrieved Constant.
-=cut
-
-method get_constant(Str $name, Maybe[AcceptableInput] $value=)
-{
-    $name = $self->prefix . $name;
-    my $param = $self->_get_impl($name);
-    if(not defined $param)
-    {
-        if(not defined $value)
-        {
-            confess(
-                "No constant named '$name'. Please specify value ".
-                "if you want to create a new constant."
-            );
-        }
-        $param = AI::MXNet::Gluon::Constant->new($name, $value);
-        $self->_params->set($name, $param);
-    }
-    elsif(defined $value)
-    {
-        confess("reinit of Constant $name is not allowed");
-    }
-    return $param;
-}
-
-=head2 initialize
-
-        Initializes all Parameters managed by this dictionary to be used for 'NDArray'
-        API. It has no effect when using 'Symbol' API.
-
-        Parameters
-        ----------
-        :$init : Initializer
-            Global default Initializer to be used when AI::MXNet::Gluon::Parameter->init is undef.
-            Otherwise, AI::MXNet::Gluon::Parameter->init takes precedence.
-        :$ctx : AI::MXNet::Context or array ref of AI::MXNet::Context objects
-            Keeps a copy of Parameters on one or many context(s).
-        :$force_reinit : bool, default False
-            Whether to force re-initialization if parameter is already initialized.
-        :$verbose : bool, default False
-            Whether to force re-initialization if parameter is already initialized.
-=cut
-
-method initialize(
-    Initializer                                            :$init=AI::MXNet::Initializer->Uniform(),
-    Maybe[AI::MXNet::Context|ArrayRef[AI::MXNet::Context]] :$ctx=,
-    Bool                                                   :$verbose=0,
-    Bool                                                   :$force_reinit=0
-)
-{
-    if($verbose)
-    {
-        $init->set_verbosity(verbose=>$verbose);
-    }
-    $_->initialize(ctx => $ctx, default_init => $init, force_reinit => $force_reinit) for $self->values;
-}
-
-=head2 zero_grad
-
-    Sets all Parameters' gradient buffer to 0.
-=cut
-
-method zero_grad()
-{
-    $_->zero_grad for $self->values;
-}
-
-=head2 reset_ctx
-
-    Re-assign all Parameters to other contexts.
-
-    $ctx : AI::MXNet::Context or array ref of AI::MXNet::Context objects, defaults to AI::MXNet::Context->current_ctx().
-            Assign Parameter to given context. If $ctx is an array ref of AI::MXNet::Context objects, a
-            copy will be made for each context.
-=cut
-
-method reset_ctx(AI::MXNet::Context|ArrayRef[AI::MXNet::Conetxt] $ctx=AI::MXNet::Context->current_ctx)
-{
-    $_->reset_ctx($ctx) for $self->values;
-}
-
-=head2 setattr
-
-        Set an attribute to a new value for all Parameters.
-
-        For example, set grad_req to null if you don't need gradient w.r.t a
-        model's Parameters::
-
-            $model->collect_params()->setattr(grad_req => 'null');
-
-        or change the learning rate multiplier::
-
-            $model->collect_params()->setattr(lr_mult => 0.5);
-
-        Parameters
-        ----------
-        $name : str
-            Name of the attribute.
-        $value : valid type for attribute name
-            The new value for the attribute.
-=cut
-
-method setattr($name, $value)
-{
-    $_->$name($value) for $self->values;
-}
-
-
-=head2 save
-
-    Save parameters to file.
-
-    $filename : str
-        Path to parameter file.
-    $strip_prefix : str, default ''
-    Strip prefix from parameter names before saving.
-=cut
-
-method save(Str $filename, Str $strip_prefix='')
-{
-    my %arg_dict = ();
-    for my $param ($self->values())
-    {
-        my $weight = $param->_reduce();
-        if(not $param->name =~ /^$strip_prefix/)
-        {
-            confess(
-                "Prefix $strip_prefix is to be striped before saving, but Parameter ".
-                "${\ $param->name } does not start with $strip_prefix. If you are using Block.save_params, ".
-                "This may be due to your Block shares parameters from other ".
-                "Blocks or you forgot to use `with name_scope()`` during init. ".
-                "Consider switching to Block.collect_params.save and ".
-                "Block.collect_params.load instead."
-            );
-        }
-        $arg_dict{ substr($param->name, length $strip_prefix) } = $weight;
-    }
-    AI::MXNet::NDArray->save($filename, \%arg_dict);
-}
-
-=head2
-
-        Load parameters from file.
-
-        $filename : str
-            Path to parameter file.
-        :$ctx : AI::MXNet::Context or array ref of AI::MXNet::Context objects
-            Context(s) initialize loaded parameters on.
-        :$allow_missing : bool, default False
-            Whether to silently skip loading parameters not represents in the file.
-        :$ignore_extra : bool, default False
-            Whether to silently ignore parameters from the file that are not
-            present in this ParameterDict.
-        :$restore_prefix : str, default ''
-            prepend prefix to names of stored parameters before loading.
-=cut
-
-method load(
-    Str                                              $filename,
-    AI::MXNet::Context|ArrayRef[AI::MXNet::Context] :$ctx=AI::MXNet::Context->current_ctx,
-    Bool                                            :$allow_missing=0,
-    Bool                                            :$ignore_extra=0,
-    Str                                             :$restore_prefix=''
-)
-{
-    if($restore_prefix)
-    {
-        for my $name ($self->keys())
-        {
-            assert(
-                ($name =~ /^$restore_prefix/),
-                "restore_prefix is $restore_prefix but Parameters name $name does not start ".
-                "with $restore_prefix"
-            );
-        }
-    }
-    my $lprefix  = length $restore_prefix;
-    my %orig_load = %{ AI::MXNet::NDArray->load($filename) };
-    my %arg_dict  = map { my $k = $_; s/^(?:arg|aux)://; ($restore_prefix.$_, $orig_load{$k}) } keys %orig_load;
-    if(not $allow_missing)
-    {
-        for my $name ($self->keys())
-        {
-            assert(
-                (exists $arg_dict{ $name }),
-                sprintf("Parameter %s is missing in file %s", substr($name, $lprefix), $filename)
-            );
-        }
-    }
-    for my $name (keys %arg_dict)
-    {
-        if(not $self->_params->exists($name))
-        {
-            assert(
-                $ignore_extra,
-                sprintf(
-                    "Parameter %s loaded from file %s is not present in ParameterDict",
-                    substr($name, $lprefix),
-                    $filename
-                )
-            );
-            next;
-        }
-        $self->_params->get($name)->_load_init($arg_dict{$name}, $ctx);
-    }
-}
-
-__PACKAGE__->AI::MXNet::NS::register('AI::MXNet::Gluon');
-
-1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/RNN.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/RNN.pm
deleted file mode 100644
index bf5736ccbb9e..000000000000
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/RNN.pm
+++ /dev/null
@@ -1,25 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-package AI::MXNet::Gluon::RNN;
-use strict;
-use warnings;
-use AI::MXNet::NS 'global';
-use AI::MXNet::Gluon::RNN::Layer;
-use AI::MXNet::Gluon::RNN::Cell;
-
-1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/RNN/Cell.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/RNN/Cell.pm
deleted file mode 100644
index 89493c7b8bfb..000000000000
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/RNN/Cell.pm
+++ /dev/null
@@ -1,1325 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-use strict;
-use warnings;
-package AI::MXNet::Gluon::RNN::RecurrentCell;
-use Mouse::Role;
-use AI::MXNet::Base;
-use AI::MXNet::Function::Parameters;
-
-method _cells_state_info($cells, $batch_size)
-{
-    return [map { @{ $_->state_info($batch_size) } } $cells->values];
-}
-
-method _cells_begin_state($cells, %kwargs)
-{
-    return [map { @{ $_->begin_state(%kwargs) } } $cells->values];
-}
-
-method _get_begin_state(GluonClass $F, $begin_state, GluonInput $inputs, $batch_size)
-{
-    if(not defined $begin_state)
-    {
-        if($F =~ /AI::MXNet::NDArray/)
-        {
-            my $ctx = blessed $inputs ? $inputs->context : $inputs->[0]->context;
-            {
-                local($AI::MXNet::current_ctx) = $ctx;
-                my $func = sub {
-                    my %kwargs = @_;
-                    my $shape = delete $kwargs{shape};
-                    return AI::MXNet::NDArray->zeros($shape, %kwargs);
-                };
-                $begin_state = $self->begin_state(batch_size => $batch_size, func => $func);
-            }
-        }
-        else
-        {
-            $begin_state = $self->begin_state(batch_size => $batch_size, func => sub { return $F->zeros(@_) });
-        }
-    }
-    return $begin_state;
-}
-
-
-method _format_sequence($length, $inputs, $layout, $merge, $in_layout=)
-{
-    assert(
-        (defined $inputs),
-        "unroll(inputs=None) has been deprecated. ".
-        "Please create input variables outside unroll."
-    );
-
-    my $axis = index($layout, 'T');
-    my $batch_axis = index($layout, 'N');
-    my $batch_size = 0;
-    my $in_axis = defined $in_layout ? index($in_layout, 'T') : $axis;
-    my $F;
-    if(blessed $inputs and $inputs->isa('AI::MXNet::Symbol'))
-    {
-        $F = 'AI::MXNet::Symbol';
-        if(not $merge)
-        {
-            assert(
-                (@{ $inputs->list_outputs() } == 1),
-                "unroll doesn't allow grouped symbol as input. Please convert ".
-                "to list with list(inputs) first or let unroll handle splitting"
-            );
-            $inputs = [
-                AI::MXNet::Symbol->split(
-                    $inputs, axis => $in_axis, num_outputs => $length, squeeze_axis => 1
-                )
-            ];
-        }
-    }
-    elsif(blessed $inputs and $inputs->isa('AI::MXNet::NDArray'))
-    {
-        $F = 'AI::MXNet::NDArray';
-        $batch_size = $inputs->shape->[$batch_axis];
-        if(not $merge)
-        {
-            assert(not defined $length or $length == $inputs->shape->[$in_axis]);
-            $inputs = as_array(
-                AI::MXNet::NDArray->split(
-                    $inputs, axis=>$in_axis,
-                    num_outputs => $inputs->shape->[$in_axis],
-                    squeeze_axis => 1
-                )
-            );
-        }
-    }
-    else
-    {
-        assert(not defined $length or @{ $inputs } == $length);
-        if($inputs->[0]->isa('AI::MXNet::Symbol'))
-        {
-            $F = 'AI::MXNet::Symbol';
-        }
-        else
-        {
-            $F = 'AI::MXNet::NDArray';
-            $batch_size = $inputs->[0]->shape->[$batch_axis];
-        }
-        if($merge)
-        {
-            $inputs  = [map { $F->expand_dims($_, axis => $axis) } @{ $inputs }];
-            $inputs  = $F->stack(@{ $inputs }, axis => $axis);
-            $in_axis = $axis;
-        }
-    }
-    if(blessed $inputs and $axis != $in_axis)
-    {
-        $inputs = $F->swapaxes($inputs, dim1=>$axis, dim2=>$in_axis);
-    }
-    return ($inputs, $axis, $F, $batch_size);
-}
-
-method _mask_sequence_variable_length($F, $data, $length, $valid_length, $time_axis, $merge)
-{
-    assert(defined $valid_length);
-    if(not blessed $data)
-    {
-        $data = $F->stack(@$data, axis=>$time_axis);
-    }
-    my $outputs = $F->SequenceMask($data, { sequence_length=>$valid_length, use_sequence_length=>1,
-                             axis=>$time_axis});
-    if(not $merge)
-    {
-        $outputs = $F->split($outputs, { num_outputs=>$length, axis=>$time_axis,
-                                   squeeze_axis=>1});
-        if(not ref $outputs eq 'ARRAY')
-        {
-            $outputs = [$outputs];
-        }
-    }
-    return $outputs;
-}
-
-method _reverse_sequences($sequences, $unroll_step, $valid_length=)
-{
-    my $F;
-    if($sequences->[0]->isa('AI::MXNet::Symbol'))
-    {
-        $F = 'AI::MXNet::Symbol';
-    }
-    else
-    {
-        $F = 'AI::MXNet::NDArray';
-    }
-
-    my $reversed_sequences;
-    if(not defined $valid_length)
-    {
-        $reversed_sequences = [reverse(@$sequences)];
-    }
-    else
-    {
-        $reversed_sequences = $F->SequenceReverse($F->stack(@$sequences, axis=>0),
-                                               {sequence_length=>$valid_length,
-                                               use_sequence_length=>1});
-        $reversed_sequences = $F->split($reversed_sequences, {axis=>0, num_outputs=>$unroll_step, squeeze_axis=>1});
-    }
-    return $reversed_sequences;
-}
-
-=head1 NAME
-
-    AI::MXNet::Gluon::RNN::RecurrentCell
-=cut
-
-=head1 DESCRIPTION
-
-    Abstract role for RNN cells
-
-    Parameters
-    ----------
-    prefix : str, optional
-        Prefix for names of `Block`s
-        (this prefix is also used for names of weights if `params` is `None`
-        i.e. if `params` are being created and not reused)
-    params : Parameter or None, optional
-        Container for weight sharing between cells.
-        A new Parameter container is created if `params` is `None`.
-=cut
-
-=head2 reset
-
-    Reset before re-using the cell for another graph.
-=cut
-
-method reset()
-{
-    $self->init_counter(-1);
-    $self->counter(-1);
-    $_->reset for $self->_children->values;
-}
-
-=head2 state_info
-
-    Shape and layout information of states
-=cut
-method state_info(Int $batch_size=0)
-{
-    confess('Not Implemented');
-}
-
-=head2 begin_state
-
-        Initial state for this cell.
-
-        Parameters
-        ----------
-        $func : CodeRef, default sub { AI::MXNet::Symbol->zeros(@_) }
-            Function for creating initial state.
-
-            For Symbol API, func can be `symbol.zeros`, `symbol.uniform`,
-            `symbol.var etc`. Use `symbol.var` if you want to directly
-            feed input as states.
-
-            For NDArray API, func can be `ndarray.zeros`, `ndarray.ones`, etc.
-        $batch_size: int, default 0
-            Only required for NDArray API. Size of the batch ('N' in layout)
-            dimension of input.
-
-        %kwargs :
-            Additional keyword arguments passed to func. For example
-            `mean`, `std`, `dtype`, etc.
-
-        Returns
-        -------
-        states : nested array ref of Symbol
-            Starting states for the first RNN step.
-=cut
-
-method begin_state(Int :$batch_size=0, CodeRef :$func=, %kwargs)
-{
-    $func //= sub {
-        my %kwargs = @_;
-        my $shape = delete $kwargs{shape};
-        return AI::MXNet::NDArray->zeros($shape, %kwargs);
-    };
-    assert(
-        (not $self->modified),
-        "After applying modifier cells (e.g. ZoneoutCell) the base ".
-        "cell cannot be called directly. Call the modifier cell instead."
-    );
-    my @states;
-    for my $info (@{ $self->state_info($batch_size) })
-    {
-        $self->init_counter($self->init_counter + 1);
-        if(defined $info)
-        {
-            %$info = (%$info, %kwargs);
-        }
-        else
-        {
-            $info = \%kwargs;
-        }
-        my $state = $func->(
-            name => "${\ $self->_prefix }begin_state_${\ $self->init_counter }",
-            %$info
-        );
-        push @states, $state;
-    }
-    return \@states;
-}
-
-=head2 unroll
-
-        Unrolls an RNN cell across time steps.
-
-        Parameters
-        ----------
-        $length : int
-            Number of steps to unroll.
-        $inputs : Symbol, list of Symbol, or None
-            If `inputs` is a single Symbol (usually the output
-            of Embedding symbol), it should have shape
-            (batch_size, length, ...) if `layout` is 'NTC',
-            or (length, batch_size, ...) if `layout` is 'TNC'.
-
-            If `inputs` is a list of symbols (usually output of
-            previous unroll), they should all have shape
-            (batch_size, ...).
-        :$begin_state : nested list of Symbol, optional
-            Input states created by `begin_state()`
-            or output state of another cell.
-            Created from `begin_state()` if `None`.
-        :$layout : str, optional
-            `layout` of input symbol. Only used if inputs
-            is a single Symbol.
-        :$merge_outputs : bool, optional
-            If `False`, returns outputs as a list of Symbols.
-            If `True`, concatenates output across time steps
-            and returns a single symbol with shape
-            (batch_size, length, ...) if layout is 'NTC',
-            or (length, batch_size, ...) if layout is 'TNC'.
-            If `None`, output whatever is faster.
-
-        Returns
-        -------
-        outputs : list of Symbol or Symbol
-            Symbol (if `merge_outputs` is True) or list of Symbols
-            (if `merge_outputs` is False) corresponding to the output from
-            the RNN from this unrolling.
-
-        states : list of Symbol
-            The new state of this RNN after this unrolling.
-            The type of this symbol is same as the output of `begin_state()`.
-=cut
-
-method unroll(
-    Int $length,
-    Maybe[GluonInput] $inputs,
-    Maybe[GluonInput] :$begin_state=,
-    Str :$layout='NTC',
-    Maybe[Bool] :$merge_outputs=,
-    Maybe[Bool] :$valid_length=
-)
-{
-    $self->reset();
-    my ($F, $batch_size, $axis);
-    ($inputs, $axis, $F, $batch_size) = $self->_format_sequence($length, $inputs, $layout, 0);
-    $begin_state //= $self->_get_begin_state($F, $begin_state, $inputs, $batch_size);
-
-    my $states = $begin_state;
-    my $outputs = [];
-    my $all_states = [];
-    for my $i (0..$length-1)
-    {
-        my $output;
-        ($output, $states) = $self->($inputs->[$i], $states);
-        push @$outputs, $output;
-        if(defined $valid_length)
-        {
-            push @$all_states, $states;
-        }
-    }
-    if(defined $valid_length)
-    {
-        $states = [];
-        for(zip(@$all_states))
-        {
-            push @$states, $F->SequenceLast($F->stack(@$_, axis=>0),
-                                     sequence_length=>$valid_length,
-                                     use_sequence_length=>1,
-                                     axis=>0);
-        }
-        $outputs = $self->_mask_sequence_variable_length($F, $outputs, $length, $valid_length, $axis, 1);
-    }
-    ($outputs) = $self->_format_sequence($length, $outputs, $layout, $merge_outputs);
-    return ($outputs, $states);
-}
-
-method _get_activation(GluonClass $F, GluonInput $inputs, Activation $activation, %kwargs)
-{
-    if(not blessed $activation)
-    {
-        my %act = map { $_ => 1 } qw(tanh relu sigmoid softsign);
-        if(exists $act{$activation})
-        {
-            return $F->$activation($inputs, %kwargs)
-        }
-        return $F->Activation($inputs, act_type=>$activation, %kwargs);
-    }
-    elsif(ref($activation) =~ /LeakyReLU/)
-    {
-        return $F->LeakyReLU($inputs, act_type=>'leaky', slope => $activation->alpha, %kwargs);
-    }
-    else
-    {
-        return $activation->($inputs, %kwargs);
-    }
-}
-
-=head2 forward
-
-        Unrolls the recurrent cell for one time step.
-
-        Parameters
-        ----------
-        inputs : sym.Variable
-            Input symbol, 2D, of shape (batch_size * num_units).
-        states : list of sym.Variable
-            RNN state from previous step or the output of begin_state().
-
-        Returns
-        -------
-        output : Symbol
-            Symbol corresponding to the output from the RNN when unrolling
-            for a single time step.
-        states : list of Symbol
-            The new state of this RNN after this unrolling.
-            The type of this symbol is same as the output of `begin_state()`.
-            This can be used as an input state to the next time step
-            of this RNN.
-
-        See Also
-        --------
-        begin_state: This function can provide the states for the first time step.
-        unroll: This function unrolls an RNN for a given number of (>=1) time steps.
-=cut
-
-package AI::MXNet::Gluon::RNN::HybridRecurrentCell;
-use AI::MXNet::Gluon::Mouse;
-extends 'AI::MXNet::Gluon::HybridBlock';
-with 'AI::MXNet::Gluon::RNN::RecurrentCell';
-has 'modified'      => (is => 'rw', isa => 'Bool', default => 0);
-has [qw/counter
-     init_counter/] => (is => 'rw', isa => 'Int', default => -1);
-
-sub BUILD
-{
-    my $self = shift;
-    $self->reset;
-}
-
-use overload '""' => sub {
-    my $self = shift;
-    my $s = '%s(%s';
-    if($self->can('activation'))
-    {
-        $s .= ", ${\ $self->activation }";
-    }
-    $s .= ')';
-    my $mapping = $self->input_size ? $self->input_size . " -> " . $self->hidden_size : $self->hidden_size;
-    return sprintf($s, $self->_class_name, $mapping);
-};
-
-method forward(GluonInput $inputs, Maybe[GluonInput|ArrayRef[GluonInput]] $states)
-{
-    $self->counter($self->counter + 1);
-    $self->SUPER::forward($inputs, $states);
-}
-
-package AI::MXNet::Gluon::RNN::RNNCell;
-use AI::MXNet::Gluon::Mouse;
-extends 'AI::MXNet::Gluon::RNN::HybridRecurrentCell';
-
-=head1 NAME
-
-    AI::MXNet::Gluon::RNN::RNNCell
-=cut
-
-=head1 DESCRIPTION
-
-    Simple recurrent neural network cell.
-
-    Parameters
-    ----------
-    hidden_size : int
-        Number of units in output symbol
-    activation : str or Symbol, default 'tanh'
-        Type of activation function.
-    i2h_weight_initializer : str or Initializer
-        Initializer for the input weights matrix, used for the linear
-        transformation of the inputs.
-    h2h_weight_initializer : str or Initializer
-        Initializer for the recurrent weights matrix, used for the linear
-        transformation of the recurrent state.
-    i2h_bias_initializer : str or Initializer
-        Initializer for the bias vector.
-    h2h_bias_initializer : str or Initializer
-        Initializer for the bias vector.
-    prefix : str, default 'rnn_'
-        Prefix for name of `Block`s
-        (and name of weight if params is `None`).
-    params : Parameter or None
-        Container for weight sharing between cells.
-        Created if `None`.
-=cut
-
-has 'hidden_size' => (is => 'rw', isa => 'Int', required => 1);
-has 'activation'  => (is => 'rw', isa => 'Activation', default => 'tanh');
-has [qw/
-    i2h_weight_initializer
-    h2h_weight_initializer
-    /]            => (is => 'rw', isa => 'Maybe[Initializer]');
-has [qw/
-    i2h_bias_initializer
-    h2h_bias_initializer
-    /]            => (is => 'rw', isa => 'Maybe[Initializer]', default => 'zeros');
-has 'input_size'  => (is => 'rw', isa => 'Int', default => 0);
-has [qw/
-        i2h_weight
-        h2h_weight
-        i2h_bias
-        h2h_bias
-    /]            => (is => 'rw', init_arg => undef);
-
-method python_constructor_arguments()
-{
-    [qw/
-        hidden_size activation
-        i2h_weight_initializer h2h_weight_initializer
-        i2h_bias_initializer h2h_bias_initializer
-        input_size
-    /];
-}
-
-sub BUILD
-{
-    my $self = shift;
-    $self->i2h_weight($self->params->get(
-        'i2h_weight', shape=>[$self->hidden_size, $self->input_size],
-        init => $self->i2h_weight_initializer,
-        allow_deferred_init => 1
-    ));
-    $self->h2h_weight($self->params->get(
-        'h2h_weight', shape=>[$self->hidden_size, $self->hidden_size],
-        init => $self->h2h_weight_initializer,
-        allow_deferred_init => 1
-    ));
-    $self->i2h_bias($self->params->get(
-        'i2h_bias', shape=>[$self->hidden_size],
-        init => $self->i2h_bias_initializer,
-        allow_deferred_init => 1
-    ));
-    $self->h2h_bias($self->params->get(
-        'h2h_bias', shape=>[$self->hidden_size],
-        init => $self->h2h_bias_initializer,
-        allow_deferred_init => 1
-    ));
-}
-
-method state_info(Int $batch_size=0)
-{
-    return [{ shape => [$batch_size, $self->hidden_size], __layout__ => 'NC' }];
-}
-
-method _alias() { 'rnn' }
-
-method hybrid_forward(
-    GluonClass $F, GluonInput $inputs, GluonInput $states,
-    GluonInput :$i2h_weight, GluonInput :$h2h_weight, GluonInput :$i2h_bias, GluonInput :$h2h_bias
-)
-{
-    my $prefix = "t${\ $self->counter}_";
-    my $i2h = $F->FullyConnected(
-        data => $inputs, weight => $i2h_weight, bias => $i2h_bias,
-        num_hidden => $self->hidden_size,
-        name => "${prefix}i2h"
-    );
-    my $h2h = $F->FullyConnected(
-        data => $states->[0], weight => $h2h_weight, bias => $h2h_bias,
-        num_hidden => $self->hidden_size,
-        name => "${prefix}h2h"
-    );
-    my $i2h_plus_h2h = $F->elemwise_add($i2h, $h2h, name => "${prefix}plus0");
-    my $output = $self->_get_activation($F, $i2h_plus_h2h, $self->activation, name => "${prefix}out");
-    return ($output, [$output]);
-}
-
-__PACKAGE__->register('AI::MXNet::Gluon::RNN');
-
-package AI::MXNet::Gluon::RNN::LSTMCell;
-use AI::MXNet::Gluon::Mouse;
-extends 'AI::MXNet::Gluon::RNN::HybridRecurrentCell';
-
-=head1 NAME
-
-    AI::MXNet::Gluon::RNN::LSTMCell
-=cut
-
-=head1 DESCRIPTION
-
-    Long-Short Term Memory (LSTM) network cell.
-
-    Parameters
-    ----------
-    hidden_size : int
-        Number of units in output symbol.
-    i2h_weight_initializer : str or Initializer
-        Initializer for the input weights matrix, used for the linear
-        transformation of the inputs.
-    h2h_weight_initializer : str or Initializer
-        Initializer for the recurrent weights matrix, used for the linear
-        transformation of the recurrent state.
-    i2h_bias_initializer : str or Initializer, default 'lstmbias'
-        Initializer for the bias vector. By default, bias for the forget
-        gate is initialized to 1 while all other biases are initialized
-        to zero.
-    h2h_bias_initializer : str or Initializer
-        Initializer for the bias vector.
-    prefix : str, default 'lstm_'
-        Prefix for name of `Block`s
-        (and name of weight if params is `None`).
-    params : Parameter or None
-        Container for weight sharing between cells.
-        Created if `None`.
-=cut
-
-has 'hidden_size' => (is => 'rw', isa => 'Int', required => 1);
-has [qw/
-    i2h_weight_initializer
-    h2h_weight_initializer
-    /]            => (is => 'rw', isa => 'Maybe[Initializer]');
-has [qw/
-    i2h_bias_initializer
-    h2h_bias_initializer
-    /]            => (is => 'rw', isa => 'Maybe[Initializer]', default => 'zeros');
-has 'input_size'  => (is => 'rw', isa => 'Int', default => 0);
-has [qw/
-        i2h_weight
-        h2h_weight
-        i2h_bias
-        h2h_bias
-    /]            => (is => 'rw', init_arg => undef);
-
-method python_constructor_arguments()
-{
-    [qw/
-        hidden_size
-        i2h_weight_initializer h2h_weight_initializer
-        i2h_bias_initializer h2h_bias_initializer
-        input_size
-    /];
-}
-
-
-sub BUILD
-{
-    my $self = shift;
-    $self->i2h_weight($self->params->get(
-        'i2h_weight', shape=>[4*$self->hidden_size, $self->input_size],
-        init => $self->i2h_weight_initializer,
-        allow_deferred_init => 1
-    ));
-    $self->h2h_weight($self->params->get(
-        'h2h_weight', shape=>[4*$self->hidden_size, $self->hidden_size],
-        init => $self->h2h_weight_initializer,
-        allow_deferred_init => 1
-    ));
-    $self->i2h_bias($self->params->get(
-        'i2h_bias', shape=>[4*$self->hidden_size],
-        init => $self->i2h_bias_initializer,
-        allow_deferred_init => 1
-    ));
-    $self->h2h_bias($self->params->get(
-        'h2h_bias', shape=>[4*$self->hidden_size],
-        init => $self->h2h_bias_initializer,
-        allow_deferred_init => 1
-    ));
-}
-
-method state_info(Int $batch_size=0)
-{
-    return [
-        { shape => [$batch_size, $self->hidden_size], __layout__ => 'NC' },
-        { shape => [$batch_size, $self->hidden_size], __layout__ => 'NC' }
-    ];
-}
-
-method _alias() { 'lstm' }
-
-method hybrid_forward(
-    GluonClass $F, GluonInput $inputs, GluonInput $states,
-    GluonInput :$i2h_weight, GluonInput :$h2h_weight, GluonInput :$i2h_bias, GluonInput :$h2h_bias
-)
-{
-    my $prefix = "t${\ $self->counter}_";
-    my $i2h = $F->FullyConnected(
-        $inputs, $i2h_weight, $i2h_bias,
-        num_hidden => $self->hidden_size*4,
-        name => "${prefix}i2h"
-    );
-    my $h2h = $F->FullyConnected(
-        $states->[0], $h2h_weight, $h2h_bias,
-        num_hidden => $self->hidden_size*4,
-        name => "${prefix}h2h"
-    );
-    my $gates = $F->elemwise_add($i2h, $h2h, name => "${prefix}plus0");
-    my @slice_gates = @{ $F->SliceChannel($gates, num_outputs => 4, name => "${prefix}slice") };
-    my $in_gate = $F->Activation($slice_gates[0], act_type=>"sigmoid", name => "${prefix}i");
-    my $forget_gate = $F->Activation($slice_gates[1], act_type=>"sigmoid", name => "${prefix}f");
-    my $in_transform = $F->Activation($slice_gates[2], act_type=>"tanh", name => "${prefix}c");
-    my $out_gate = $F->Activation($slice_gates[3], act_type=>"sigmoid", name => "${prefix}o");
-    my $next_c = $F->_plus(
-        $F->elemwise_mul($forget_gate, $states->[1], name => "${prefix}mul0"),
-        $F->elemwise_mul($in_gate, $in_transform, name => "${prefix}mul1"),
-        name => "${prefix}state"
-    );
-    my $next_h = $F->_mul($out_gate, $F->Activation($next_c, act_type=>"tanh", name => "${prefix}activation0"), name => "${prefix}out");
-    return ($next_h, [$next_h, $next_c]);
-}
-
-__PACKAGE__->register('AI::MXNet::Gluon::RNN');
-
-package AI::MXNet::Gluon::RNN::GRUCell;
-use AI::MXNet::Gluon::Mouse;
-extends 'AI::MXNet::Gluon::RNN::HybridRecurrentCell';
-
-=head1 NAME
-
-    AI::MXNet::Gluon::RNN::GRUCell
-=cut
-
-=head1 DESCRIPTION
-
-    Gated Rectified Unit (GRU) network cell.
-    Note: this is an implementation of the cuDNN version of GRUs
-    (slight modification compared to Cho et al. 2014).
-
-    Parameters
-    ----------
-    hidden_size : int
-        Number of units in output symbol.
-    i2h_weight_initializer : str or Initializer
-        Initializer for the input weights matrix, used for the linear
-        transformation of the inputs.
-    h2h_weight_initializer : str or Initializer
-        Initializer for the recurrent weights matrix, used for the linear
-        transformation of the recurrent state.
-    i2h_bias_initializer : str or Initializer
-        Initializer for the bias vector.
-    h2h_bias_initializer : str or Initializer
-        Initializer for the bias vector.
-    prefix : str, default 'gru_'
-        prefix for name of `Block`s
-        (and name of weight if params is `None`).
-    params : Parameter or None
-        Container for weight sharing between cells.
-        Created if `None`.
-=cut
-
-has 'hidden_size' => (is => 'rw', isa => 'Int', required => 1);
-has [qw/
-    i2h_weight_initializer
-    h2h_weight_initializer
-    /]            => (is => 'rw', isa => 'Maybe[Initializer]');
-has [qw/
-    i2h_bias_initializer
-    h2h_bias_initializer
-    /]            => (is => 'rw', isa => 'Maybe[Initializer]', default => 'zeros');
-has 'input_size'  => (is => 'rw', isa => 'Int', default => 0);
-has [qw/
-        i2h_weight
-        h2h_weight
-        i2h_bias
-        h2h_bias
-    /]            => (is => 'rw', init_arg => undef);
-
-method python_constructor_arguments()
-{
-    [qw/
-        hidden_size
-        i2h_weight_initializer h2h_weight_initializer
-        i2h_bias_initializer h2h_bias_initializer
-        input_size
-    /];
-}
-
-sub BUILD
-{
-    my $self = shift;
-    $self->i2h_weight($self->params->get(
-        'i2h_weight', shape=>[3*$self->hidden_size, $self->input_size],
-        init => $self->i2h_weight_initializer,
-        allow_deferred_init => 1
-    ));
-    $self->h2h_weight($self->params->get(
-        'h2h_weight', shape=>[3*$self->hidden_size, $self->hidden_size],
-        init => $self->h2h_weight_initializer,
-        allow_deferred_init => 1
-    ));
-    $self->i2h_bias($self->params->get(
-        'i2h_bias', shape=>[3*$self->hidden_size],
-        init => $self->i2h_bias_initializer,
-        allow_deferred_init => 1
-    ));
-    $self->h2h_bias($self->params->get(
-        'h2h_bias', shape=>[3*$self->hidden_size],
-        init => $self->h2h_bias_initializer,
-        allow_deferred_init => 1
-    ));
-}
-
-method state_info(Int $batch_size=0)
-{
-    return [{ shape => [$batch_size, $self->hidden_size], __layout__ => 'NC' }];
-}
-
-method _alias() { 'gru' }
-
-method hybrid_forward(
-    GluonClass $F, GluonInput $inputs, GluonInput $states,
-    GluonInput :$i2h_weight, GluonInput :$h2h_weight, GluonInput :$i2h_bias, GluonInput :$h2h_bias
-)
-{
-    my $prefix = "t${\ $self->counter}_";
-    my $prev_state_h = $states->[0];
-    my $i2h = $F->FullyConnected(
-        $inputs, $i2h_weight, $i2h_bias,
-        num_hidden => $self->hidden_size*3,
-        name => "${prefix}i2h"
-    );
-    my $h2h = $F->FullyConnected(
-        $states->[0], $h2h_weight, $h2h_bias,
-        num_hidden => $self->hidden_size*3,
-        name => "${prefix}h2h"
-    );
-    my ($i2h_r, $i2h_z, $h2h_r, $h2h_z);
-    ($i2h_r, $i2h_z, $i2h) = @{ $F->SliceChannel($i2h, num_outputs => 3, name => "${prefix}i2h_slice") };
-    ($h2h_r, $h2h_z, $h2h) = @{ $F->SliceChannel($h2h, num_outputs => 3, name => "${prefix}h2h_slice") };
-    my $reset_gate  = $F->Activation($F->elemwise_add($i2h_r, $h2h_r, name => "${prefix}plus0"), act_type=>"sigmoid", name => "${prefix}r_act");
-    my $update_gate = $F->Activation($F->elemwise_add($i2h_z, $h2h_z, name => "${prefix}plus1"), act_type=>"sigmoid", name => "${prefix}z_act");
-    my $next_h_tmp = $F->Activation(
-        $F->elemwise_add(
-            $i2h,
-            $F->elemwise_mul(
-                $reset_gate, $h2h, name => "${prefix}mul0"
-            ),
-            name => "${prefix}plus2"
-        ),
-        act_type => "tanh",
-        name => "${prefix}h_act"
-    );
-    my $ones = $F->ones_like($update_gate, name => "${prefix}ones_like0");
-    my $next_h = $F->_plus(
-        $F->elemwise_mul(
-            $F->elemwise_sub($ones, $update_gate, name => "${prefix}minus0"),
-            $next_h_tmp,
-            name => "${prefix}mul1"
-        ),
-        $F->elemwise_mul($update_gate, $prev_state_h, name => "${prefix}mul2"),
-        name => "${prefix}out"
-    );
-    return ($next_h, [$next_h]);
-}
-
-__PACKAGE__->register('AI::MXNet::Gluon::RNN');
-
-package AI::MXNet::Gluon::RNN::SequentialRNNCell;
-use AI::MXNet::Gluon::Mouse;
-use AI::MXNet::Base;
-no warnings 'redefine';
-extends 'AI::MXNet::Gluon::Block';
-with 'AI::MXNet::Gluon::RNN::RecurrentCell';
-has 'modified'      => (is => 'rw', isa => 'Bool', default => 0);
-has [qw/counter
-     init_counter/] => (is => 'rw', isa => 'Int', default => -1);
-
-sub BUILD
-{
-    my $self = shift;
-    $self->reset;
-}
-
-=head1 NAME
-
-    AI::MXNet::Gluon::RNN::SequentialRNNCell
-=cut
-
-=head1 DESCRIPTION
-
-    Sequentially stacking multiple RNN cells.
-=cut
-
-=head2 add
-
-    Appends a cell into the stack.
-
-    Parameters
-    ----------
-        cell : rnn cell
-=cut
-
-method add(AI::MXNet::Gluon::Block $cell)
-{
-    $self->register_child($cell);
-}
-
-method state_info(Int $batch_size=0)
-{
-    return $self->_cells_state_info($self->_children, $batch_size);
-}
-
-method begin_state(%kwargs)
-{
-    assert(
-        (not $self->modified),
-        "After applying modifier cells (e.g. ZoneoutCell) the base ".
-        "cell cannot be called directly. Call the modifier cell instead."
-    );
-    return $self->_cells_begin_state($self->_children, %kwargs);
-}
-
-method unroll(Int $length, GluonInput $inputs, Maybe[GluonInput] :$begin_state=, Str :$layout='NTC', Maybe[Bool] :$merge_outputs=)
-{
-    $self->reset();
-    my ($F, $batch_size);
-    ($inputs, undef, $F, $batch_size) = $self->_format_sequence($length, $inputs, $layout, undef);
-    my $num_cells = $self->_children->keys;
-    $begin_state = $self->_get_begin_state($F, $begin_state, $inputs, $batch_size);
-    my $p = 0;
-    my @next_states;
-    my $states;
-    enumerate(sub {
-        my ($i, $cell) = @_;
-        my $n = @{ $cell->state_info() };
-        $states = [@{ $begin_state }[$p..$p+$n-1]];
-        $p += $n;
-        ($inputs, $states) = $cell->unroll(
-            $length, $inputs, begin_state => $states, layout => $layout,
-            merge_outputs => ($i < ($num_cells - 1)) ? undef : $merge_outputs
-        );
-        push @next_states, @{ $states };
-    }, [$self->_children->values]);
-    return ($inputs, \@next_states);
-}
-
-method call($inputs, $states)
-{
-    $self->counter($self->counter + 1);
-    my @next_states;
-    my $p = 0;
-    for my $cell ($self->_children->values)
-    {
-        assert(not $cell->isa('AI::MXNet::Gluon::RNN::BidirectionalCell'));
-        my $n = @{ $cell->state_info() };
-        my $state = [@{ $states }[$p,$p+$n-1]];
-        $p += $n;
-        ($inputs, $state) = $cell->($inputs, $state);
-        push @next_states, @{ $state };
-    }
-    return ($inputs, \@next_states);
-}
-
-use overload '@{}' => sub { [shift->_children->values] };
-use overload '""'  => sub {
-    my $self = shift;
-    my $s = "%s(\n%s\n)";
-    my @children;
-    enumerate(sub {
-        my ($i, $m) = @_;
-        push @children, "($i): ". AI::MXNet::Base::_indent("$m", 2);
-    }, [$self->_children->values]);
-    return sprintf($s, $self->_class_name, join("\n", @children));
-};
-
-method hybrid_forward(@args)
-{
-    confess('Not Implemented');
-}
-
-__PACKAGE__->register('AI::MXNet::Gluon::RNN');
-
-package AI::MXNet::Gluon::RNN::DropoutCell;
-use AI::MXNet::Gluon::Mouse;
-extends 'AI::MXNet::Gluon::RNN::HybridRecurrentCell';
-
-=head1 NAME
-
-    AI::MXNet::Gluon::RNN::DropoutCell
-=cut
-
-=head1 DESCRIPTION
-
-    Applies dropout on input.
-
-    Parameters
-    ----------
-    rate : float
-        Percentage of elements to drop out, which
-        is 1 - percentage to retain.
-=cut
-
-has 'rate' => (is => 'ro', isa => 'Num', required => 1);
-method python_constructor_arguments() { ['rate'] }
-
-method state_info(Int $batch_size=0) { [] }
-
-method _alias() { 'dropout' }
-
-method hybrid_forward(GluonClass $F, GluonInput $inputs, GluonInput $states)
-{
-    if($self->rate > 0)
-    {
-        $inputs = $F->Dropout($inputs, p => $self->rate, name => "t${\ $self->counter }_fwd");
-    }
-    return ($inputs, $states);
-}
-
-method unroll(Int $length, GluonInput $inputs, Maybe[GluonInput] :$begin_state=, Str :$layout='NTC', Maybe[Bool] :$merge_outputs=)
-{
-    $self->reset;
-    my $F;
-    ($inputs, undef, $F) = $self->_format_sequence($length, $inputs, $layout, $merge_outputs);
-    if(blessed $inputs)
-    {
-        return $self->hybrid_forward($F, $inputs, $begin_state//[]);
-    }
-    else
-    {
-        return $self->SUPER::unroll(
-            $length, $inputs, begin_state => $begin_state, layout => $layout,
-            merge_outputs => $merge_outputs
-        );
-    }
-}
-
-use overload '""' => sub {
-    my $self = shift;
-    return $self->_class_name.'(rate ='.$self->rate.')';
-};
-
-__PACKAGE__->register('AI::MXNet::Gluon::RNN');
-
-package AI::MXNet::Gluon::RNN::ModifierCell;
-use AI::MXNet::Gluon::Mouse;
-use AI::MXNet::Base;
-extends 'AI::MXNet::Gluon::RNN::HybridRecurrentCell';
-has 'base_cell' => (is => 'rw', isa => 'AI::MXNet::Gluon::RNN::HybridRecurrentCell', required => 1);
-
-=head1 NAME
-
-    AI::MXNet::Gluon::RNN::ModifierCell
-=cut
-
-=head1 DESCRIPTION
-
-    Base class for modifier cells. A modifier
-    cell takes a base cell, apply modifications
-    on it (e.g. Zoneout), and returns a new cell.
-
-    After applying modifiers the base cell should
-    no longer be called directly. The modifier cell
-    should be used instead.
-=cut
-
-
-sub BUILD
-{
-    my $self = shift;
-    assert(
-        (not $self->base_cell->modified),
-        "Cell ${\ $self->base_cell->name } is already modified. One cell cannot be modified twice"
-    );
-    $self->base_cell->modified(1);
-}
-
-method params()
-{
-    return $self->base_cell->params;
-}
-
-method state_info(Int $batch_size=0)
-{
-    return $self->base_cell->state_info($batch_size);
-
-}
-
-method begin_state(CodeRef :$func=sub{ AI::MXNet::Symbol->zeros(@_) }, %kwargs)
-{
-    assert(
-        (not $self->modified),
-        "After applying modifier cells (e.g. DropoutCell) the base ".
-        "cell cannot be called directly. Call the modifier cell instead."
-    );
-    $self->base_cell->modified(0);
-    my $begin = $self->base_cell->begin_state(func => $func, %kwargs);
-    $self->base_cell->modified(1);
-    return $begin;
-}
-
-method hybrid_forward(GluonClass $F, GluonInput $inputs, GluonInput $states)
-{
-    confess('Not Implemented');
-}
-
-use overload '""' => sub {
-    my $self = shift;
-    return $self->_class_name.'('.$self->base_cell.')';
-};
-
-package AI::MXNet::Gluon::RNN::ZoneoutCell;
-use AI::MXNet::Gluon::Mouse;
-use AI::MXNet::Base;
-extends 'AI::MXNet::Gluon::RNN::ModifierCell';
-
-=head1 NAME
-
-    AI::MXNet::Gluon::RNN::ZoneoutCell
-=cut
-
-=head1 DESCRIPTION
-
-    Applies Zoneout on base cell.
-=cut
-has [qw/zoneout_outputs
-        zoneout_states/] => (is => 'ro', isa => 'Num', default => 0);
-has 'prev_output' => (is => 'rw', init_arg => undef);
-method python_constructor_arguments() { ['base_cell', 'zoneout_outputs', 'zoneout_states'] }
-
-sub BUILD
-{
-    my $self = shift;
-    assert(
-        (not $self->base_cell->isa('AI::MXNet::Gluon::RNN::BidirectionalCell')),
-        "BidirectionalCell doesn't support zoneout since it doesn't support step. ".
-        "Please add ZoneoutCell to the cells underneath instead."
-    );
-    assert(
-        (not $self->base_cell->isa('AI::MXNet::Gluon::RNN::SequentialRNNCel') or not $self->base_cell->bidirectional),
-        "Bidirectional SequentialRNNCell doesn't support zoneout. ".
-        "Please add ZoneoutCell to the cells underneath instead."
-    );
-}
-
-use overload '""' => sub {
-    my $self = shift;
-    return $self->_class_name.'(p_out='.$self->zoneout_outputs.', p_state='.$self->zoneout_states.
-           ', '.$self->base_cell.')';
-};
-
-method _alias() { 'zoneout' }
-
-method reset()
-{
-    $self->SUPER::reset();
-    $self->prev_output(undef);
-}
-
-method hybrid_forward(GluonClass $F, GluonInput $inputs, GluonInput $states)
-{
-    my ($cell, $p_outputs, $p_states) = ($self->base_cell, $self->zoneout_outputs, $self->zoneout_states);
-    my ($next_output, $next_states) = $cell->($inputs, $states);
-    my $mask = sub { my ($p, $like) = @_; $F->Dropout($F->ones_like($like), p=>$p) };
-
-    my $prev_output = $self->prev_output//$F->zeros_like($next_output);
-    my $output = $p_outputs != 0 ? $F->where($mask->($p_outputs, $next_output), $next_output, $prev_output) : $next_output;
-    if($p_states != 0)
-    {
-        my @tmp;
-        for(zip($next_states, $states)) {
-            my ($new_s, $old_s) = @$_;
-            push @tmp, $F->where($mask->($p_states, $new_s), $new_s, $old_s);
-        }
-        $states = \@tmp;
-    }
-    else
-    {
-        $states = $next_states;
-    }
-    $self->prev_output($output);
-    return ($output, $states);
-}
-
-__PACKAGE__->register('AI::MXNet::Gluon::RNN');
-
-package AI::MXNet::Gluon::RNN::ResidualCell;
-use AI::MXNet::Gluon::Mouse;
-use AI::MXNet::Base;
-extends 'AI::MXNet::Gluon::RNN::ModifierCell';
-method python_constructor_arguments() { ['base_cell'] }
-
-=head1 NAME
-
-    AI::MXNet::Gluon::RNN::ResidualCell
-=cut
-
-=head1 DESCRIPTION
-
-    Adds residual connection as described in Wu et al, 2016
-    (https://arxiv.org/abs/1609.08144).
-    Output of the cell is output of the base cell plus input.
-=cut
-
-method hybrid_forward(GluonClas $F, GluonInput $inputs, GluonInput $states)
-{
-    my $output;
-    ($output, $states) = $self->base_cell->($inputs, $states);
-    $output = $F->elemwise_add($output, $inputs, name => "t${\ $self->counter }_fwd");
-    return ($output, $states);
-}
-
-method unroll(Int $length, GluonInput $inputs, Maybe[GluonInput] :$begin_state=, Str :$layout='NTC', Maybe[Bool] :$merge_outputs=)
-{
-    $self->reset();
-
-    $self->base_cell->modified(0);
-    my ($outputs, $states) = $self->base_cell->unroll(
-        $length, $inputs, begin_state => $begin_state, layout => $layout, merge_outputs => $merge_outputs
-    );
-    $self->base_cell->modified(1);
-
-    $merge_outputs //= blessed $outputs ? 1 : 0;
-    my $F;
-    ($inputs, undef, $F) = $self->_format_sequence($length, $inputs, $layout, $merge_outputs);
-    if($merge_outputs)
-    {
-        $outputs = $F->elemwise_add($outputs, $inputs);
-    }
-    else
-    {
-        my @tmp;
-        for(zip($outputs, $inputs)) {
-            my ($i, $j) = @$_;
-            push @tmp, $F->elemwise_add($i, $j);
-        }
-        $outputs = \@tmp;
-    }
-    return ($outputs, $states);
-}
-
-__PACKAGE__->register('AI::MXNet::Gluon::RNN');
-
-package AI::MXNet::Gluon::RNN::BidirectionalCell;
-use AI::MXNet::Gluon::Mouse;
-use AI::MXNet::Base;
-extends 'AI::MXNet::Gluon::RNN::HybridRecurrentCell';
-has [qw/l_cell r_cell/] => (is => 'ro', isa => 'AI::MXNet::Gluon::RNN::HybridRecurrentCell', required => 1);
-has 'output_prefix'     => (is => 'ro', isa => 'Str', default => 'bi_');
-method python_constructor_arguments() { ['l_cell', 'r_cell', 'output_prefix'] }
-
-=head1 NAME
-
-    AI::MXNet::Gluon::RNN::BidirectionalCell
-=cut
-
-=head1 DESCRIPTION
-
-    Bidirectional RNN cell.
-
-    Parameters
-    ----------
-    l_cell : RecurrentCell
-        Cell for forward unrolling
-    r_cell : RecurrentCell
-        Cell for backward unrolling
-=cut
-
-method call($inputs, $states)
-{
-    confess("Bidirectional cell cannot be stepped. Please use unroll");
-}
-
-use overload '""' => sub {
-    my $self = shift;
-    "${\ $self->_class_name }(forward=${\ $self->l_cell }, backward=${\ $self->r_cell })";
-};
-
-method state_info(Int $batch_size=0)
-{
-    return $self->_cells_state_info($self->_children, $batch_size);
-}
-
-method begin_state(%kwargs)
-{
-    assert(
-        (not $self->modified),
-        "After applying modifier cells (e.g. DropoutCell) the base ".
-        "cell cannot be called directly. Call the modifier cell instead."
-    );
-    return $self->_cells_begin_state($self->_children, %kwargs);
-}
-
-method unroll(Int $length, GluonInput $inputs, Maybe[GluonInput] :$begin_state=, Str :$layout='NTC', Maybe[Bool] :$merge_outputs=)
-{
-    $self->reset();
-    my ($axis, $F, $batch_size);
-    ($inputs, $axis, $F, $batch_size) = $self->_format_sequence($length, $inputs, $layout, 0);
-    $begin_state //= $self->_get_begin_state($F, $begin_state, $inputs, $batch_size);
-
-    my $states = $begin_state;
-    my ($l_cell, $r_cell) = $self->_children->values;
-    $l_cell->state_info($batch_size);
-    my ($l_outputs, $l_states) = $l_cell->unroll(
-            $length, $inputs,
-            begin_state => [@{ $states }[0..@{ $l_cell->state_info($batch_size) }-1]],
-            layout => $layout,
-            merge_outputs => $merge_outputs
-    );
-    my ($r_outputs, $r_states) = $r_cell->unroll(
-        $length, [reverse @{$inputs}],
-        begin_state     => [@{$states}[@{ $l_cell->state_info }..@{$states}-1]],
-        layout          => $layout,
-        merge_outputs   => $merge_outputs
-    );
-    if(not defined $merge_outputs)
-    {
-        $merge_outputs = blessed $l_outputs and blessed $r_outputs;
-        ($l_outputs) = $self->_format_sequence(undef, $l_outputs, $layout, $merge_outputs);
-        ($r_outputs) = $self->_format_sequence(undef, $r_outputs, $layout, $merge_outputs);
-    }
-    my $outputs;
-    if($merge_outputs)
-    {
-        $r_outputs = $F->reverse($r_outputs, axis=>$axis);
-        $outputs = $F->concat($l_outputs, $r_outputs, dim=>2, name=>$self->output_prefix.'out');
-    }
-    else
-    {
-        $outputs = [];
-        enumerate(sub {
-            my ($i, $l_o, $r_o) = @_;
-                push @$outputs, $F->concat(
-                    $l_o, $r_o, dim=>1,
-                    name => sprintf('%st%d', $self->output_prefix, $i)
-                );
-            }, [@{ $l_outputs }], [reverse(@{ $r_outputs })]
-        );
-    }
-    $states = [@{ $l_states }, @{ $r_states }];
-    return ($outputs, $states);
-}
-
-__PACKAGE__->register('AI::MXNet::Gluon::RNN');
-
-1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/RNN/Layer.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/RNN/Layer.pm
deleted file mode 100644
index 08212ab20f6d..000000000000
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/RNN/Layer.pm
+++ /dev/null
@@ -1,776 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-use strict;
-use warnings;
-package AI::MXNet::Gluon::RNN::Layer;
-use AI::MXNet::Function::Parameters;
-use AI::MXNet::Gluon::Mouse;
-use AI::MXNet::Base;
-extends 'AI::MXNet::Gluon::HybridBlock';
-
-has 'hidden_size'   => (is => 'rw', isa => 'Int');
-has 'num_layers'    => (is => 'rw', isa => 'Int');
-has 'layout'        => (is => 'rw', isa => 'Str');
-has 'dropout'       => (is => 'rw', isa => 'Num');
-has 'bidirectional' => (is => 'rw', isa => 'Bool');
-has 'input_size'    => (is => 'rw', isa => 'Int', default => 0);
-has 'projection_size' => (is => 'rw', isa => 'Maybe[Int]');
-has [qw/lstm_state_clip_min
-        lstm_state_clip_max/] => (is => 'rw', isa => 'Maybe[Num]');
-has 'lstm_state_clip_nan' => (is => 'rw', isa => 'Bool', default => 0);
-has [qw/
-    i2h_weight_initializer
-    h2h_weight_initializer
-    i2h_bias_initializer
-    h2h_bias_initializer
-    h2r_weight_initializer
-    /]              => (is => 'rw', isa => 'Maybe[Initializer]');
-has 'mode'          => (is => 'rw', isa => 'Str');
-has [qw/dir gates
-    unfused/]       => (is => 'rw', init_arg => undef);
-
-method python_constructor_arguments()
-{
-    [qw/
-        hidden_size num_layers layout
-        dropout bidirectional input_size
-        i2h_weight_initializer h2h_weight_initializer
-        i2h_bias_initializer h2h_bias_initializer
-        mode projection_size h2r_weight_initializer
-        lstm_state_clip_min lstm_state_clip_max lstm_state_clip_nan
-    /];
-}
-
-sub BUILD
-{
-    my $self = shift;
-    assert(
-        ($self->layout eq 'TNC' or $self->layout eq 'NTC'),
-        "Invalid layout [${\ $self->layout }]; must be one of ['TNC' or 'NTC']"
-    );
-    $self->dir($self->bidirectional ? 2 : 1);
-    $self->gates({qw/rnn_relu 1 rnn_tanh 1 lstm 4 gru 3/}->{$self->mode});
-    my ($ng, $ni, $nh) = ($self->gates, $self->input_size, $self->hidden_size);
-    if(not $self->projection_size)
-    {
-        for my $i (0..$self->num_layers-1)
-        {
-            for my $j ($self->dir == 2 ? ('l', 'r') : ('l'))
-            {
-                $self->_register_param(
-                    "$j${i}_i2h_weight", [$ng*$nh, $ni],
-                    $self->i2h_weight_initializer
-                );
-                $self->_register_param(
-                    "$j${i}_h2h_weight", [$ng*$nh, $nh],
-                    $self->h2h_weight_initializer
-                );
-                $self->_register_param(
-                    "$j${i}_i2h_bias", [$ng*$nh],
-                    $self->i2h_bias_initializer,
-                );
-                $self->_register_param(
-                    "$j${i}_h2h_bias", [$ng*$nh],
-                    $self->h2h_bias_initializer,
-                );
-            }
-            $ni = $nh * $self->dir;
-        }
-    }
-    else
-    {
-        my $np = $self->projection_size;
-        for my $i (0..$self->num_layers-1)
-        {
-            for my $j ($self->dir == 2 ? ('l', 'r') : ('l'))
-            {
-                $self->_register_param(
-                    "$j${i}_i2h_weight", [$ng*$nh, $ni],
-                    $self->i2h_weight_initializer
-                );
-                $self->_register_param(
-                    "$j${i}_h2h_weight", [$ng*$nh, $np],
-                    $self->h2h_weight_initializer
-                );
-                $self->_register_param(
-                    "$j${i}_i2h_bias", [$ng*$nh],
-                    $self->i2h_bias_initializer,
-                );
-                $self->_register_param(
-                    "$j${i}_h2h_bias", [$ng*$nh],
-                    $self->h2h_bias_initializer,
-                );
-                $self->_register_param(
-                    "$j${i}_h2r_weight", [$np, $nh],
-                    $self->h2r_weight_initializer,
-                );
-            }
-            $ni = $np * $self->dir;
-        }
-    }
-}
-
-method _register_param($name, $shape, $init)
-{
-    my $p = $self->params->get(
-        $name, shape=>$shape, init=>$init,
-        allow_deferred_init=>1
-    );
-    $self->$name($p);
-    return $p;
-}
-
-use overload '""' => sub {
-    my $self = shift;
-    my $name = $self->_class_name;
-    my $mapping = $self->input_size ? $self->input_size.' -> '.$self->hidden_size : $self->hidden_size;
-    my $s = "$name($mapping, ${\ $self->layout }";
-    if($self->num_layers != 1)
-    {
-        $s .= ', num_layers='.$self->num_layers;
-    }
-    if($self->dropout != 0)
-    {
-        $s .= ', dropout='.$self->dropout;
-    }
-    if($self->dir == 2)
-    {
-        $s .= ', bidirectional';
-    }
-    $s .= ')';
-    return $s;
-};
-
-method _collect_params_with_prefix(Str $prefix='')
-{
-    $prefix .= '.' if($prefix);
-    my $pattern = qr/(l|r)(\d+)_(i2h|h2h)_(weight|bias)$/;
-    my $convert_key = sub { my ($m, $bidirectional) = @_;
-        my ($d, $l, $g, $t) = @$m;
-        if($bidirectional)
-        {
-            return "_unfused.$l.${d}_cell.${g}_$t";
-        }
-        else
-        {
-            return "_unfused.$l.${g}_$t";
-        }
-    };
-    my $bidirectional = 0;
-    my %params = %{ $self->_reg_params };
-    for my $k (keys %params)
-    {
-        $k =~ $pattern;
-        $bidirectional = 1 if $1 and $1 eq 'r';
-    }
-    my %ret;
-    for my $k (keys %params)
-    {
-        $k =~ $pattern;
-        $ret{ $prefix . $convert_key->([$1, $2, $3, $4], $bidirectional) } = $params{$k};
-    }
-    my $iter = $self->_children->iterator;
-    while(my ($name, $child) = $iter->())
-    {
-        %ret = (%ret, %{ $child->_collect_params_with_prefix("$prefix$name") });
-    }
-    return \%ret;
-}
-
-method state_info($batch_size=0)
-{
-    confess('NotImplementedError');
-}
-
-
-method _unfuse()
-{
-    assert((not $self->projection_size), "_unfuse does not support projection layer yet!");
-    assert(
-        (not $self->lstm_state_clip_min and not $self->lstm_state_clip_max),
-        "_unfuse does not support state clipping yet!"
-    );
-    my $get_cell = {
-        rnn_relu => sub {
-            my %kwargs = @_;
-            AI::MXNet::Gluon::RNN::RNNCell->new(
-                $self->hidden_size,
-                activation => 'relu',
-                %kwargs
-            )
-        },
-        rnn_tanh => sub {
-            my %kwargs = @_;
-            AI::MXNet::Gluon::RNN::RNNCell->new(
-                $self->hidden_size,
-                activation => 'tanh',
-                %kwargs
-            )
-        },
-        lstm => sub {
-            my %kwargs = @_;
-            AI::MXNet::Gluon::RNN::LSTMCell->new(
-                $self->hidden_size,
-                %kwargs
-            )
-        },
-        gru => sub {
-            my %kwargs = @_;
-            AI::MXNet::Gluon::RNN::GRUCell->new(
-                $self->hidden_size,
-                %kwargs
-            )
-        }
-    }->{$self->mode};
-    my $stack = AI::MXNet::Gluon::RNN::SequentialRNNCell->new(prefix => $self->prefix, params => $self->params);
-    $stack->name_scope(sub {
-        my $ni = $self->input_size;
-        for my $i (0..$self->num_layers-1)
-        {
-            my %kwargs = (
-                input_size => $ni,
-                i2h_weight_initializer => $self->i2h_weight_initializer,
-                h2h_weight_initializer => $self->h2h_weight_initializer,
-                i2h_bias_initializer   => $self->i2h_bias_initializer,
-                h2h_bias_initializer   => $self->h2h_bias_initializer
-            );
-            if($self->dir == 2)
-            {
-                $stack->add(
-                    AI::MXNet::Gluon::RNN::BidirectionalCell->new(
-                        $get_cell->(prefix=> "l${i}_", %kwargs),
-                        $get_cell->(prefix=> "r${i}_", %kwargs),
-                    )
-                );
-            }
-            else
-            {
-                $stack->add($get_cell->(prefix=> "l${i}_", %kwargs));
-            }
-            if($self->dropout > 0 and $i != ($self->_num_layers - 1))
-            {
-                $stack->add(AI::MXNet::Gluon::RNN::DropoutCell->new($self->dropout));
-            }
-            $ni = $self->hidden_size * $self->dir;
-        }
-    });
-    return $stack;
-}
-
-method begin_state(
-    $batch_size=0,
-    CodeRef :$func=sub { my %kwargs = @_; my $shape = delete $kwargs{shape}; AI::MXNet::NDArray->zeros($shape, %kwargs) },
-    %kwargs
-)
-{
-    my @states;
-    enumerate(sub {
-        my ($i, $info) = @_;
-        if(defined $info)
-        {
-            %$info = (%$info, %kwargs);
-        }
-        else
-        {
-            %$info = %kwargs;
-        }
-        push @states, $func->(name=> $self->prefix."h0_$i", %$info);
-    }, $self->state_info($batch_size));
-    return \@states;
-}
-
-use Data::Dumper;
-method hybrid_forward(GluonClass $F, GluonInput $inputs, @args)
-{
-    my $states;
-    if(@args)
-    {
-        if(not defined $args[0] or ref $args[0])
-        {
-            $states = shift(@args);
-            undef $states if(ref $states eq 'ARRAY' and not @$states);
-        }
-    }
-    use Data::Dumper;
-
-    my $batch_size;
-    if($F eq 'AI::MXNet::NDArray')
-    {
-        $batch_size = $inputs->shape->[index($self->layout, 'N')];
-    }
-    my $skip_states = not defined $states;
-    if($skip_states)
-    {
-        if($F eq 'AI::MXNet::NDArray')
-        {
-            $states = $self->begin_state($batch_size, ctx=>$inputs->context, dtype=>$inputs->dtype);
-        }
-        else
-        {
-            $states = $self->begin_state(0, func=>sub { return AI::MXNet::Symbol->zeros(@_) });
-        }
-    }
-    if(blessed $states and ($states->isa('AI::MXNet::NDArray') or $states->isa('AI::MXNet::Symbol')))
-    {
-        $states = [$states];
-    }
-    if($F eq 'AI::MXNet::NDArray')
-    {
-        for(zip($states, $self->state_info($batch_size)))
-        {
-            my ($state, $info) = @$_;
-            if(Dumper($state->shape) ne Dumper($info->{shape}))
-            {
-                my @state_shape = @{ $state->shape };
-                confess("Invalid recurrent state shape. Expecting @{$info->{shape}}, got @state_shape.");
-            }
-        }
-    }
-    my $out = $self->_forward_kernel($F, $inputs, $states, @args);
-    return $skip_states ? $out->[0] : $out;
-}
-
-method _forward_kernel($F, $inputs, $states, %kwargs)
-{
-    if($self->layout eq 'NTC')
-    {
-        $inputs = $F->swapaxes($inputs, dim1=>0, dim2=>1);
-    }
-    my @params;
-    if(not defined $self->projection_size)
-    {
-        for my $t ('weight', 'bias')
-        {
-            for my $l (0..$self->num_layers-1)
-            {
-                for my $d ($self->dir == 2 ? ('l', 'r') : ('l'))
-                {
-                    for my $g ('i2h', 'h2h')
-                    {
-                        push @params, $kwargs{"$d${l}_${g}_$t"}->reshape([-1]);
-                    }
-                }
-            }
-        }
-    }
-    else
-    {
-        for my $t ('weight', 'bias')
-        {
-            for my $l (0..$self->num_layers-1)
-            {
-                for my $d ($self->dir == 2 ? ('l', 'r') : ('l'))
-                {
-                    for my $g ('i2h', 'h2h', 'h2r')
-                    {
-                        push @params, $kwargs{"$d${l}_${g}_$t"}->reshape([-1])
-                            unless($g eq 'h2r' and $t eq 'bias');
-                    }
-                }
-            }
-        }
-    }
-    my $params = $F->_rnn_param_concat(@params, dim=>0);
-    my $rnn = $F->RNN(
-        $inputs, $params, @{ $states }, { state_size => $self->hidden_size,
-        num_layers => $self->num_layers, bidirectional => $self->dir == 2 ? 1 : 0,
-        p => $self->dropout, state_outputs => 1, mode => $self->mode,
-        (defined $self->lstm_state_clip_min ? (lstm_state_clip_min=>$self->lstm_state_clip_min) : ()),
-        (defined $self->lstm_state_clip_max ? (lstm_state_clip_max=>$self->lstm_state_clip_max) : ()),
-        (defined $self->lstm_state_clip_nan ? (lstm_state_clip_nan=>$self->lstm_state_clip_nan) : ())
-    });
-    my $outputs;
-    my @rnn = @{$rnn};
-    if($self->mode eq 'lstm')
-    {
-        ($outputs, $states) = ($rnn[0], [$rnn[1], $rnn[2]]);
-    }
-    else
-    {
-        ($outputs, $states) = ($rnn[0], [$rnn[1]]);
-    }
-    if($self->layout eq 'NTC')
-    {
-        $outputs = $outputs->swapaxes(dim1 => 0, dim2 => 1);
-    }
-    return [$outputs, $states];
-}
-
-package AI::MXNet::Gluon::RNN::RNN;
-
-=head1 NAME
-
-     AI::MXNet::Gluon::RNN::RNN
-=cut
-
-=head1 DESCRIPTION
-
-    Applies a multi-layer Elman RNN with `tanh` or `ReLU` non-linearity to an input sequence.
-
-    For each element in the input sequence, each layer computes the following
-    function:
-
-    .. math::
-        h_t = \tanh(w_{ih} * x_t + b_{ih}  +  w_{hh} * h_{(t-1)} + b_{hh})
-
-    where :math:`h_t` is the hidden state at time `t`, and :math:`x_t` is the hidden
-    state of the previous layer at time `t` or :math:`input_t` for the first layer.
-    If nonlinearity='relu', then `ReLU` is used instead of `tanh`.
-
-    Parameters
-    ----------
-    hidden_size: int
-        The number of features in the hidden state h.
-    num_layers: int, default 1
-        Number of recurrent layers.
-    activation: {'relu' or 'tanh'}, default 'tanh'
-        The activation function to use.
-    layout : str, default 'TNC'
-        The format of input and output tensors. T, N and C stand for
-        sequence length, batch size, and feature dimensions respectively.
-    dropout: float, default 0
-        If non-zero, introduces a dropout layer on the outputs of each
-        RNN layer except the last layer.
-    bidirectional: bool, default False
-        If `True`, becomes a bidirectional RNN.
-    i2h_weight_initializer : str or Initializer
-        Initializer for the input weights matrix, used for the linear
-        transformation of the inputs.
-    h2h_weight_initializer : str or Initializer
-        Initializer for the recurrent weights matrix, used for the linear
-        transformation of the recurrent state.
-    i2h_bias_initializer : str or Initializer
-        Initializer for the bias vector.
-    h2h_bias_initializer : str or Initializer
-        Initializer for the bias vector.
-    input_size: int, default 0
-        The number of expected features in the input x.
-        If not specified, it will be inferred from input.
-    prefix : str or None
-        Prefix of this `Block`.
-    params : ParameterDict or None
-        Shared Parameters for this `Block`.
-
-
-    Input shapes:
-        The input shape depends on `layout`. For `layout='TNC'`, the
-        input has shape `(sequence_length, batch_size, input_size)`
-
-
-    Output shape:
-        The output shape depends on `layout`. For `layout='TNC'`, the
-        output has shape `(sequence_length, batch_size, num_hidden)`.
-        If `bidirectional` is True, output shape will instead be
-        `(sequence_length, batch_size, 2*num_hidden)`
-
-    Recurrent state:
-        The recurrent state is an NDArray with shape `(num_layers, batch_size, num_hidden)`.
-        If `bidirectional` is True, the recurrent state shape will instead be
-        `(2*num_layers, batch_size, num_hidden)`
-        If input recurrent state is None, zeros are used as default begin states,
-        and the output recurrent state is omitted.
-
-
-    Examples
-    --------
-    >>> layer = mx.gluon.rnn.RNN(100, 3)
-    >>> layer.initialize()
-    >>> input = mx.nd.random.uniform(shape=(5, 3, 10))
-    >>> # by default zeros are used as begin state
-    >>> output = layer(input)
-    >>> # manually specify begin state.
-    >>> h0 = mx.nd.random.uniform(shape=(3, 3, 100))
-    >>> output, hn = layer(input, h0)
-=cut
-use AI::MXNet::Gluon::Mouse;
-extends 'AI::MXNet::Gluon::RNN::Layer';
-
-has '+num_layers'    => (default => 1);
-has 'activation'     => (is => 'rw', default => 'relu');
-has '+layout'        => (default => 'TNC');
-has '+dropout'       => (default => 0);
-has '+bidirectional' => (default => 0);
-has [qw/
-    +i2h_bias_initializer
-    +h2h_bias_initializer
-    /]               => (default => 'zeros');
-has '+mode'          => (default => sub { 'rnn_' . shift->activation }, lazy => 1);
-method python_constructor_arguments()
-{
-    [qw/
-        hidden_size num_layers activation layout
-        dropout bidirectional input_size
-        i2h_weight_initializer h2h_weight_initializer
-        i2h_bias_initializer h2h_bias_initializer
-    /];
-}
-
-method state_info(DimSize $batch_size=0)
-{
-    return [{
-        shape => [$self->num_layers * $self->dir, $batch_size, $self->hidden_size],
-        __layout__ => 'LNC'
-    }];
-}
-
-__PACKAGE__->register('AI::MXNet::Gluon::RNN');
-
-package AI::MXNet::Gluon::RNN::LSTM;
-
-=head1 NANE
-
-    AI::MXNet::Gluon::RNN::LSTM
-=cut
-
-=head1 DESCRIPTION
-
-    Applies a multi-layer long short-term memory (LSTM) RNN to an input sequence.
-
-    For each element in the input sequence, each layer computes the following
-    function:
-
-    .. math::
-        \begin{array}{ll}
-        i_t = sigmoid(W_{ii} x_t + b_{ii} + W_{hi} h_{(t-1)} + b_{hi}) \\
-        f_t = sigmoid(W_{if} x_t + b_{if} + W_{hf} h_{(t-1)} + b_{hf}) \\
-        g_t = \tanh(W_{ig} x_t + b_{ig} + W_{hc} h_{(t-1)} + b_{hg}) \\
-        o_t = sigmoid(W_{io} x_t + b_{io} + W_{ho} h_{(t-1)} + b_{ho}) \\
-        c_t = f_t * c_{(t-1)} + i_t * g_t \\
-        h_t = o_t * \tanh(c_t)
-        \end{array}
-
-    where :math:`h_t` is the hidden state at time `t`, :math:`c_t` is the
-    cell state at time `t`, :math:`x_t` is the hidden state of the previous
-    layer at time `t` or :math:`input_t` for the first layer, and :math:`i_t`,
-    :math:`f_t`, :math:`g_t`, :math:`o_t` are the input, forget, cell, and
-    out gates, respectively.
-
-    Parameters
-    ----------
-    hidden_size: int
-        The number of features in the hidden state h.
-    num_layers: int, default 1
-        Number of recurrent layers.
-    layout : str, default 'TNC'
-        The format of input and output tensors. T, N and C stand for
-        sequence length, batch size, and feature dimensions respectively.
-    dropout: float, default 0
-        If non-zero, introduces a dropout layer on the outputs of each
-        RNN layer except the last layer.
-    bidirectional: bool, default False
-        If `True`, becomes a bidirectional RNN.
-    i2h_weight_initializer : str or Initializer
-        Initializer for the input weights matrix, used for the linear
-        transformation of the inputs.
-    h2h_weight_initializer : str or Initializer
-        Initializer for the recurrent weights matrix, used for the linear
-        transformation of the recurrent state.
-    i2h_bias_initializer : str or Initializer, default 'lstmbias'
-        Initializer for the bias vector. By default, bias for the forget
-        gate is initialized to 1 while all other biases are initialized
-        to zero.
-    h2h_bias_initializer : str or Initializer
-        Initializer for the bias vector.
-    input_size: int, default 0
-        The number of expected features in the input x.
-        If not specified, it will be inferred from input.
-    prefix : str or None
-        Prefix of this `Block`.
-    params : `ParameterDict` or `None`
-        Shared Parameters for this `Block`.
-
-
-    Input shapes:
-        The input shape depends on `layout`. For `layout='TNC'`, the
-        input has shape `(sequence_length, batch_size, input_size)`
-
-    Output shape:
-        The output shape depends on `layout`. For `layout='TNC'`, the
-        output has shape `(sequence_length, batch_size, num_hidden)`.
-        If `bidirectional` is True, output shape will instead be
-        `(sequence_length, batch_size, 2*num_hidden)`
-
-    Recurrent state:
-        The recurrent state is a list of two NDArrays. Both has shape
-        `(num_layers, batch_size, num_hidden)`.
-        If `bidirectional` is True, each recurrent state will instead have shape
-        `(2*num_layers, batch_size, num_hidden)`.
-        If input recurrent state is None, zeros are used as default begin states,
-        and the output recurrent state is omitted.
-
-
-    Examples
-    --------
-    >>> layer = mx.gluon.rnn.LSTM(100, 3)
-    >>> layer.initialize()
-    >>> input = mx.nd.random.uniform(shape=(5, 3, 10))
-    >>> # by default zeros are used as begin state
-    >>> output = layer(input)
-    >>> # manually specify begin state.
-    >>> h0 = mx.nd.random.uniform(shape=(3, 3, 100))
-    >>> c0 = mx.nd.random.uniform(shape=(3, 3, 100))
-    >>> output, hn = layer(input, [h0, c0])
-=cut
-
-use AI::MXNet::Gluon::Mouse;
-extends 'AI::MXNet::Gluon::RNN::Layer';
-
-has '+num_layers'    => (default => 1);
-has '+layout'        => (default => 'TNC');
-has '+dropout'       => (default => 0);
-has '+bidirectional' => (default => 0);
-has [qw/
-    +i2h_bias_initializer
-    +h2h_bias_initializer
-    /]               => (default => 'zeros');
-has '+mode'          => (default => 'lstm');
-
-method state_info(DimSize $batch_size=0)
-{
-    return [
-        {
-            shape => [
-                $self->num_layers * $self->dir, $batch_size, 
-                defined $self->projection_size ? $self->projection_size : $self->hidden_size
-            ],
-            __layout__ => 'LNC'
-        },
-        {
-            shape => [$self->num_layers * $self->dir, $batch_size, $self->hidden_size],
-            __layout__ => 'LNC'
-        }
-    ];
-}
-
-__PACKAGE__->register('AI::MXNet::Gluon::RNN');
-
-package AI::MXNet::Gluon::RNN::GRU;
-
-=head1 NANE
-
-    AI::MXNet::Gluon::RNN::GRU
-=cut
-
-=head1 DESCRIPTION
-
-    Applies a multi-layer gated recurrent unit (GRU) RNN to an input sequence.
-
-    For each element in the input sequence, each layer computes the following
-    function:
-
-    .. math::
-        \begin{array}{ll}
-        r_t = sigmoid(W_{ir} x_t + b_{ir} + W_{hr} h_{(t-1)} + b_{hr}) \\
-        i_t = sigmoid(W_{ii} x_t + b_{ii} + W_hi h_{(t-1)} + b_{hi}) \\
-        n_t = \tanh(W_{in} x_t + b_{in} + r_t * (W_{hn} h_{(t-1)}+ b_{hn})) \\
-        h_t = (1 - i_t) * n_t + i_t * h_{(t-1)} \\
-        \end{array}
-
-    where :math:`h_t` is the hidden state at time `t`, :math:`x_t` is the hidden
-    state of the previous layer at time `t` or :math:`input_t` for the first layer,
-    and :math:`r_t`, :math:`i_t`, :math:`n_t` are the reset, input, and new gates, respectively.
-
-    Parameters
-    ----------
-    hidden_size: int
-        The number of features in the hidden state h
-    num_layers: int, default 1
-        Number of recurrent layers.
-    layout : str, default 'TNC'
-        The format of input and output tensors. T, N and C stand for
-        sequence length, batch size, and feature dimensions respectively.
-    dropout: float, default 0
-        If non-zero, introduces a dropout layer on the outputs of each
-        RNN layer except the last layer
-    bidirectional: bool, default False
-        If True, becomes a bidirectional RNN.
-    i2h_weight_initializer : str or Initializer
-        Initializer for the input weights matrix, used for the linear
-        transformation of the inputs.
-    h2h_weight_initializer : str or Initializer
-        Initializer for the recurrent weights matrix, used for the linear
-        transformation of the recurrent state.
-    i2h_bias_initializer : str or Initializer
-        Initializer for the bias vector.
-    h2h_bias_initializer : str or Initializer
-        Initializer for the bias vector.
-    input_size: int, default 0
-        The number of expected features in the input x.
-        If not specified, it will be inferred from input.
-    prefix : str or None
-        Prefix of this `Block`.
-    params : ParameterDict or None
-        Shared Parameters for this `Block`.
-
-
-    Input shapes:
-        The input shape depends on `layout`. For `layout='TNC'`, the
-        input has shape `(sequence_length, batch_size, input_size)`
-
-    Output shape:
-        The output shape depends on `layout`. For `layout='TNC'`, the
-        output has shape `(sequence_length, batch_size, num_hidden)`.
-        If `bidirectional` is True, output shape will instead be
-        `(sequence_length, batch_size, 2*num_hidden)`
-
-    Recurrent state:
-        The recurrent state is an NDArray with shape `(num_layers, batch_size, num_hidden)`.
-        If `bidirectional` is True, the recurrent state shape will instead be
-        `(2*num_layers, batch_size, num_hidden)`
-        If input recurrent state is None, zeros are used as default begin states,
-        and the output recurrent state is omitted.
-
-
-    Examples
-    --------
-    >>> layer = mx.gluon.rnn.GRU(100, 3)
-    >>> layer.initialize()
-    >>> input = mx.nd.random.uniform(shape=(5, 3, 10))
-    >>> # by default zeros are used as begin state
-    >>> output = layer(input)
-    >>> # manually specify begin state.
-    >>> h0 = mx.nd.random.uniform(shape=(3, 3, 100))
-    >>> output, hn = layer(input, h0)
-=cut
-
-use AI::MXNet::Gluon::Mouse;
-extends 'AI::MXNet::Gluon::RNN::Layer';
-
-has '+num_layers'    => (default => 1);
-has '+layout'        => (default => 'TNC');
-has '+dropout'       => (default => 0);
-has '+bidirectional' => (default => 0);
-has [qw/
-    +i2h_bias_initializer
-    +h2h_bias_initializer
-    /]               => (default => 'zeros');
-has '+mode'          => (default => 'gru');
-
-method state_info(DimSize $batch_size=0)
-{
-    return [
-        {
-            shape => [$self->num_layers * $self->dir, $batch_size, $self->hidden_size],
-            __layout__ => 'LNC'
-        }
-    ];
-}
-
-__PACKAGE__->register('AI::MXNet::Gluon::RNN');
-
-1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Trainer.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Trainer.pm
deleted file mode 100644
index 0ab484de2525..000000000000
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Trainer.pm
+++ /dev/null
@@ -1,560 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-use strict;
-use warnings;
-package AI::MXNet::Gluon::Trainer;
-use AI::MXNet::NS;
-use AI::MXNet::Base;
-use AI::MXNet::Function::Parameters;
-use IO::File;
-use Mouse;
-
-
-=head1 NAME
-
-    AI::MXNet::Gluon::Trainer
-=cut
-
-=head1 DESCRIPTION
-
-    Applies an `Optimizer` on a set of Parameters. Trainer should
-    be used together with `autograd`.
-
-    Parameters
-    ----------
-    params : AI::MXNet::Gluon::ParameterDict
-        The set of parameters to optimize.
-    optimizer : str or Optimizer
-        The optimizer to use. See
-        `help <https://mxnet.io/api/python/optimization/optimization.html#the-mxnet-optimizer-package>`_
-        on Optimizer for a list of available optimizers.
-    optimizer_params : hash ref
-        Key-word arguments to be passed to optimizer constructor. For example,
-        {learning_rate => 0.1}. All optimizers accept learning_rate, wd (weight decay),
-        clip_gradient, and lr_scheduler. See each optimizer's
-        constructor for a list of additional supported arguments.
-    kvstore : str or KVStore
-        kvstore type for multi-gpu and distributed training. See help on
-        mx->kvstore->create for more information.
-    compression_params : hash ref
-        Specifies type of gradient compression and additional arguments depending
-        on the type of compression being used. For example, 2bit compression requires a threshold.
-        Arguments would then be {type => '2bit', threshold => 0.5}
-        See AI::MXNet::KVStore->set_gradient_compression method for more details on gradient compression.
-    update_on_kvstore : Bool, default undef
-        Whether to perform parameter updates on kvstore. If undef, then trainer will choose the more
-        suitable option depending on the type of kvstore.
-
-    Properties
-    ----------
-    learning_rate : float
-        The current learning rate of the optimizer. Given an Optimizer object
-        optimizer, its learning rate can be accessed as optimizer->learning_rate.
-=cut
-
-has 'params'             => (is => 'rw', isa => 'HashRef|ArrayRef|AI::MXNet::Gluon::ParameterDict');
-has 'optimizer'          => (is => 'ro', isa => 'Optimizer');
-has 'optimizer_params'   => (is => 'ro', isa => 'Maybe[HashRef]');
-has 'compression_params' => (is => 'ro', isa => 'Maybe[HashRef]');
-has 'kvstore'            => (is => 'rw', isa => 'Maybe[KVStore]', default => 'device');
-has 'update_on_kvstore'  => (is => 'rw', isa => 'Maybe[Bool]');
-has [qw/_scale _contexts
-    _kv_initialized
-    _param2idx
-    _kvstore_params
-    _contains_sparse
-    _params_to_init
-    _updaters
-    _optimizer/]       => (is => 'rw', init_arg => undef);
-around BUILDARGS => \&AI::MXNet::Base::process_arguments;
-method python_constructor_arguments()
-{
-    [qw/params optimizer optimizer_params kvstore compression_params update_on_kvstore/]
-}
-
-sub BUILD
-{
-    my $self = shift;
-    my @params;
-    if(blessed $self->params)
-    {
-        @params = $self->params->values;
-    }
-    elsif(ref $self->params eq 'HASH')
-    {
-        @params = values %{ $self->params };
-    }
-    else
-    {
-        @params = @{ $self->params };
-    }
-    $self->params([]);
-    $self->_contains_sparse(0);
-    $self->_param2idx({});
-    for(enumerate(\@params))
-    {
-        my ($i, $param) = @$_;
-        if(not(blessed $param and $param->isa('AI::MXNet::Gluon::Parameter')))
-        {
-            confess(
-                "First argument must be a array or hash of Parameters, ".
-                "got list of [$param]."
-            );
-        }
-        $self->_param2idx->{ $param->name } = $i;
-        push @{ $self->params }, $param;
-        $param->_set_trainer($self);
-        if($param->stype ne 'default')
-        {
-            $self->_contains_sparse(1);
-        }
-    }
-    my $optimizer_params = $self->optimizer_params//{};
-    $self->_scale(delete $optimizer_params->{rescale_grad}//1);
-    $self->_contexts($self->_check_contexts);
-    $self->_init_optimizer($self->optimizer, $optimizer_params);
-    $self->_kvstore_params({
-        kvstore => $self->kvstore,
-        update_on_kvstore => $self->update_on_kvstore
-    });
-    $self->_kv_initialized(0);
-    $self->kvstore(undef);
-    $self->update_on_kvstore(undef);
-    $self->_params_to_init([]);
-    $self->_reset_kvstore();
-}
-
-method _check_contexts()
-{
-    my $contexts;
-    for my $param (@{ $self->params })
-    {
-        my $ctx = $param->list_ctx;
-        assert(
-            (not defined $contexts or join('', @{ $contexts }) eq join('', @{ $ctx })),
-            "All Parameters must be initialized on the same set of contexts, ".
-            "but Parameter ${\ $param->name } is initialized on @{ $ctx//[] } while previous Parameters ".
-            "are initialized on @{ $contexts//[] }."
-        );
-        $contexts = $ctx;
-    }
-    return $contexts;
-}
-
-method _init_optimizer($optimizer, $optimizer_params)
-{
-    my %param_dict = map { $_ => $self->params->[$_] } 0 .. @{ $self->params } - 1;
-    if(blessed $optimizer and $optimizer->isa('AI::MXNet::Optimizer'))
-    {
-        assert(
-            (not %{ $optimizer_params }),
-            "optimizer_params must be empty if optimizer is an instance of ".
-            "Optimizer instead of str"
-        );
-        $self->_optimizer($optimizer);
-        $self->_optimizer->param_dict(\%param_dict);
-    }
-    else
-    {
-        $self->_optimizer(
-            AI::MXNet::Optimizer->create(
-                $optimizer, param_dict => \%param_dict,
-                %{ $optimizer_params }
-            )
-        );
-    }
-    $self->_updaters([
-        map { AI::MXNet::Optimizer->get_updater($self->_optimizer) } @{ $self->_contexts }
-    ]);
-}
-
-method _init_params()
-{
-    assert(
-        $self->_kv_initialized,
-        "Cannot initialize parameters in KVStore ".
-        "when KVStore is not initialized."
-    );
-    my @params_to_init;
-    if($self->kvstore)
-    {
-        for my $param (@{ $self->_params_to_init })
-        {
-            if(@{ $param->_deferred_init })
-            {
-                push @params_to_init, $param;
-            }
-            else
-            {
-                my $param_arrays = $param->_check_and_get($param->_data, []);
-                my $idx = $self->_param2idx->{ $param->name };
-                $self->kvstore->init($idx, $param_arrays->[0]);
-                if($param->stype eq 'default')
-                {
-                    $self->kvstore->pull($idx, out => $param_arrays, priority=>-$idx);
-                }
-            }
-        }
-    }
-    $self->_params_to_init(\@params_to_init);
-}
-
-method _reset_kvstore()
-{
-    if($self->kvstore and $self->kvstore->type =~ /dist/)
-    {
-        confess("Cannot reset distributed KVStore.");
-    }
-    $self->_kv_initialized(0);
-    $self->kvstore(undef);
-    $self->update_on_kvstore(undef);
-    $self->_params_to_init([@{ $self->params }]);
-}
-
-method _init_kvstore()
-{
-    my $config = $self->_kvstore_params;
-    my ($kvstore, $update_on_kvstore);
-    if($self->_contains_sparse)
-    {
-        ($kvstore, $update_on_kvstore) = AI::MXNet::Module::_create_sparse_kvstore($config->{kvstore});
-        # update_on_kvstore is set to False by the user
-        if(defined $config->{update_on_kvstore} and not $config->{update_on_kvstore})
-        {
-            confess(
-                "Cannot set update_on_kvstore to False when sparse ".
-                "gradients and/or sparse weights are present."
-            )
-        }
-    }
-    else
-    {
-        my %arg_arrays = map { $_->name => $_->data($self->_contexts->[0]) } @{ $self->params };
-        ($kvstore, $update_on_kvstore) = AI::MXNet::Module::_create_kvstore(
-            $config->{kvstore}, scalar(@{$self->_contexts }), \%arg_arrays
-        );
-        if(defined $config->{update_on_kvstore})
-        {
-            $update_on_kvstore = $config->{update_on_kvstore};
-        }
-    }
-    if($kvstore)
-    {
-        if($self->compression_params)
-        {
-            $kvstore->set_gradient_compression($self->compression_params);
-        }
-        # kv->pull(row_sparse_grad) is not supported
-        if($kvstore->type =~ /dist/ and not $self->_contains_sparse)
-        {
-            $update_on_kvstore = 0;
-        }
-        if($update_on_kvstore)
-        {
-            # optimizer preferably needs to be set before init for multiprecision
-            $kvstore->set_optimizer($self->_optimizer);
-        }
-        $self->kvstore($kvstore);
-        $self->update_on_kvstore($update_on_kvstore);
-    }
-    else
-    {
-        $self->kvstore(undef);
-        $self->update_on_kvstore(undef);
-    }
-    $self->_kv_initialized(1);
-}
-
-# Internal method to invoke pull operations on KVStore. If $full_idx is set to 1,
-# $kv->pull is preferred instead of $kv->row_sparse_pull.
-
-method _row_sparse_pull($parameter, $out, $row_id, $full_idx=0)
-{
-    # initialize kv and params if not already
-    $self->_init_kvstore() unless $self->_kv_initialized;
-    $self->_init_params() if scalar(@{ $self->_params_to_init });
-    my $idx = $self->_param2idx->{ $parameter->name };
-    if($full_idx and not $self->kvstore->type =~ /dist/)
-    {
-        assert($row_id->size == $out->shape->[0]);
-        $self->kvstore->pull($idx, out => $out, priority => -$idx, ignore_sparse => 0);
-    }
-    else
-    {
-        $self->kvstore->row_sparse_pull($idx, out => $out, row_ids => $row_id, priority => -$idx);
-    }
-}
-
-=head2 step
-
-        Makes one step of parameter update. Should be called after
-        `autograd->backward()` and outside of `record()` scope.
-
-        For normal parameter updates, `step()` should be used, which internally calls
-        `allreduce_grads()` and then `update()`. However, if you need to get the reduced
-        gradients to perform certain transformation, such as in gradient clipping, then
-        you may want to manually call `allreduce_grads()` and `update()` separately.
-
-        Parameters
-        ----------
-        $batch_size : Int
-            Batch size of data processed. Gradient will be normalized by `1/batch_size`.
-            Set this to 1 if you normalized loss manually with `loss = mean(loss)`.
-        $ignore_stale_grad : Bool, optional, default=False
-            If true, ignores Parameters with stale gradient (gradient that has not
-            been updated by `backward` after last step) and skip update.
-=cut
-
-method step(Int $batch_size, Bool $ignore_stale_grad=0)
-{
-    $self->_init_kvstore() unless $self->_kv_initialized;
-    $self->_init_params() if scalar(@{ $self->_params_to_init });
-    $self->_optimizer->rescale_grad($self->_scale/$batch_size);
-    $self->_allreduce_grads();
-    $self->_update($ignore_stale_grad);
-}
-
-=head2 allreduce_grads
-
-        For each parameter, reduce the gradients from different contexts.
-
-        Should be called after `autograd.backward()`, outside of `record()` scope,
-        and before `trainer.update()`.
-
-        For normal parameter updates, `step()` should be used, which internally calls
-        `allreduce_grads()` and then `update()`. However, if you need to get the reduced
-        gradients to perform certain transformation, such as in gradient clipping, then
-        you may want to manually call `allreduce_grads()` and `update()` separately.
-=cut
-
-method allreduce_grads()
-{
-    $self->_init_kvstore() unless $self->_kv_initialized;
-    $self->_init_params() if scalar(@{ $self->_params_to_init });
-    assert(
-        (not ($self->kvstore and $self->update_on_kvstore)),
-        'allreduce_grads() when parameters are updated on kvstore '.
-        'is not supported. Try setting `update_on_kvstore` '.
-        'to False when creating trainer.'
-    );
-    $self->_allreduce_grads();
-}
-
-method _allreduce_grads()
-{
-    if($self->kvstore)
-    {
-        for(enumerate($self->params))
-        {
-            my ($i, $param) = @$_;
-            if($param->grad_req ne 'null')
-            {
-                $self->kvstore->push($i, $param->list_grad(), priority=>-$i);
-                if(not $self->update_on_kvstore)
-                {
-                    $self->kvstore->pull($i, out => $param->list_grad(), priority=>-$i);
-                }
-            }
-        }
-    }
-}
-
-method learning_rate(Maybe [Num] $lr)
-{
-    if(not blessed $self->_optimizer)
-    {
-        AI::MXNet::Logging->warning(
-            "Optimizer has to be defined before its learning ".
-            "rate can be accessed."
-        );
-        return;
-    }
-    else
-    {
-        if(defined $lr)
-        {
-            $self->_optimizer->lr($lr);
-        }
-        return $self->_optimizer->lr;
-    }
-}
-
-=head2 set_learning_rate
-
-        Sets a new learning rate of the optimizer.
-
-        Parameters
-        ----------
-        lr : float
-            The new learning rate of the optimizer.
-=cut
-
-method set_learning_rate(Num $lr)
-{
-    $self->learning_rate($lr);
-}
-
-=head2 update
-
-        Makes one step of parameter update.
-
-        Should be called after autograd->backward() and outside of record() scope,
-        and after trainer->update`.
-
-
-        For normal parameter updates, step() should be used, which internally calls
-        allreduce_grads() and then update(). However, if you need to get the reduced
-        gradients to perform certain transformation, such as in gradient clipping, then
-        you may want to manually call allreduce_grads() and update() separately.
-
-        Parameters
-        ----------
-        $batch_size : Int
-            Batch size of data processed. Gradient will be normalized by `1/$batch_size`.
-            Set this to 1 if you normalized loss manually with $loss = mean($loss).
-        $ignore_stale_grad : Bool, optional, default=False
-            If true, ignores Parameters with stale gradient (gradient that has not
-            been updated by backward() after last step) and skip update.
-=cut
-
-method update(Int $batch_size, Bool $ignore_stale_grad=0)
-{
-    $self->_init_kvstore() unless $self->_kv_initialized;
-    $self->_init_params() if scalar(@{ $self->_params_to_init });
-    assert(
-        (not ($self->kvstore and $self->update_on_kvstore)),
-        'update() when parameters are updated on kvstore '.
-        'is not supported. Try setting `update_on_kvstore` '.
-        'to False when creating trainer.'
-    );
-    $self->_optimizer->rescale_grad($self->_scale/$batch_size);
-    $self->_update($ignore_stale_grad);
-}
-
-method _update(Bool $ignore_stale_grad=0):
-{
-    for(enumerate($self->params))
-    {
-        my ($i, $param) = @$_;
-        next if($param->grad_req eq 'null');
-
-        if(not $ignore_stale_grad)
-        {
-            for my $data (@{ $param->_check_and_get($param->_data, []) })
-            {
-                if(not $data->_fresh_grad)
-                {
-                    AI::MXNet::Logging->warning(
-                        "Gradient of Parameter '%s' on context %s has not been updated ".
-                        "by backward since last `step`. This could mean a bug in your ".
-                        "model that made it only use a subset of the Parameters (Blocks) ".
-                        "for this iteration. If you are intentionally only using a subset, ".
-                        "call step with ignore_stale_grad=True to suppress this ".
-                        "warning and skip updating of Parameters with stale gradient",
-                        $param->name, $data->context
-                    );
-                }
-            }
-        }
-        if($self->kvstore and $self->update_on_kvstore)
-        {
-            if($param->stype eq 'default')
-            {
-                # 'row_sparse' parameters are not pulled immediately - they're pulled
-                # in `SparseBlock.sparse_forward`
-                $self->kvstore->pull($i, out => $param->list_data(), priority=>-$i);
-            }
-            next;
-        }
-
-        for(zip($self->_updaters, $param->list_data(), $param->list_grad()))
-        {
-            my ($upd, $arr, $grad) = @$_;
-            if(not $ignore_stale_grad or $arr->_fresh_grad)
-            {
-                $upd->($i, $grad, $arr);
-                $arr->_fresh_grad(0);
-            }
-        }
-    }
-}
-
-=head2 save_states
-
-        Saves trainer states (e.g. optimizer, momentum) to a file.
-
-        Parameters
-        ----------
-        fname : str
-            Path to output states file.
-=cut
-
-method save_states(Str $fname)
-{
-    assert(defined $self->_optimizer);
-    $self->_init_kvstore() unless $self->_kv_initialized;
-    $self->_init_params() if scalar(@{ $self->_params_to_init });
-
-    if($self->update_on_kvstore)
-    {
-        $self->kvstore->save_optimizer_states($fname, dump_optimizer=>1);
-    }
-    else
-    {
-        open(F, ">$fname") or Carp::confess("can not open $fname: $1");
-        print F $self->_updaters->[0]->get_states(dump_optimizer => 1);
-        close(F);
-    }
-}
-
-=head2 load_states
-
-        Loads trainer states (e.g. optimizer, momentum) from a file.
-
-        Parameters
-        ----------
-        fname : str
-            Path to input states file.
-=cut
-
-method load_states(Str $fname)
-{
-    $self->_init_kvstore() unless $self->_kv_initialized;
-    $self->_init_params() if scalar(@{ $self->_params_to_init });
-
-    if($self->update_on_kvstore)
-    {
-        $self->kvstore->load_optimizer_states($fname);
-        $self->_optimizer($self->kvstore->_updater->optimizer);
-        $self->_optimizer->param_dict({ map { $_->[0] => $_->[1] } enumerate($self->params) });
-    }
-    else
-    {
-        my $states = join('', IO::File->new($fname)->getlines);
-        for my $updater (@{ $self->_updaters })
-        {
-            $updater->set_states($states);
-            $updater->optimizer($self->_updaters->[0]->optimizer);
-        }
-        $self->_optimizer($self->_updaters->[0]->optimizer);
-    }
-}
-
-__PACKAGE__->AI::MXNet::NS::register('AI::MXNet::Gluon');
-
-1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Utils.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Utils.pm
deleted file mode 100644
index 66d8acc6c5b1..000000000000
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Utils.pm
+++ /dev/null
@@ -1,315 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-package AI::MXNet::Gluon::Utils;
-use strict;
-use warnings;
-use AI::MXNet::Base;
-use AI::MXNet::Function::Parameters;
-use Digest::SHA qw(sha1_hex);
-use File::Path qw(make_path);
-use HTTP::Tiny;
-use Exporter;
-use base qw(Exporter);
-our @EXPORT_OK = qw(download check_sha1);
-
-=head1 NAME
-
-    AI::MXNet::Gluon::Utils
-=cut
-
-=head1 DESCRIPTION
-
-    Miscellaneous utilities.
-=cut
-
-=head2 split_data
-
-    Splits an NDArray into `num_slice` slices along `batch_axis`.
-    Usually used for data parallelism where each slices is sent
-    to one device (i.e. GPU).
-
-    Parameters
-    ----------
-    $data : NDArray
-        A batch of data.
-    $num_slice : int
-        Number of desired slices.
-    $batch_axis=0 : int, default 0
-        The axis along which to slice.
-    :$even_split=1 : bool, default True
-        Whether to force all slices to have the same number of elements.
-        If `True`, an error will be raised when `num_slice` does not evenly
-        divide `data.shape[batch_axis]`.
-
-    Returns
-    -------
-    array ref of NDArray
-        Return value is a array ref even if `num_slice` is 1.
-=cut
-
-
-method split_data(AI::MXNet::NDArray $data, Int $num_slice, Int $batch_axis=0, Bool :$even_split=1)
-{
-    my $size = $data->shape->[$batch_axis];
-    if($size < $num_slice)
-    {
-        Carp::confess(
-            sprintf(
-                "Too many slices for data with shape (%s). Arguments are ".
-                "num_slice=%d and batch_axis=%d.",
-                join(',', @{ $data->shape }), $num_slice, $batch_axis
-            )
-        );
-    }
-    if($even_split and $size % $num_slice != 0)
-    {
-        Carp::confess(
-            sprintf(
-                "data with shape %s cannot be evenly split into %d slices along axis %d. ".
-                "Use a batch size that's multiple of %d or set even_split=False to allow ".
-                "uneven partitioning of data.",
-                join(',', @{ $data->shape }), $num_slice, $batch_axis, $num_slice
-            )
-        );
-    }
-    my $step = int($size/$num_slice);
-    my $slices = [];
-    if($batch_axis == 0)
-    {
-        for my $i (0 .. $num_slice-1)
-        {
-            if($i < $num_slice-1)
-            {
-                push @$slices, $data->slice([$i*$step, ($i+1)*$step-1]);
-            }
-            else
-            {
-                push @$slices, $data->slice([$i*$step, $size-1]);
-            }
-        }
-    }
-    elsif($even_split)
-    {
-        $slices = AI::MXNet::NDArray->split($data, num_outputs => $num_slice, axis => $batch_axis);
-    }
-    else
-    {
-        for my $i (0 .. $num_slice-1)
-        {
-            if($i < $num_slice-1)
-            {
-                push @$slices, $data->slice_axis($batch_axis, $i*$step, ($i+1)*$step);
-            }
-            else
-            {
-                push @$slices, $data->slice_axis($batch_axis, $i*$step, $size);
-            }
-        }
-    }
-    return $slices;
-}
-
-=head2 split_and_load
-
-    Splits an NDArray into `len(ctx_list)` slices along `batch_axis` and loads
-    each slice to one context in `ctx_list`.
-
-    Parameters
-    ----------
-    $data : AcceptableInput
-        A batch of data.
-    :$ctx_list : list of Context
-        A list of Contexts.
-    :$batch_axis : int, default 0
-        The axis along which to slice.
-    :$even_split : bool, default True
-        Whether to force all slices to have the same number of elements.
-
-    Returns
-    -------
-    list of NDArray
-        Each corresponds to a context in `ctx_list`.
-=cut
-
-method split_and_load(
-    PDL|PDL::Matrix|ArrayRef|AI::MXNet::NDArray $data,
-    ArrayRef[AI::MXNet::Context] :$ctx_list,
-    Int :$batch_axis=0,
-    Bool :$even_split=1
-)
-{
-    if(not (blessed $data and $data->isa('AI::MXNet::NDArray')))
-    {
-        $data = AI::MXNet::NDArray->array($data, ctx => $ctx_list->[0])
-    }
-    if(@{ $ctx_list } == 1)
-    {
-        return [$data->as_in_context($ctx_list->[0])];
-    }
-    my $slices = __PACKAGE__->split_data($data, scalar(@$ctx_list), $batch_axis, even_split => $even_split);
-    my @ret;
-    for(zip($slices, $ctx_list)) {
-        my ($i, $ctx) = @$_;
-        push @ret, $i->as_in_context($ctx);
-    }
-    return \@ret;
-}
-
-=head2 clip_global_norm
-
-    Rescales NDArrays so that the sum of their 2-norm is smaller than `max_norm`.
-=cut
-
-method clip_global_norm(ArrayRef[AI::MXNet::NDArray] $arrays, Num $max_norm)
-{
-    my $_norm = sub { my ($array) = @_;
-        if($array->stype eq 'default')
-        {
-            my $x = $array->reshape([-1]);
-            return AI::MXNet::NDArray->dot($x, $x);
-        }
-        return $array->norm->square;
-    };
-    assert(@$arrays > 0);
-    my $ctx = $arrays->[0]->context;
-    my $total_norm = AI::MXNet::NDArray->add_n(map { $_norm->($_)->as_in_context($ctx) } @$arrays);
-    $total_norm = $total_norm->sqrt->asscalar;
-    if(lc($total_norm) eq 'nan' or $total_norm =~ /inf/i)
-    {
-        AI::MXNet::Logging->warning('nan or inf is detected. Clipping results will be undefined.');
-    }
-    my $scale = $max_norm / ($total_norm + 1e-8);
-    if($scale < 1.0)
-    {
-        for my $arr (@$arrays)
-        {
-            $arr *= $scale;
-        }
-    }
-    return $total_norm;
-}
-
-=head2 check_sha1
-
-    Check whether the sha1 hash of the file content matches the expected hash.
-
-    Parameters
-    ----------
-    filename : str
-        Path to the file.
-    sha1_hash : str
-        Expected sha1 hash in hexadecimal digits.
-
-    Returns
-    -------
-    bool
-        Whether the file content matches the expected hash.
-=cut
-
-func check_sha1(Str $filename, Str $sha1_hash)
-{
-    local($/) = undef;
-    open(F, $filename) or Carp::confess("can't open $filename $!");
-    my $data = <F>;
-    close(F);
-    return sha1_hex($data) eq $sha1_hash;
-}
-
-=head2 download
-
-    Download an given URL
-
-    Parameters
-    ----------
-    $url : str
-        URL to download
-    :$path : str, optional
-        Destination path to store downloaded file. By default stores to the
-        current directory with same name as in url.
-    :$overwrite : bool, optional
-        Whether to overwrite destination file if already exists.
-    :$sha1_hash : str, optional
-        Expected sha1 hash in hexadecimal digits. Will ignore existing file when hash is specified
-        but doesn't match.
-    Returns
-    -------
-    str
-        The file path of the downloaded file.
-=cut
-
-func download(Str $url, Maybe[Str] :$path=, Bool :$overwrite=0, Maybe[Str] :$sha1_hash=)
-{
-    my $fname;
-    $path =~ s/~/$ENV{HOME}/ if defined $path;
-    if(not defined $path)
-    {
-        $fname = (split(m[/], $url))[-1];
-    }
-    elsif(-d $path)
-    {
-        $fname = join('/', $path, (split(m[/], $url))[-1]);
-    }
-    else
-    {
-        $fname = $path;
-    }
-    if($overwrite or not -f $fname or ($sha1_hash and not check_sha1($fname, $sha1_hash)))
-    {
-        $fname =~ s/~/$ENV{HOME}/;
-        my $dirname = $fname;
-        $dirname =~ s/[^\/]+$//;
-        if(not -d $dirname)
-        {
-            make_path($dirname);
-        }
-        warn "Downloading $fname from $url ...\n";
-        my $response = HTTP::Tiny->new->get($url);
-        Carp::confess("download of url failed! ($response->{status} $response->{reason})\n")
-            unless $response->{success};
-        open(F, ">$fname") or Carp::confess("can't open $fname: $!");
-        print F $response->{content};
-        close(F);
-    }
-    return $fname
-}
-
-package AI::MXNet::Gluon::Utils::HookHandle;
-use Mouse;
-use AI::MXNet::Base;
-use Scalar::Util qw(refaddr);
-has [qw/_hooks_dict_ref/] => (is => 'rw', init_arg => undef, weak_ref => 1);
-has [qw/_id/]             => (is => 'rw', init_arg => undef);
-
-method attach(Hash::Ordered $hooks_dict, $hook)
-{
-    assert((not $self->_hooks_dict_ref), 'The same handle cannot be attached twice.');
-    $self->_id(refaddr($hook));
-    $hooks_dict->set($self->_id, $hook);
-    $self->_hooks_dict_ref($hooks_dict);
-}
-
-method detach()
-{
-    my $hooks_dict = $self->_hooks_dict_ref;
-    if($hooks_dict and $hooks_dict->exists($self->_id))
-    {
-        $hooks_dict->delete($self->_id);
-    }
-}
-
-1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/IO.pm b/perl-package/AI-MXNet/lib/AI/MXNet/IO.pm
deleted file mode 100644
index 4f35518bb9a8..000000000000
--- a/perl-package/AI-MXNet/lib/AI/MXNet/IO.pm
+++ /dev/null
@@ -1,856 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-package AI::MXNet::IO;
-use strict;
-use warnings;
-use AI::MXNet::NS;
-use AI::MXNet::Base;
-use AI::MXNet::Function::Parameters;
-use Scalar::Util qw/blessed/;
-
-=head1 NAME
-
-    AI::MXNet::IO - Data loading interface of MXNet
-=cut
-
-=head1 DESCRIPTION
-
-    This document summarizes supported data formats and iterator APIs to read the data including
-    mx->io              Data iterators for common data formats.
-    mx->recordio        Data iterators for the RecordIO data format.
-    mx->image           Image Iterators and image augmentation functions.
-=cut
-
-# Convert data into canonical form.
-method init_data(
-    Maybe[AcceptableInput|HashRef[AcceptableInput]|ArrayRef[AcceptableInput]|Hash::Ordered] $data,
-    Undef|Int :$allow_empty=,
-    Str :$default_name
-)
-{
-    Carp::confess("data must be defined or allow_empty set to true value")
-        if(not defined $data and not $allow_empty);
-    $data //= [];
-    if(blessed $data and not $data->isa('Hash::Ordered'))
-    {
-        $data = [$data];
-    }
-
-    Carp::confess("data must not be empty or allow_empty set to true value")
-        if(ref($data) eq 'ARRAY' and not @{ $data } and not $allow_empty);
-
-    my @ret;
-    if(ref($data) eq 'ARRAY')
-    {
-        if(@{ $data } == 1)
-        {
-            @ret = ([$default_name, $data->[0]]);
-        }
-        else
-        {
-            my $i = -1;
-            @ret = map { $i++; ["_${i}_$default_name", $_] } @{ $data };
-        }
-    }
-    elsif(ref($data) eq 'HASH')
-    {
-        AI::MXNet::Logging->warning(
-            "Use of a raw perl hash as input is obsolete and the behaviour of the iterator is undefined.\n".
-            "Please use Hash::Ordered object instead."
-        );
-        while(my ($k, $v) = each %{ $data })
-        {
-            push @ret, [$k, $v];
-        }
-    }
-    elsif(blessed $data and $data->isa('Hash::Ordered'))
-    {
-        for my $k ($data->keys)
-        {
-            push @ret, [$k, $data->get($k)];
-        }
-    }
-    for my $d (@ret)
-    {
-        if(not (blessed $d->[1] and $d->[1]->isa('AI::MXNet::NDArray')))
-        {
-            $d->[1] = AI::MXNet::NDArray->array($d->[1]);
-        }
-    }
-    return \@ret;
-}
-
-method DataDesc(@args)  { AI::MXNet::DataDesc->new(@args)  }
-method DataBatch(@args) { AI::MXNet::DataBatch->new(@args) }
-
-package AI::MXNet::DataDesc;
-use Mouse;
-use overload '""'  => \&stringify,
-             '@{}' => \&to_nameshape;
-has 'name'   => (is => 'ro', isa => "Str",   required => 1);
-has 'shape'  => (is => 'ro', isa => "Shape", required => 1);
-has 'dtype'  => (is => 'ro', isa => "Dtype", default => 'float32');
-has 'layout' => (is => 'ro', isa => "Str",   default => 'NCHW');
-
-around BUILDARGS => sub {
-    my $orig  = shift;
-    my $class = shift;
-    if(@_ >= 2 and ref $_[1] eq 'ARRAY')
-    {
-        my $name  = shift;
-        my $shape = shift;
-        return $class->$orig(name => $name, shape => $shape, @_);
-    }
-    return $class->$orig(@_);
-};
-
-method stringify($other=, $reverse=)
-{
-    sprintf(
-        "DataDesc[%s,%s,%s,%s]",
-        $self->name,
-        join('x', @{ $self->shape }),
-        $self->dtype,
-        $self->layout
-    );
-}
-
-method to_nameshape($other=, $reverse=)
-{
-    [$self->name, $self->shape];
-}
-
-=head1 NAME
-
-    AI::MXNet::DataDesc - A container class for describing the data layout.
-=cut
-
-=head2 get_batch_axis
-
-    Get the dimension that corresponds to the batch size.
-
-    Parameters
-    ----------
-    layout : str
-        layout string. For example, "NCHW".
-
-    Returns
-    -------
-    An axis indicating the batch_size dimension. When data-parallelism is
-    used, the data will be automatically split and concatenate along the batch_size
-    dimension. Axis can be -1, which means the whole array will be copied for each
-    data-parallelism device.
-=cut
-
-method get_batch_axis(Str|Undef $layout)
-{
-    return 0 unless defined $layout;
-    return index($layout, 'N');
-}
-
-=head2 get_list
-
-    Coverts the input to an array ref AI::MXNet::DataDesc objects.
-
-    Parameters
-    ----------
-    $shapes : HashRef[Shape]
-    $types= :  Maybe[HashRef[Dtype]]
-=cut
-
-method get_list(HashRef[Shape] $shapes, Maybe[HashRef[Dtype]] $types=)
-{
-    $types //= {};
-    return [
-        map {
-            AI::MXNet::DataDesc->new(
-                name  => $_,
-                shape => $shapes->{$_},
-                (exists $types->{$_} ? (type => $types->{$_}) : ())
-            )
-        } keys %{ $shapes }
-    ];
-}
-
-package AI::MXNet::DataBatch;
-use Mouse;
-
-=head1 NAME
-
-    AI::MXNet::DataBatch - A container for a mini-batch of the data and related information.
-=cut
-
-=head1 DESCRIPTION
-
-    Default object for holding a mini-batch of data and related information.
-=cut
-
-has 'data'          => (is => 'rw', isa => 'Maybe[ArrayRef[AI::MXNet::NDArray]]', required => 1);
-has 'label'         => (is => 'rw', isa => 'Maybe[ArrayRef[AI::MXNet::NDArray]]');
-has 'pad'           => (is => 'rw');
-has 'index'         => (is => 'rw');
-has 'bucket_key'    => (is => 'rw');
-has 'provide_data'  => (is => 'rw');
-has 'provide_label' => (is => 'rw');
-
-package AI::MXNet::DataIter;
-use Mouse;
-use overload '<>' =>  sub { shift->next },
-             '@{}' => sub { shift->list };
-
-=head1 NAME
-
-    AI::MXNet::DataIter - A parent class for MXNet data iterators.
-=cut
-
-has 'batch_size' => (is => 'rw', isa => 'Int', default => 0);
-
-=head2 reset
-
-    Reset the iterator.
-=cut
-
-method reset(){}
-
-=head2 list
-
-    Returns remaining iterator items as an array ref.
-=cut
-
-method list()
-{
-    my @ret;
-    while(my $data = <$self>)
-    {
-        $data->label([map { $_->copy } @{ $data->label }]);
-        $data->data([map { $_->copy } @{ $data->data }]);
-        push @ret, $data;
-    }
-    return \@ret;
-}
-
-=head2 next
-
-    Returns the next data batch from the iterator.
-
-    Returns
-    -------
-    $data : AI::MXNet::DataBatch
-    The data of next batch.
-=cut
-
-method next()
-{
-    if($self->iter_next())
-    {
-        return AI::MXNet::DataBatch->new(
-            data  => $self->getdata,
-            label => $self->getlabel,
-            pad   => $self->getpad,
-            index => $self->getindex
-        );
-    }
-    else
-    {
-        return undef;
-    }
-}
-
-=head2 iter_next
-
-    Iterate to next batch.
-
-    Returns
-    -------
-    $has_next : Bool
-=cut
-
-method iter_next(){}
-
-=head2 get_data
-
-    The data of current batch.
-
-    Returns
-    -------
-    data : AI::MXNet::NDArray
-=cut
-
-method get_data(){}
-
-=head2 getlabel
-
-    The label of the current batch.
-
-    Returns
-    -------
-    label : AI::MXNet::NDArray
-=cut
-
-method getlabel(){}
-
-=head2 getindex
-
-    The index of the current batch.
-
-    Returns
-    -------
-    $index : PDL
-=cut
-
-method getindex(){}
-
-=head2 getpad
-
-    The number of padding examples in the current batch.
-
-    Returns
-    -------
-    $pad : Int
-=cut
-
-method getpad(){}
-
-package AI::MXNet::ResizeIter;
-use Mouse;
-
-extends 'AI::MXNet::DataIter';
-
-=head1 NAME
-
-    AI::MXNet::ResizeIter
-=cut
-
-=head1 DESCRIPTION
-
-    Resize a DataIter to a given number of batches per epoch.
-    May produce incomplete batch in the middle of an epoch due
-    to the padding from internal iterator.
-
-    Parameters
-    ----------
-    data_iter : DataIter
-        Internal data iterator.
-    size : number of batches per epoch to resize to.
-    reset_internal : whether to reset internal iterator on ResizeIter.reset
-=cut
-
-has 'data_iter'      => (is => 'ro', isa => 'AI::MXnet::DataIter', required => 1);
-has 'size'           => (is => 'ro', isa => 'Int', required => 1);
-has 'reset_internal' => (is => 'rw', isa => 'Int', default => 1);
-has 'cur'            => (is => 'rw', isa => 'Int', default => 0);
-has 'current_batch'  => (is => 'rw', isa => 'Maybe[AI::MXNet::DataBatch]');
-has [qw/provide_data
-    default_bucket_key
-    provide_label
-    batch_size/]     => (is => 'rw', init_arg => undef);
-
-sub BUILD
-{
-    my $self = shift;
-    $self->provide_data($self->data_iter->provide_data);
-    $self->provide_label($self->data_iter->provide_label);
-    $self->batch_size($self->data_iter->batch_size);
-    if($self->data_iter->can('default_bucket_key'))
-    {
-        $self->default_bucket_key($self->data_iter->default_bucket_key);
-    }
-}
-
-method reset()
-{
-    $self->cur(0);
-    if($self->reset_internal)
-    {
-        $self->data_iter->reset;
-    }
-}
-
-method iter_next()
-{
-    return 0 if($self->cur == $self->size);
-    $self->current_batch($self->data_iter->next);
-    if(not defined $self->current_batch)
-    {
-        $self->data_iter->reset;
-        $self->current_batch($self->data_iter->next);
-    }
-    $self->cur($self->cur + 1);
-    return 1;
-}
-
-method get_data()
-{
-    return $self->current_batch->data;
-}
-
-method getlabel()
-{
-    return $self->current_batch->label;
-}
-
-method getindex()
-{
-    return $self->current_batch->index;
-}
-
-method getpad()
-{
-    return $self->current_batch->pad;
-}
-
-package AI::MXNet::NDArrayIter;
-use Mouse;
-use AI::MXNet::Base;
-use List::Util;
-extends 'AI::MXNet::DataIter';
-
-=head1 NAME
-
-    AI::MXNet::NDArrayIter - Predefined NDArray iterator.
-=cut
-
-=head1 DESCRIPTION
-
-    Predefined NDArray iterator. Accepts PDL or AI::MXNet::NDArray object as an input.
-
-    Parameters
-    ----------
-    data: Maybe[AcceptableInput|HashRef[AcceptableInput]|ArrayRef[AcceptableInput]].
-        NDArrayIter supports single or multiple data and label.
-    label: Maybe[AcceptableInput|HashRef[AcceptableInput]|ArrayRef[AcceptableInput]].
-        Same as data, but is not given to the model during testing.
-    batch_size=1: Int
-        Batch Size
-    shuffle=0: Bool
-        Whether to shuffle the data
-    last_batch_handle='pad': 'pad', 'discard' or 'roll_over'
-        How to handle the last batch
-
-    Note
-    ----
-    This iterator will pad, discard or roll over the last batch if
-    the size of data does not match batch_size. Roll over is intended
-    for training and can cause problems if used for prediction.
-=cut
-
-has 'data'                => (is => 'rw', isa => 'Maybe[AcceptableInput|HashRef[AcceptableInput]|ArrayRef[AcceptableInput]|Hash::Ordered]');
-has 'data_list'           => (is => 'rw', isa => 'ArrayRef[AI::MXNet::NDArray]');
-has 'label'               => (is => 'rw', isa => 'Maybe[AcceptableInput|HashRef[AcceptableInput]|ArrayRef[AcceptableInput]|Hash::Ordered]');
-has 'batch_size'          => (is => 'rw', isa => 'Int', default => 1);
-has 'shuffle'             => (is => 'rw', isa => 'Bool', default => 0);
-has 'last_batch_handle'   => (is => 'rw', isa => 'Str', default => 'pad');
-has 'label_name'          => (is => 'rw', isa => 'Str', default => 'softmax_label');
-has 'num_source'          => (is => 'rw', isa => 'Int');
-has 'cursor'              => (is => 'rw', isa => 'Int');
-has 'num_data'            => (is => 'rw', isa => 'Int');
-
-around BUILDARGS => \&AI::MXNet::Base::process_arguments;
-method python_constructor_arguments() { ['data', 'label'] };
-
-sub BUILD
-{
-    my $self  = shift;
-    my $data  = AI::MXNet::IO->init_data($self->data,  allow_empty => 0, default_name => 'data');
-    my $label = AI::MXNet::IO->init_data($self->label, allow_empty => 1, default_name => $self->label_name);
-    if(
-        (
-            (blessed $data and $data->isa('AI::MXNet::NDArray::CSR'))
-                or
-            (blessed $label and $label->isa('AI::MXNet::NDArray::CSR'))
-        )
-            and
-        ($self->last_batch_handle != 'discard')
-    )
-    {
-        confess("`NDArrayIter` only supports AI::MXNet::NDArray::CSR with `last_batch_handle` set to discard.");
-    }
-    my $num_data  = $data->[0][1]->shape->[0];
-    confess("size of data dimension 0 $num_data < batch_size ${\ $self->batch_size }")
-        unless($num_data >= $self->batch_size);
-    if($self->shuffle)
-    {
-        my @idx = List::Util::shuffle(0..$num_data-1);
-        $_->[1] = AI::MXNet::NDArray->array(
-            pdl_shuffle($_->[1]->stype eq 'csr' ? $_->[1]->aspdlccs : $_->[1]->aspdl, \@idx),
-            ctx => $_->[1]->context
-        ) for (@$data, @$label);
-    }
-    if($self->last_batch_handle eq 'discard')
-    {
-        my $new_n = $num_data - $num_data % $self->batch_size - 1;
-        $_->[1] = $_->[1]->slice([0, $new_n]) for @$data;
-        $_->[1] = $_->[1]->slice([0, $new_n]) for @$label;
-        $num_data = $new_n + 1;
-    }
-    my $data_list  = [map { $_->[1] } (@{ $data }, @{ $label })];
-    my $num_source = @{ $data_list };
-    my $cursor = -$self->batch_size;
-    $self->data($data);
-    $self->data_list($data_list);
-    $self->label($label);
-    $self->num_source($num_source);
-    $self->cursor($cursor);
-    $self->num_data($num_data);
-}
-
-# The name and shape of data provided by this iterator
-method provide_data()
-{
-    return [map {
-        my ($k, $v) = @{ $_ };
-        my $shape = $v->shape;
-        $shape->[0] = $self->batch_size;
-        AI::MXNet::DataDesc->new(name => $k, shape => $shape, dtype => $v->dtype)
-    } @{ $self->data }];
-}
-
-# The name and shape of label provided by this iterator
-method provide_label()
-{
-    return [map {
-        my ($k, $v) = @{ $_ };
-        my $shape = $v->shape;
-        $shape->[0] = $self->batch_size;
-        AI::MXNet::DataDesc->new(name => $k, shape => $shape, dtype => $v->dtype)
-    } @{ $self->label }];
-}
-
-# Ignore roll over data and set to start
-method hard_reset()
-{
-    $self->cursor(-$self->batch_size);
-}
-
-method reset()
-{
-    if($self->last_batch_handle eq 'roll_over' and $self->cursor > $self->num_data)
-    {
-        $self->cursor(-$self->batch_size + ($self->cursor%$self->num_data)%$self->batch_size);
-    }
-    else
-    {
-        $self->cursor(-$self->batch_size);
-    }
-}
-
-method iter_next()
-{
-    $self->cursor($self->batch_size + $self->cursor);
-    return $self->cursor < $self->num_data;
-}
-
-method next()
-{
-    if($self->iter_next)
-    {
-        return AI::MXNet::DataBatch->new(
-            data  => $self->getdata,
-            label => $self->getlabel,
-            pad   => $self->getpad,
-            index => undef
-        );
-    }
-    else
-    {
-        return undef;
-    }
-}
-
-# Load data from underlying arrays, internal use only
-method _getdata($data_source)
-{
-    confess("DataIter needs reset.") unless $self->cursor < $self->num_data;
-    if(($self->cursor + $self->batch_size) <= $self->num_data)
-    {
-        return [
-            map {
-                $_->[1]->slice([$self->cursor,$self->cursor+$self->batch_size-1])
-            } @{ $data_source }
-        ];
-    }
-    else
-    {
-        my $pad = $self->batch_size - $self->num_data + $self->cursor - 1;
-        return [
-            map {
-                AI::MXNet::NDArray->concatenate(
-                    [
-                        $_->[1]->slice([$self->cursor, -1]),
-                        $_->[1]->slice([0, $pad])
-                    ]
-                )
-            } @{ $data_source }
-        ];
-    }
-}
-
-method getdata()
-{
-    return $self->_getdata($self->data);
-}
-
-method getlabel()
-{
-    return $self->_getdata($self->label);
-}
-
-method getpad()
-{
-    if( $self->last_batch_handle eq 'pad'
-            and
-        ($self->cursor + $self->batch_size) > $self->num_data
-    )
-    {
-        return $self->cursor + $self->batch_size - $self->num_data;
-    }
-    else
-    {
-        return 0;
-    }
-}
-package AI::MXNet::MXDataIter;
-use Mouse;
-use AI::MXNet::Base;
-
-extends 'AI::MXNet::DataIter';
-
-=head1 NAME
-
-    AI::MXNet::MXDataIter - A data iterator pre-built in C++ layer of MXNet.
-=cut
-
-=head1 DESCRIPTION
-
-    Here are the list of currently available predefined iterators, for more custom iterators
-    please check out the examples directory.
-    Also please refer to the L<Python docs|https://mxnet.apache.org/api/python/io/io.html>
-    mx->io->CSVIter                     Returns the CSV file iterator.
-    mx->io->LibSVMIter                  Returns the LibSVM iterator which returns data with csr storage type.
-    mx->io->ImageRecordIter             Iterates on image RecordIO files
-    mx->io->ImageRecordInt8Iter         Iterating on image RecordIO files
-    mx->io->ImageRecordUInt8Iter        Iterating on image RecordIO files
-    mx->io->MNISTIter                   Iterating on the MNIST dataset.
-    mx->recordio->MXRecordIO            Reads/writes RecordIO data format, supporting sequential read and write.
-    mx->recordio->MXIndexedRecordIO     Reads/writes RecordIO data format, supporting random access.
-    mx->image->ImageIter                Image data iterator with a large number of augmentation choices.
-=cut
-
-has 'handle'           => (is => 'ro', isa => 'DataIterHandle', required => 1);
-has '_debug_skip_load' => (is => 'rw', isa => 'Int', default => 0);
-has '_debug_at_begin'  => (is => 'rw', isa => 'Int', default => 0);
-has 'data_name'        => (is => 'ro', isa => 'Str', default => 'data');
-has 'label_name'       => (is => 'ro', isa => 'Str', default => 'softmax_label');
-has [qw/first_batch
-        provide_data
-        provide_label
-        batch_size/]   => (is => 'rw', init_arg => undef);
-
-sub BUILD
-{
-    my $self = shift;
-    $self->first_batch($self->next);
-    my $data = $self->first_batch->data->[0];
-    $self->provide_data([
-        AI::MXNet::DataDesc->new(
-            name  => $self->data_name,
-            shape => $data->shape,
-            dtype => $data->dtype
-        )
-    ]);
-    my $label = $self->first_batch->label->[0];
-    $self->provide_label([
-        AI::MXNet::DataDesc->new(
-            name  => $self->label_name,
-            shape => $label->shape,
-            dtype => $label->dtype
-        )
-    ]);
-    $self->batch_size($data->shape->[0]);
-}
-
-sub DEMOLISH
-{
-    check_call(AI::MXNetCAPI::DataIterFree(shift->handle));
-}
-
-=head2 debug_skip_load
-
-    Set the iterator to simply return always first batch.
-    Notes
-    -----
-    This can be used to test the speed of network without taking
-    the loading delay into account.
-=cut
-
-method debug_skip_load()
-{
-    $self->_debug_skip_load(1);
-    AI::MXNet::Logging->info('Set debug_skip_load to be true, will simply return first batch');
-}
-
-method reset()
-{
-    $self->_debug_at_begin(1);
-    $self->first_batch(undef);
-    check_call(AI::MXNetCAPI::DataIterBeforeFirst($self->handle));
-}
-
-
-method next()
-{
-    if($self->_debug_skip_load and not $self->_debug_at_begin)
-    {
-        return  AI::MXNet::DataBatch->new(
-                    data  => [$self->getdata],
-                    label => [$self->getlabel],
-                    pad   => $self->getpad,
-                    index => $self->getindex
-        );
-    }
-    if(defined $self->first_batch)
-    {
-        my $batch = $self->first_batch;
-        $self->first_batch(undef);
-        return $batch
-    }
-    $self->_debug_at_begin(0);
-    my $next_res =  check_call(AI::MXNetCAPI::DataIterNext($self->handle));
-    if($next_res)
-    {
-        return  AI::MXNet::DataBatch->new(
-                    data  => [$self->getdata],
-                    label => [$self->getlabel],
-                    pad   => $self->getpad,
-                    index => $self->getindex
-        );
-    }
-    else
-    {
-        return undef;
-    }
-}
-
-method iter_next()
-{
-    if(defined $self->first_batch)
-    {
-        return 1;
-    }
-    else
-    {
-        return scalar(check_call(AI::MXNetCAPI::DataIterNext($self->handle)));
-    }
-}
-
-method getdata()
-{
-    my $handle = check_call(AI::MXNetCAPI::DataIterGetData($self->handle));
-    return AI::MXNet::NDArray->_ndarray_cls($handle);
-}
-
-method getlabel()
-{
-    my $handle = check_call(AI::MXNetCAPI::DataIterGetLabel($self->handle));
-    return AI::MXNet::NDArray->_ndarray_cls($handle);
-}
-
-method getindex()
-{
-    return pdl(check_call(AI::MXNetCAPI::DataIterGetIndex($self->handle)));
-}
-
-method getpad()
-{
-    return scalar(check_call(AI::MXNetCAPI::DataIterGetPadNum($self->handle)));
-}
-
-package AI::MXNet::IO;
-
-sub NDArrayIter { shift; return AI::MXNet::NDArrayIter->new(@_); }
-
-my %iter_meta;
-method get_iter_meta()
-{
-    return \%iter_meta;
-}
-
-# Create an io iterator by handle.
-func _make_io_iterator($handle)
-{
-    my ($iter_name, $desc,
-        $arg_names, $arg_types, $arg_descs
-    ) = @{ check_call(AI::MXNetCAPI::DataIterGetIterInfo($handle)) };
-    my $param_str = build_param_doc($arg_names, $arg_types, $arg_descs);
-    my $doc_str = "$desc\n\n"
-                  ."$param_str\n"
-                  ."name : string, required.\n"
-                  ."    Name of the resulting data iterator.\n\n"
-                  ."Returns\n"
-                  ."-------\n"
-                  ."iterator: DataIter\n"
-                  ."    The result iterator.";
-    my $iter = sub {
-        my $class = shift;
-        my (@args, %kwargs);
-        if(@_ and ref $_[-1] eq 'HASH')
-        {
-            %kwargs = %{ pop(@_) };
-        }
-        @args = @_;
-        Carp::confess("$iter_name can only accept keyword arguments")
-            if @args;
-        for my $key (keys %kwargs)
-        {
-            $kwargs{ $key } = "(" .join(",", @{ $kwargs{ $key } }) .")"
-                if ref $kwargs{ $key } eq 'ARRAY';
-        }
-        my $handle = check_call(
-            AI::MXNetCAPI::DataIterCreateIter(
-                $handle,
-                scalar(keys %kwargs),
-                \%kwargs
-            )
-        );
-        return AI::MXNet::MXDataIter->new(handle => $handle, %kwargs);
-    };
-    $iter_meta{$iter}{__name__} = $iter_name;
-    $iter_meta{$iter}{__doc__}  = $doc_str;
-    return $iter;
-}
-
-# List and add all the data iterators to current module.
-method _init_io_module()
-{
-    for my $creator (@{ check_call(AI::MXNetCAPI::ListDataIters()) })
-    {
-        my $data_iter = _make_io_iterator($creator);
-        {
-            my $name = $iter_meta{ $data_iter }{__name__};
-            no strict 'refs';
-            {
-                *{__PACKAGE__."::$name"} = $data_iter;
-            }
-        }
-    }
-}
-
-# Initialize the io in startups
-__PACKAGE__->_init_io_module;
-
-1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Image.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Image.pm
deleted file mode 100644
index ab4e7964918b..000000000000
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Image.pm
+++ /dev/null
@@ -1,994 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-package AI::MXNet::Image;
-use strict;
-use warnings;
-use Scalar::Util qw(blessed);
-use AI::MXNet::NS;
-use AI::MXNet::Base;
-use AI::MXNet::Function::Parameters;
-use AI::MXNet::Image::NDArray;
-use AI::MXNet::Image::Symbol;
-
-=head1 NAME
-
-    AI::MXNet:Image - Read individual image files and perform augmentations.
-=cut
-
-=head2 imread
-
-    Read and decode an image to an NDArray.
-
-    Note: `imread` uses OpenCV.
-    MXNet must have been built with USE_OPENCV=1 for `imdecode` to work.
-
-    Parameters
-    ----------
-    $filename : str
-        Name of the image file to be loaded.
-    :$flag : int
-        0 for grayscale. 1 for colored.
-    :$to_rgb : int
-        0 for BGR format (OpenCV default). 1 for RGB format (MXNet default).
-    :$out : NDArray
-        Output buffer. Do not specify for automatic allocation.
-
-    Returns
-    -------
-    An NDArray containing the image.
-
-    Example
-    -------
-    >>> mx->img->imread("flower.jpg");
-    <NDArray 224x224x3 @cpu(0)>
-
-    Set `flag` parameter to 0 to get grayscale output
-
-    >>> mx->img->imdecode("flower.jpg", flag=>0);
-    <NDArray 224x224x1 @cpu(0)>
-
-    Set `to_rgb` parameter to 0 to get output in OpenCV format (BGR)
-
-    >>> mx->img->imdecode($str_image, to_rgb=>0);
-    <NDArray 224x224x3 @cpu(0)>
-=cut
-
-method imread(Str $filename, Int :$flag=1, Int :$to_rgb=1, Maybe[AI::MXNet::NDArray] :$out=)
-{
-    return AI::MXNet::NDArray->_cvimread($filename, { flag => $flag, to_rgb => $to_rgb, ($out ? (out => $out) : ()) });
-}
-
-=head2 imdecode
-
-    Decode an image from string. Requires OpenCV to work.
-
-    Parameters
-    ----------
-    $buf : str, array ref, pdl, ndarray
-        Binary image data.
-    :$flag : int
-        0 for grayscale. 1 for colored.
-    :$to_rgb : int
-        0 for BGR format (OpenCV default). 1 for RGB format (MXNet default).
-    :$out : NDArray
-        Output buffer. Do not specify for automatic allocation.
-=cut
-
-method imdecode(Str|PDL $buf, Int :$flag=1, Int :$to_rgb=1, Maybe[AI::MXNet::NDArray] :$out=)
-{
-    if(not ref $buf)
-    {
-        my $pdl_type = PDL::Type->new(DTYPE_MX_TO_PDL->{'uint8'});
-        my $len; { use bytes; $len = length $buf; }
-        my $pdl = PDL->new_from_specification($pdl_type, $len);
-        ${$pdl->get_dataref} = $buf;
-        $pdl->upd_data;
-        $buf = $pdl;
-    }
-    if(not (blessed $buf and $buf->isa('AI::MXNet::NDArray')))
-    {
-        $buf = AI::MXNet::NDArray->array($buf, dtype=>'uint8');
-    }
-    return AI::MXNet::NDArray->_cvimdecode($buf, { flag => $flag, to_rgb => $to_rgb, ($out ? (out => $out) : ()) });
-}
-
-=head2 scale_down
-
-Scale down crop size if it's bigger than the image size.
-
-    Parameters:
-    -----------
-    Shape $src_size
-    Shape $size
-
-    Returns:
-    --------
-    ($w, $h)
-=cut
-
-method scale_down(Shape $src_size, Shape $size)
-{
-    my ($w, $h) = @{ $size };
-    my ($sw, $sh) = @{ $src_size };
-    if($sh < $h)
-    {
-        ($w, $h) = (($w*$sh)/$h, $sh);
-    }
-    if($sw < $w)
-    {
-        ($w, $h) = ($sw, ($h*$sw)/$w);
-    }
-    return (int($w), int($h));
-}
-
-=head2 resize_short
-
-    Resize shorter edge to the size.
-
-    Parameters:
-    -----------
-    AI::MXNet::NDArray $src
-    Int                $size
-    Int                $interp=2
-
-    Returns:
-    --------
-    AI::MXNet::NDArray $resized_image
-=cut
-
-method resize_short(AI::MXNet::NDArray $src, Int $size, Int $interp=2)
-{
-    my ($new_h, $new_w);
-    my ($h, $w) = @{ $src->shape };
-    if($h > $w)
-    {
-        ($new_h, $new_w) = ($size*$h/$w, $size);
-    }
-    else
-    {
-        ($new_h, $new_w) = ($size, $size*$w/$h);
-    }
-    return AI::MXNet::NDArray->_cvimresize($src, int $new_w, int $new_h, { interp=>$interp });
-}
-
-=head2 fixed_crop
-
-    Crop src at fixed location, and (optionally) resize it to the size.
-
-    Parameters:
-    -----------
-    AI::MXNet::NDArray $src
-    Int                $x0
-    Int                $y0
-    Int                $w
-    Int                $h
-    Maybe[Shape]       $size=
-    Int                $interp=2
-
-    Returns:
-    --------
-    AI::MXNet::NDArray $cropped_image
-=cut
-
-method fixed_crop(AI::MXNet::NDArray $src, Int $x0, Int $y0, Int $w, Int $h, Maybe[Shape] $size=, Int $interp=2)
-{
-    my $out = AI::MXNet::NDArray->crop($src, { begin=>[$y0, $x0, 0], end=>[$y0+$h, $x0+$w, $src->shape->[2]] });
-    if(defined $size and join(',', $w, $h) ne join(',', @{ $size }))
-    {
-        $out = AI::MXNet::NDArray->_cvimresize($out, (map { int } @{ $size }), { interp=>$interp });
-    }
-    return $out;
-}
-
-=head2 random_crop
-
-    Randomly crop src with size. Upsample result if src is smaller than the size.
-
-    Parameters:
-    -----------
-    AI::MXNet::NDArray $src
-    Shape              $size=
-    Int                $interp=2
-
-    Returns:
-    --------
-    ($cropped_image, [$x0, $y0, $new_w, $new_h])
-=cut
-
-method random_crop(AI::MXNet::NDArray $src, Shape $size, Int $interp=2)
-{
-    my ($h, $w) = @{ $src->shape };
-    my ($new_w, $new_h) = __PACKAGE__->scale_down([$w, $h], $size);
-
-    my $x0 = int(rand($w - $new_w + 1));
-    my $y0 = int(rand($h - $new_h + 1));
-
-    my $out = __PACKAGE__->fixed_crop($src, $x0, $y0, $new_w, $new_h, $size, $interp);
-    return ($out, [$x0, $y0, $new_w, $new_h]);
-}
-
-=head2 center_crop
-
-    Randomly crop src with size around the center. Upsample result if src is smaller than the size.
-
-    Parameters:
-    -----------
-    AI::MXNet::NDArray $src
-    Shape              $size=
-    Int                $interp=2
-
-    Returns:
-    --------
-    ($cropped_image, [$x0, $y0, $new_w, $new_h])
-=cut
-
-method center_crop(AI::MXNet::NDArray $src, Shape $size, Int $interp=2)
-{
-    my ($h, $w) = @{ $src->shape };
-    my ($new_w, $new_h) = __PACKAGE__->scale_down([$w, $h], $size);
-
-    my $x0 = int(($w - $new_w)/2);
-    my $y0 = int(($h - $new_h)/2);
-
-    my $out = __PACKAGE__->fixed_crop($src, $x0, $y0, $new_w, $new_h, $size, $interp);
-    return ($out, [$x0, $y0, $new_w, $new_h]);
-}
-
-=head2 color_normalize
-
-    Normalize src with mean and std.
-
-    Parameters:
-    -----------
-    AI::MXNet::NDArray $src
-    Num|AI::MXNet::NDArray $mean
-    Maybe[Num|AI::MXNet::NDArray] $std=
-    Int $interp=2
-
-    Returns:
-    --------
-    AI::MXNet::NDArray $normalized_image
-=cut
-
-method color_normalize(AI::MXNet::NDArray $src, Num|AI::MXNet::NDArray $mean, Maybe[Num|AI::MXNet::NDArray] $std=)
-{
-    $src -= $mean;
-    if(defined $std)
-    {
-        $src /= $std;
-    }
-    return $src;
-}
-
-=head2 random_size_crop
-
-    Randomly crop src with size. Randomize area and aspect ratio.
-
-    Parameters:
-    -----------
-    AI::MXNet::NDArray $src
-    Shape              $size
-    Num                $min_area
-    ArrayRef[Int]      [$from, $to] # $ratio
-    Maybe[Int]         $interp=2
-
-    Returns:
-    --------
-    ($cropped_image, [$x0, $y0, $new_w, $new_h])
-=cut
-
-method random_size_crop(AI::MXNet::NDArray $src, Shape $size, Num $min_area, ArrayRef[Num] $ratio, Maybe[Int] $interp=2)
-{
-    my ($h, $w) = @{ $src->shape };
-    my ($from, $to) = @{ $ratio };
-    my $new_ratio = $from + ($to-$from) * rand;
-    my $max_area;
-    if($new_ratio * $h > $w)
-    {
-        $max_area = $w*int($w/$new_ratio);
-    }
-    else
-    {
-        $max_area = $h*int($h*$new_ratio);
-    }
-
-    $min_area *= $h*$w;
-    if($max_area < $min_area)
-    {
-        return __PACKAGE__->random_crop($src, $size, $interp);
-    }
-    my $new_area = $min_area + ($max_area-$min_area) * rand;
-    my $new_w = int(sqrt($new_area*$new_ratio));
-    my $new_h = $new_w;
-
-    assert($new_w <= $w and $new_h <= $h);
-    my $x0 = int(rand($w - $new_w + 1));
-    my $y0 = int(rand($h - $new_h + 1));
-
-    my $out = __PACKAGE__->fixed_crop($src, $x0, $y0, $new_w, $new_h, $size, $interp);
-    return ($out, [$x0, $y0, $new_w, $new_h]);
-}
-
-=head2 ResizeAug
-
-    Makes "resize shorter edge to size augumenter" closure.
-
-    Parameters:
-    -----------
-    Shape              $size
-    Int                $interp=2
-
-    Returns:
-    --------
-    CodeRef that accepts AI::MXNet::NDArray $src as input
-    and returns [__PACKAGE__->resize_short($src, $size, $interp)]
-=cut
-
-method ResizeAug(Shape $size, Int $interp=2)
-{
-    my $aug = sub {
-        my $src = shift;
-        return [__PACKAGE__->resize_short($src, $size, $interp)];
-    };
-    return $aug;
-}
-
-=head2 RandomCropAug
-
-    Makes "random crop augumenter" closure.
-
-    Parameters:
-    -----------
-    Shape              $size
-    Int                $interp=2
-
-    Returns:
-    --------
-    CodeRef that accepts AI::MXNet::NDArray $src as input
-    and returns [(__PACKAGE__->random_crop($src, $size, $interp))[0]]
-=cut
-
-method RandomCropAug(Shape $size, Int $interp=2)
-{
-    my $aug = sub {
-        my $src = shift;
-        return [(__PACKAGE__->random_crop($src, $size, $interp))[0]];
-    };
-    return $aug;
-}
-
-=head2 RandomSizedCropAug
-
-    Makes "random crop augumenter" closure.
-
-    Parameters:
-    -----------
-    Shape              $size
-    Num                $min_area
-    ArrayRef[Num]      $ratio
-    Int                $interp=2
-
-    Returns:
-    CodeRef that accepts AI::MXNet::NDArray $src as input
-    and returns [(__PACKAGE__->random_size_crop($src, $size, $min_area, $ratio, $interp))[0]]
-=cut
-
-method RandomSizedCropAug(Shape $size, Num $min_area, ArrayRef[Num] $ratio, Int $interp=2)
-{
-    my $aug = sub {
-        my $src = shift;
-        return [(__PACKAGE__->random_size_crop($src, $size, $min_area, $ratio, $interp))[0]];
-    };
-    return $aug;
-}
-
-=head2 CenterCropAug
-
-    Makes "center crop augumenter" closure.
-
-    Parameters:
-    -----------
-    Shape              $size
-    Int                $interp=2
-
-    Returns:
-    CodeRef that accepts AI::MXNet::NDArray $src as input
-    and returns [(__PACKAGE__->center_crop($src, $size, $interp))[0]]
-=cut
-
-method CenterCropAug(Shape $size, Int $interp=2)
-{
-    my $aug = sub {
-        my $src = shift;
-        return [(__PACKAGE__->center_crop($src, $size, $interp))[0]];
-    };
-    return $aug;
-}
-
-=head2 RandomOrderAug
-
-    Makes "Apply list of augmenters in random order" closure.
-
-    Parameters:
-    -----------
-    ArrayRef[CodeRef]  $ts
-
-    Returns:
-    --------
-    CodeRef that accepts AI::MXNet::NDArray $src as input
-    and returns ArrayRef[AI::MXNet::NDArray]
-=cut
-
-method RandomOrderAug(ArrayRef[CodeRef] $ts)
-{
-    my $aug = sub {
-        my $src = shift;
-        my @ts = List::Util::shuffle(@{ $ts });
-        my @tmp;
-        for my $t (@ts)
-        {
-            push @tmp, $t->($src);
-        }
-        return \@tmp;
-    };
-    return $aug;
-}
-
-=head2 RandomOrderAug
-
-    Makes "Apply random brightness, contrast and saturation jitter in random order" closure
-
-    Parameters:
-    -----------
-    Num $brightness
-    Num $contrast
-    Num $saturation
-
-    Returns:
-    --------
-    CodeRef that accepts AI::MXNet::NDArray $src as input
-    and returns ArrayRef[AI::MXNet::NDArray]
-=cut
-
-method ColorJitterAug(Num $brightness, Num $contrast, Num $saturation)
-{
-    my @ts;
-    my $coef = AI::MXNet::NDArray->array([[[0.299, 0.587, 0.114]]]);
-    if($brightness > 0)
-    {
-        my $baug = sub { my $src = shift;
-            my $alpha = 1 + -$brightness + 2 * $brightness * rand;
-            $src *= $alpha;
-            return [$src];
-        };
-        push @ts, $baug;
-    }
-
-    if($contrast > 0)
-    {
-        my $caug = sub { my $src = shift;
-            my $alpha = 1 + -$contrast + 2 * $contrast * rand;
-            my $gray  = $src*$coef;
-            $gray = (3.0*(1.0-$alpha)/$gray->size)*$gray->sum;
-            $src *= $alpha;
-            $src += $gray;
-            return [$src];
-        };
-        push @ts, $caug;
-    }
-
-    if($saturation > 0)
-    {
-        my $saug = sub { my $src = shift;
-            my $alpha = 1 + -$saturation + 2 * $saturation * rand;
-            my $gray  = $src*$coef;
-            $gray = AI::MXNet::NDArray->sum($gray, { axis=>2, keepdims =>1 });
-            $gray *= (1.0-$alpha);
-            $src *= $alpha;
-            $src += $gray;
-            return [$src];
-        };
-        push @ts, $saug;
-    }
-
-    return __PACKAGE__->RandomOrderAug(\@ts);
-}
-
-=head2 LightingAug
-
-    Makes "Add PCA based noise" closure.
-
-    Parameters:
-    -----------
-    Num $alphastd
-    PDL $eigval
-    PDL $eigvec
-
-    Returns:
-    --------
-    CodeRef that accepts AI::MXNet::NDArray $src as input
-    and returns ArrayRef[AI::MXNet::NDArray]
-=cut
-
-method LightingAug(Num $alphastd, PDL $eigval, PDL $eigvec)
-{
-    my $aug = sub { my $src = shift;
-        my $alpha = AI::MXNet::NDArray->zeros([3]);
-        AI::MXNet::Random->normal(0, $alphastd, { out => $alpha });
-        my $rgb = ($eigvec*$alpha->aspdl) x $eigval;
-        $src += AI::MXNet::NDArray->array($rgb);
-        return [$src]
-    };
-    return $aug
-}
-
-=head2 ColorNormalizeAug
-
-    Makes "Mean and std normalization" closure.
-
-    Parameters:
-    -----------
-    PDL $mean
-    PDL $std
-
-    Returns:
-    --------
-    CodeRef that accepts AI::MXNet::NDArray $src as input
-    and returns [__PACKAGE__->color_normalize($src, $mean, $std)]
-=cut
-
-method ColorNormalizeAug(PDL $mean, PDL $std)
-{
-    $mean = AI::MXNet::NDArray->array($mean);
-    $std = AI::MXNet::NDArray->array($std);
-    my $aug = sub { my $src = shift;
-        return [__PACKAGE__->color_normalize($src, $mean, $std)]
-    };
-    return $aug;
-}
-
-=head2 HorizontalFlipAug
-
-    Makes "Random horizontal flipping" closure.
-
-    Parameters:
-    -----------
-    Num $p < 1
-
-    Returns:
-    --------
-    CodeRef that accepts AI::MXNet::NDArray $src as input
-    and returns [$p > rand ? AI::MXNet::NDArray->flip($src, axis=1>) : $src]
-=cut
-
-method HorizontalFlipAug(Num $p)
-{
-    my $aug = sub { my $src = shift;
-        return [$p > rand() ? AI::MXNet::NDArray->flip($src, { axis=>1 }) : $src]
-    };
-    return $aug;
-}
-
-=head2 CastAug
-
-    Makes "Cast to float32" closure.
-
-    Returns:
-    --------
-    CodeRef that accepts AI::MXNet::NDArray $src as input
-    and returns [$src->astype('float32')]
-=cut
-
-method CastAug()
-{
-    my $aug = sub { my $src = shift;
-        return [$src->astype('float32')]
-    };
-    return $aug;
-}
-
-=head2 CreateAugmenter
-
-    Create augumenter list
-
-    Parameters:
-    -----------
-    Shape          :$data_shape,
-    Bool           :$resize=0,
-    Bool           :$rand_crop=0,
-    Bool           :$rand_resize=0,
-    Bool           :$rand_mirror=0,
-    Maybe[Num|PDL] :$mean=,
-    Maybe[Num|PDL] :$std=,
-    Num            :$brightness=0,
-    Num            :$contrast=0,
-    Num            :$saturation=0,
-    Num            :$pca_noise=0,
-    Int            :$inter_method=2
-=cut
-
-method CreateAugmenter(
-    Shape          :$data_shape,
-    Bool           :$resize=0,
-    Bool           :$rand_crop=0,
-    Bool           :$rand_resize=0,
-    Bool           :$rand_mirror=0,
-    Maybe[Num|PDL] :$mean=,
-    Maybe[Num|PDL] :$std=,
-    Num            :$brightness=0,
-    Num            :$contrast=0,
-    Num            :$saturation=0,
-    Num            :$pca_noise=0,
-    Int            :$inter_method=2
-)
-{
-    my @auglist;
-    if($resize > 0)
-    {
-        push @auglist, __PACKAGE__->ResizeAug($resize, $inter_method);
-    }
-
-    my $crop_size = [$data_shape->[2], $data_shape->[1]];
-    if($rand_resize)
-    {
-        assert($rand_crop);
-        push @auglist, __PACKAGE__->RandomSizedCropAug($crop_size, 0.3, [3.0/4.0, 4.0/3.0], $inter_method);
-    }
-    elsif($rand_crop)
-    {
-        push @auglist, __PACKAGE__->RandomCropAug($crop_size, $inter_method);
-    }
-    else
-    {
-        push @auglist, __PACKAGE__->CenterCropAug($crop_size, $inter_method);
-    }
-
-    if($rand_mirror)
-    {
-        push @auglist, __PACKAGE__->HorizontalFlipAug(0.5);
-    }
-
-    push @auglist, __PACKAGE__->CastAug;
-
-    if($brightness or $contrast or $saturation)
-    {
-        push @auglist, __PACKAGE__->ColorJitterAug($brightness, $contrast, $saturation);
-    }
-    if($pca_noise > 0)
-    {
-        my $eigval = AI::MXNet::NDArray->array([55.46, 4.794, 1.148])->aspdl;
-        my $eigvec = AI::MXNet::NDArray->array([[-0.5675, 0.7192, 0.4009],
-                           [-0.5808, -0.0045, -0.8140],
-                           [-0.5836, -0.6948, 0.4203]])->aspdl;
-        push @auglist, __PACKAGE__->LightingAug($pca_noise, $eigval, $eigvec);
-    }
-
-    if($mean)
-    {
-        $mean = AI::MXNet::NDArray->array([123.68, 116.28, 103.53])->aspdl;
-    }
-    if($std)
-    {
-        $std = AI::MXNet::NDArray->array([58.395, 57.12, 57.375])->aspdl;
-    }
-    if(defined $mean)
-    {
-        assert(defined $std);
-        push @auglist, __PACKAGE__->ColorNormalizeAug($mean, $std);
-    }
-
-    return \@auglist;
-}
-
-method imresize(AI::MXNet::NDArray $src, Int $w, Int $h, Int $interp=2)
-{
-    return AI::MXNet::NDArray->_cvimresize($src, int $w, int $h, { interp=>$interp });
-}
-
-method ImageIter(@args) { AI::MXNet::ImageIter->new(@args) }
-
-package AI::MXNet::ImageIter;
-use Mouse;
-use AI::MXNet::Base;
-extends 'AI::MXNet::DataIter';
-
-=head1 NAME
-
-    AI::MXNet::ImageIter - Image data iterator.
-=cut
-
-=head1 DESCRIPTION
-
-
-    Image data iterator with a large number of augumentation choices.
-    Supports reading from both .rec files and raw image files with image list.
-
-    To load from .rec files, please specify path_imgrec. Also specify path_imgidx
-    to use data partition (for distributed training) or shuffling.
-
-    To load from raw image files, specify path_imglist and path_root.
-
-    Parameters
-    ----------
-    batch_size : Int
-        Number of examples per batch
-    data_shape : Shape
-        Data shape in (channels, height, width).
-        For now, only RGB image with 3 channels is supported.
-    label_width : Int
-        dimension of label
-    path_imgrec : str
-        path to image record file (.rec).
-        Created with tools/im2rec.py or bin/im2rec
-    path_imglist : str
-        path to image list (.lst)
-        Created with tools/im2rec.py or with custom script.
-        Format: index\t[one or more label separated by \t]\trelative_path_from_root
-    imglist: array ref
-        a list of image with the label(s)
-        each item is a list [imagelabel: float or array ref of float, imgpath]
-    path_root : str
-        Root folder of image files
-    path_imgidx : str
-        Path to image index file. Needed for partition and shuffling when using .rec source.
-    shuffle : bool
-        Whether to shuffle all images at the start of each iteration.
-    Can be slow for HDD.
-    part_index : int
-        Partition index
-    num_parts : int
-        Total number of partitions.
-    data_name='data' Str
-    label_name='softmax_label' Str
-    kwargs : hash ref with any additional arguments for augmenters
-=cut
-
-has 'batch_size'  => (is => 'ro', isa => 'Int',   required => 1);
-has 'data_shape'  => (is => 'ro', isa => 'Shape', required => 1);
-has 'label_width' => (is => 'ro', isa => 'Int',   default  => 1);
-has 'data_name'   => (is => 'ro', isa => 'Str',   default  => 'data');
-has 'label_name'  => (is => 'ro', isa => 'Str',   default  => 'softmax_label');
-has [qw/path_imgrec
-        path_imglist
-        path_root
-        path_imgidx
-    /]            => (is => 'ro', isa => 'Str');
-has 'shuffle'     => (is => 'ro', isa => 'Bool', default => 0);
-has 'part_index'  => (is => 'ro', isa => 'Int', default => 0);
-has 'num_parts'   => (is => 'ro', isa => 'Int', default => 0);
-has 'aug_list'    => (is => 'rw', isa => 'ArrayRef[CodeRef]');
-has 'imglist'     => (is => 'rw', isa => 'ArrayRef|HashRef');
-has 'kwargs'      => (is => 'ro', isa => 'HashRef');
-has [qw/imgidx
-        imgrec
-        seq
-        cur
-        provide_data
-        provide_label
-           /]     => (is => 'rw', init_arg => undef);
-
-sub BUILD
-{
-    my $self = shift;
-    assert($self->path_imgrec or $self->path_imglist or ref $self->imglist eq 'ARRAY');
-    if($self->path_imgrec)
-    {
-        print("loading recordio...\n");
-        if($self->path_imgidx)
-        {
-            $self->imgrec(
-                AI::MXNet::IndexedRecordIO->new(
-                    idx_path => $self->path_imgidx,
-                    uri => $self->path_imgrec,
-                    flag => 'r'
-                )
-            );
-            $self->imgidx([@{ $self->imgrec->keys }]);
-        }
-        else
-        {
-            $self->imgrec(AI::MXNet::RecordIO->new(uri => $self->path_imgrec, flag => 'r'));
-        }
-    }
-    my %imglist;
-    my @imgkeys;
-    if($self->path_imglist)
-    {
-        print("loading image list...\n");
-        open(my $f, $self->path_imglist) or confess("can't open ${\ $self->path_imglist } : $!");
-        while(my $line = <$f>)
-        {
-            chomp($line);
-            my @line = split(/\t/, $line);
-            my $label = AI::MXNet::NDArray->array([@line[1..@line-2]]);
-            my $key   = $line[0];
-            $imglist{$key} = [$label, $line[-1]];
-            push @imgkeys, $key;
-        }
-        $self->imglist(\%imglist);
-    }
-    elsif(ref $self->imglist eq 'ARRAY')
-    {
-        print("loading image list...\n");
-        my %result;
-        my $index = 1;
-        for my $img (@{ $self->imglist })
-        {
-            my $key = $index++;
-            my $label;
-            if(not ref $img->[0])
-            {
-                $label = AI::MXNet::NDArray->array([$img->[0]]);
-            }
-            else
-            {
-                $label = AI::MXNet::NDArray->array($img->[0]);
-                $result{$key} = [$label, $img->[1]];
-                push @imgkeys, $key;
-            }
-        }
-        $self->imglist(\%result);
-    }
-    assert(@{ $self->data_shape } == 3 and $self->data_shape->[0] == 3);
-    $self->provide_data([
-        AI::MXNet::DataDesc->new(
-            name  => $self->data_name,
-            shape => [$self->batch_size, @{ $self->data_shape }]
-        )
-    ]);
-    if($self->label_width > 1)
-    {
-        $self->provide_label([
-            AI::MXNet::DataDesc->new(
-                name  => $self->label_name,
-                shape => [$self->batch_size, $self->label_width]
-            )
-        ]);
-    }
-    else
-    {
-        $self->provide_label([
-            AI::MXNet::DataDesc->new(
-                name  => $self->label_name,
-                shape => [$self->batch_size]
-            )
-        ]);
-    }
-    if(not defined $self->imgrec)
-    {
-        $self->seq(\@imgkeys);
-    }
-    elsif($self->shuffle or $self->num_parts > 1)
-    {
-        assert(defined $self->imgidx);
-        $self->seq($self->imgidx);
-    }
-    if($self->num_parts > 1)
-    {
-        assert($self->part_index < $self->num_parts);
-        my $N = @{ $self->seq };
-        my $C = $N/$self->num_parts;
-        $self->seq([@{ $self->seq }[$self->part_index*$C..($self->part_index+1)*$C-1]]);
-    }
-    if(defined $self->aug_list or defined $self->kwargs)
-    {
-        $self->aug_list(AI::MXNet::Image->CreateAugmenter(data_shape => $self->data_shape, %{ $self->kwargs//{} }));
-    }
-    else
-    {
-        $self->aug_list([]);
-    }
-    $self->cur(0);
-    $self->reset();
-}
-
-method reset()
-{
-    if($self->shuffle)
-    {
-        @{ $self->seq } = List::Util::shuffle(@{ $self->seq });
-    }
-    if(defined $self->imgrec)
-    {
-        $self->imgrec->reset;
-    }
-    $self->cur(0);
-}
-
-method next_sample()
-{
-    if(defined $self->seq)
-    {
-        return undef if($self->cur >= @{ $self->seq });
-        my $idx = $self->seq->[$self->cur];
-        $self->cur($self->cur + 1);
-        if(defined $self->imgrec)
-        {
-            my $s = $self->imgrec->read_idx($idx);
-            my ($header, $img) = AI::MXNet::RecordIO->unpack($s);
-            if(not defined $self->imglist)
-            {
-                return ($header->label, $img);
-            }
-            else
-            {
-                return ($self->imglist->{$idx}[0], $img);
-            }
-        }
-        else
-        {
-            my ($label, $fname) = @{ $self->imglist->{$idx} };
-            if(not defined $self->imgrec)
-            {
-                open(F, $self->path_root . "/$fname") or confess("can't open $fname $!");
-                my $img;
-                { local $/ = undef; $img = <F> };
-                close(F);
-                return ($label, $img);
-            }
-        }
-    }
-    else
-    {
-        my $s = $self->imgrec->read;
-        return undef if(not defined $s);
-        my ($header, $img) = AI::MXNet::RecordIO->unpack($s);
-        return ($header->label, $img)
-    }
-}
-
-method next()
-{
-    my $batch_size = $self->batch_size;
-    my ($c, $h, $w) = @{ $self->data_shape };
-    my $batch_data  = AI::MXNet::NDArray->empty([$batch_size, $c, $h, $w]);
-    my $batch_label = AI::MXNet::NDArray->empty(@{$self->provide_label->[0]}[1]);
-    my $i = 0;
-    while ($i < $batch_size)
-    {
-        my ($label, $s) = $self->next_sample;
-        last if not defined $label;
-        my $data = [AI::MXNet::Image->imdecode($s)];
-        if(@{ $data->[0]->shape } == 0)
-        {
-            AI::MXNet::Logging->debug('Invalid image, skipping.');
-            next;
-        }
-        for my $aug (@{ $self->aug_list })
-        {
-            $data = [map { @{ $aug->($_) } } @$data];
-        }
-        for my $d (@$data)
-        {
-            assert(($i < $batch_size), 'Batch size must be multiples of augmenter output length');
-            $batch_data->at($i)  .= AI::MXNet::NDArray->transpose($d, { axes=>[2, 0, 1] });
-            $batch_label->at($i) .= $label;
-            $i++;
-        }
-    }
-    return undef if not $i;
-    return AI::MXNet::DataBatch->new(data=>[$batch_data], label=>[$batch_label], pad => $batch_size-$i);
-}
-
-package AI::MXNet::Image;
-sub sym    { 'AI::MXNet::Image::Symbol'  }
-sub symbol { 'AI::MXNet::Image::Symbol'  }
-sub nd     { 'AI::MXNet::Image::NDArray' }
-sub ndarray { 'AI::MXNet::Image::NDArray' }
-
-1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Image/NDArray.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Image/NDArray.pm
deleted file mode 100644
index aad1782b0b53..000000000000
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Image/NDArray.pm
+++ /dev/null
@@ -1,24 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-package AI::MXNet::Image::NDArray;
-use strict;
-use warnings;
-use parent 'AI::MXNet::AutoLoad';
-sub config { ('image', 'AI::MXNet::NDArray') }
-
-1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Image/Symbol.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Image/Symbol.pm
deleted file mode 100644
index 60416418b1ad..000000000000
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Image/Symbol.pm
+++ /dev/null
@@ -1,24 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-package AI::MXNet::Image::Symbol;
-use strict;
-use warnings;
-use parent 'AI::MXNet::AutoLoad';
-sub config { ('image', 'AI::MXNet::Symbol') }
-
-1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Initializer.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Initializer.pm
deleted file mode 100644
index 089731f16eee..000000000000
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Initializer.pm
+++ /dev/null
@@ -1,875 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-package AI::MXNet::InitDesc;
-use Mouse;
-use AI::MXNet::Function::Parameters;
-
-=head1 NAME
-
-    AI::MXNet::InitDesc - A container for the initialization pattern serialization.
-
-=head2 new
-
-    Parameters
-    ---------
-    name : str
-        name of variable
-    attrs : hash ref of str to str
-        attributes of this variable taken from AI::MXNet::Symbol->attr_dict
-=cut
-has 'name'        => (is => 'ro', isa => 'Str', required => 1);
-has 'attrs'       => (is => 'rw', isa => 'HashRef[Str]', lazy => 1, default => sub { +{} });
-use overload '""' => sub { shift->name };
-around BUILDARGS => sub {
-    my $orig  = shift;
-    my $class = shift;
-    return $class->$orig(name => $_[0]) if @_ == 1;
-    return $class->$orig(@_);
-};
-
-# Base class for Initializers
-package AI::MXNet::Initializer;
-use Mouse;
-use AI::MXNet::NS;
-use AI::MXNet::Base qw(:DEFAULT pzeros pceil);
-use AI::MXNet::NDArray;
-use JSON::PP;
-use overload "&{}" => sub { my $self = shift; sub { $self->call(@_) } },
-             '""'  => sub {
-                my $self = shift;
-                my ($name) = ref($self) =~ /::(\w+)$/;
-                encode_json(
-                    [lc $name,
-                        $self->kwargs//{ map { $_ => "".$self->$_ } $self->meta->get_attribute_list }
-                ]);
-             },
-             fallback => 1;
-has 'kwargs' => (is => 'rw', init_arg => undef, isa => 'HashRef');
-has '_verbose'    => (is => 'rw', isa => 'Bool', lazy => 1, default => 0);
-has '_print_func' => (is => 'rw', isa => 'CodeRef', lazy => 1,
-    default => sub {
-        return sub {
-            my $x = shift;
-            return ($x->norm/sqrt($x->size))->asscalar;
-        };
-    }
-);
-
-=head1 NAME
-
-    AI::MXNet::Initializer - Base class for all Initializers
-
-=head1 DESCRIPTION
-
-    The base class AI::MXNet::Initializer defines the default behaviors to initialize various parameters,
-    such as set bias to 1, except for the weight. Other classes then define how to initialize the weights.
-    Currently following classes are available:
-    mx->init->Uniform    Initializes weights with random values uniformly sampled from a given range.
-    mx->init->Normal     Initializes weights with random values sampled from a normal distribution with a mean of zero and standard deviation of sigma.
-    mx->init->Load       Initializes variables by loading data from file or dict.
-    mx->init->Mixed      Initialize parameters using multiple initializers.
-    mx->init->Zero       Initializes weights to zero.
-    mx->init->One        Initializes weights to one.
-    mx->init->Constant   Initializes the weights to a given value.
-    mx->init->Orthogonal Initialize weight as orthogonal matrix.
-    mx->init->Xavier     Returns an initializer performing Xavier initialization for weights.
-    mx->init->MSRAPrelu  Initialize the weight according to a MSRA paper.
-    mx->init->Bilinear   Initialize weight for upsampling layers.
-    mx->init->FusedRNN   Initialize parameters for fused rnn layers.
-
-=head2 register
-
-    Register an initializer class to the AI::MXNet::Initializer factory.
-=cut
-
-=head2 set_verbosity
-
-    Switch on/off verbose mode
-
-    Parameters
-    ----------
-    $verbose : bool
-        switch on/off verbose mode
-    $print_func : CodeRef
-        A function that computes statistics of initialized arrays.
-        Takes an AI::MXNet::NDArray and returns a scalar. Defaults to mean
-        absolute value |x|/size(x)
-=cut
-
-method set_verbosity(Bool $verbose=0, CodeRef $print_func=)
-{
-    $self->_verbose($verbose);
-    $self->_print_func($print_func) if defined $print_func;
-}
-
-method _verbose_print($desc, $init, $arr)
-{
-    if($self->_verbose and defined $self->_print_func)
-    {
-        AI::MXNet::Logging->info('Initialized %s as %s: %s', $desc, $init, $self->_print_func->($arr));
-    }
-}
-
-my %init_registry;
-method get_init_registry()
-{
-    return \%init_registry;
-}
-
-method register()
-{
-    my ($name) = $self =~ /::(\w+)$/;
-    my $orig_name = $name;
-    $name         = lc $name;
-    if(exists $init_registry{ $name })
-    {
-        my $existing = $init_registry{ $name };
-        warn(
-            "WARNING: New initializer $self.$name"
-            ."is overriding existing initializer $existing.$name"
-        );
-    }
-    $init_registry{ $name } = $self;
-    {
-        no strict 'refs';
-        no warnings 'redefine';
-        *{"$orig_name"} = sub { shift; $self->new(@_) };
-        *InitDesc       = sub { shift; AI::MXNet::InitDesc->new(@_) };
-    }
-}
-
-=head2 init
-
-    Parameters
-    ----------
-    $desc : AI::MXNet::InitDesc|str
-        a name of corresponding ndarray
-        or the object that describes the initializer.
-
-    $arr : AI::MXNet::NDArray
-        an ndarray to be initialized.
-=cut
-method call(Str|AI::MXNet::InitDesc $desc, AI::MXNet::NDArray $arr)
-{
-    return $self->_legacy_init($desc, $arr) unless blessed $desc;
-    my $init = $desc->attrs->{ __init__ };
-    if($init)
-    {
-        my ($klass, $kwargs);
-        if(exists $self->get_init_registry->{ lc $init })
-        {
-            $klass = $init;
-            $kwargs = {};
-        }
-        else
-        {
-            ($klass, $kwargs) = @{ decode_json($init) };
-        }
-        $self->get_init_registry->{ lc $klass }->new(%{ $kwargs })->_init_weight("$desc", $arr);
-        $self->_verbose_print($desc, $init, $arr);
-    }
-    else
-    {
-        $desc = "$desc";
-        if($desc =~ /(weight|bias|gamma|beta)$/)
-        {
-            my $method = "_init_$1";
-            $self->$method($desc, $arr);
-            $self->_verbose_print($desc, $1, $arr);
-        }
-        elsif($desc =~ /min$/)
-        {
-            $self->_init_zero($desc, $arr);
-            $self->_verbose_print($desc, 'min', $arr);
-        }
-        elsif($desc =~ /max$/)
-        {
-            $self->_init_one($desc, $arr);
-            $self->_verbose_print($desc, 'max', $arr);
-        }
-        else
-        {
-            $self->_init_default($desc, $arr)
-        }
-    }
-}
-
-
-method _legacy_init(Str $name, AI::MXNet::NDArray $arr)
-{
-    warnings::warnif(
-        'deprecated',
-        'Calling initializer with init($str, $NDArray) has been deprecated.'.
-        'please use init(mx->init->InitDesc(...), NDArray) instead.'
-    );
-    if($name =~ /^upsampling/)
-    {
-        $self->_init_bilinear($name, $arr);
-    }
-    elsif($name =~ /^stn_loc/ and $name =~ /weight$/)
-    {
-        $self->_init_zero($name, $arr);
-    }
-    elsif($name =~ /^stn_loc/ and $name =~ /bias$/)
-    {
-        $self->_init_loc_bias($name, $arr);
-    }
-    elsif($name =~ /bias$/)
-    {
-        $self->_init_bias($name, $arr);
-    }
-    elsif($name =~ /gamma$/)
-    {
-        $self->_init_gamma($name, $arr);
-    }
-    elsif($name =~ /beta$/)
-    {
-        $self->_init_beta($name, $arr);
-    }
-    elsif($name =~ /weight$/)
-    {
-        $self->_init_weight($name, $arr);
-    }
-    elsif($name =~ /moving_mean$/)
-    {
-        $self->_init_zero($name, $arr);
-    }
-    elsif($name =~ /moving_var$/)
-    {
-        $self->_init_one($name, $arr);
-    }
-    elsif($name =~ /moving_inv_var$/)
-    {
-        $self->_init_zero($name, $arr);
-    }
-    elsif($name =~ /moving_avg$/)
-    {
-        $self->_init_zero($name, $arr);
-    }
-    elsif($name =~ /min$/)
-    {
-        $self->_init_zero($name, $arr);
-    }
-    elsif($name =~ /max$/)
-    {
-        $self->_init_one($name, $arr);
-    }
-    else
-    {
-        $self->_init_default($name, $arr);
-    }
-}
-
-*slice = *call;
-
-method _init_bilinear($name, $arr)
-{
-    my $pdl_type = PDL::Type->new(DTYPE_MX_TO_PDL->{ 'float32' });
-    my $weight = pzeros(
-        PDL::Type->new(DTYPE_MX_TO_PDL->{ 'float32' }),
-        $arr->size
-    );
-    my $shape = $arr->shape;
-    my $size = $arr->size;
-    my $f = pceil($shape->[3] / 2)->at(0);
-    my $c = (2 * $f - 1 - $f % 2) / (2 * $f);
-    for my $i (0..($size-1))
-    {
-        my $x = $i % $shape->[3];
-        my $y = ($i / $shape->[3]) % $shape->[2];
-        $weight->index($i) .= (1 - abs($x / $f - $c)) * (1 - abs($y / $f - $c));
-    }
-    $arr .= $weight->reshape(reverse @{ $shape });
-}
-
-method _init_loc_bias($name, $arr)
-{
-    confess("assert error shape[0] == 6")
-        unless $arr->shape->[0] == 6;
-    $arr .= [1.0, 0, 0, 0, 1.0, 0];
-}
-
-method _init_zero($name, $arr)
-{
-    $arr .= 0;
-}
-
-method _init_one($name, $arr)
-{
-    $arr .= 1;
-}
-
-method _init_bias($name, $arr)
-{
-    $arr .= 0;
-}
-
-method _init_gamma($name, $arr)
-{
-    $arr .= 1;
-}
-
-method _init_beta($name, $arr)
-{
-    $arr .= 0;
-}
-
-method _init_weight($name, $arr)
-{
-    confess("Virtual method, subclass must override it");
-}
-
-method _init_default($name, $arr)
-{
-    confess(
-        "Unknown initialization pattern for $name. "
-        .'Default initialization is now limited to '
-        .'"weight", "bias", "gamma" (1.0), and "beta" (0.0).'
-        .'Please use mx.sym.Variable(init=mx.init.*) to set initialization pattern'
-    );
-}
-
-=head1 NAME
-
-    AI::MXNet::Load  - Initialize by loading a pretrained param from a hash ref.
-=cut
-
-=head2 new
-
-    Parameters
-    ----------
-    param: HashRef[AI::MXNet::NDArray]
-    default_init: Initializer
-        default initializer when a name is not found in the param hash ref.
-    verbose: bool
-    log the names when initializing.
-=cut
-
-package AI::MXNet::Load;
-use Mouse;
-extends 'AI::MXNet::Initializer';
-
-has 'param'        => (is => "rw", isa => 'HashRef[AI::MXNet::NDArray]', required => 1);
-has 'default_init' => (is => "rw", isa => "AI::MXNet::Initializer");
-has 'verbose'      => (is => "rw", isa => "Int", default => 0);
-
-sub BUILD
-{
-    my $self = shift;
-    my $param = AI::MXNet::NDArray->load($self->param) unless ref $self->param;
-    my %self_param;
-    while(my ($name, $arr) = each %{ $self->param })
-    {
-        $name =~ s/^(?:arg|aux)://;
-        $self_param{ $name } = $arr;
-    }
-    $self->param(\%self_param);
-}
-
-method call(Str $name, AI::MXNet::NDArray $arr)
-{
-    if(exists $self->param->{ $name })
-    {
-        my $target_shape = join(',', @{ $arr->shape });
-        my $param_shape  = join(',', @{ $self->param->{ $name }->shape });
-        confess(
-            "Parameter $name cannot be initialized from loading. "
-            ."Shape mismatch, target $target_shape vs loaded $param_shape"
-        ) unless $target_shape eq $param_shape;
-        $arr .= $self->param->{ $name };
-        AI::MXNet::Log->info("Initialized $name by loading") if $self->verbose;
-    }
-    else
-    {
-        confess(
-            "Cannot Initialize $name. Not found in loaded param "
-            ."and no default Initializer is provided."
-        ) unless defined $self->default_init;
-        $self->default_init($name, $arr);
-        AI::MXNet::Log->info("Initialized $name by default") if $self->verbose;
-    }
-}
-
-*slice = *call;
-
-=head1 NAME
-
-    AI::MXNet::Mixed - A container with multiple initializer patterns.
-=cut
-
-=head2 new
-
-    patterns: array ref of str
-        array ref of regular expression patterns to match parameter names.
-    initializers: array ref of AI::MXNet::Initializer objects.
-        array ref of Initializers corresponding to the patterns.
-=cut
-
-package AI::MXNet::Mixed;
-use Mouse;
-extends 'AI::MXNet::Initializer';
-
-has "map"          => (is => "rw", init_arg => undef);
-has "patterns"     => (is => "ro", isa => 'ArrayRef[Str]');
-has "initializers" => (is => "ro", isa => 'ArrayRef[AI::MXnet::Initializer]');
-
-sub BUILD
-{
-    my $self = shift;
-    confess("patterns count != initializers count")
-        unless (@{ $self->patterns } == @{ $self->initializers });
-    my %map;
-    @map{ @{ $self->patterns } } = @{ $self->initializers };
-    $self->map(\%map);
-}
-
-method call(Str $name, AI::MXNet::NDArray $arr)
-{
-    for my $pattern (keys %{ $self->map })
-    {
-        if($name =~ /$pattern/)
-        {
-            $self->map->{$pattern}->($name, $arr);
-            return;
-        }
-    }
-    confess(
-        "Parameter name $name did not match any pattern. Consider"
-        ."add a \".*\" pattern at the and with default Initializer."
-    );
-}
-
-package AI::MXNet::Zero;
-use Mouse;
-extends 'AI::MXNet::Initializer';
-method _init_weight(Str $name, AI::MXNet::NDArray $arr)
-{
-    $arr .= 0;
-}
-
-__PACKAGE__->register;
-
-package AI::MXNet::Zeros;
-use Mouse;
-extends 'AI::MXNet::Zero';
-
-__PACKAGE__->register;
-
-package AI::MXNet::One;
-use Mouse;
-extends 'AI::MXNet::Initializer';
-method _init_weight(Str $name, AI::MXNet::NDArray $arr)
-{
-    $arr .= 1;
-}
-
-__PACKAGE__->register;
-
-package AI::MXNet::Ones;
-use Mouse;
-extends 'AI::MXNet::One';
-
-__PACKAGE__->register;
-
-package AI::MXNet::Constant;
-use Mouse;
-extends 'AI::MXNet::Initializer';
-has 'value' => (is => 'ro', isa => 'Num', required => 1);
-around BUILDARGS => sub {
-    my $orig  = shift;
-    my $class = shift;
-    return $class->$orig(value => $_[0]) if @_ == 1;
-    return $class->$orig(@_);
-};
-
-method _init_weight(Str $name, AI::MXNet::NDArray $arr)
-{
-    $arr .= $self->value;
-}
-
-__PACKAGE__->register;
-
-=head1 NAME
-
-    AI::MXNet::Uniform - Initialize the weight with uniform random values.
-=cut
-
-=head1 DESCRIPTION
-
-    Initialize the weight with uniform random values contained within of [-scale, scale]
-
-    Parameters
-    ----------
-    scale : float, optional
-        The scale of the uniform distribution.
-=cut
-
-package AI::MXNet::Uniform;
-use Mouse;
-extends 'AI::MXNet::Initializer';
-has "scale" => (is => "ro", isa => "Num", default => 0.07);
-around BUILDARGS => sub {
-    my $orig  = shift;
-    my $class = shift;
-    return $class->$orig(scale => $_[0]) if @_ == 1;
-    return $class->$orig(@_);
-};
-
-method _init_weight(Str $name, AI::MXNet::NDArray $arr)
-{
-    AI::MXNet::Random->uniform(-$self->scale, $self->scale, { out => $arr });
-}
-
-__PACKAGE__->register;
-
-=head1 NAME
-
-    AI::MXNet::Normal - Initialize the weight with gaussian random values.
-=cut
-
-=head1 DESCRIPTION
-
-    Initialize the weight with gaussian random values contained within of [0, sigma]
-
-    Parameters
-    ----------
-    sigma : float, optional
-        Standard deviation for the gaussian distribution.
-=cut
-
-package AI::MXNet::Normal;
-use Mouse;
-extends 'AI::MXNet::Initializer';
-has "sigma" => (is => "ro", isa => "Num", default => 0.01);
-around BUILDARGS => sub {
-    my $orig  = shift;
-    my $class = shift;
-    return $class->$orig(sigma => $_[0]) if @_ == 1;
-    return $class->$orig(@_);
-};
-
-method _init_weight(Str $name, AI::MXNet::NDArray $arr)
-{
-    AI::MXNet::Random->normal(0, $self->sigma, { out => $arr });
-}
-
-__PACKAGE__->register;
-
-=head1 NAME
-
-    AI::MXNet::Orthogonal - Intialize the weight as an Orthogonal matrix.
-=cut
-
-=head1 DESCRIPTION
-
-    Intialize weight as Orthogonal matrix
-
-    Parameters
-    ----------
-    scale : float, optional
-        scaling factor of weight
-
-    rand_type: string optional
-        use "uniform" or "normal" random number to initialize weight
-
-    Reference
-    ---------
-    Exact solutions to the nonlinear dynamics of learning in deep linear neural networks
-    arXiv preprint arXiv:1312.6120 (2013).
-=cut
-
-package AI::MXNet::Orthogonal;
-use AI::MXNet::Base;
-use Mouse;
-use AI::MXNet::Types;
-extends 'AI::MXNet::Initializer';
-has "scale" => (is => "ro", isa => "Num", default => 1.414);
-has "rand_type" => (is => "ro", isa => enum([qw/uniform normal/]), default => 'uniform');
-
-method _init_weight(Str $name, AI::MXNet::NDArray $arr)
-{
-    my @shape = @{ $arr->shape };
-    my $nout = $shape[0];
-    my $nin = AI::MXNet::NDArray->size([@shape[1..$#shape]]);
-    my $tmp = AI::MXNet::NDArray->zeros([$nout, $nin]);
-    if($self->rand_type eq 'uniform')
-    {
-        AI::MXNet::Random->uniform(-1, 1, { out => $tmp });
-    }
-    else
-    {
-        AI::MXNet::Random->normal(0, 1, { out => $tmp });
-    }
-    $tmp = $tmp->aspdl;
-    my ($u, $s, $v) = svd($tmp);
-    my $q;
-    if(join(',', @{ $u->shape->unpdl }) eq join(',', @{ $tmp->shape->unpdl }))
-    {
-        $q = $u;
-    }
-    else
-    {
-        $q = $v;
-    }
-    $q = $self->scale * $q->reshape(reverse(@shape));
-    $arr .= $q;
-}
-
-*slice = *call;
-__PACKAGE__->register;
-
-=head1 NAME
-
-    AI::MXNet::Xavier - Initialize the weight with Xavier or similar initialization scheme.
-=cut
-
-=head1 DESCRIPTION
-
-    Parameters
-    ----------
-    rnd_type: str, optional
-        Use gaussian or uniform.
-    factor_type: str, optional
-        Use avg, in, or out.
-    magnitude: float, optional
-        The scale of the random number range.
-=cut
-
-package AI::MXNet::Xavier;
-use Mouse;
-use AI::MXNet::Types;
-extends 'AI::MXNet::Initializer';
-has "magnitude"   => (is => "rw", isa => "Num", default => 3);
-has "rnd_type"    => (is => "ro", isa => enum([qw/uniform gaussian/]), default => 'uniform');
-has "factor_type" => (is => "ro", isa => enum([qw/avg in out/]), default => 'avg');
-
-method _init_weight(Str $name, AI::MXNet::NDArray $arr)
-{
-    my @shape = @{ $arr->shape };
-    confess(__PACKAGE__." initializer can not be applied on less than 2D tensor")
-        if @shape < 2;
-    my $hw_scale = 1;
-    if(@shape > 2)
-    {
-        $hw_scale = AI::MXNet::NDArray->size([@shape[2..$#shape]]);
-    }
-    my ($fan_in, $fan_out) = ($shape[1] * $hw_scale, $shape[0] * $hw_scale);
-    my $factor;
-    if($self->factor_type eq "avg")
-    {
-        $factor = ($fan_in + $fan_out) / 2;
-    }
-    elsif($self->factor_type eq "in")
-    {
-        $factor = $fan_in;
-    }
-    else
-    {
-        $factor = $fan_out;
-    }
-    my $scale = sqrt($self->magnitude / $factor);
-    if($self->rnd_type eq "iniform")
-    {
-        AI::MXNet::Random->uniform(-$scale, $scale, { out => $arr });
-    }
-    else
-    {
-        AI::MXNet::Random->normal(0, $scale, { out => $arr });
-    }
-}
-__PACKAGE__->register;
-
-=head1 NAME
-
-    AI::MXNet::MSRAPrelu - Custom initialization scheme.
-=cut
-
-=head1 DESCRIPTION
-
-    Initialize the weight with initialization scheme from
-    Delving Deep into Rectifiers: Surpassing Human-Level Performance on ImageNet Classification.
-
-    Parameters
-    ----------
-    factor_type: str, optional
-        Use avg, in, or out.
-    slope: float, optional
-        initial slope of any PReLU (or similar) nonlinearities.
-=cut
-
-package AI::MXNet::MSRAPrelu;
-use Mouse;
-extends 'AI::MXNet::Xavier';
-
-has '+rnd_type'    => (default => "gaussian");
-has '+factor_type' => (default => "avg");
-has 'slope'        => (is => 'ro', isa => 'Num', default => 0.25);
-
-sub BUILD
-{
-    my $self = shift;
-    my $magnitude = 2 / (1 + $self->slope ** 2);
-    $self->magnitude($magnitude);
-    $self->kwargs({ slope => $self->slope, factor_type => $self->factor_type });
-}
-__PACKAGE__->register;
-
-package AI::MXNet::Bilinear;
-use Mouse;
-use AI::MXNet::Base;
-extends 'AI::MXNet::Initializer';
-
-method _init_weight($name, $arr)
-{
-    my $pdl_type = PDL::Type->new(DTYPE_MX_TO_PDL->{ 'float32' });
-    my $weight = pzeros(
-        PDL::Type->new(DTYPE_MX_TO_PDL->{ 'float32' }),
-        $arr->size
-    );
-    my $shape = $arr->shape;
-    my $size = $arr->size;
-    my $f = pceil($shape->[3] / 2)->at(0);
-    my $c = (2 * $f - 1 - $f % 2) / (2 * $f);
-    for my $i (0..($size-1))
-    {
-        my $x = $i % $shape->[3];
-        my $y = ($i / $shape->[3]) % $shape->[2];
-        $weight->index($i) .= (1 - abs($x / $f - $c)) * (1 - abs($y / $f - $c));
-    }
-    $arr .= $weight->reshape(reverse @{ $shape });
-}
-
-__PACKAGE__->register;
-
-package AI::MXNet::LSTMBias;
-
-=head1 NAME
-
-    AI::MXNet::LSTMBias - Custom initializer for LSTM cells.
-=cut
-
-=head1 DESCRIPTION
-
-    Initializes all biases of an LSTMCell to 0.0 except for
-    the forget gate's bias that is set to a custom value.
-
-    Parameters
-    ----------
-    forget_bias: float,a bias for the forget gate.
-    Jozefowicz et al. 2015 recommends setting this to 1.0.
-=cut
-
-use Mouse;
-extends 'AI::MXNet::Initializer';
-has 'forget_bias' => (is => 'ro', isa => 'Num', required => 1);
-around BUILDARGS => \&AI::MXNet::Base::process_arguments;
-method python_constructor_arguments() { ['forget_bias'] }
-
-method _init_weight(Str $name, AI::MXNet::NDArray $arr)
-{
-    $arr .= 0;
-    # in the case of LSTMCell the forget gate is the second
-    # gate of the 4 LSTM gates, we modify the according values.
-    my $num_hidden = int($arr->shape->[0] / 4);
-    $arr->slice([$num_hidden, 2*$num_hidden-1]) .= $self->forget_bias;
-}
-
-__PACKAGE__->register;
-
-package AI::MXNet::FusedRNN;
-use Mouse;
-use JSON::PP;
-extends 'AI::MXNet::Initializer';
-
-=head1 NAME
-
-    AI::MXNet::FusedRNN - Custom initializer for fused RNN cells.
-=cut
-
-=head1 DESCRIPTION
-
-    Initializes parameters for fused rnn layer.
-
-    Parameters
-    ----------
-    init : Initializer
-        initializer applied to unpacked weights.
-    All parameters below must be exactly the same as ones passed to the
-    FusedRNNCell constructor.
-
-    num_hidden : int
-    num_layers : int
-    mode : str
-    bidirectional : bool
-    forget_bias : float
-=cut
-
-has 'init'          => (is => 'rw', isa => 'Str|AI::MXNet::Initializer', required => 1);
-has 'forget_bias'   => (is => 'ro', isa => 'Num', default => 1);
-has [qw/num_hidden
-       num_layers/] => (is => 'ro', isa => 'Int', required => 1);
-has 'mode'          => (is => 'ro', isa => 'Str', required => 1);
-has 'bidirectional' => (is => 'ro', isa => 'Bool', default => 0);
-
-sub BUILD
-{
-    my $self = shift;
-    if(not blessed $self->init)
-    {
-        my ($klass, $kwargs);
-        eval {
-            ($klass, $kwargs) = @{ decode_json($self->init) };
-        };
-        confess("FusedRNN failed to init $@") if $@;
-        $self->init($self->get_init_registry->{ lc $klass }->new(%$kwargs));
-    }
-}
-
-method _init_weight($name, $arr)
-{
-    my $cell = AI::MXNet::RNN::FusedCell->new(
-        num_hidden    => $self->num_hidden,
-        num_layers    => $self->num_layers,
-        mode          => $self->mode,
-        bidirectional => $self->bidirectional,
-        forget_bias   => $self->forget_bias,
-        prefix        => ''
-    );
-
-    my $args = $cell->unpack_weights({ parameters => $arr });
-    for my $name (keys %{ $args })
-    {
-        my $desc = AI::MXNet::InitDesc->new(name => $name);
-        # for lstm bias, we use a custom initializer
-        # which adds a bias to the forget gate
-        if($self->mode eq 'lstm' and $name =~ /f_bias$/)
-        {
-            $args->{$name} .= $self->forget_bias;
-        }
-        else
-        {
-            $self->init->($desc, $args->{$name});
-        }
-    }
-
-    $arr .= $cell->pack_weights($args)->{parameters};
-}
-
-__PACKAGE__->register;
-
-1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/KVStore.pm b/perl-package/AI-MXNet/lib/AI/MXNet/KVStore.pm
deleted file mode 100644
index 259ad0df5191..000000000000
--- a/perl-package/AI-MXNet/lib/AI/MXNet/KVStore.pm
+++ /dev/null
@@ -1,656 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-package AI::MXNet::KVStore;
-use strict;
-use warnings;
-use AI::MXNet::NS;
-use AI::MXNet::Base;
-use AI::MXNet::NDArray;
-use AI::MXNet::Optimizer;
-use MIME::Base64;
-use Storable;
-use Mouse;
-use AI::MXNet::Function::Parameters;
-
-=head1 NAME
-
-    AI::MXNet::KVStore - Key value store interface of MXNet.
-
-=head1 DESCRIPTION
-
-    Key value store interface of MXNet for parameter synchronization, over multiple devices.
-=cut
-
-has 'handle' => (is => 'ro', isa => 'KVStoreHandle', required => 1);
-has '_updater' => (is => 'rw',  isa => 'AI::MXNet::Updater');
-
-sub DEMOLISH
-{
-    check_call(AI::MXNetCAPI::KVStoreFree(shift->handle));
-}
-
-=head2  init
-
-    Initialize a single or a sequence of key-value pairs into the store.
-    For each key, one must init it before push and pull.
-    Only worker 0's (rank == 0) data are used.
-    This function returns after data have been initialized successfully
-
-    Parameters
-    ----------
-    $key : Str|ArrayRef[Str]
-        The keys.
-    $value : AI::MXNet::NDArray|ArrayRef[AI::MXNet::NDArray]|ArrayRef[ArrayRef[AI::MXNet::NDArray]]
-        The values.
-
-    Examples
-    --------
-    >>> # init a single key-value pair
-    >>> $shape = [2,3]
-    >>> $kv = mx->kv->create('local')
-    >>> $kv->init(3, mx->nd->ones($shape)*2)
-    >>> $a = mx->nd->zeros($shape)
-    >>> $kv->pull(3, out=>$a)
-    >>> print $a->aspdl
-    [[ 2  2  2]
-    [ 2  2  2]]
-
-    >>> # init a list of key-value pairs
-    >>> $keys = [5, 7, 9]
-    >>> $kv->init(keys, [map { mx->nd->ones($shape) } 0..@$keys-1])
-=cut
-
-method init(
-    Str|ArrayRef[Str] $key,
-    AI::MXNet::NDArray|ArrayRef[AI::MXNet::NDArray]|ArrayRef[ArrayRef[AI::MXNet::NDArray]] $value
-)
-{
-    my ($keys, $vals) = _key_value($key, $value);
-    check_call(
-        AI::MXNetCAPI::KVStoreInitEx(
-            $self->handle, scalar(@{ $keys }), $keys, $vals
-        )
-    );
-}
-
-=head2  push
-
-    Push a single or a sequence of key-value pairs into the store.
-    Data consistency:
-    1. this function returns after adding an operator to the engine.
-    2. push is always called after all previous push and pull on the same
-        key are finished.
-    3. there is no synchronization between workers. One can use _barrier()
-    to sync all workers.
-
-    Parameters
-    ----------
-    $key : Str|ArrayRef[Str]
-    $value : AI::MXNet::NDArray|ArrayRef[AI::MXNet::NDArray]|ArrayRef[ArrayRef[AI::MXNet::NDArray]]
-    :$priority=0 : Int, optional
-        The priority of the push operation.
-        The higher the priority, the faster this action is likely
-        to be executed before other push actions.
-
-    Examples
-    --------
-    >>> # push a single key-value pair
-    >>> $kv->push(3, mx->nd->ones($shape)*8)
-    >>> $kv->pull(3, out=>$a) # pull out the value
-    >>> print $a->aspdl()
-        [[ 8.  8.  8.]
-        [ 8.  8.  8.]]
-
-    >>> # aggregate the value and the push
-    >>> $gpus = [map { mx->gpu($_) } 0..3]
-    >>> $b = [map { mx->nd->ones($shape, ctx => $_) } @$gpus]
-    >>> $kv->push(3, $b)
-    >>> $kv->pull(3, out=>$a)
-    >>> print $a->aspdl
-        [[ 4.  4.  4.]
-        [ 4.  4.  4.]]
-
-    >>> # push a list of keys.
-    >>> # single device
-    >>> $kv->push($keys, [map { mx->nd->ones($shape) } 0..@$keys-1)
-    >>> $b = [map { mx->nd->zeros(shape) } 0..@$keys-1]
-    >>> $kv->pull($keys, out=>$b)
-    >>> print $b->[1]->aspdl
-        [[ 1.  1.  1.]
-        [ 1.  1.  1.]]
-
-    >>> # multiple devices:
-    >>> $b = [map { [map { mx->nd->ones($shape, ctx => $_) } @$gpus] } @$keys-1]
-    >>> $kv->push($keys, $b)
-    >>> $kv->pull($keys, out=>$b)
-    >>> print $b->[1][1]->aspdl()
-        [[ 4.  4.  4.]
-        [ 4.  4.  4.]]
-=cut
-
-method push(
-    Str|ArrayRef[Str] $key,
-    AI::MXNet::NDArray|ArrayRef[AI::MXNet::NDArray]|ArrayRef[ArrayRef[AI::MXNet::NDArray]] $value,
-    Int :$priority=0
-)
-{
-    my ($keys, $vals) = _key_value($key, $value);
-    check_call(
-        AI::MXNetCAPI::KVStorePushEx(
-            $self->handle, scalar(@{ $keys }), $keys, $vals, $priority
-        )
-    );
-}
-
-=head2 pull
-
-    Pull a single value or a sequence of values from the store.
-
-    Data consistency:
-
-    1. this function returns after adding an operator to the engine. But any
-        further read on out will be blocked until it is finished.
-    2. pull is always called after all previous push and pull on the same
-        key are finished.
-    3. It pulls the newest value from the store.
-
-    Parameters
-    ----------
-    $key : Str|ArrayRef[Str]
-        Keys
-    :$out: AI::MXNet::NDArray|ArrayRef[AI::MXNet::NDArray]|ArrayRef[ArrayRef[AI::MXNet::NDArray]]
-        According values
-
-    :$priority=0 : Int, optional
-        The priority of the push operation.
-        The higher the priority, the faster this action is likely
-        to be executed before other push actions.
-
-    Examples
-    --------
-    >>> # pull a single key-value pair
-    >>> $a = mx->nd->zeros($shape)
-    >>> $kv->pull(3, out=>$a)
-    >>> print $a->aspdl
-        [[ 2.  2.  2.]
-        [ 2.  2.  2.]]
-
-    >>> # pull into multiple devices
-    >>> $b = [map { mx->nd->ones($shape, $_) } @$gpus]
-    >>> $kv->pull(3, out=>$b)
-    >>> print $b->[1]->aspdl()
-        [[ 2.  2.  2.]
-        [ 2.  2.  2.]]
-
-    >>> # pull a list of key-value pairs.
-    >>> # On single device
-    >>> $keys = [5, 7, 9]
-    >>> $b = [map { mx->nd->zeros($shape) } 0..@$keys-1]
-    >>> $kv->pull($keys, out=>$b)
-    >>> print $b->[1]->aspdl()
-        [[ 2.  2.  2.]
-        [ 2.  2.  2.]]
-    >>> # On multiple devices
-    >>> $b = [map { [map { mx->nd->ones($shape, ctx => $_) } @$gpus ] } 0..@$keys-1]
-    >>> $kv->pull($keys, out=>$b)
-    >>> print $b->[1][1]->aspdl()
-        [[ 2.  2.  2.]
-        [ 2.  2.  2.]]
-=cut
-
-method pull(
-    Str|ArrayRef[Str] $key,
-    AI::MXNet::NDArray|ArrayRef[AI::MXNet::NDArray]|ArrayRef[ArrayRef[AI::MXNet::NDArray]] :$out,
-    Int :$priority=0,
-    Bool :$ignore_sparse=1
-)
-{
-    my ($keys, $vals) = _key_value($key, $out);
-    check_call(
-        AI::MXNetCAPI::KVStorePullWithSparseEx(
-            $self->handle, scalar(@{ $keys }), $keys, $vals, $priority, $ignore_sparse
-        )
-    );
-}
-
-=head2  row_sparse_pull
-
-        Pulls a single AI::MXNet::NDArray::RowSparse value or an array ref of AI::MXNet::NDArray::RowSparse values
-        from the store with specified row_ids. When there is only one row_id, KVStoreRowSparsePull
-        is invoked just once and the result is broadcast to all the rest of outputs.
-
-        `row_sparse_pull` is executed asynchronously after all previous
-        `pull`/`row_sparse_pull` calls and the last `push` call for the
-        same input key(s) are finished.
-
-        The returned values are guaranteed to be the latest values in the store.
-
-        Parameters
-        ----------
-        $key : Str|ArrayRef[Str] $key
-            Keys.
-
-        :$out: AI::MXNet::NDArray::RowSparse|ArrayRef[AI::MXNet::NDArray::RowSparse]|ArrayRef[ArrayRef[AI::MXNet::NDArray::RowSparse]]
-            Values corresponding to the keys. The stype is expected to be row_sparse
-
-        :$priority=0 : Int, optional
-            The priority of the pull operation.
-            Higher priority pull operations are likely to be executed before
-            other pull actions.
-
-        :$row_ids : AI::MXNet::NDArray|ArrayRef[AI::MXNet::NDArray]|ArrayRef[ArrayRef[AI::MXNet::NDArray]]
-            The row_ids for which to pull for each value. Each row_id is an 1D NDArray
-            whose values don't have to be unique nor sorted.
-
-        Examples
-        --------
-        >>> $shape = [3, 3]
-        >>> $kv->init('3', mx->nd->ones($shape)->tostype('row_sparse'))
-        >>> $a = mx->nd->sparse->zeros('row_sparse', $shape)
-        >>> $row_ids = mx->nd->array([0, 2], dtype=>'int64')
-        >>> $kv->row_sparse_pull('3', out=>$a, row_ids=>$row_ids)
-        >>> print $a->aspdl
-        [[ 1.  1.  1.]
-        [ 0.  0.  0.]
-        [ 1.  1.  1.]]
-        >>> $duplicate_row_ids = mx->nd->array([2, 2], dtype=>'int64')
-        >>> $kv->row_sparse_pull('3', out=>$a, row_ids=>$duplicate_row_ids)
-        >>> print $a->aspdl
-        [[ 0.  0.  0.]
-        [ 0.  0.  0.]
-        [ 1.  1.  1.]]
-        >>> $unsorted_row_ids = mx->nd->array([1, 0], dtype=>'int64')
-        >>> $kv->row_sparse_pull('3', out=>$a, row_ids=>$unsorted_row_ids)
-        >>> print $a->aspdl
-        [[ 1.  1.  1.]
-        [ 1.  1.  1.]
-        [ 0.  0.  0.]]
-=cut
-
-
-method row_sparse_pull(
-    Str|ArrayRef[Str] $key,
-    AI::MXNet::NDArray::RowSparse|ArrayRef[AI::MXNet::NDArray::RowSparse]|ArrayRef[ArrayRef[AI::MXNet::NDArray::RowSparse]] :$out,
-    Int :$priority=0,
-    AI::MXNet::NDArray|ArrayRef[AI::MXNet::NDArray]|ArrayRef[ArrayRef[AI::MXNet::NDArray]] :$row_ids
-)
-{
-    if(blessed $row_ids)
-    {
-        $row_ids = [$row_ids];
-    }
-    my $first_out = $out;
-    # whether row_ids are the same
-    my $single_rowid = 0;
-    if(@$row_ids == 1 and ref $out eq 'ARRAY')
-    {
-        $single_rowid = 1;
-        $first_out = [$out->[0]];
-    }
-    my ($ckeys, $cvals) = _key_value($key, $first_out);
-    my (undef, $crow_ids) = _key_value($key, $row_ids);
-    assert(
-        (@$crow_ids == @$cvals),
-        "the number of row_ids doesn't match the number of values"
-    );
-    check_call(
-        AI::MXNetCAPI::KVStorePullRowSparseEx(
-            $self->handle, scalar(@$ckeys), $ckeys, $cvals, $crow_ids, $priority
-        )
-    );
-    # the result can be copied to other devices without invoking row_sparse_pull
-    # if the indices are the same
-    if($single_rowid)
-    {
-        for my $out_i (@{ $out } [1..@{ $out }-1])
-        {
-            $out->[0]->copyto($out_i);
-        }
-    }
-}
-
-=head2  set_gradient_compression
-
-        Specifies type of low-bit quantization for gradient compression \
-         and additional arguments depending on the type of compression being used.
-
-        2bit Gradient Compression takes a positive float `threshold`.
-        The technique works by thresholding values such that positive values in the
-        gradient above threshold will be set to threshold. Negative values whose absolute
-        values are higher than threshold, will be set to the negative of threshold.
-        Values whose absolute values are less than threshold will be set to 0.
-        By doing so, each value in the gradient is in one of three states. 2bits are
-        used to represent these states, and every 16 float values in the original
-        gradient can be represented using one float. This compressed representation
-        can reduce communication costs. The difference between these thresholded values and
-        original values is stored at the sender's end as residual and added to the
-        gradient in the next iteration.
-
-        When kvstore is 'local', gradient compression is used to reduce communication
-        between multiple devices (gpus). Gradient is quantized on each GPU which
-        computed the gradients, then sent to the GPU which merges the gradients. This
-        receiving GPU dequantizes the gradients and merges them. Note that this
-        increases memory usage on each GPU because of the residual array stored.
-
-        When kvstore is 'dist', gradient compression is used to reduce communication
-        from worker to sender. Gradient is quantized on each worker which
-        computed the gradients, then sent to the server which dequantizes
-        this data and merges the gradients from each worker. Note that this
-        increases CPU memory usage on each worker because of the residual array stored.
-        Only worker to server communication is compressed in this setting.
-        If each machine has multiple GPUs, currently this GPU to GPU or GPU to CPU communication
-        is not compressed. Server to worker communication (in the case of pull)
-        is also not compressed.
-
-        To use 2bit compression, we need to specify `type` as `2bit`.
-        Only specifying `type` would use default value for the threshold.
-        To completely specify the arguments for 2bit compression, we would need to pass
-        a dictionary which includes `threshold` like:
-        {'type': '2bit', 'threshold': 0.5}
-
-        Parameters
-        ----------
-        $compression_params : HashRef[Str]
-            A dictionary specifying the type and parameters for gradient compression.
-            The key `type` in this dictionary is a
-            required string argument and specifies the type of gradient compression.
-            Currently `type` can be only `2bit`
-            Other keys in this dictionary are optional and specific to the type
-            of gradient compression.
-=cut
-
-method set_gradient_compression(HashRef[Str] $compression_params)
-{
-    if($self->type =~ /(?:device|dist)/)
-    {
-        check_call(
-            AI::MXNetCAPI::KVStoreSetGradientCompression(
-                $self->handle,
-                scalar(keys %$compression_params),
-                $compression_params
-            )
-        );
-    }
-    else
-    {
-        confess('Gradient compression is not supported for this type of kvstore');
-    }
-}
-
-=head2  set_optimizer
-
-    Register an optimizer to the store
-
-    If there are multiple machines, this process (should be a worker node)
-    will pack this optimizer and send it to all servers. It returns after
-    this action is done.
-
-    Parameters
-    ----------
-    $optimizer : AI::MXNet::Optimizer
-        the optimizer
-=cut
-
-method set_optimizer(AI::MXNet::Optimizer $optimizer)
-{
-    my $is_worker = check_call(AI::MXNetCAPI::KVStoreIsWorkerNode());
-    if($self->type =~ /dist/ and $is_worker)
-    {
-        my $optim_str = MIME::Base64::encode_base64(Storable::freeze($optimizer), "");
-        $self->_send_command_to_servers(0, $optim_str);
-    }
-    else
-    {
-        $self->_updater(AI::MXNet::Optimizer->get_updater($optimizer));
-        $self->_set_updater($self->_updater);
-    }
-}
-
-=head2  type
-
-    Get the type of this kvstore
-
-    Returns
-    -------
-    $type : Str
-        the string type
-=cut
-
-method type()
-{
-    return scalar(check_call(AI::MXNetCAPI::KVStoreGetType($self->handle)));
-}
-
-=head2  rank
-
-    Get the rank of this worker node
-
-    Returns
-    -------
-    $rank : Int
-        The rank of this node, which is in [0, get_num_workers())
-=cut
-
-method rank()
-{
-    return scalar(check_call(AI::MXNetCAPI::KVStoreGetRank($self->handle)));
-}
-
-=head2  num_workers
-
-    Get the number of worker nodes
-
-    Returns
-    -------
-    $size : Int
-        The number of worker nodes
-=cut
-
-method num_workers()
-{
-    return scalar(check_call(AI::MXNetCAPI::KVStoreGetGroupSize($self->handle)));
-}
-
-=head2 save_optimizer_states
-
-    Save optimizer (updater) state to file
-
-    Parameters
-    ----------
-    $fname : Str
-        Path to output states file.
-    :$dump_optimizer=0 : Bool, default False
-            Whether to also save the optimizer itself. This would also save optimizer
-            information such as learning rate and weight decay schedules.
-=cut
-
-method save_optimizer_states(Str $fname, Bool :$dump_optimizer=0)
-{
-    confess("Cannot save states for distributed training")
-        unless defined $self->_updater;
-    open(F, ">:raw", "$fname") or confess("can't open $fname for writing: $!");
-    print F $self->_updater->get_states($dump_optimizer);
-    close(F);
-}
-
-=head2 load_optimizer_states
-
-    Load optimizer (updater) state from file.
-
-    Parameters
-    ----------
-    $fname : Str
-        Path to input states file.
-=cut
-
-method load_optimizer_states(Str $fname)
-{
-    confess("Cannot save states for distributed training")
-        unless defined $self->_updater;
-    open(F, "<:raw", "$fname") or confess("can't open $fname for reading: $!");
-    my $data;
-    { local($/) = undef; $data = <F>; }
-    close(F);
-    $self->_updater->set_states($data);
-}
-
-=head2 _set_updater
-
-    Set a push updater into the store.
-
-    This function only changes the local store. Use set_optimizer for
-    multi-machines.
-
-    Parameters
-    ----------
-    $updater : Undater
-        the updater function
-
-    Examples
-    --------
-    >>> my $update = sub { my ($key, input, stored) = @_;
-        ...     print "update on key: $key\n";
-        ...     $stored += $input * 2; };
-        >>> $kv->_set_updater($update)
-        >>> $kv->pull(3, out=>$a)
-        >>> print $a->aspdl()
-        [[ 4.  4.  4.]
-        [ 4.  4.  4.]]
-        >>> $kv->push(3, mx->nd->ones($shape))
-        update on key: 3
-        >>> $kv->pull(3, out=>$a)
-        >>> print $a->aspdl()
-        [[ 6.  6.  6.]
-        [ 6.  6.  6.]]
-=cut
-
-method _set_updater(Updater $updater_func)
-{
-    check_call(
-        AI::MXNetCAPI::KVStoreSetUpdater(
-            $self->handle,
-            sub {
-                my ($index, $input_handle, $storage_handle) = @_;
-                $updater_func->(
-                    $index,
-                    AI::MXNet::NDArray->_ndarray_cls($input_handle),
-                    AI::MXNet::NDArray->_ndarray_cls($storage_handle)
-                );
-            }
-        )
-    );
-}
-
-=head2 _barrier
-
-    Global barrier between all worker nodes.
-
-    For example, assume there are n machines, we want to let machine 0 first
-    init the values, and then pull the inited value to all machines. Before
-    pulling, we can place a barrier to guarantee that the initialization is
-    finished.
-=cut
-
-method _barrier()
-{
-    check_call(AI::MXNetCAPI::KVStoreBarrier($self->handle));
-}
-
-=head2 _send_command_to_servers
-
-    Send a command to all server nodes
-    Send a command to all server nodes, which will make each server node run
-    KVStoreServer.controller
-    This function returns after the command has been executed in all server
-    nodes.
-
-    Parameters
-    ----------
-    $head : Int
-        the head of the command
-    $body : Str
-        the body of the command
-=cut
-
-method _send_command_to_servers(Int $head, Str $body)
-{
-    check_call(
-        AI::MXNetCAPI::KVStoreSendCommmandToServers(
-            $self->handle,
-            $head,
-            $body
-        )
-    );
-}
-
-=head2 create
-
-    Create a new KVStore.
-
-    Parameters
-    ----------
-    $name='local' : Str
-    The type of KVStore
-        - local works for multiple devices on a single machine (single process)
-        - dist works for multi-machines (multiple processes)
-    Returns
-    -------
-    kv : KVStore
-        The created AI::MXNet::KVStore
-=cut
-
-method create(Str $name='local')
-{
-    my $handle = check_call(AI::MXNetCAPI::KVStoreCreate($name));
-    return __PACKAGE__->new(handle => $handle);
-}
-
-sub _key_value
-{
-    my ($keys, $vals) = @_;
-    if(not ref $keys)
-    {
-        if(blessed $vals)
-        {
-            return ([$keys], [$vals->handle]);
-        }
-        else
-        {
-            for my $value (@{ $vals })
-            {
-                assert(blessed($value) and $value->isa('AI::MXNet::NDArray'));
-                return ([($keys)x@$vals], [map { $_->handle } @$vals]);
-            }
-        }
-    }
-    else
-    {
-        assert(not blessed($vals) and @$keys == @$vals);
-        my @c_keys;
-        my @c_vals;
-        for(zip($keys, $vals)) {
-            my ($key, $val) = @$_;
-            my ($c_key, $c_val) = _key_value($key, $val);
-            push @c_keys, @$c_key;
-            push @c_vals, @$c_val;
-        }
-        return (\@c_keys, \@c_vals);
-    }
-}
-
-1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/KVStoreServer.pm b/perl-package/AI-MXNet/lib/AI/MXNet/KVStoreServer.pm
deleted file mode 100644
index 39e152a6d641..000000000000
--- a/perl-package/AI-MXNet/lib/AI/MXNet/KVStoreServer.pm
+++ /dev/null
@@ -1,98 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-package AI::MXNet::KVStoreServer;
-use strict;
-use warnings;
-use AI::MXNet::Base;
-use AI::MXNet::KVStore;
-use Storable;
-use MIME::Base64;
-use Mouse;
-use AI::MXNet::Function::Parameters;
-
-=head1 NAME
-
-    AI::MXNet::KVStoreServer - The key-value store server.
-=cut
-
-=head2 new
-
-    Initialize a new KVStoreServer.
-
-    Parameters
-    ----------
-    kvstore : AI::MXNet::KVStore
-=cut
-
-has 'kvstore' => (is => 'ro', isa => 'AI::MXNet::KVStore', required => 1);
-has 'handle'  => (is => 'ro', isa => 'KVStoreHandle', default => sub { shift->kvstore->handle }, lazy => 1);
-has 'init_logging' => (is => 'rw', isa => 'Int', default => 0);
-
-
-# return the server controller
-method _controller()
-{
-    return  sub {
-        my ($cmd_id, $cmd_body) = @_;
-        if (not $self->init_logging)
-        {
-            ## TODO write logging
-            $self->init_logging(1);
-        }
-        if($cmd_id == 0)
-        {
-            my $optimizer = Storable::thaw(MIME::Base64::decode_base64($cmd_body));
-            $self->kvstore->set_optimizer($optimizer);
-        }
-        else
-        {
-            my $rank = $self->kvstore->rank;
-            print("server $rank, unknown command ($cmd_id, $cmd_body)\n");
-        }
-    }
-}
-
-=head2 run
-
-    run the server, whose behavior is like
-    >>> while receive(x):
-    ...     if is_command x: controller(x)
-    ...     else if is_key_value x: updater(x)
-=cut
-
-method run()
-{
-    check_call(AI::MXNetCAPI::KVStoreRunServer($self->handle, $self->_controller));
-}
-
-# Start server/scheduler
-func _init_kvstore_server_module()
-{
-    my $is_worker = check_call(AI::MXNetCAPI::KVStoreIsWorkerNode());
-    if($is_worker == 0)
-    {
-        my $kvstore = AI::MXNet::KVStore->create('dist');
-        my $server = __PACKAGE__->new(kvstore => $kvstore);
-        $server->run();
-        exit(0);
-    }
-}
-
-_init_kvstore_server_module();
-
-1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/LRScheduler.pm b/perl-package/AI-MXNet/lib/AI/MXNet/LRScheduler.pm
deleted file mode 100644
index 5575e37f75fe..000000000000
--- a/perl-package/AI-MXNet/lib/AI/MXNet/LRScheduler.pm
+++ /dev/null
@@ -1,193 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-package AI::MXNet::LRScheduler;
-use strict;
-use warnings;
-use Mouse;
-use AI::MXNet::Function::Parameters;
-use AI::MXNet::Logging;
-use overload "&{}" => sub { my $self = shift; sub { $self->call(@_) } },
-             fallback => 1;
-
-=head1 NAME
-
-    AI::MXNet::LRScheduler - The adaptive scheduler of the learning rate.
-=cut
-
-=head1 DESCRIPTION
-
-    Learning rate scheduler, which adaptively changes the learning rate based on the
-    progress.
-=cut
-
-=head2 new
-
-    base_lr : float (optional, default 0.01)
-    the initial learning rate
-=cut
-
-has 'base_lr' => (is => 'rw', isa => 'Num', default => 0.01);
-
-=head2 call
-
-    Call to schedule current learning rate
-
-    The training progress is presented by num_update, which can be roughly
-    viewed as the number of minibatches executed so far. Its value is
-    non-decreasing, and increases at most by one.
-
-    The exact value is the upper bound of the number of updates applied to
-    a weight/index
-
-    See more details in https://github.com/dmlc/mxnet/issues/625
-
-    Parameters
-    ----------
-    $num_update: Int
-        the maximal number of updates applied to a weight.
-=cut
-
-package AI::MXNet::FactorScheduler;
-
-=head1 NAME
-
-    AI::MXNet::FactorScheduler - Reduces the learning rate by a factor.
-
-=head1 DESCRIPTION
-
-    Reduces the learning rate by a factor each step.
-    Assume the weight has been updated by n times, then the learning rate will
-    be base_lr * factor^(floor(n/step))
-
-    Parameters
-    ----------
-    step: Int
-        schedule the learning rate update after n updates
-    factor: Num
-        the factor by which to reduce the learning rate.
-=cut
-use Mouse;
-extends 'AI::MXNet::LRScheduler';
-
-has 'step'            => (is => 'ro', isa => 'Int', required => 1);
-has 'factor'          => (is => 'ro', isa => 'Num', default  => 1);
-has 'count'           => (is => 'rw', isa => 'Int', default  => 1);
-has 'stop_factor_lr'  => (is => 'ro', isa => 'Num', default  => 1e-8);
-
-sub BUILD
-{
-    my $self = shift;
-    confess("Schedule step must be greater or equal than 1")
-        if $self->step < 1;
-    confess("Factor must be no more than 1 to make lr reduce")
-        if $self->factor > 1;
-}
-
-method call(Int $num_update)
-{
-    # NOTE: use while rather than if  (for continuing training via load_epoch)
-    while($num_update > $self->count + $self->step)
-    {
-        $self->count($self->count + $self->step);
-        $self->base_lr($self->base_lr * $self->factor);
-        if($self->base_lr < $self->stop_factor_lr)
-        {
-            $self->base_lr($self->stop_factor_lr);
-            AI::MXNet::Logging->info(
-                "Update[%d]: now learning rate arrived at %0.5e, will not "
-                ."change in the future", $num_update, $self->base_lr
-            );
-        }
-        else
-        {
-            AI::MXNet::Logging->info(
-                "Update[%d]: Changed learning rate to %0.5e",
-                $num_update, $self->base_lr
-            );
-        }
-    }
-    return $self->base_lr;
-}
-
-package AI::MXNet::MultiFactorScheduler;
-
-=head1 NAME
-
-    AI::MXNet::MultiFactorScheduler - Reduces the learning rate by an array ref of factors.
-
-=head1 DESCRIPTION
-
-    Reduces a learning rate in factor at steps specified in an array ref.
-    Assume the weight has been updated by n times, then the learning rate will
-    be base_lr * factor^(sum((step/n)<=1)) # step is an array.
-
-    Parameters
-    ----------
-    step: ArrayRef[Int]
-        schedule learning rate after n updates
-    factor: Num
-        the factor for reducing the learning rate
-=cut
-
-use Mouse;
-extends 'AI::MXNet::LRScheduler';
-has 'step'            => (is => 'ro', isa => 'ArrayRef[Int]', required => 1);
-has 'factor'          => (is => 'ro', isa => 'Num', default  => 1);
-has 'cur_step_ind'    => (is => 'rw', isa => 'Int', default  => 0);
-has 'count'           => (is => 'rw', isa => 'Int', default  => 0);
-
-sub BUILD
-{
-    my $self = shift;
-    confess("step array must have at least one member")
-        unless @{ $self->step } >=1 ;
-    for (my $i = 0; $i < @{ $self->step }; $i++)
-    {
-        confess("Schedule step must be an increasing integer list")
-            if($i and $self->step->[$i] <= $self->step->[$i-1]);
-        confess("Schedule step must be greater or equal than 1")
-            if $self->step->[$i] < 1;
-    }
-    confess("Factor must be no more than 1 to make lr reduce")
-        if $self->factor > 1;
-}
-
-method call(Int $num_update)
-{
-    # NOTE: use while rather than if  (for continuing training via load_epoch)
-    while($self->cur_step_ind < @{ $self->step })
-    {
-        if($num_update > $self->step->[$self->cur_step_ind])
-        {
-            $self->count($self->step->[$self->cur_step_ind]);
-            $self->cur_step_ind($self->cur_step_ind + 1);
-            $self->base_lr($self->base_lr * $self->factor);
-            AI::MXNet::Logging->info(
-                "Update[%d]: Changed learning rate to %0.5e",
-                $num_update, $self->base_lr
-            );
-        }
-        else
-        {
-            return $self->base_lr;
-        }
-    }
-    return $self->base_lr;
-}
-
-1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/LinAlg.pm b/perl-package/AI-MXNet/lib/AI/MXNet/LinAlg.pm
deleted file mode 100644
index 50077e1a04be..000000000000
--- a/perl-package/AI-MXNet/lib/AI/MXNet/LinAlg.pm
+++ /dev/null
@@ -1,78 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-package AI::MXNet::LinAlg;
-use strict;
-use warnings;
-use AI::MXNet::NS;
-use AI::MXNet::LinAlg::Symbol;
-use AI::MXNet::LinAlg::NDArray;
-
-=head1 NAME
-
-    AI::MXNet::LinAlg - Linear Algebra routines for NDArray and Symbol.
-=cut
-
-=head1 DESCRIPTION
-
-    The Linear Algebra API, provides imperative/symbolic linear algebra tensor operations on CPU/GPU.
-
-    mx->linalg-><sym|nd>->gemm  Performs general matrix multiplication and accumulation.
-    mx->linalg-><sym|nd>->gemm2 Performs general matrix multiplication.
-    mx->linalg-><sym|nd>->potrf Performs Cholesky factorization of a symmetric positive-definite matrix.
-    mx->linalg-><sym|nd>->potri Performs matrix inversion from a Cholesky factorization.
-    mx->linalg-><sym|nd>->trmm  Performs multiplication with a lower triangular matrix.
-    mx->linalg-><sym|nd>->trsm  Solves matrix equation involving a lower triangular matrix.
-    mx->linalg-><sym|nd>->sumlogdiag    Computes the sum of the logarithms of the diagonal elements of a square matrix.
-    mx->linalg-><sym|nd>->syrk  Multiplication of matrix with its transpose.
-    mx->linalg-><sym|nd>->gelqf LQ factorization for general matrix.
-    mx->linalg-><sym|nd>->syevd Eigendecomposition for symmetric matrix.
-    L<NDArray Python Docs|https://mxnet.apache.org/api/python/ndarray/linalg.html>
-    L<Symbol Python Docs|https://mxnet.apache.org/api/python/symbol/linalg.html>
-
-    Examples:
-
-    ## NDArray
-    my $A = mx->nd->array([[1.0, 1.0], [1.0, 1.0]]);
-    my $B = mx->nd->array([[1.0, 1.0], [1.0, 1.0], [1.0, 1.0]]);
-    ok(almost_equal(
-        mx->nd->linalg->gemm2($A, $B, transpose_b=>1, alpha=>2.0)->aspdl,
-        pdl([[4.0, 4.0, 4.0], [4.0, 4.0, 4.0]])
-    ));
-
-    ## Symbol
-    my $sym_gemm2 = mx->sym->linalg->gemm2(
-        mx->sym->var('A'),
-        mx->sym->var('B'),
-        transpose_b => 1,
-        alpha => 2.0
-    );
-    my $A = mx->nd->array([[1.0, 1.0], [1.0, 1.0]]);
-    my $B = mx->nd->array([[1.0, 1.0], [1.0, 1.0], [1.0, 1.0]]);
-    ok(almost_equal(
-        $sym_gemm2->eval(args => { A => $A, B => $B })->[0]->aspdl,
-        pdl([[4.0, 4.0, 4.0], [4.0, 4.0, 4.0]])
-    ));
-
-=cut
-
-sub sym    { 'AI::MXNet::LinAlg::Symbol'  }
-sub symbol { 'AI::MXNet::LinAlg::Symbol'  }
-sub nd     { 'AI::MXNet::LinAlg::NDArray' }
-sub ndarray { 'AI::MXNet::LinAlg::NDArray' }
-
-1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/LinAlg/NDArray.pm b/perl-package/AI-MXNet/lib/AI/MXNet/LinAlg/NDArray.pm
deleted file mode 100644
index 3d3b3e5818b5..000000000000
--- a/perl-package/AI-MXNet/lib/AI/MXNet/LinAlg/NDArray.pm
+++ /dev/null
@@ -1,24 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-package AI::MXNet::LinAlg::NDArray;
-use strict;
-use warnings;
-use parent 'AI::MXNet::AutoLoad';
-sub config { ('linalg', 'AI::MXNet::NDArray') }
-
-1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/LinAlg/Symbol.pm b/perl-package/AI-MXNet/lib/AI/MXNet/LinAlg/Symbol.pm
deleted file mode 100644
index 6ebdadf23b47..000000000000
--- a/perl-package/AI-MXNet/lib/AI/MXNet/LinAlg/Symbol.pm
+++ /dev/null
@@ -1,24 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-package AI::MXNet::LinAlg::Symbol;
-use strict;
-use warnings;
-use parent 'AI::MXNet::AutoLoad';
-sub config { ('linalg', 'AI::MXNet::Symbol') }
-
-1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Logging.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Logging.pm
deleted file mode 100644
index 839b456e1ef1..000000000000
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Logging.pm
+++ /dev/null
@@ -1,28 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-package AI::MXNet::Logging;
-## TODO
-use strict;
-use warnings;
-use Mouse;
-our $silent = 0;
-sub warning { return if $silent; shift; warn sprintf(shift, @_) . "\n" };
-*debug   = *info = *warning;
-sub get_logger { __PACKAGE__->new }
-
-1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Metric.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Metric.pm
deleted file mode 100644
index cf98458d61ab..000000000000
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Metric.pm
+++ /dev/null
@@ -1,1192 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-package AI::MXNet::Metric;
-use strict;
-use warnings;
-use AI::MXNet::NS;
-use AI::MXNet::Function::Parameters;
-use Scalar::Util qw/blessed/;
-use JSON::PP;
-
-=encoding utf-8
-
-=head1 NAME
-
-    AI::MXNet::Metric - Evaluation Metric API.
-=head1 DESCRIPTION
-
-    This module hosts all the evaluation metrics available to evaluate the performance of a learned model.
-    L<Python Docs|https://mxnet.apache.org/api/python/metric/metric.html>
-=cut
-
-# Check to see if the two arrays are the same size.
-sub _calculate_shape
-{
-    my $input = shift;
-    my ($shape);
-    if(blessed($input))
-    {
-        if($input->isa('PDL'))
-        {
-            $shape = $input->shape->at(-1);
-        }
-        else
-        {
-            $shape = $input->shape->[0];
-        }
-    }
-    else
-    {
-        $shape = @{ $input };
-    }
-    return $shape;
-}
-func check_label_shapes(
-    ArrayRef|AI::MXNet::NDArray|PDL $labels,
-    ArrayRef|AI::MXNet::NDArray|PDL $preds
-)
-{
-    my ($label_shape, $pred_shape) = (_calculate_shape($labels), _calculate_shape($preds));
-    Carp::confess(
-        "Shape of labels $label_shape does not "
-        ."match shape of predictions $pred_shape"
-    ) unless $pred_shape == $label_shape;
-}
-
-package AI::MXNet::EvalMetric;
-use Mouse;
-use overload '""' => sub {
-    return "EvalMetric: "
-            .Data::Dumper->new(
-                [shift->get_name_value()]
-            )->Purity(1)->Deepcopy(1)->Terse(1)->Dump
-},  fallback => 1;
-has 'name'       => (is => 'rw', isa => 'Str');
-has 'num'        => (is => 'rw', isa => 'Int');
-has 'num_inst'   => (is => 'rw', isa => 'Maybe[Int|ArrayRef[Int]]');
-has 'sum_metric' => (is => 'rw', isa => 'Maybe[Num|ArrayRef[Num]]');
-has '_kwargs'    => (is => 'rw', init_arg => undef);
-around BUILDARGS => \&AI::MXNet::Base::process_arguments;
-
-sub BUILD
-{
-    my ($self, $kwargs) = @_;
-    $self->reset;
-    $self->_kwargs($kwargs);
-}
-
-method _class_name()
-{
-    my $class = ref $self || $self;
-    $class =~ s/^.+:://;
-    $class;
-}
-
-=head2 get_config
-
-    Save configurations of metric. Can be recreated
-        from configs with mx->metric->create(%{ $config })
-=cut
-
-method get_config()
-{
-    my %config = %{ $self->_kwargs };
-    %config = (%config,
-        metric => $self->_class_name,
-        name   => $self->name
-    );
-    return \%config;
-}
-
-method update($label, $pred)
-{
-    confess('NotImplemented');
-}
-
-method reset()
-{
-    if(not defined $self->num)
-    {
-        $self->num_inst(0);
-        $self->sum_metric(0);
-    }
-    else
-    {
-        $self->num_inst([(0) x $self->num]);
-        $self->sum_metric([(0) x $self->num]);
-    }
-}
-
-method get()
-{
-    if(not defined $self->num)
-    {
-        if($self->num_inst == 0)
-        {
-            return ($self->name, 'nan');
-        }
-        else
-        {
-            return ($self->name, $self->sum_metric / $self->num_inst);
-        }
-    }
-    else
-    {
-        my $names = [map { sprintf('%s_%d', $self->name, $_) } 0..$self->num-1];
-        my $values = [];
-        for (my $i = 0; $i < @{ $self->sum_metric }; $i++)
-        {
-            my ($x, $y) = ($self->sum_metric->[$i], $self->num_inst->[$i]);
-            if($y != 0)
-            {
-                push (@$values, $x/$y);
-            }
-            else
-            {
-                push (@$values, 'nan');
-            }
-        }
-        return ($names, $values);
-    }
-}
-
-method get_name_value()
-{
-    my ($name, $value) = $self->get;
-    $name = [$name] unless ref $name;
-    $value = [$value] unless ref $value;
-    my %ret;
-    @ret{ @$name } = @$value;
-    return \%ret;
-}
-
-package AI::MXNet::CompositeEvalMetric;
-use Mouse;
-
-extends 'AI::MXNet::EvalMetric';
-has 'metrics' => (is => 'rw', isa => 'ArrayRef[AI::MXNet::EvalMetric]', default => sub { [] });
-has '+name'   => (default => 'composite');
-method python_constructor_arguments() { ['metrics'] }
-
-# Add a child metric.
-method add(AI::MXNet::EvalMetric $metric)
-{
-    push @{ $self->metrics }, $metric;
-}
-
-# Get a child metric.
-method get_metric(int $index)
-{
-    my $max = @{ $self->metrics } - 1;
-    confess("Metric index $index is out of range 0 and $max")
-        if $index > $max;
-    return $self->metrics->[$index];
-}
-
-method update(ArrayRef[AI::MXNet::NDArray] $labels, ArrayRef[AI::MXNet::NDArray] $preds)
-{
-    for my $metric (@{ $self->metrics })
-    {
-        $metric->update($labels, $preds);
-    }
-}
-
-method reset()
-{
-    for my $metric (@{ $self->metrics })
-    {
-        $metric->reset;
-    }
-}
-
-method get()
-{
-    my $names = [];
-    my $results = [];
-    for my $metric (@{ $self->metrics })
-    {
-        my ($name, $result) = $metric->get;
-        $name = [$name] unless ref $name;
-        $result = [$result] unless ref $result;
-        push @$names, @$name;
-        push @$results, @$result;
-    }
-    return ($names, $results);
-}
-
-
-########################
-# CLASSIFICATION METRICS
-########################
-
-=head1 NAME
-
-    AI::MXNet::Accuracy - Computes accuracy classification score.
-=cut
-
-=head1 DESCRIPTION
-
-    The accuracy score is defined as
-
-    accuracy(y, y^) = (1/n) * sum(i=0..n-1) { y^(i)==y(i) }
-
-    Parameters:
-    axis (Int, default=1) – The axis that represents classes.
-    name (Str, default='accuracy') – Name of this metric instance for display.
-
-    pdl> use AI::MXNet qw(mx)
-    pdl> $predicts = [mx->nd->array([[0.3, 0.7], [0, 1.], [0.4, 0.6]])]
-    pdl> $labels   = [mx->nd->array([[0, 1, 1]])]
-    pdl> $acc = mx->metric->Accuracy()
-    pdl> $acc->update($labels, $predicts)
-    pdl> use Data::Dumper
-    pdl> print Dumper([$acc->get])
-    $VAR1 = [
-          'accuracy',
-          '0.666666666666667'
-    ];
-
-=cut
-
-package AI::MXNet::Accuracy;
-use Mouse;
-use AI::MXNet::Base;
-extends 'AI::MXNet::EvalMetric';
-has '+name'   => (default => 'accuracy');
-has 'axis'    => (is => 'ro', isa => 'Int', default => 1);
-
-method update(ArrayRef[AI::MXNet::NDArray] $labels, ArrayRef[AI::MXNet::NDArray] $preds)
-{
-    AI::MXNet::Metric::check_label_shapes($labels, $preds);
-    for(zip($labels, $preds)) {
-        my ($label, $pred_label) = @$_;
-        if(join(',', @{$pred_label->shape}) ne join(',', @{$label->shape}))
-        {
-            $pred_label = AI::MXNet::NDArray->argmax_channel($pred_label, { axis => $self->axis });
-        }
-        my $sum = ($pred_label->aspdl->flat == $label->aspdl->flat)->sum;
-        $self->sum_metric($self->sum_metric + $sum);
-        $self->num_inst($self->num_inst + $pred_label->size);
-    }
-}
-
-=head1 NAME
-
-    AI::MXNet::TopKAccuracy - Computes top k predictions accuracy.
-=cut
-
-=head1 DESCRIPTION
-
-    TopKAccuracy differs from Accuracy in that it considers the prediction
-    to be True as long as the ground truth label is in the top K predicated labels.
-
-    If top_k = 1, then TopKAccuracy is identical to Accuracy.
-
-    Parameters:
-    top_k(Int, default 1) – Whether targets are in top k predictions.
-    name (Str, default 'top_k_accuracy') – Name of this metric instance for display.
-
-    use AI::MXNet qw(mx);
-    $top_k = 3;
-    $predicts = [mx->nd->array(
-      [[0.80342804, 0.5275223 , 0.11911147, 0.63968144, 0.09092526,
-        0.33222568, 0.42738095, 0.55438581, 0.62812652, 0.69739294],
-       [0.78994969, 0.13189035, 0.34277045, 0.20155961, 0.70732423,
-        0.03339926, 0.90925004, 0.40516066, 0.76043547, 0.47375838],
-       [0.28671892, 0.75129249, 0.09708994, 0.41235779, 0.28163896,
-        0.39027778, 0.87110921, 0.08124512, 0.55793117, 0.54753428],
-       [0.33220307, 0.97326881, 0.2862761 , 0.5082575 , 0.14795074,
-        0.19643398, 0.84082001, 0.0037532 , 0.78262101, 0.83347772],
-       [0.93790734, 0.97260166, 0.83282304, 0.06581761, 0.40379256,
-        0.37479349, 0.50750135, 0.97787696, 0.81899021, 0.18754124],
-       [0.69804812, 0.68261077, 0.99909815, 0.48263116, 0.73059268,
-        0.79518236, 0.26139168, 0.16107376, 0.69850315, 0.89950917],
-       [0.91515562, 0.31244902, 0.95412616, 0.7242641 , 0.02091039,
-        0.72554552, 0.58165923, 0.9545687 , 0.74233195, 0.19750339],
-       [0.94900651, 0.85836332, 0.44904621, 0.82365038, 0.99726878,
-        0.56413064, 0.5890016 , 0.42402702, 0.89548786, 0.44437266],
-       [0.57723744, 0.66019353, 0.30244304, 0.02295771, 0.83766937,
-        0.31953292, 0.37552193, 0.18172362, 0.83135182, 0.18487429],
-       [0.96968683, 0.69644561, 0.60566253, 0.49600661, 0.70888438,
-        0.26044186, 0.65267488, 0.62297362, 0.83609334, 0.3572364 ]]
-    )];
-    $labels = [mx->nd->array([2, 6, 9, 2, 3, 4, 7, 8, 9, 6])];
-    $acc = mx->metric->TopKAccuracy(top_k=>$top_k);
-    $acc->update($labels, $predicts);
-    use Data::Dumper;
-    print Dumper([$acc->get]);
-    $VAR1 = [
-          'top_k_accuracy_3',
-          '0.3'
-    ];
-
-
-=cut
-
-package AI::MXNet::TopKAccuracy;
-use Mouse;
-use List::Util qw/min/;
-use AI::MXNet::Base;
-extends 'AI::MXNet::EvalMetric';
-has '+name'   => (default => 'top_k_accuracy');
-has 'top_k' => (is => 'rw', isa => 'Int', default => 1);
-method python_constructor_arguments() { ['top_k'] }
-
-sub BUILD
-{
-    my $self = shift;
-    confess("Please use Accuracy if top_k is no more than 1")
-        unless $self->top_k > 1;
-    $self->name($self->name . "_" . $self->top_k);
-}
-
-method update(ArrayRef[AI::MXNet::NDArray] $labels, ArrayRef[AI::MXNet::NDArray] $preds)
-{
-    AI::MXNet::Metric::check_label_shapes($labels, $preds);
-    for(zip($labels, $preds)) {
-        my ($label, $pred_label) = @$_;
-        confess('Predictions should be no more than 2 dims')
-            unless @{ $pred_label->shape } <= 2;
-        $pred_label = $pred_label->aspdl->qsorti;
-        $label = $label->astype('int32')->aspdl;
-        AI::MXNet::Metric::check_label_shapes($label, $pred_label);
-        my $num_samples = $pred_label->shape->at(-1);
-        my $num_dims = $pred_label->ndims;
-        if($num_dims == 1)
-        {
-            my $sum = ($pred_label->flat == $label->flat)->sum;
-            $self->sum_metric($self->sum_metric + $sum);
-        }
-        elsif($num_dims == 2)
-        {
-            my $num_classes = $pred_label->shape->at(0);
-            my $top_k = min($num_classes, $self->top_k);
-            for my $j (0..$top_k-1)
-            {
-                my $sum = ($pred_label->slice($num_classes -1 - $j, 'X')->flat == $label->flat)->sum;
-                $self->sum_metric($self->sum_metric + $sum);
-            }
-        }
-        $self->num_inst($self->num_inst + $num_samples);
-    }
-}
-
-package _BinaryClassificationMetrics {
-    use Mouse;
-    #Private container class for classification metric statistics. True/false positive and
-    # true/false negative counts are sufficient statistics for various classification metrics.
-    #This class provides the machinery to track those statistics across mini-batches of
-    #(label, prediction) pairs.
-    has [qw/true_positives
-            false_negatives
-            false_positives
-            true_negatives/] => (is => 'rw', isa => 'Int', default => 0);
-
-    method update_binary_stats(AI::MXNet::NDArray $label, AI::MXNet::NDArray $pred)
-    {
-        $pred = AI::MXNet::NDArray->argmax($pred, { axis => 1 })->aspdl;
-        $label = $label->astype('int32')->aspdl;
-
-        AI::MXNet::Metric::check_label_shapes($label, $pred);
-        if($label->uniq->len > 2)
-        {
-            confess("Currently only support binary classification.");
-        }
-
-        my $pred_true = ($pred == 1);
-        my $pred_false = 1 - $pred_true;
-        my $label_true = ($label == 1);
-        my $label_false = 1 - $label_true;
-
-        $self->true_positives($self->true_positives + ($pred_true * $label_true)->sum);
-        $self->false_positives($self->false_positives + ($pred_true * $label_false)->sum);
-        $self->false_negatives($self->false_negatives + ($pred_false * $label_true)->sum);
-        $self->true_negatives($self->true_negatives + ($pred_false * $label_false)->sum);
-    }
-
-    method precision()
-    {
-        if($self->true_positives + $self->false_positives > 0)
-        {
-            return $self->true_positives / ($self->true_positives + $self->false_positives);
-        }
-        else
-        {
-            return 0;
-        }
-    }
-
-    method recall()
-    {
-        if($self->true_positives + $self->false_negatives > 0)
-        {
-            return $self->true_positives / ($self->true_positives + $self->false_negatives);
-        }
-        else
-        {
-            return 0;
-        }
-    }
-
-    method fscore()
-    {
-        if($self->precision + $self->recall > 0)
-        {
-            return 2 * $self->precision * $self->recall / ($self->precision + $self->recall);
-        }
-        else
-        {
-            return 0;
-        }
-    }
-
-    method matthewscc()
-    {
-        if(not $self->total_examples)
-        {
-            return 0;
-        }
-        my @terms = (
-            $self->true_positives + $self->false_positives,
-            $self->true_positives + $self->false_negatives,
-            $self->true_negatives + $self->false_positives,
-            $self->true_negatives + $self->false_negatives
-        );
-        my $denom = 1;
-        for my $t (grep { $_ } @terms)
-        {
-            $denom *= $t;
-        }
-        return (($self->true_positives * $self->true_negatives) - ($self->false_positives * $self->false_negatives)) / sqrt($denom);
-    }
-
-    method total_examples()
-    {
-        return $self->false_negatives + $self->false_positives +
-               $self->true_negatives + $self->true_positives;
-    }
-
-    method reset_stats()
-    {
-        $self->false_positives(0);
-        $self->false_negatives(0);
-        $self->true_positives(0);
-        $self->true_negatives(0);
-    }
-};
-
-=head1 NAME
-
-    AI::MXNet::F1 - Calculate the F1 score of a binary classification problem.
-=cut
-
-=head1 DESCRIPTION
-
-    The F1 score is equivalent to harmonic mean of the precision and recall,
-    where the best value is 1.0 and the worst value is 0.0. The formula for F1 score is:
-
-    F1 = 2 * (precision * recall) / (precision + recall)
-    The formula for precision and recall is:
-
-    precision = true_positives / (true_positives + false_positives)
-    recall    = true_positives / (true_positives + false_negatives)
-    Note:
-
-    This F1 score only supports binary classification.
-
-    Parameters:
-    name (Str, default 'f1') – Name of this metric instance for display.
-    average (Str, default 'macro') –
-    Strategy to be used for aggregating across mini-batches.
-    “macro”: average the F1 scores for each batch. “micro”: compute a single F1 score across all batches.
-
-
-    $predicts = [mx.nd.array([[0.3, 0.7], [0., 1.], [0.4, 0.6]])];
-    $labels   = [mx.nd.array([0., 1., 1.])];
-    $f1 = mx->metric->F1();
-    $f1->update($labels, $predicts);
-    print $f1->get;
-    f1 0.8
-
-=cut
-
-package AI::MXNet::F1;
-use Mouse;
-use AI::MXNet::Base;
-extends 'AI::MXNet::EvalMetric';
-has '+name'   => (default => 'f1');
-has 'average' => (is => 'ro', isa => 'Str', default => 'macro');
-has 'metrics' => (is => 'rw', init_arg => undef, default => sub { _BinaryClassificationMetrics->new });
-has 'method'  => (is => 'ro', init_arg => undef, default => 'fscore');
-method python_constructor_arguments() { [qw/name average/] }
-
-method update(ArrayRef[AI::MXNet::NDArray] $labels, ArrayRef[AI::MXNet::NDArray] $preds)
-{
-    my $method = $self->method;
-    AI::MXNet::Metric::check_label_shapes($labels, $preds);
-    for(zip($labels, $preds)) {
-        my ($label, $pred) = @$_;
-        $self->metrics->update_binary_stats($label, $pred);
-        if($self->average eq "macro")
-        {
-            $self->sum_metric($self->sum_metric + $self->metrics->$method);
-            $self->num_inst($self->num_inst + 1);
-            $self->metrics->reset_stats();
-        }
-        else
-        {
-            $self->sum_metric($self->metrics->fscore * $self->metrics->total_examples);
-            $self->num_inst($self->metrics->total_examples);
-        }
-    }
-}
-
-method reset()
-{
-    $self->sum_metric(0);
-    $self->num_inst(0);
-    $self->metrics->reset_stats();
-}
-
-=head1 NAME
-
-    AI::MXNet::MCC - Computes the Matthews Correlation Coefficient of a binary classification problem.
-=cut
-
-=head1 DESCRIPTION
-
-    While slower to compute than F1 the MCC can give insight that F1 or Accuracy cannot.
-    For instance, if the network always predicts the same result
-    then the MCC will immeadiately show this. The MCC is also symetric with respect
-    to positive and negative categorization, however, there needs to be both
-    positive and negative examples in the labels or it will always return 0.
-    MCC of 0 is uncorrelated, 1 is completely correlated, and -1 is negatively correlated.
-
-        MCC = (TP * TN - FP * FN)/sqrt( (TP + FP)*( TP + FN )*( TN + FP )*( TN + FN ) )
-
-    where 0 terms in the denominator are replaced by 1.
-
-    This version of MCC only supports binary classification.
-
-    Parameters
-    ----------
-    name : str, 'mcc'
-        Name of this metric instance for display.
-    average : str, default 'macro'
-        Strategy to be used for aggregating across mini-batches.
-            "macro": average the MCC for each batch.
-            "micro": compute a single MCC across all batches.
-
-    Examples
-    --------
-    In this example the network almost always predicts positive
-    >>> $false_positives = 1000
-    >>> $false_negatives = 1
-    >>> $true_positives = 10000
-    >>> $true_negatives = 1
-    >>> $predicts = [mx->nd->array(
-        [
-            ([.3, .7])x$false_positives,
-            ([.7, .3])x$true_negatives,
-            ([.7, .3])x$false_negatives,
-            ([.3, .7])xtrue_positives
-        ]
-    )];
-    >>> $labels  = [mx->nd->array(
-        [
-            (0)x($false_positives + $true_negatives),
-            (1)x($false_negatives + $true_positives)
-        ]
-    )];
-    >>> $f1 = mx->metric->F1();
-    >>> $f1->update($labels, $predicts);
-    >>> $mcc = mx->metric->MCC()
-    >>> $mcc->update($labels, $predicts)
-    >>> print $f1->get();
-    f1 0.95233560306652054
-    >>> print $mcc->get();
-    mcc 0.01917751877733392
-
-=cut
-
-package AI::MXNet::MCC;
-use Mouse;
-extends 'AI::MXNet::F1';
-has '+name'   => (default => 'mcc');
-has '+method' => (default => 'matthewscc');
-
-package AI::MXNet::Perplexity;
-use Mouse;
-use AI::MXNet::Base;
-extends 'AI::MXNet::EvalMetric';
-has '+name'        => (default => 'Perplexity');
-has 'ignore_label' => (is => 'ro', isa => 'Maybe[Int]');
-has 'axis'         => (is => 'ro', isa => 'Int', default => -1);
-method python_constructor_arguments() { ['ignore_label', 'axis'] }
-
-around BUILDARGS => sub {
-    my $orig  = shift;
-    my $class = shift;
-    return $class->$orig(ignore_label => $_[0]) if @_ == 1;
-    return $class->$orig(@_);
-};
-
-=head1 NAME
-
-    AI::MXNet::Perplexity - Calculate perplexity.
-=cut
-
-=head1 DESCRIPTION
-
-    Perplexity is a measurement of how well a probability distribution or model predicts a sample.
-    A low perplexity indicates the model is good at predicting the sample.
-
-    Parameters
-    ----------
-    ignore_label : int or undef
-        index of invalid label to ignore when
-        counting. usually should be -1. Include
-        all entries if undef.
-    axis : int (default -1)
-        The axis from prediction that was used to
-        compute softmax. By default uses the last
-        axis.
-
-    $predicts = [mx->nd->array([[0.3, 0.7], [0, 1.], [0.4, 0.6]])];
-    $labels   = [mx->nd->array([0, 1, 1])];
-    $perp = mx->metric->Perplexity(ignore_label=>undef);
-    $perp->update($labels, $predicts);
-    print $perp->get()
-    Perplexity 1.77109762851559
-
-=cut
-
-method update(ArrayRef[AI::MXNet::NDArray] $labels, ArrayRef[AI::MXNet::NDArray] $preds)
-{
-    AI::MXNet::Metric::check_label_shapes($labels, $preds);
-    my ($loss, $num) = (0, 0);
-    for(zip($labels, $preds)) {
-        my ($label, $pred) = @$_;
-        my $label_shape = $label->shape;
-        my $pred_shape  = $pred->shape;
-        assert(
-            (product(@{ $label_shape }) == product(@{ $pred_shape })/$pred_shape->[-1]),
-            "shape mismatch: (@$label_shape) vs. (@$pred_shape)"
-        );
-        $label = $label->as_in_context($pred->context)->reshape([$label->size]);
-        $pred = AI::MXNet::NDArray->pick($pred, $label->astype('int32'), { axis => $self->axis });
-        if(defined $self->ignore_label)
-        {
-            my $ignore = ($label == $self->ignore_label);
-            $num -= $ignore->sum->asscalar;
-            $pred = $pred*(1-$ignore) + $ignore;
-        }
-        $loss -= $pred->maximum(1e-10)->log->sum->asscalar;
-        $num  += $pred->size;
-    }
-    $self->sum_metric($self->sum_metric + $loss);
-    $self->num_inst($self->num_inst + $num);
-}
-
-method get()
-{
-    return ($self->name, exp($self->sum_metric / $self->num_inst));
-}
-
-####################
-# REGRESSION METRICS
-####################
-
-=head1 NAME
-
-    AI::MXNet::MAE - Calculate Mean Absolute Error loss
-=head1 DESCRIPTION
-
-    >>> $predicts = [mx->nd->array([3, -0.5, 2, 7])->reshape([4,1])]
-    >>> $labels = [mx->nd->array([2.5, 0.0, 2, 8])->reshape([4,1])]
-    >>> $mean_absolute_error = mx->metric->MAE()
-    >>> $mean_absolute_error->update($labels, $predicts)
-    >>> print $mean_absolute_error->get()
-    ('mae', 0.5)
-
-=cut
-
-
-package AI::MXNet::MAE;
-use Mouse;
-use AI::MXNet::Base;
-extends 'AI::MXNet::EvalMetric';
-has '+name'   => (default => 'mae');
-
-method update(ArrayRef[AI::MXNet::NDArray] $labels, ArrayRef[AI::MXNet::NDArray] $preds)
-{
-    AI::MXNet::Metric::check_label_shapes($labels, $preds);
-    for(zip($labels, $preds)) {
-        my ($label, $pred) = @$_;
-        $label = $label->aspdl;
-        $pred =  $pred->aspdl;
-        if($label->ndims == 1)
-        {
-            $label = $label->reshape(1, $label->shape->at(0));
-        }
-        $self->sum_metric($self->sum_metric + ($label - $pred)->abs->avg);
-        $self->num_inst($self->num_inst + 1);
-    }
-}
-
-=head1 NAME
-
-    AI::MXNet::MSE - Calculate Mean Squared Error loss
-=head1 DESCRIPTION
-
-    >>> $predicts = [mx->nd->array([3, -0.5, 2, 7])->reshape([4,1])]
-    >>> $labels = [mx->nd->array([2.5, 0.0, 2, 8])->reshape([4,1])]
-    >>> $mean_squared_error = mx->metric->MSE()
-    >>> $mean_squared_error->update($labels, $predicts)
-    >>> print $mean_squared_error->get()
-    ('mse', 0.375)
-
-=cut
-
-package AI::MXNet::MSE;
-use Mouse;
-use AI::MXNet::Base;
-extends 'AI::MXNet::EvalMetric';
-has '+name'   => (default => 'mse');
-
-method update(ArrayRef[AI::MXNet::NDArray] $labels, ArrayRef[AI::MXNet::NDArray] $preds)
-{
-    AI::MXNet::Metric::check_label_shapes($labels, $preds);
-    for(zip($labels, $preds)) {
-        my ($label, $pred) = @$_;
-        $label = $label->aspdl;
-        $pred =  $pred->aspdl;
-        if($label->ndims == 1)
-        {
-            $label = $label->reshape(1, $label->shape->at(0));
-        }
-        $self->sum_metric($self->sum_metric + (($label - $pred)**2)->avg);
-        $self->num_inst($self->num_inst + 1);
-    }
-}
-
-=head1 NAME
-
-    AI::MXNet::RMSE - Calculate Root Mean Squred Error loss
-=head1 DESCRIPTION
-
-    >>> $predicts = [mx->nd->array([3, -0.5, 2, 7])->reshape([4,1])]
-    >>> $labels = [mx->nd->array([2.5, 0.0, 2, 8])->reshape([4,1])]
-    >>> $root_mean_squared_error = mx->metric->RMSE()
-    >>> $root_mean_squared_error->update($labels, $predicts)
-    >>> print $root_mean_squared_error->get()
-    'rmse', 0.612372457981
-
-=cut
-
-package AI::MXNet::RMSE;
-use Mouse;
-use AI::MXNet::Base;
-extends 'AI::MXNet::EvalMetric';
-has '+name'   => (default => 'rmse');
-
-method update(ArrayRef[AI::MXNet::NDArray] $labels, ArrayRef[AI::MXNet::NDArray] $preds)
-{
-    AI::MXNet::Metric::check_label_shapes($labels, $preds);
-    for(zip($labels, $preds)) {
-        my ($label, $pred) = @$_;
-        $label = $label->aspdl;
-        $pred =  $pred->aspdl;
-        if($label->ndims == 1)
-        {
-            $label = $label->reshape(1, $label->shape->at(0));
-        }
-        $self->sum_metric($self->sum_metric + sqrt((($label - $pred)**2)->avg));
-        $self->num_inst($self->num_inst + 1);
-    }
-}
-
-
-=head1 NAME
-
-    AI::MXNet::CrossEntropy - Calculate Cross Entropy loss
-=head1 DESCRIPTION
-
-    >>> $predicts = [mx->nd->array([[0.3, 0.7], [0, 1.], [0.4, 0.6]])]
-    >>> $labels   = [mx->nd->array([0, 1, 1])]
-    >>> $ce = mx->metric->CrossEntropy()
-    >>> $ce->update($labels, $predicts)
-    >>> print $ce->get()
-    ('cross-entropy', 0.57159948348999023)
-
-=cut
-
-# Calculate Cross Entropy loss
-package AI::MXNet::CrossEntropy;
-use Mouse;
-use AI::MXNet::Base;
-extends 'AI::MXNet::EvalMetric';
-has '+name'   => (default => 'cross-entropy');
-has 'eps'     => (is => 'ro', isa => 'Num', default => 1e-12);
-method python_constructor_arguments() { ['eps'] }
-
-method update(ArrayRef[AI::MXNet::NDArray] $labels, ArrayRef[AI::MXNet::NDArray] $preds)
-{
-    AI::MXNet::Metric::check_label_shapes($labels, $preds);
-    for(zip($labels, $preds)) {
-        my ($label, $pred) = @$_;
-        $label = $label->aspdl->flat;
-        $pred =  $pred->aspdl;
-        my $label_shape = $label->shape->at(0);
-        my $pred_shape  = $pred->shape->at(-1);
-        confess(
-            "Size of label  $label_shape and
-            .first dimension of pred $pred_shape do not match"
-        ) unless $label_shape == $pred_shape;
-        my $prob = $pred->index($label);
-        $self->sum_metric($self->sum_metric + (-($prob + $self->eps)->log)->sum);
-        $self->num_inst($self->num_inst + $label_shape);
-    }
-}
-
-=head1 NAME
-
-    AI::MXNet::NegativeLogLikelihood - Computes the negative log-likelihood loss.
-=head1 DESCRIPTION
-
-    >>> $predicts = [mx->nd->array([[0.3, 0.7], [0, 1.], [0.4, 0.6]])]
-    >>> $labels   = [mx->nd->array([0, 1, 1])]
-    >>> $nll_loss = mx->metric->NegativeLogLikelihood
-    >>> $nll_loss->update($labels, $predicts)
-    >>> print $nll_loss->get()
-    ('cross-entropy', 0.57159948348999023)
-
-=cut
-
-package AI::MXNet::NegativeLogLikelihood;
-use Mouse;
-use AI::MXNet::Base;
-extends 'AI::MXNet::CrossEntropy';
-has '+name'   => (default => 'nll_loss');
-
-package AI::MXNet::PearsonCorrelation;
-use Mouse;
-use AI::MXNet::Base;
-extends 'AI::MXNet::EvalMetric';
-has '+name'   => (default => 'pearson-correlation');
-
-=head1 NAME
-
-    AI::MXNet::PearsonCorrelation - Computes Pearson correlation.
-=cut
-
-=head1 DESCRIPTION
-
-    Computes Pearson correlation.
-
-    Parameters
-    ----------
-    name : str
-        Name of this metric instance for display.
-
-    Examples
-    --------
-    >>> $predicts = [mx->nd->array([[0.3, 0.7], [0, 1.], [0.4, 0.6]])]
-    >>> $labels   = [mx->nd->array([[1, 0], [0, 1], [0, 1]])]
-    >>> $pr = mx->metric->PearsonCorrelation()
-    >>> $pr->update($labels, $predicts)
-    >>> print pr->get()
-    ('pearson-correlation', '0.421637061887229')
-=cut
-
-method update(ArrayRef[AI::MXNet::NDArray] $labels, ArrayRef[AI::MXNet::NDArray] $preds)
-{
-    AI::MXNet::Metric::check_label_shapes($labels, $preds);
-    for(zip($labels, $preds)) {
-        my ($label, $pred) = @$_;
-        AI::MXNet::Metric::check_label_shapes($label, $pred);
-        $label = $label->aspdl->flat;
-        $pred  = $pred->aspdl->flat;
-        my ($label_mean, $label_stdv) = ($label->stats)[0, 6];
-        my ($pred_mean, $pred_stdv) = ($pred->stats)[0, 6];
-        $self->sum_metric(
-            $self->sum_metric
-                +
-            ((($label-$label_mean)*($pred-$pred_mean))->sum/$label->nelem)/(($label_stdv*$pred_stdv)->at(0))
-        );
-        $self->num_inst($self->num_inst + 1);
-    }
-}
-
-package AI::MXNet::Loss;
-use Mouse;
-use AI::MXNet::Base;
-extends 'AI::MXNet::EvalMetric';
-has '+name'   => (default => 'loss');
-
-=head1 NAME
-
-    AI::MXNet::Loss - Dummy metric for directly printing loss.
-=cut
-
-=head1 DESCRIPTION
-
-    Dummy metric for directly printing loss.
-
-    Parameters
-    ----------
-    name : str
-        Name of this metric instance for display.
-=cut
-
-method update($labels, ArrayRef[AI::MXNet::NDArray] $preds)
-{
-    for my $pred (@{ $preds })
-    {
-        $self->sum_metric($self->sum_metric + $pred->sum->asscalar);
-        $self->num_inst($self->num_inst + $pred->size);
-    }
-}
-
-package AI::MXNet::Confidence;
-use Mouse;
-
-=head1 NAME
-
-    AI::MXNet::Confidence - Accuracy by confidence buckets.
-=cut
-
-=head1 DESCRIPTION
-
-    Accuracy by confidence buckets.
-
-    Parameters
-    ----------
-    name : str
-        Name of this metric instance for display.
-    num_classes: Int
-        number of classes
-    confidence_thresholds: ArrayRef[Num]
-        confidence buckets
-    For example
-    my $composite_metric  = AI::MXNet::CompositeEvalMetric->new;
-    $composite_metric->add(mx->metric->create('acc'));
-    $composite_metric->add(
-        AI::MXNet::Confidence->new(
-            num_classes => 2,
-            confidence_thresholds => [ 0.5, 0.7, 0.8, 0.9 ],
-        )
-    );
-=cut
-
-extends 'AI::MXNet::EvalMetric';
-has 'num_classes', is => 'ro', isa => 'Int', required => 1;
-has 'confidence_thresholds', is => 'ro', isa => 'ArrayRef[Num]', required => 1;
-has '+name'   => (default => 'confidence');
-has '+sum_metric', isa => 'PDL';
-has '+num_inst', isa => 'PDL';
-method python_constructor_arguments() { ['num_classes', 'confidence_thresholds'] }
-
-sub _hot
-{
-    my($m, $n) = @_;
-    my $md = $m->dim(-1);
-    my $hot = PDL->zeros($n, $md);
-    $hot->index2d($m->flat(), PDL->sequence($md)) .= 1;
-    return $hot;
-}
-
-sub reset
-{
-    my($self) = @_;
-    my $nt = @{$self->confidence_thresholds};
-    my $n = $self->num_classes;
-    $self->sum_metric(PDL->zeroes($nt, $n));
-    $self->num_inst(PDL->zeroes($nt, $n));
-    return;
-}
-
-sub update
-{
-    my($self, $labels, $preds) = @_;
-    my $n = $self->num_classes;
-    my $ct = PDL->new($self->confidence_thresholds);
-    my $nt = $ct->nelem;
-    for(0 .. @$labels - 1)
-    {
-        my $label = _hot($labels->[$_]->aspdl, $n);
-        my $pred = $preds->[$_]->aspdl;
-        for my $c (0 .. $n - 1)
-        {
-            my $ls = $label->slice($c);
-            my $pm = $pred->slice($c) > $ct;
-            $self->sum_metric->slice(":,$c") += ($pm & $ls);
-            $self->num_inst->slice(":,$c") += $pm;
-        }
-    }
-    return;
-}
-
-sub get
-{
-    my($self) = @_;
-    my(@names, @values);
-    my $val = $self->sum_metric / $self->num_inst;
-    my $ct = $self->confidence_thresholds;
-    my $n = $self->num_classes;
-    for my $c (0 .. $n - 1)
-    {
-        for my $t (0 .. @$ct - 1)
-        {
-            my $sm = $self->sum_metric->at($t, $c);
-            my $ni = $self->num_inst->at($t, $c);
-            push @names, "P(v=$c|Conf>$ct->[$t])=($sm/$ni)";
-            push @values, $val->at($t, $c);
-        }
-    }
-    return(\@names, \@values);
-}
-
-=head1 NAME
-
-    AI::MXNet::CustomMetric - Custom evaluation metric that takes a sub ref.
-=cut
-
-=head1 DESCRIPTION
-
-    Custom evaluation metric that takes a sub ref.
-
-    Parameters
-    ----------
-    eval_function : subref
-        Customized evaluation function.
-    name : str, optional
-        The name of the metric
-    allow_extra_outputs : bool
-        If true, the prediction outputs can have extra outputs.
-        This is useful in RNN, where the states are also produced
-        in outputs for forwarding.
-=cut
-
-
-package AI::MXNet::CustomMetric;
-use Mouse;
-use AI::MXNet::Base;
-extends 'AI::MXNet::EvalMetric';
-has 'eval_function'       => (is => 'ro', isa => 'CodeRef');
-has 'allow_extra_outputs' => (is => 'ro', isa => 'Int', default => 0);
-method python_constructor_arguments() { ['eval_function', 'allow_extra_outputs'] }
-
-method update(ArrayRef[AI::MXNet::NDArray] $labels, ArrayRef[AI::MXNet::NDArray] $preds)
-{
-    AI::MXNet::Metric::check_label_shapes($labels, $preds)
-        unless $self->allow_extra_outputs;
-    for(zip($labels, $preds)) {
-        my ($label, $pred) = @$_;
-        $label = $label->aspdl;
-        $pred =  $pred->aspdl;
-        my $value = $self->eval_function->($label, $pred);
-        my $sum_metric = ref $value ? $value->[0] : $value;
-        my $num_inst   = ref $value ? $value->[1] : 1;
-        $self->sum_metric($self->sum_metric + $sum_metric);
-        $self->num_inst($self->num_inst + $num_inst);
-    }
-}
-
-package AI::MXNet::Metric;
-
-=head2 create
-
-    Create an evaluation metric.
-
-    Parameters
-    ----------
-    metric : str or sub ref
-        The name of the metric, or a function
-        providing statistics given pred, label NDArray.
-=cut
-
-my %metrics = qw/
-    acc                 AI::MXNet::Accuracy
-    accuracy            AI::MXNet::Accuracy
-    ce                  AI::MXNet::CrossEntropy
-    crossentropy        AI::MXNet::CrossEntropy
-    nll_loss            AI::MXNet::NegativeLogLikelihood
-    f1                  AI::MXNet::F1
-    mcc                 AI::MXNet::MCC
-    mae                 AI::MXNet::MAE
-    mse                 AI::MXNet::MSE
-    rmse                AI::MXNet::RMSE
-    top_k_accuracy      AI::MXNet::TopKAccuracy
-    topkaccuracy        AI::MXNet::TopKAccuracy
-    perplexity          AI::MXNet::Perplexity
-    pearsonr            AI::MXNet::PearsonCorrelation
-    pearsoncorrelation  AI::MXNet::PearsonCorrelation
-    loss                AI::MXNet::Loss
-    compositeevalmetric AI::MXNet::CompositeEvalMetric
-    confidence          AI::MXNet::Confidence
-/;
-
-method create(Metric|ArrayRef[Metric] $metric, @kwargs)
-{
-    Carp::confess("metric must be defined") unless defined $metric;
-    return $metric if blessed $metric and $metric->isa('AI::MXNet::EvalMetric');
-    if(my $ref = ref $metric)
-    {
-        if($ref eq 'ARRAY')
-        {
-            my $composite_metric = AI::MXNet::CompositeEvalMetric->new();
-            for my $child_metric (@{ $metric })
-            {
-                $composite_metric->add(__PACKAGE__->create($child_metric, @kwargs))
-            }
-            return $composite_metric;
-        }
-        else
-        {
-            return AI::MXNet::CustomMetric->new(eval_function => $metric, @kwargs);
-        }
-    }
-    else
-    {
-        if(not exists $metrics{ lc($metric) } and not $metric =~ /^{/)
-        {
-            my @metrics = keys %metrics;
-            Carp::confess("Metric must be either subref or one of [@metrics]");
-        }
-        if($metric =~ /^{/ and not @kwargs)
-        {
-            my $config = decode_json($metric);
-            $metric = delete $config->{metric};
-            @kwargs = %{ $config };
-        }
-        return $metrics{ lc($metric) }->new(@kwargs);
-    }
-}
-
-{
-    no strict 'refs';
-    no warnings 'redefine';
-    for my $metric (values %metrics)
-    {
-        my ($name) = $metric =~ /(\w+)$/;
-        *{__PACKAGE__."::$name"} = sub { shift; $metric->new(@_); };
-    }
-}
-
-1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Module.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Module.pm
deleted file mode 100644
index 097f038ed11a..000000000000
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Module.pm
+++ /dev/null
@@ -1,987 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-## TODO
-## this class is here because of https://github.com/gfx/p5-Mouse/pull/67
-## once 2.4.7 version of Mouse in Ubuntu for affected Perl version
-## these accessors should be merged into main class
-
-package AI::MXNet::Module::Private;
-use Mouse;
-has [qw/_param_names _fixed_param_names
-        _aux_names _data_names _label_names _state_names
-        _output_names _arg_params _aux_params
-        _params_dirty _optimizer _kvstore
-         _update_on_kvstore _updater _work_load_list
-        _preload_opt_states _exec_group
-        _data_shapes _label_shapes _context _grad_req/
-] => (is => 'rw', init_arg => undef);
-
-package AI::MXNet::Module;
-use AI::MXNet::NS;
-use AI::MXNet::Base;
-use AI::MXNet::Function::Parameters;
-use List::Util qw(max);
-use Data::Dumper ();
-use Mouse;
-
-func _create_sparse_kvstore(Maybe[Str|AI::MXNet::KVStore] $kvstore)
-{
-    # always update on kvstore
-    my $update_on_kvstore = 1;
-    my $kv;
-    if(blessed $kvstore)
-    {
-        $kv = $kvstore;
-    }
-    else
-    {
-        $kv = AI::MXNet::KVStore->create($kvstore);
-    }
-    return ($kv, $update_on_kvstore);
-}
-
-func _create_kvstore(
-    Maybe[Str|AI::MXNet::KVStore] $kvstore,
-    Int                           $num_device,
-    HashRef[AI::MXNet::NDArray]   $arg_params
-)
-{
-    my $update_on_kvstore = 1;
-    my $kv;
-    if(defined $kvstore)
-    {
-        if(blessed $kvstore)
-        {
-            $kv = $kvstore;
-        }
-        else
-        {
-            # create kvstore using the string type
-            if($num_device == 1 and $kvstore !~ /dist/)
-            {
-                # no need to use kv for single device and single machine
-            }
-            else
-            {
-                $kv = AI::MXNet::KVStore->create($kvstore);
-                if($kvstore eq 'local')
-                {
-                    # automatically select a proper local
-                    my $max_size = max(map { product(@{ $_->shape }) } values %{ $arg_params });
-                    if($max_size > 1024 * 1024 * 16)
-                    {
-                        $update_on_kvstore = 0;
-                    }
-                }
-            }
-        }
-    }
-
-    $update_on_kvstore = 0 if not $kv;
-    return ($kv, $update_on_kvstore);
-}
-
-func _initialize_kvstore(
-    AI::MXNet::KVStore           :$kvstore,
-    HashRef[AI::MXNet::NDArray]  :$arg_params,
-    ArrayRef[Str]                :$param_names,
-    Bool                         :$update_on_kvstore,
-    ArrayRef[AI::MXNet::NDArray]|ArrayRef[ArrayRef[AI::MXNet::NDArray]] :$param_arrays
-)
-{
-    enumerate(sub{
-        my ($idx, $param_on_devs) = @_;
-        my $name = $param_names->[$idx];
-        $kvstore->init($name, $arg_params->{ $name });
-        if($update_on_kvstore)
-        {
-            $kvstore->pull($name, out => $param_on_devs, priority => -$idx);
-        }
-    }, $param_arrays);
-}
-
-func _update_params_on_kvstore(
-    ArrayRef[AI::MXNet::NDArray]|ArrayRef[ArrayRef[AI::MXNet::NDArray]] $param_arrays,
-    ArrayRef[AI::MXNet::NDArray]|ArrayRef[ArrayRef[AI::MXNet::NDArray]] $grad_arrays,
-    AI::MXNet::KVStore           $kvstore,
-    ArrayRef[Str]                $param_names
-)
-{
-    enumerate(sub{
-        my ($index, $arg_list, $grad_list) = @_;
-        if(ref $grad_list eq 'ARRAY' and not defined $grad_list->[0])
-        {
-            return;
-        }
-        my $name = $param_names->[$index];
-        # push gradient, priority is negative index
-        $kvstore->push($name, $grad_list, priority => -$index);
-        # pull back the weights
-        $kvstore->pull($name, out => $arg_list, priority  => -$index);
-    }, $param_arrays, $grad_arrays);
-}
-
-func _update_params(
-    ArrayRef[ArrayRef[AI::MXNet::NDArray]] $param_arrays,
-    ArrayRef[ArrayRef[AI::MXNet::NDArray]] $grad_arrays,
-    AI::MXNet::Updater                     $updater,
-    Int                                    $num_device,
-    Maybe[AI::MXNet::KVStore]              $kvstore=,
-    Maybe[ArrayRef[Str]]                   $param_names=
-)
-{
-    enumerate(sub{
-        my ($index, $arg_list, $grad_list) = @_;
-        if(not defined $grad_list->[0])
-        {
-            return;
-        }
-        if($kvstore)
-        {
-            my $name = $param_names->[$index];
-            # push gradient, priority is negative index
-            $kvstore->push($name, $grad_list, priority => -$index);
-            # pull back the sum gradients, to the same locations.
-            $kvstore->pull($name, out => $grad_list, priority => -$index);
-        }
-        enumerate(sub {
-            my ($k, $w, $g) = @_;
-            # faked an index here, to make optimizer create diff
-            # state for the same index but on diff devs, TODO(mli)
-            # use a better solution later
-            $updater->($index*$num_device+$k, $g, $w);
-        }, $arg_list, $grad_list);
-    }, $param_arrays, $grad_arrays);
-}
-
-method load_checkpoint(Str $prefix, Int $epoch)
-{
-    my $symbol = AI::MXNet::Symbol->load("$prefix-symbol.json");
-    my %save_dict = %{ AI::MXNet::NDArray->load(sprintf('%s-%04d.params', $prefix, $epoch)) };
-    my %arg_params;
-    my %aux_params;
-    while(my ($k, $v) = each %save_dict)
-    {
-        my ($tp, $name) = split(/:/, $k, 2);
-        if($tp eq 'arg')
-        {
-            $arg_params{$name} = $v;
-        }
-        if($tp eq 'aux')
-        {
-            $aux_params{$name} = $v;
-        }
-    }
-    return ($symbol, \%arg_params, \%aux_params);
-}
-
-=head1 NAME
-
-    AI::MXNet::Module - FeedForward interface of MXNet.
-    See AI::MXNet::Module::Base for the details.
-=cut
-
-extends 'AI::MXNet::Module::Base';
-
-has '_symbol'           => (is => 'ro', init_arg => 'symbol', isa => 'AI::MXNet::Symbol', required => 1);
-has '_data_names'       => (is => 'ro', init_arg => 'data_names', isa => 'ArrayRef[Str]');
-has '_label_names'      => (is => 'ro', init_arg => 'label_names', isa => 'Maybe[ArrayRef[Str]]');
-has 'work_load_list'    => (is => 'rw', isa => 'Maybe[ArrayRef[Int]]');
-has 'fixed_param_names' => (is => 'rw', isa => 'Maybe[ArrayRef[Str]]');
-has 'state_names'       => (is => 'rw', isa => 'Maybe[ArrayRef[Str]]');
-has 'logger'            => (is => 'ro', default => sub { AI::MXNet::Logging->get_logger });
-has '_p'                => (is => 'rw', init_arg => undef);
-has 'context'           => (
-    is => 'ro',
-    isa => 'AI::MXNet::Context|ArrayRef[AI::MXNet::Context]',
-    default => sub { AI::MXNet::Context->cpu }
-);
-
-around BUILDARGS => sub {
-    my $orig  = shift;
-    my $class = shift;
-    if(@_%2)
-    {
-        my $symbol = shift;
-        return $class->$orig(symbol => $symbol, @_);
-    }
-    return $class->$orig(@_);
-};
-
-sub BUILD
-{
-    my $self = shift;
-    $self->_p(AI::MXNet::Module::Private->new);
-    my $context = $self->context;
-    if(blessed $context)
-    {
-        $context = [$context];
-    }
-    $self->_p->_context($context);
-    my $work_load_list = $self->work_load_list;
-    if(not defined $work_load_list)
-    {
-        $work_load_list = [(1)x@{$self->_p->_context}];
-    }
-    assert(@{ $work_load_list } == @{ $self->_p->_context });
-    $self->_p->_work_load_list($work_load_list);
-    my @data_names  = @{ $self->_data_names//['data'] };
-    my @label_names = @{ $self->_label_names//['softmax_label'] };
-    my @state_names = @{ $self->state_names//[] };
-    my $arg_names   = $self->_symbol->list_arguments;
-    my @input_names = (@data_names, @label_names, @state_names);
-    my %input_names = map { $_ => 1 } @input_names;
-    $self->_p->_param_names([grep { not exists $input_names{$_} } @{ $arg_names }]);
-    $self->_p->_fixed_param_names($self->fixed_param_names//[]);
-    $self->_p->_state_names(\@state_names);
-    $self->_p->_aux_names($self->_symbol->list_auxiliary_states);
-    $self->_p->_data_names(\@data_names);
-    $self->_p->_label_names(\@label_names);
-    $self->_p->_output_names($self->_symbol->list_outputs);
-    $self->_p->_params_dirty(0);
-    $self->_check_input_names($self->_symbol, $self->_p->_data_names, "data", 1);
-    $self->_check_input_names($self->_symbol, $self->_p->_label_names, "label", 0);
-    $self->_check_input_names($self->_symbol, $self->_p->_state_names, "state", 1);
-    $self->_check_input_names($self->_symbol, $self->_p->_fixed_param_names, "fixed_param", 1);
-}
-
-method Module(@args) { return @args ?  __PACKAGE__->new(@args) : __PACKAGE__ }
-method BucketingModule(@args) { return AI::MXNet::Module::Bucketing->new(@args) }
-
-=head2 load
-
-        Create a model from previously saved checkpoint.
-
-        Parameters
-        ----------
-        $prefix : Str
-            path prefix of saved model files. You should have
-            "prefix-symbol.json", "prefix-xxxx.params", and
-            optionally "prefix-xxxx.states", where xxxx is the
-            epoch number.
-        $epoch : Int
-            epoch to load.
-        $load_optimizer_states=0 : Bool
-            whether to load optimizer states. Checkpoint needs
-            to have been made with save_optimizer_states=True.
-        :$data_names : array ref of str
-            Default is ['data'] for a typical model used in image classification.
-        :$label_names : array ref of str
-            Default is ['softmax_label'] for a typical model used in image
-            classification.
-        :$logger : Logger
-            Default is AI::MXNet::Logging.
-        :$context : Context or list of Context
-            Default is cpu(0).
-        :$work_load_list : array ref of number
-            Default is undef, indicating an uniform workload.
-        :$fixed_param_names: array ref of str
-            Default is undef, indicating no network parameters are fixed.
-=cut
-
-method load(
-    Str $prefix,
-    Int $epoch,
-    Bool $load_optimizer_states=0,
-    %kwargs
-)
-{
-    my ($sym, $args, $auxs) = __PACKAGE__->load_checkpoint($prefix, $epoch);
-    my $mod = $self->new(symbol => $sym, %kwargs);
-    $mod->_p->_arg_params($args);
-    $mod->_p->_aux_params($auxs);
-    $mod->params_initialized(1);
-    if($load_optimizer_states)
-    {
-        $mod->_p->_preload_opt_states(sprintf('%s-%04d.states', $prefix, $epoch));
-    }
-    return $mod;
-}
-
-=head2 save_checkpoint
-
-    Save current progress to a checkpoint.
-    Use mx->callback->module_checkpoint as epoch_end_callback to save during training.
-
-    Parameters
-    ----------
-    $prefix : Str
-        The file prefix to checkpoint to
-    $epoch : Int
-        The current epoch number
-    $save_optimizer_states=0 : Bool
-        Whether to save optimizer states for later training
-=cut
-
-
-method save_checkpoint(Str $prefix, Int $epoch, Bool $save_optimizer_states=0)
-{
-    $self->_symbol->save("$prefix-symbol.json");
-    my $param_name = sprintf('%s-%04d.params', $prefix, $epoch);
-    $self->save_params($param_name);
-    AI::MXNet::Logging->info('Saved checkpoint to "%s"', $param_name);
-    if($save_optimizer_states)
-    {
-        my $state_name = sprintf('%s-%04d.states', $prefix, $epoch);
-        $self->save_optimizer_states($state_name);
-        AI::MXNet::Logging->info('Saved optimizer state to "%s"', $state_name);
-    }
-}
-
-=head2 model_save_checkpoint
-
-    Checkpoint the model data into file.
-
-    Parameters
-    ----------
-    $prefix : Str
-        Prefix of model name.
-    $epoch : Int
-        The epoch number of the model.
-    $symbol : AI::MXNet::Symbol
-        The input symbol
-    $arg_params : HashRef[AI::MXNet::NDArray]
-        Model's parameters, hash ref of name to AI::MXNet::NDArray of net's weights.
-    $aux_params : HashRef[AI::MXNet::NDArray]
-        Model's parameters, hash ref of name to AI::MXNet::NDArray of net's auxiliary states.
-    Notes
-    -----
-    - prefix-symbol.json will be saved for symbol.
-    - prefix-epoch.params will be saved for parameters.
-=cut
-
-method model_save_checkpoint(
-    Str                         $prefix,
-    Int                         $epoch,
-    Maybe[AI::MXNet::Symbol]    $symbol,
-    HashRef[AI::MXNet::NDArray] $arg_params,
-    HashRef[AI::MXNet::NDArray] $aux_params
-)
-{
-    if(defined $symbol)
-    {
-        $symbol->save("$prefix-symbol.json");
-    }
-    my $param_name = sprintf('%s-%04d.params', $prefix, $epoch);
-    $self->save_params($param_name, $arg_params, $aux_params);
-    AI::MXNet::Logging->info('Saved checkpoint to "%s"', $param_name);
-}
-
-# Internal function to reset binded state.
-method _reset_bind()
-{
-    $self->binded(0);
-    $self->_p->_exec_group(undef);
-    $self->_p->_data_shapes(undef);
-    $self->_p->_label_shapes(undef);
-}
-
-method data_names()
-{
-    return $self->_p->_data_names;
-}
-
-method label_names()
-{
-    return $self->_p->_label_names;
-}
-
-method output_names()
-{
-    return $self->_p->_output_names;
-}
-
-method data_shapes()
-{
-    assert($self->binded);
-    return $self->_p->_data_shapes;
-}
-
-method label_shapes()
-{
-    assert($self->binded);
-    return $self->_p->_label_shapes;
-}
-
-method output_shapes()
-{
-    assert($self->binded);
-    return $self->_p->_exec_group->get_output_shapes;
-}
-
-method get_params()
-{
-    assert($self->binded and $self->params_initialized);
-    if($self->_p->_params_dirty)
-    {
-        $self->_sync_params_from_devices();
-    }
-    return ($self->_p->_arg_params, $self->_p->_aux_params);
-}
-
-method init_params(
-    Maybe[AI::MXNet::Initializer]      :$initializer=AI::MXNet::Initializer->Uniform(scale => 0.01),
-    Maybe[HashRef[AI::MXNet::NDArray]] :$arg_params=,
-    Maybe[HashRef[AI::MXNet::NDArray]] :$aux_params=,
-    Bool                               :$allow_missing=0,
-    Bool                               :$force_init=0,
-    Bool                               :$allow_extra=0
-)
-{
-    if($self->params_initialized and not $force_init)
-    {
-        AI::MXNet::Logging->warning(
-            "Parameters already initialized and force_init=0. "
-            ."init_params call ignored."
-        );
-        return;
-    }
-    assert($self->binded, 'call bind before initializing the parameters');
-    my $_impl = sub {
-            my ($name, $arr, $cache) = @_;
-            # Internal helper for parameter initialization
-            if(defined $cache)
-            {
-                if(exists $cache->{$name})
-                {
-                    my $cache_arr = $cache->{$name};
-                    # just in case the cached array is just the target itself
-                    if($cache_arr->handle ne $arr->handle)
-                    {
-                        $cache_arr->copyto($arr);
-                    }
-                }
-                else
-                {
-                    if(not $allow_missing)
-                    {
-                        confess("$name is not presented");
-                    }
-                    if(defined $initializer)
-                    {
-                        $initializer->($name, $arr);
-                    }
-                }
-            }
-            else
-            {
-                $initializer->($name, $arr) if defined $initializer;
-            }
-    };
-    my $attrs = $self->_symbol->attr_dict;
-    while(my ($name, $arr) = each %{ $self->_p->_arg_params })
-    {
-        $_impl->(
-            AI::MXNet::InitDesc->new(
-                name  => $name,
-                ($attrs->{$name} ? (attrs => $attrs->{$name}) : ())
-            ),
-            $arr, $arg_params
-        );
-    }
-    while(my ($name, $arr) = each %{ $self->_p->_aux_params })
-    {
-        $_impl->(
-            AI::MXNet::InitDesc->new(
-                name  => $name,
-                ($attrs->{$name} ? (attrs => $attrs->{$name}) : ())
-            ),
-            $arr, $aux_params
-        );
-    }
-    $self->params_initialized(1);
-    $self->_p->_params_dirty(0);
-
-    # copy the initialized parameters to devices
-    $self->_p->_exec_group->set_params($self->_p->_arg_params, $self->_p->_aux_params, $allow_extra);
-}
-
-method set_params(
-    HashRef[AI::MXNet::NDArray]  $arg_params,
-    HashRef[AI::MXNet::NDArray]  $aux_params,
-    Bool                        :$allow_missing=0,
-    Bool                        :$force_init=1,
-    Bool                        :$allow_extra=0
-)
-{
-    if(not $allow_missing)
-    {
-        $self->init_params(
-            arg_params    => $arg_params,    aux_params => $aux_params,
-            allow_missing => $allow_missing, force_init => $force_init,
-            allow_extra   => $allow_extra
-        );
-        return;
-    }
-
-    if($self->params_initialized and not $force_init)
-    {
-        AI::MXNet::Logging->warning(
-            "Parameters already initialized and force_init=False. "
-            ."set_params call ignored."
-        );
-        return;
-    }
-    $self->_p->_exec_group->set_params($arg_params, $aux_params, $allow_extra);
-    $self->_p->_params_dirty(1);
-    $self->params_initialized(1);
-}
-
-=head2 bind
-
-    Bind the symbols to construct executors. This is necessary before one
-    can perform computation with the module.
-
-    Parameters
-    ----------
-    :$data_shapes : ArrayRef[AI::MXNet::DataDesc|NameShape]
-        Typically is $data_iter->provide_data.
-    :$label_shapes : Maybe[ArrayRef[AI::MXNet::DataDesc|NameShape]]
-        Typically is $data_iter->provide_label.
-    :$for_training : bool
-        Default is 1. Whether the executors should be bind for training.
-    :$inputs_need_grad : bool
-        Default is 0. Whether the gradients to the input data need to be computed.
-        Typically this is not needed. But this might be needed when implementing composition
-        of modules.
-    :$force_rebind : bool
-        Default is 0. This function does nothing if the executors are already
-        binded. But with this 1, the executors will be forced to rebind.
-    :$shared_module : Module
-        Default is undef. This is used in bucketing. When not undef, the shared module
-        essentially corresponds to a different bucket -- a module with different symbol
-        but with the same sets of parameters (e.g. unrolled RNNs with different lengths).
-=cut
-
-method bind(
-    ArrayRef[AI::MXNet::DataDesc|NameShape]        :$data_shapes,
-    Maybe[ArrayRef[AI::MXNet::DataDesc|NameShape]] :$label_shapes=,
-    Bool                                           :$for_training=1,
-    Bool                                           :$inputs_need_grad=0,
-    Bool                                           :$force_rebind=0,
-    Maybe[AI::MXNet::Module]                       :$shared_module=,
-    GradReq|HashRef[GradReq]|ArrayRef[GradReq]     :$grad_req='write',
-    Maybe[ArrayRef[Str]]                           :$state_names=$self->_p->_state_names
-)
-{
-    # force rebinding is typically used when one want to switch from
-    # training to prediction phase.
-    if($force_rebind)
-    {
-        $self->_reset_bind();
-    }
-    if($self->binded)
-    {
-        $self->logger->warning('Already binded, ignoring bind()');
-        return;
-    }
-    $self->for_training($for_training);
-    $self->inputs_need_grad($inputs_need_grad);
-    $self->binded(1);
-    $self->_p->_grad_req($grad_req);
-
-    if(not $for_training)
-    {
-        assert(not $inputs_need_grad);
-    }
-    ($data_shapes, $label_shapes) = $self->_parse_data_desc(
-        $self->data_names, $self->label_names, $data_shapes, $label_shapes
-    );
-    $self->_p->_data_shapes($data_shapes);
-    $self->_p->_label_shapes($label_shapes);
-    my $shared_group;
-    if($shared_module)
-    {
-        assert($shared_module->binded and $shared_module->params_initialized);
-        $shared_group = $shared_module->_p->_exec_group;
-    }
-
-    $self->_p->_exec_group(
-        AI::MXNet::DataParallelExecutorGroup->new(
-            symbol            => $self->_symbol,
-            contexts          => $self->_p->_context,
-            workload          => $self->_p->_work_load_list,
-            data_shapes       => $self->_p->_data_shapes,
-            label_shapes      => $self->_p->_label_shapes,
-            param_names       => $self->_p->_param_names,
-            state_names       => $state_names,
-            for_training      => $for_training,
-            inputs_need_grad  => $inputs_need_grad,
-            shared_group      => $shared_group,
-            logger            => $self->logger,
-            fixed_param_names => $self->_p->_fixed_param_names,
-            grad_req          => $grad_req
-        )
-    );
-    if($shared_module)
-    {
-        $self->params_initialized(1);
-        $self->_p->_arg_params($shared_module->_p->_arg_params);
-        $self->_p->_aux_params($shared_module->_p->_aux_params);
-    }
-    elsif($self->params_initialized)
-    {
-        # if the parameters are already initialized, we are re-binding
-        # so automatically copy the already initialized params
-        $self->_p->_exec_group->set_params($self->_p->_arg_params, $self->_p->_aux_params);
-    }
-    else
-    {
-        assert(not defined $self->_p->_arg_params and not $self->_p->_aux_params);
-        my @param_arrays = (
-            map { AI::MXNet::NDArray->zeros($_->[0]->shape, dtype => $_->[0]->dtype, stype => $_->[0]->stype) }
-            @{ $self->_p->_exec_group->_p->param_arrays }
-        );
-        my %arg_params;
-        @arg_params{ @{ $self->_p->_param_names } } = @param_arrays;
-        $self->_p->_arg_params(\%arg_params);
-        my @aux_arrays = (
-            map { AI::MXNet::NDArray->zeros($_->[0]->shape, dtype => $_->[0]->dtype) }
-            @{ $self->_p->_exec_group->_p->aux_arrays }
-        );
-        my %aux_params;
-        @aux_params{ @{ $self->_p->_aux_names } } = @aux_arrays;
-        $self->_p->_aux_params(\%aux_params);
-    }
-    if($shared_module and $shared_module->optimizer_initialized)
-    {
-        $self->borrow_optimizer($shared_module)
-    }
-}
-
-=head2 reshape
-
-    Reshape the module for new input shapes.
-    Parameters
-    ----------
-    :$data_shapes : ArrayRef[AI::MXNet::DataDesc]
-        Typically is $data_iter->provide_data.
-    :$label_shapes= : Maybe[ArrayRef[AI::MXNet::DataDesc]]
-        Typically is $data_iter->provide_label.
-=cut
-
-method reshape(
-    ArrayRef[AI::MXNet::DataDesc|NameShape]        :$data_shapes,
-    Maybe[ArrayRef[AI::MXNet::DataDesc|NameShape]] :$label_shapes=
-)
-{
-    assert($self->binded);
-    ($data_shapes, $label_shapes) = $self->_parse_data_desc(
-        $self->data_names, $self->label_names, $data_shapes, $label_shapes
-    );
-    $self->_p->_data_shapes($data_shapes);
-    $self->_p->_label_shapes($label_shapes);
-    $self->_p->_exec_group->reshape($self->_p->_data_shapes, $self->_p->_label_shapes);
-}
-
-method init_optimizer(
-    Str|AI::MXNet::KVStore :$kvstore='local',
-    Optimizer              :$optimizer='sgd',
-    HashRef                :$optimizer_params={ learning_rate => 0.01 },
-    Bool                   :$force_init=0
-)
-{
-    assert($self->binded and $self->params_initialized);
-    if($self->optimizer_initialized and not $force_init)
-    {
-        $self->logger->warning('optimizer already initialized, ignoring...');
-        return;
-    }
-    if($self->_p->_params_dirty)
-    {
-        $self->_sync_params_from_devices;
-    }
-
-    my ($kvstore, $update_on_kvstore) = _create_kvstore(
-        $kvstore,
-        scalar(@{$self->_p->_context}),
-        $self->_p->_arg_params
-    );
-    my $batch_size = $self->_p->_exec_group->_p->batch_size;
-    if($kvstore and $kvstore->type =~ /dist/ and $kvstore->type =~ /_sync/)
-    {
-        $batch_size *= $kvstore->num_workers;
-    }
-    my $rescale_grad = 1/$batch_size;
-
-    if(not blessed $optimizer)
-    {
-        my %idx2name;
-        if($update_on_kvstore)
-        {
-            @idx2name{ 0..@{$self->_p->_exec_group->param_names}-1 } = @{$self->_p->_exec_group->param_names};
-        }
-        else
-        {
-            for my $k (0..@{$self->_p->_context}-1)
-            {
-                @idx2name{ map { $_ + $k } 0..@{$self->_p->_exec_group->param_names}-1 } = @{$self->_p->_exec_group->param_names};
-            }
-        }
-        if(not exists $optimizer_params->{rescale_grad})
-        {
-            $optimizer_params->{rescale_grad} = $rescale_grad;
-        }
-        $optimizer = AI::MXNet::Optimizer->create(
-            $optimizer,
-            sym  => $self->symbol,
-            param_idx2name => \%idx2name,
-            %{ $optimizer_params }
-        );
-        if($optimizer->rescale_grad != $rescale_grad)
-        {
-            AI::MXNet::Logging->warning(
-                "Optimizer created manually outside Module but rescale_grad "
-                ."is not normalized to 1.0/batch_size/num_workers (%s vs. %s). "
-                ."Is this intended?",
-                $optimizer->rescale_grad, $rescale_grad
-            );
-        }
-    }
-
-    $self->_p->_optimizer($optimizer);
-    $self->_p->_kvstore($kvstore);
-    $self->_p->_update_on_kvstore($update_on_kvstore);
-    $self->_p->_updater(undef);
-
-    if($kvstore)
-    {
-        # copy initialized local parameters to kvstore
-        _initialize_kvstore(
-            kvstore           => $kvstore,
-            param_arrays      => $self->_p->_exec_group->_p->param_arrays,
-            arg_params        => $self->_p->_arg_params,
-            param_names       => $self->_p->_param_names,
-            update_on_kvstore => $update_on_kvstore
-        );
-    }
-    if($update_on_kvstore)
-    {
-        $kvstore->set_optimizer($self->_p->_optimizer);
-    }
-    else
-    {
-        $self->_p->_updater(AI::MXNet::Optimizer->get_updater($optimizer));
-    }
-    $self->optimizer_initialized(1);
-
-    if($self->_p->_preload_opt_states)
-    {
-        $self->load_optimizer_states($self->_p->_preload_opt_states);
-        $self->_p->_preload_opt_states(undef);
-    }
-}
-
-=head2 borrow_optimizer
-
-    Borrow optimizer from a shared module. Used in bucketing, where exactly the same
-    optimizer (esp. kvstore) is used.
-
-    Parameters
-    ----------
-    shared_module : AI::MXNet::Module
-=cut
-
-method borrow_optimizer(AI::MXNet::Module $shared_module)
-{
-    assert($shared_module->optimizer_initialized);
-    $self->_p->_optimizer($shared_module->_p->_optimizer);
-    $self->_p->_kvstore($shared_module->_p->_kvstore);
-    $self->_p->_update_on_kvstore($shared_module->_p->_update_on_kvstore);
-    $self->_p->_updater($shared_module->_p->_updater);
-    $self->optimizer_initialized(1);
-}
-
-method forward(
-    AI::MXNet::DataBatch $data_batch,
-    Maybe[Bool]         :$is_train=
-)
-{
-    assert($self->binded and $self->params_initialized);
-
-    my @curr_data_shapes = map { $_->shape } @{ $self->data_shapes };
-    my @new_data_shapes  = map { $_->shape } @{ $data_batch->data };
-    if(Data::Dumper->Dump(\@curr_data_shapes) ne Data::Dumper->Dump(\@new_data_shapes))
-    {
-        my $new_dshape;
-        if($data_batch->can('provide_data') and $data_batch->provide_data)
-        {
-            $new_dshape = $data_batch->provide_data;
-        }
-        else
-        {
-            $new_dshape = [];
-            for(zip($self->data_shapes, \@new_data_shapes)) {
-                my ($i, $shape) = @$_;
-                push @{ $new_dshape }, AI::MXNet::DataDesc->new(
-                    $i->name, $shape, $i->dtype, $i->layout
-                );
-            }
-        }
-        my $new_lshape;
-        if($data_batch->can('provide_label') and $data_batch->provide_label)
-        {
-            $new_lshape = $data_batch->provide_label;
-        }
-        elsif($data_batch->can('label') and $data_batch->label)
-        {
-            $new_lshape = [];
-            for(zip($self->label_shapes, $data_batch->label)) {
-                my ($i, $j) = @$_;
-                push @{ $new_lshape }, AI::MXNet::DataDesc->new(
-                    $i->name, $j->shape, $i->dtype, $i->layout
-                );
-            }
-        }
-        $self->reshape(data_shapes => $new_dshape, label_shapes => $new_lshape);
-    }
-    $self->_p->_exec_group->forward($data_batch, $is_train);
-}
-
-method backward(Maybe[AI::MXNet::NDArray|ArrayRef[AI::MXNet::NDArray]] $out_grads=)
-{
-    assert($self->binded and $self->params_initialized);
-    $self->_p->_exec_group->backward($out_grads);
-}
-
-method update()
-{
-    assert($self->binded and $self->params_initialized and $self->optimizer_initialized);
-    $self->_p->_params_dirty(1);
-    if($self->_p->_update_on_kvstore)
-    {
-        _update_params_on_kvstore(
-            $self->_p->_exec_group->_p->param_arrays,
-            $self->_p->_exec_group->_p->grad_arrays,
-            $self->_p->_kvstore,
-            $self->_p->_exec_group->param_names
-        );
-    }
-    else
-    {
-        _update_params(
-            $self->_p->_exec_group->_p->param_arrays,
-            $self->_p->_exec_group->_p->grad_arrays,
-            $self->_p->_updater,
-            scalar(@{ $self->_p->_context}),
-            $self->_p->_kvstore,
-            $self->_p->_exec_group->param_names
-        );
-    }
-}
-
-method get_outputs(Bool $merge_multi_context=1)
-{
-    assert($self->binded and $self->params_initialized);
-    return $self->_p->_exec_group->get_outputs($merge_multi_context);
-}
-
-method get_input_grads(Bool $merge_multi_context=1)
-{
-    assert($self->binded and $self->params_initialized and $self->inputs_need_grad);
-    return $self->_p->_exec_group->get_input_grads($merge_multi_context);
-}
-
-method get_states(Bool $merge_multi_context=1)
-{
-    assert($self->binded and $self->params_initialized);
-    return $self->_p->_exec_group->get_states($merge_multi_context);
-}
-
-method set_states(:$states=, :$value=)
-{
-    assert($self->binded and $self->params_initialized);
-    return $self->_p->_exec_group->set_states($states, $value);
-}
-
-method update_metric(
-    AI::MXNet::EvalMetric $eval_metric,
-    ArrayRef[AI::MXNet::NDArray] $labels
-)
-{
-    $self->_p->_exec_group->update_metric($eval_metric, $labels);
-}
-
-=head2 _sync_params_from_devices
-
-    Synchronize parameters from devices to CPU. This function should be called after
-    calling 'update' that updates the parameters on the devices, before one can read the
-    latest parameters from $self->_arg_params and $self->_aux_params.
-=cut
-
-method _sync_params_from_devices()
-{
-    $self->_p->_exec_group->get_params($self->_p->_arg_params, $self->_p->_aux_params);
-    $self->_p->_params_dirty(0);
-}
-
-method save_optimizer_states(Str $fname)
-{
-    assert($self->optimizer_initialized);
-    if($self->_p->_update_on_kvstore)
-    {
-        $self->_p->_kvstore->save_optimizer_states($fname);
-    }
-    else
-    {
-        open(F, ">:raw", "$fname") or confess("can't open $fname for writing: $!");
-        print F $self->_p->_updater->get_states();
-        close(F);
-    }
-}
-
-method load_optimizer_states(Str $fname)
-{
-    assert($self->optimizer_initialized);
-    if($self->_p->_update_on_kvstore)
-    {
-        $self->_p->_kvstore->load_optimizer_states($fname);
-    }
-    else
-    {
-        open(F, "<:raw", "$fname") or confess("can't open $fname for reading: $!");
-        my $data;
-        { local($/) = undef; $data = <F>; }
-        close(F);
-        $self->_p->_updater->set_states($data);
-    }
-}
-
-method install_monitor(AI::MXNet::Monitor $mon)
-{
-    assert($self->binded);
-    $self->_p->_exec_group->install_monitor($mon);
-}
-
-method _updater()
-{
-    $self->_p->_updater;
-}
-
-method _kvstore()
-{
-    $self->_p->_kvstore;
-}
-
-method _arg_params()
-{
-    $self->_p->_arg_params;
-}
-
-
-1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Module/Base.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Module/Base.pm
deleted file mode 100644
index 6b572f4cceb5..000000000000
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Module/Base.pm
+++ /dev/null
@@ -1,1079 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-use strict;
-use warnings;
-package AI::MXNet::BatchEndParam;
-use Mouse;
-use AI::MXNet::Function::Parameters;
-has [qw/epoch nbatch/] => (is => 'rw', isa => 'Int');
-has 'eval_metric'      => (is => 'rw', isa => 'AI::MXNet::EvalMetric');
-
-package AI::MXNet::Module::Base;
-use Mouse;
-use AI::MXNet::Base;
-use Time::HiRes qw(time);
-use Storable qw(dclone);
-
-=head1 NAME
-
-    AI::MXNet::Module::Base - Base class for AI::MXNet::Module and AI::MXNet::Module::Bucketing
-=cut
-
-func _as_list($obj)
-{
-    return [$obj] if ((ref($obj)//'') ne 'ARRAY');
-    return $obj;
-}
-
-# Check that all input names are in symbol's argument
-method _check_input_names(
-    AI::MXNet::Symbol $symbol,
-    ArrayRef[Str]     $names,
-    Str               $typename,
-    Bool              $throw
-)
-{
-    my @candidates;
-    my %args = map {
-        push @candidates, $_ if not /_(?:weight|bias|gamma|beta)$/;
-        $_ => 1
-    } @{ $symbol->list_arguments };
-    for my $name (@$names)
-    {
-        my $msg;
-        if(not exists $args{$name} and $name ne 'softmax_label')
-        {
-            $msg = sprintf("\033[91mYou created Module with Module(..., %s_names=%s) but "
-                ."input with name '%s' is not found in symbol.list_arguments(). "
-                ."Did you mean one of:\n\t%s\033[0m",
-                $typename, "@$names", $name, join("\n\t", @candidates)
-            );
-            if($throw)
-            {
-                confess($msg);
-            }
-            else
-            {
-                AI::MXNet::Logging->warning($msg);
-            }
-        }
-    }
-}
-
-# Check that input names matches input data descriptors
-method _check_names_match(
-    ArrayRef[Str]                  $data_names,
-    ArrayRef[NameShapeOrDataDesc]  $data_shapes,
-    Str                            $name,
-    Bool                           $throw
-)
-{
-    return if (not @$data_shapes and @$data_names == 1 and  $data_names->[0] eq 'softmax_label');
-    my @actual = sort map { @{$_}[0] } @{ $data_shapes };
-    my @data_names = sort @$data_names;
-    if("@data_names" ne "@actual")
-    {
-        my $msg = sprintf(
-            "Data provided by %s_shapes don't match names specified by %s_names (%s vs. %s)",
-            $name, $name, "@actual", "@data_names"
-        );
-        if($throw)
-        {
-            confess($msg);
-        }
-        else
-        {
-            AI::MXNet::Logging->warning($msg);
-        }
-    }
-}
-
-method _parse_data_desc(
-    ArrayRef[Str]                                  $data_names,
-    Maybe[ArrayRef[Str]]                           $label_names,
-    ArrayRef[NameShapeOrDataDesc]                  $data_shapes,
-    Maybe[ArrayRef[NameShapeOrDataDesc]]           $label_shapes
-)
-{
-    $data_shapes = [map { blessed $_ ? $_ : AI::MXNet::DataDesc->new(@$_) } @$data_shapes];
-    $self->_check_names_match($data_names, $data_shapes, 'data', 1);
-    if($label_shapes)
-    {
-        $label_shapes = [map { blessed $_ ? $_ : AI::MXNet::DataDesc->new(@$_) } @$label_shapes];
-        $self->_check_names_match($label_names, $label_shapes, 'label', 0);
-    }
-    else
-    {
-        $self->_check_names_match($label_names, [], 'label', 0);
-    }
-    return ($data_shapes, $label_shapes);
-}
-
-=head1 DESCRIPTION
-
-    The base class of a modules. A module represents a computation component. The design
-    purpose of a module is that it abstract a computation "machine", that one can run forward,
-    backward, update parameters, etc. We aim to make the APIs easy to use, especially in the
-    case when we need to use imperative API to work with multiple modules (e.g. stochastic
-    depth network).
-
-    A module has several states:
-
-        - Initial state. Memory is not allocated yet, not ready for computation yet.
-        - Binded. Shapes for inputs, outputs, and parameters are all known, memory allocated,
-        ready for computation.
-        - Parameter initialized. For modules with parameters, doing computation before initializing
-        the parameters might result in undefined outputs.
-        - Optimizer installed. An optimizer can be installed to a module. After this, the parameters
-        of the module can be updated according to the optimizer after gradients are computed
-        (forward-backward).
-
-    In order for a module to interact with others, a module should be able to report the
-    following information in its raw stage (before binded)
-
-        - data_names: array ref of string indicating the names of required data.
-        - output_names: array ref of string indicating the names of required outputs.
-
-    And also the following richer information after binded:
-
-    - state information
-        - binded: bool, indicating whether the memory buffers needed for computation
-        has been allocated.
-        - for_training: whether the module is binded for training (if binded).
-        - params_initialized: bool, indicating whether the parameters of this modules
-        has been initialized.
-        - optimizer_initialized: bool, indicating whether an optimizer is defined
-        and initialized.
-        - inputs_need_grad: bool, indicating whether gradients with respect to the
-        input data is needed. Might be useful when implementing composition of modules.
-
-    - input/output information
-        - data_shapes: am array ref of [name, shape]. In theory, since the memory is allocated,
-        we could directly provide the data arrays. But in the case of data parallelization,
-        the data arrays might not be of the same shape as viewed from the external world.
-        - label_shapes: an array ref of [name, shape]. This might be [] if the module does
-        not need labels (e.g. it does not contains a loss function at the top), or a module
-        is not binded for training.
-        - output_shapes: an array ref of [name, shape] for outputs of the module.
-
-    - parameters (for modules with parameters)
-        - get_params(): return an array ($arg_params, $aux_params). Each of those
-        is a hash ref of name to NDArray mapping. Those NDArrays always on
-        CPU. The actual parameters used for computing might be on other devices (GPUs),
-        this function will retrieve (a copy of) the latest parameters. Therefore, modifying
-        - get_params($arg_params, $aux_params): assign parameters to the devices
-        doing the computation.
-        - init_params(...): a more flexible interface to assign or initialize the parameters.
-
-    - setup
-        - bind(): prepare environment for computation.
-        - init_optimizer(): install optimizer for parameter updating.
-
-    - computation
-        - forward(data_batch): forward operation.
-        - backward(out_grads=): backward operation.
-        - update(): update parameters according to installed optimizer.
-        - get_outputs(): get outputs of the previous forward operation.
-        - get_input_grads(): get the gradients with respect to the inputs computed
-        in the previous backward operation.
-        - update_metric(metric, labels): update performance metric for the previous forward
-        computed results.
-
-    - other properties (mostly for backward compatability)
-        - symbol: the underlying symbolic graph for this module (if any)
-        This property is not necessarily constant. For example, for AI::MXNet::Module::Bucketing,
-        this property is simply the *current* symbol being used. For other modules,
-        this value might not be well defined.
-
-    When those intermediate-level API are implemented properly, the following
-    high-level API will be automatically available for a module:
-
-        - fit: train the module parameters on a data set
-        - predict: run prediction on a data set and collect outputs
-        - score: run prediction on a data set and evaluate performance
-=cut
-
-has 'logger'            => (is => 'rw', default => sub { AI::MXNet::Logging->get_logger });
-has '_symbol'           => (is => 'rw', init_arg => 'symbol', isa => 'AI::MXNet::Symbol');
-has [
-    qw/binded for_training inputs_need_grad
-    params_initialized optimizer_initialized/
-]                       => (is => 'rw', isa => 'Bool', init_arg => undef, default => 0);
-
-################################################################################
-# High Level API
-################################################################################
-
-=head2 forward_backward
-
-    A convenient function that calls both forward and backward.
-=cut
-
-method forward_backward(AI::MXNet::DataBatch $data_batch)
-{
-    $self->forward($data_batch, is_train => 1);
-    $self->backward();
-}
-
-=head2 score
-
-    Run prediction on eval_data and evaluate the performance according to
-    eval_metric.
-
-    Parameters
-    ----------
-    $eval_data   : AI::MXNet::DataIter
-    $eval_metric : AI::MXNet::EvalMetric
-    :$num_batch= : Maybe[Int]
-        Number of batches to run. Default is undef, indicating run until the AI::MXNet::DataIter
-        finishes.
-    :$batch_end_callback= : Maybe[Callback]
-        Could also be a array ref of functions.
-    :$reset=1 : Bool
-        Default 1, indicating whether we should reset $eval_data before starting
-        evaluating.
-    $epoch=0 : Int
-        Default is 0. For compatibility, this will be passed to callbacks (if any). During
-        training, this will correspond to the training epoch number.
-=cut
-
-method score(
-    AI::MXNet::DataIter                 $eval_data,
-    EvalMetric|ArrayRef[EvalMetric]     $eval_metric,
-    Maybe[Int]                          :$num_batch=,
-    Maybe[Callback]|ArrayRef[Callback]  :$batch_end_callback=,
-    Maybe[Callback]|ArrayRef[Callback]  :$score_end_callback=,
-    Bool                                :$reset=1,
-    Int                                 :$epoch=0
-)
-{
-    assert($self->binded and $self->params_initialized);
-    $eval_data->reset if $reset;
-    if(not blessed $eval_metric or not $eval_metric->isa('AI::MXNet::EvalMetric'))
-    {
-        $eval_metric = AI::MXNet::Metric->create($eval_metric);
-    }
-
-    $eval_metric->reset();
-    my $actual_num_batch = 0;
-    my $nbatch = 0;
-    while(my $eval_batch = <$eval_data>)
-    {
-        last if (defined $num_batch and $nbatch == $num_batch);
-        $self->forward($eval_batch, is_train => 0);
-        $self->update_metric($eval_metric, $eval_batch->label);
-
-        if (defined $batch_end_callback)
-        {
-            my $batch_end_params = AI::MXNet::BatchEndParam->new(
-                epoch  => $epoch,
-                nbatch => $nbatch,
-                eval_metric => $eval_metric
-            );
-            for my $callback (@{ _as_list($batch_end_callback) })
-            {
-                $callback->($batch_end_params);
-            }
-        }
-        $actual_num_batch++;
-        $nbatch++
-    }
-    if($score_end_callback)
-    {
-        my $params = AI::MXNet::BatchEndParam->new(
-            epoch  => $epoch,
-            nbatch => $actual_num_batch,
-            eval_metric => $eval_metric,
-        );
-        for my $callback (@{ _as_list($score_end_callback) })
-        {
-            $callback->($params);
-        }
-    }
-    return $eval_metric->get_name_value;
-}
-
-=head2  iter_predict
-
-    Iterate over predictions.
-
-    Parameters
-    ----------
-    $eval_data : AI::MXNet::DataIter
-    :$num_batch= : Maybe[Int]
-        Default is undef, indicating running all the batches in the data iterator.
-    :$reset=1 : bool
-        Default is 1, indicating whether we should reset the data iter before start
-        doing prediction.
-=cut
-
-method iter_predict(AI::MXNet::DataIter $eval_data, Maybe[Int] :$num_batch=, Bool :$reset=1)
-{
-    assert($self->binded and $self->params_initialized);
-    if($reset)
-    {
-        $eval_data->reset;
-    }
-    my $nbatch = 0;
-    my @out;
-    while(my $eval_batch = <$eval_data>)
-    {
-        last if defined $num_batch and $nbatch == $num_batch;
-        $self->forward($eval_batch, is_train => 0);
-        my $pad = $eval_batch->pad;
-        my $outputs = [
-            map { $_->slice([0, $_->shape->[0] - ($pad//0) - 1]) } @{ $self->get_outputs() }
-        ];
-        push @out, [$outputs, $nbatch, $eval_batch];
-        $nbatch++;
-    }
-    return @out;
-}
-
-=head2 predict
-
-    Run prediction and collect the outputs.
-
-    Parameters
-    ----------
-    $eval_data  : AI::MXNet::DataIter|AcceptableInput (PDL|NDArray)
-    :$num_batch= : Maybe[Int]
-        Default is undef, indicating running all the batches in the data iterator.
-    :$merge_batches=1 : Bool
-        Default is 1.
-    :$reset=1 : Bool
-        Default is 1, indicating whether we should reset the data iter before start
-        doing prediction.
-    :$always_output_list=0 : Bool
-    Default is 0, see the doc for return values.
-
-    Returns
-    -------
-    If the input is AI::MXNet::NDArray|PDL then the return value is AI::MXNet::NDArray.
-
-    When $merge_batches is 1 (by default), the return value will be an array ref
-    [$out1, $out2, $out3] where each element is concatenation of the outputs for
-    all the mini-batches. If $always_output_list` also is 0 (by default),
-    then in the case of a single output, $out1 is returned in stead of [$out1].
-
-    When $merge_batches is 0, the return value will be a nested array ref like
-    [[$out1_batch1, $out2_batch1], [$out1_batch2], ...]. This mode is useful because
-    in some cases (e.g. bucketing), the module does not necessarily produce the same
-    number of outputs.
-
-    The objects in the results are AI::MXNet::NDArray`s. If you need to work with pdl array,
-    just call ->aspdl() on each AI::MXNet::NDArray.
-=cut
-
-method predict(
-    AI::MXNet::DataIter|AcceptableInput $eval_data,
-    Maybe[Int] :$num_batch=, Bool :$merge_batches=1, Bool :$reset=1, Bool :$always_output_list=0
-)
-{
-    assert($self->binded and $self->params_initialized);
-    if(not blessed $eval_data or not $eval_data->isa('AI::MXNet::DataIter'))
-    {
-        if(not blessed $eval_data or not $eval_data->isa('AI::MXNet::NDArray'))
-        {
-            $eval_data = AI::MXNet::NDArray->array($eval_data);
-        }
-        $self->forward(AI::MXNet::DataBatch->new(data => [$eval_data]));
-        return $self->get_outputs->[0];
-    }
-    $eval_data->reset() if $reset;
-    my @output_list;
-    my $nbatch = 0;
-    while(my $eval_batch = <$eval_data>)
-    {
-        last if defined $num_batch and $nbatch == $num_batch;
-        $self->forward($eval_batch, is_train => 0);
-        my $pad = $eval_batch->pad;
-        my $outputs = [map { $_->slice([0, $_->shape->[0]-($pad//0)-1])->copy } @{ $self->get_outputs }];
-        push @output_list, $outputs;
-    }
-    return () unless @output_list;
-    if($merge_batches)
-    {
-        my $num_outputs = @{ $output_list[0] };
-        for my $out (@output_list)
-        {
-            unless(@{ $out } == $num_outputs)
-            {
-                confess('Cannot merge batches, as num of outputs is not the same '
-                       .'in mini-batches. Maybe bucketing is used?');
-            }
-        }
-        my @output_list2;
-        for my $i (0..$num_outputs-1)
-        {
-            push @output_list2,
-                 AI::MXNet::NDArray->concatenate([map { $_->[$i] } @output_list]);
-        }
-        if($num_outputs == 1 and not $always_output_list)
-        {
-            return $output_list2[0];
-        }
-        return @output_list2;
-    }
-    return @output_list;
-}
-
-=head2 fit
-
-    Train the module parameters.
-
-    Parameters
-    ----------
-    $train_data : AI::MXNet::DataIter
-    :$eval_data= : Maybe[AI::MXNet::DataIter]
-        If not undef, it will be used as a validation set to evaluate the performance
-        after each epoch.
-    :$eval_metric='acc' : str or AI::MXNet::EvalMetric subclass object.
-        Default is 'accuracy'. The performance measure used to display during training.
-        Other possible predefined metrics are:
-        'ce' (CrossEntropy), 'f1', 'mae', 'mse', 'rmse', 'top_k_accuracy'
-    :$epoch_end_callback= : Maybe[Callback]|ArrayRef[Callback] function or array ref of functions.
-        Each callback will be called with the current $epoch, $symbol, $arg_params
-        and $aux_params.
-    :$batch_end_callback= : Maybe[Callback]|ArrayRef[Callback] function or array ref of functions.
-        Each callback will be called with a AI::MXNet::BatchEndParam.
-    :$kvstore='local' : str or AI::MXNet::KVStore
-        Default is 'local'.
-    :$optimizer : str or AI::MXNet::Optimizer
-        Default is 'sgd'
-    :$optimizer_params : hash ref
-        Default { learning_rate => 0.01 }.
-        The parameters for the optimizer constructor.
-    :$eval_end_callback= : Maybe[Callback]|ArrayRef[Callback] function or array ref of functions
-        These will be called at the end of each full evaluation, with the metrics over
-        the entire evaluation set.
-    :$eval_batch_end_callback : Maybe[Callback]|ArrayRef[Callback] function or array ref of functions
-        These will be called at the end of each minibatch during evaluation
-    :$initializer= : Initializer
-        Will be called to initialize the module parameters if not already initialized.
-    :$arg_params= : hash ref
-        Default undef, if not undef, must be an existing parameters from a trained
-        model or loaded from a checkpoint (previously saved model). In this case,
-        the value here will be used to initialize the module parameters, unless they
-        are already initialized by the user via a call to init_params or fit.
-        $arg_params have higher priority than the $initializer.
-    :$aux_params= : hash ref
-        Default is undef. This is similar to the $arg_params, except for auxiliary states.
-    :$allow_missing=0 : Bool
-        Default is 0. Indicates whether we allow missing parameters when $arg_params
-        and $aux_params are not undefined. If this is 1, then the missing parameters
-        will be initialized via the $initializer.
-    :$force_rebind=0 : Bool
-        Default is 0. Whether to force rebinding the executors if already binded.
-    :$force_init=0 : Bool
-        Default is 0. Indicates whether we should force initialization even if the
-        parameters are already initialized.
-    :$begin_epoch=0 : Int
-        Default is 0. Indicates the starting epoch. Usually, if we are resuming from a
-        checkpoint saved at a previous training phase at epoch N, then we should specify
-        this value as N+1.
-    :$num_epoch : Int
-        Number of epochs for the training.
-=cut
-
-
-method fit(
-    AI::MXNet::DataIter                 $train_data,
-    Maybe[AI::MXNet::DataIter]         :$eval_data=,
-    EvalMetric|ArrayRef[EvalMetric]    :$eval_metric='acc',
-    Maybe[Callback]|ArrayRef[Callback] :$epoch_end_callback=,
-    Maybe[Callback]|ArrayRef[Callback] :$batch_end_callback=,
-    KVStore                            :$kvstore='local',
-    Optimizer                          :$optimizer='sgd',
-    HashRef                            :$optimizer_params={ learning_rate => 0.01 },
-    Maybe[Callback]|ArrayRef[Callback] :$eval_end_callback=,
-    Maybe[Callback]|ArrayRef[Callback] :$eval_batch_end_callback=,
-    AI::MXNet::Initializer             :$initializer=AI::MXNet::Initializer->Uniform(scale => 0.01),
-    Maybe[HashRef[AI::MXNet::NDArray]] :$arg_params=,
-    Maybe[HashRef[AI::MXNet::NDArray]] :$aux_params=,
-    Bool                               :$allow_missing=0,
-    Bool                               :$force_rebind=0,
-    Bool                               :$force_init=0,
-    Int                                :$begin_epoch=0,
-    Int                                :$num_epoch,
-    Maybe[EvalMetric|ArrayRef[EvalMetric]] :$validation_metric=,
-    Maybe[AI::MXNet::Monitor]          :$monitor=
-)
-{
-    $self->bind(
-        data_shapes  => $train_data->provide_data,
-        label_shapes => $train_data->provide_label,
-        for_training => 1,
-        force_rebind => $force_rebind
-    );
-    if($monitor)
-    {
-        $self->install_monitor($monitor);
-    }
-    $self->init_params(
-        initializer   => $initializer,
-        arg_params    => $arg_params,
-        aux_params    => $aux_params,
-        allow_missing => $allow_missing,
-        force_init    => $force_init
-    );
-    $self->init_optimizer(
-        kvstore          => $kvstore,
-        optimizer        => $optimizer,
-        optimizer_params => $optimizer_params
-    );
-
-    if(not defined $validation_metric)
-    {
-        $validation_metric = $eval_metric;
-    }
-    $eval_metric = AI::MXNet::Metric->create($eval_metric)
-        unless blessed $eval_metric;
-    my $epoch_eval_metric = dclone($eval_metric);
-
-    ################################################################################
-    # training loop
-    ################################################################################
-    for my $epoch ($begin_epoch..$num_epoch-1)
-    {
-        my $tic = time;
-        $eval_metric->reset;
-        $epoch_eval_metric->reset;
-        my $nbatch = 0;
-        my $end_of_batch = 0;
-        my $next_data_batch = <$train_data>;
-        while(not $end_of_batch)
-        {
-            my $data_batch = $next_data_batch;
-            $monitor->tic if $monitor;
-            $self->forward_backward($data_batch);
-            $self->update;
-            $next_data_batch = <$train_data>;
-            if(defined $next_data_batch)
-            {
-                $self->prepare($next_data_batch);
-            }
-            else
-            {
-                $end_of_batch = 1;
-            }
-            $self->update_metric($epoch_eval_metric, $data_batch->label);
-            $monitor->toc_print if $monitor;
-            if(defined $batch_end_callback)
-            {
-                $self->update_metric($eval_metric, $data_batch->label);
-                my $batch_end_params = AI::MXNet::BatchEndParam->new(
-                    epoch       => $epoch,
-                    nbatch      => $nbatch,
-                    eval_metric => $eval_metric
-                );
-                for my $callback (@{ _as_list($batch_end_callback) })
-                {
-                    $callback->($batch_end_params);
-                }
-            }
-            $nbatch++;
-        }
-        # one epoch of training is finished
-        my $name_value = $epoch_eval_metric->get_name_value;
-        while(my ($name, $val) = each %{ $name_value })
-        {
-            $self->logger->info('Epoch[%d] Train-%s=%f', $epoch, $name, $val);
-        }
-        my $toc = time;
-        $self->logger->info('Epoch[%d] Time cost=%.3f', $epoch, ($toc-$tic));
-
-        # sync aux params across devices
-        my ($arg_params, $aux_params) = $self->get_params;
-        $self->set_params($arg_params, $aux_params);
-
-        if($epoch_end_callback)
-        {
-            for my $callback (@{ _as_list($epoch_end_callback) })
-            {
-                $callback->($epoch, $self->get_symbol, $arg_params, $aux_params);
-            }
-        }
-        #----------------------------------------
-        # evaluation on validation set
-        if(defined $eval_data)
-        {
-            my $res = $self->score(
-                $eval_data,
-                $validation_metric,
-                score_end_callback => $eval_end_callback,
-                batch_end_callback => $eval_batch_end_callback,
-                epoch              => $epoch
-            );
-            #TODO: pull this into default
-            while(my ($name, $val) = each %{ $res })
-            {
-                $self->logger->info('Epoch[%d] Validation-%s=%f', $epoch, $name, $val);
-            }
-        }
-        # end of 1 epoch, reset the data-iter for another epoch
-        $train_data->reset;
-    }
-}
-
-################################################################################
-# Symbol information
-################################################################################
-
-=head2 get_symbol
-
-    The symbol used by this module.
-=cut
-method get_symbol() { $self->symbol }
-
-=head2 data_names
-
-    An array ref of names for data required by this module.
-=cut
-method data_names() { confess("NotImplemented") }
-
-=head2 output_names
-
-    An array ref of names for the outputs of this module.
-=cut
-method output_names() { confess("NotImplemented") }
-
-################################################################################
-# Input/Output information
-################################################################################
-
-=head2 data_shapes
-
-    An array ref of AI::MXNet::DataDesc objects specifying the data inputs to this module.
-=cut
-method data_shapes() { confess("NotImplemented") }
-
-=head2 label_shapes
-
-    A array ref of AI::MXNet::DataDesc objects specifying the label inputs to this module.
-    If this module does not accept labels -- either it is a module without a loss
-    function, or it is not binded for training, then this should return an empty
-    array ref.
-=cut
-method label_shapes() { confess("NotImplemented") }
-
-=head2 output_shapes
-
-    An array ref of (name, shape) array refs specifying the outputs of this module.
-=cut
-method output_shapes() { confess("NotImplemented") }
-
-################################################################################
-# Parameters of a module
-################################################################################
-
-=head2 get_params
-
-    The parameters, these are potentially a copies of the actual parameters used
-    to do computation on the device.
-
-    Returns
-    -------
-    ($arg_params, $aux_params), a pair of hash refs of name to value mapping.
-=cut
-
-method get_params() { confess("NotImplemented") }
-
-=head2 init_params
-
-    Initialize the parameters and auxiliary states.
-
-    Parameters
-    ----------
-    :$initializer : Maybe[AI::MXNet::Initializer]
-        Called to initialize parameters if needed.
-    :$arg_params= : Maybe[HashRef[AI::MXNet::NDArray]]
-        If not undef, should be a hash ref of existing arg_params.
-    :$aux_params : Maybe[HashRef[AI::MXNet::NDArray]]
-        If not undef, should be a hash ref of existing aux_params.
-    :$allow_missing=0 : Bool
-        If true, params could contain missing values, and the initializer will be
-        called to fill those missing params.
-    :$force_init=0 : Bool
-        If true, will force re-initialize even if already initialized.
-    :$allow_extra=0 : Boolean, optional
-        Whether allow extra parameters that are not needed by symbol.
-        If this is True, no error will be thrown when arg_params or aux_params
-        contain extra parameters that is not needed by the executor.
-=cut
-
-method init_params(
-    Maybe[AI::MXNet::Initializer]      :$initializer=AI::MXNet::Initializer->Uniform(0.01),
-    Maybe[HashRef[AI::MXNet::NDArray]] :$arg_params=,
-    Maybe[HashRef[AI::MXNet::NDArray]] :$aux_params=,
-    Bool                               :$allow_missing=0,
-    Bool                               :$force_init=0,
-    Bool                               :$allow_extra=0
-)
-{
-    confess("NotImplemented");
-}
-
-=head2 set_params
-
-    Assign parameter and aux state values.
-
-    Parameters
-    ----------
-    $arg_params= : Maybe[HashRef[AI::MXNet::NDArray]]
-        Hash ref of name to value (NDArray) mapping.
-    $aux_params= : Maybe[HashRef[AI::MXNet::NDArray]]
-        Hash Ref of name to value (`NDArray`) mapping.
-    :$allow_missing=0 : Bool
-        If true, params could contain missing values, and the initializer will be
-        called to fill those missing params.
-    :$force_init=0 : Bool
-        If true, will force re-initialize even if already initialized.
-    :$allow_extra=0 : Bool
-        Whether allow extra parameters that are not needed by symbol.
-        If this is True, no error will be thrown when arg_params or aux_params
-        contain extra parameters that is not needed by the executor.
-=cut
-
-method set_params(
-    Maybe[HashRef[AI::MXNet::NDArray]]  $arg_params=,
-    Maybe[HashRef[AI::MXNet::NDArray]]  $aux_params=,
-    Bool                               :$allow_missing=0,
-    Bool                               :$force_init=0,
-    Bool                               :$allow_extra=0
-)
-{
-    $self->init_params(
-        initializer   => undef,
-        arg_params    => $arg_params,
-        aux_params    => $aux_params,
-        allow_missing => $allow_missing,
-        force_init    => $force_init,
-        allow_extra   => $allow_extra
-    );
-}
-
-=head2 save_params
-
-    Save model parameters to file.
-
-    Parameters
-    ----------
-    $fname : str
-        Path to output param file.
-    $arg_params= : Maybe[HashRef[AI::MXNet::NDArray]]
-    $aux_params= : Maybe[HashRef[AI::MXNet::NDArray]]
-=cut
-
-method save_params(
-    Str $fname,
-    Maybe[HashRef[AI::MXNet::NDArray]] $arg_params=,
-    Maybe[HashRef[AI::MXNet::NDArray]] $aux_params=
-)
-{
-    ($arg_params, $aux_params) = $self->get_params
-        unless (defined $arg_params and defined $aux_params);
-    my %save_dict;
-    while(my ($k, $v) = each %{ $arg_params })
-    {
-        $save_dict{"arg:$k"} = $v->as_in_context(AI::MXNet::Context->cpu);
-    }
-    while(my ($k, $v) = each %{ $aux_params })
-    {
-        $save_dict{"aux:$k"} = $v->as_in_context(AI::MXNet::Context->cpu);
-    }
-    AI::MXNet::NDArray->save($fname, \%save_dict);
-}
-
-=head2 load_params
-
-    Load model parameters from file.
-
-    Parameters
-    ----------
-    $fname : str
-        Path to input param file.
-=cut
-
-method load_params(Str $fname)
-{
-    my %save_dict = %{ AI::MXNet::NDArray->load($fname) };
-    my %arg_params;
-    my %aux_params;
-    while(my ($k, $v) = each %save_dict)
-    {
-        my ($arg_type, $name) = split(/:/, $k, 2);
-        if($arg_type eq 'arg')
-        {
-            $arg_params{ $name } = $v;
-        }
-        elsif($arg_type eq 'aux')
-        {
-            $aux_params{ $name } = $v;
-        }
-        else
-        {
-            confess("Invalid param file $fname");
-        }
-    }
-    $self->set_params(\%arg_params, \%aux_params);
-}
-
-=head2 get_states
-
-    The states from all devices
-
-    Parameters
-    ----------
-    $merge_multi_context=1 : Bool
-        Default is true (1). In the case when data-parallelism is used, the states
-        will be collected from multiple devices. A true value indicate that we
-        should merge the collected results so that they look like from a single
-        executor.
-
-    Returns
-    -------
-    If $merge_multi_context is 1, it is like [$out1, $out2]. Otherwise, it
-    is like [[$out1_dev1, $out1_dev2], [$out2_dev1, $out2_dev2]]. All the output
-    elements are AI::MXNet::NDArray.
-=cut
-
-method get_states(Bool $merge_multi_context=1)
-{
-    assert($self->binded and $self->params_initialized);
-    assert(not $merge_multi_context);
-    return [];
-}
-
-=head2 set_states
-
-    Set value for states. You can specify either $states or $value, not both.
-
-    Parameters
-    ----------
-    $states= : Maybe[ArrayRef[ArrayRef[AI::MXNet::NDArray]]]
-        source states arrays formatted like [[$state1_dev1, $state1_dev2],
-            [$state2_dev1, $state2_dev2]].
-    $value= : Maybe[Num]
-        a single scalar value for all state arrays.
-=cut
-
-method set_states(Maybe[ArrayRef[ArrayRef[AI::MXNet::NDArray]]] $states=, Maybe[Num] $value=)
-{
-    assert($self->binded and $self->params_initialized);
-    assert(not $states and not $value);
-}
-
-
-=head2 install_monitor
-
-    Install monitor on all executors
-
-    Parameters
-    ----------
-    $mon : AI::MXNet::Monitor
-=cut
-
-method install_monitor(AI::MXNet::Monitor $mon) { confess("NotImplemented") }
-
-=head2 prepare
-
-    Prepare the module for processing a data batch.
-
-    Usually involves switching a bucket and reshaping.
-
-    Parameters
-    ----------
-    $data_batch : AI::MXNet::DataBatch
-=cut
-
-method prepare(AI::MXNet::DataBatch $data_batch){}
-
-################################################################################
-# Computations
-################################################################################
-
-=head2 forward
-
-    Forward computation. It supports data batches with different shapes, such as
-    different batch sizes or different image sizes.
-    If reshaping of data batch relates to modification of symbol or module, such as
-    changing image layout ordering or switching from training to predicting, module
-    rebinding is required.
-
-    Parameters
-    ----------
-    $data_batch : DataBatch
-        Could be anything with similar API implemented.
-    :$is_train= : Bool
-        Default is undef, which means is_train takes the value of $self->for_training.
-=cut
-
-method forward(AI::MXNet::DataBatch $data_batch, Bool :$is_train=) { confess("NotImplemented") }
-
-=head2 backward
-
-    Backward computation.
-
-    Parameters
-    ----------
-    $out_grads : Maybe[AI::MXNet::NDArray|ArrayRef[AI::MXNet::NDArray]], optional
-        Gradient on the outputs to be propagated back.
-        This parameter is only needed when bind is called
-        on outputs that are not a loss function.
-=cut
-
-method backward(Maybe[AI::MXNet::NDArray|ArrayRef[AI::MXNet::NDArray]] $out_grads=)
-{
-    confess("NotImplemented")
-}
-
-=head2 get_outputs
-
-    The outputs of the previous forward computation.
-
-    Parameters
-    ----------
-    $merge_multi_context=1 : Bool
-=cut
-
-method get_outputs(Bool $merge_multi_context=1) { confess("NotImplemented") }
-
-=head2 get_input_grads
-
-    The gradients to the inputs, computed in the previous backward computation.
-
-    Parameters
-    ----------
-    $merge_multi_context=1 : Bool
-=cut
-
-method get_input_grads(Bool $merge_multi_context=1) { confess("NotImplemented") }
-
-=head2 update
-
-    Update parameters according to the installed optimizer and the gradients computed
-    in the previous forward-backward batch.
-=cut
-
-method update() { confess("NotImplemented") }
-
-=head2 update_metric
-
-    Evaluate and accumulate evaluation metric on outputs of the last forward computation.
-
-    Parameters
-    ----------
-    $eval_metric : EvalMetric
-    $labels : ArrayRef[AI::MXNet::NDArray]
-        Typically $data_batch->label.
-=cut
-
-method update_metric(EvalMetric $eval_metric, ArrayRef[AI::MXNet::NDArray] $labels)
-{
-    confess("NotImplemented")
-}
-
-################################################################################
-# module setup
-################################################################################
-
-=head2 bind
-
-    Binds the symbols in order to construct the executors. This is necessary
-    before the computations can be performed.
-
-    Parameters
-    ----------
-    $data_shapes : ArrayRef[AI::MXNet::DataDesc]
-        Typically is $data_iter->provide_data.
-    :$label_shapes= : Maybe[ArrayRef[AI::MXNet::DataDesc]]
-        Typically is $data_iter->provide_label.
-    :$for_training=1 : Bool
-        Default is 1. Whether the executors should be bind for training.
-    :$inputs_need_grad=0 : Bool
-        Default is 0. Whether the gradients to the input data need to be computed.
-        Typically this is not needed. But this might be needed when implementing composition
-        of modules.
-    :$force_rebind=0 : Bool
-        Default is 0. This function does nothing if the executors are already
-        binded. But with this as 1, the executors will be forced to rebind.
-    :$shared_module= : A subclass of AI::MXNet::Module::Base
-        Default is undef. This is used in bucketing. When not undef, the shared module
-        essentially corresponds to a different bucket -- a module with different symbol
-        but with the same sets of parameters (e.g. unrolled RNNs with different lengths).
-    :$grad_req='write' : Str|ArrayRef[Str]|HashRef[Str]
-        Requirement for gradient accumulation. Can be 'write', 'add', or 'null'
-        (defaults to 'write').
-        Can be specified globally (str) or for each argument (array ref, hash ref).
-=cut
-
-method bind(
-    ArrayRef[AI::MXNet::DataDesc]         $data_shapes,
-    Maybe[ArrayRef[AI::MXNet::DataDesc]] :$label_shapes=,
-    Bool                                 :$for_training=1,
-    Bool                                 :$inputs_need_grad=0,
-    Bool                                 :$force_rebind=0,
-    Maybe[AI::MXNet::BaseModule]         :$shared_module=,
-    Str|ArrayRef[Str]|HashRef[Str]       :$grad_req='write'
-)
-{
-    confess("NotImplemented")
-}
-
-=head2 init_optimizer
-
-    Install and initialize optimizers.
-
-    Parameters
-    ----------
-    :$kvstore='local' : str or KVStore
-    :$optimizer='sgd' : str or Optimizer
-    :$optimizer_params={ learning_rate => 0.01 } : hash ref
-    :$force_init=0 : Bool
-=cut
-
-method init_optimizer(
-    Str        :$kvstore='local',
-    Optimizer  :$optimizer='sgd',
-    HashRef    :$optimizer_params={ learning_rate => 0.01 },
-    Bool       :$force_init=0
-)
-{
-    confess("NotImplemented")
-}
-
-################################################################################
-# misc
-################################################################################
-
-=head2 symbol
-
-    The symbol associated with this module.
-
-    Except for AI::MXNet::Module, for other types of modules (e.g. AI::MXNet::Module::Bucketing), this
-    property might not be a constant throughout its life time. Some modules might
-    not even be associated with any symbols.
-=cut
-
-method symbol()
-{
-    return $self->_symbol;
-}
-
-1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Module/Bucketing.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Module/Bucketing.pm
deleted file mode 100644
index a8c482e3f178..000000000000
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Module/Bucketing.pm
+++ /dev/null
@@ -1,497 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-package AI::MXNet::Module::Bucketing;
-use Mouse;
-use AI::MXNet::Function::Parameters;
-use AI::MXNet::Base;
-
-=encoding UTF-8
-
-=head1 NAME
-
-AI::MXNet::Module::Bucketing
-
-=head1 SYNOPSIS
-
-=head1 DESCRIPTION
-
-    Implements the AI::MXNet::Module::Base API, and allows multiple
-    symbols to be used depending on the `bucket_key` provided by each different
-    mini-batch of data
-=cut
-
-
-=head2 new
-
-    Parameters
-    ----------
-    $sym_gen : subref or any perl object that overloads &{} op
-        A sub when called with a bucket key, returns a list with triple
-        of ($symbol, $data_names, $label_names).
-    $default_bucket_key : str or anything else
-        The key for the default bucket.
-    $logger : Logger
-    $context : AI::MXNet::Context or array ref of AI::MXNet::Context objects
-        Default is cpu(0)
-    $work_load_list : array ref of Num
-        Default is undef, indicating uniform workload.
-    $fixed_param_names: arrayref of str
-        Default is undef, indicating no network parameters are fixed.
-    $state_names : arrayref of str
-        states are similar to data and label, but not provided by data iterator.
-        Instead they are initialized to 0 and can be set by set_states()
-=cut
-
-extends 'AI::MXNet::Module::Base';
-has '_sym_gen'            => (is => 'ro', init_arg => 'sym_gen', required => 1);
-has '_default_bucket_key' => (is => 'rw', init_arg => 'default_bucket_key', required => 1);
-has '_context'            => (
-    is => 'ro', isa => 'AI::MXNet::Context|ArrayRef[AI::MXNet::Context]',
-    lazy => 1, default => sub { AI::MXNet::Context->cpu },
-    init_arg => 'context'
-);
-has '_work_load_list'     => (is => 'rw', init_arg => 'work_load_list', isa => 'ArrayRef[Num]');
-has '_curr_module'        => (is => 'rw', init_arg => undef);
-has '_curr_bucket_key'    => (is => 'rw', init_arg => undef);
-has '_buckets'            => (is => 'rw', init_arg => undef, default => sub { +{} });
-has '_fixed_param_names'  => (is => 'rw', isa => 'ArrayRef[Str]', init_arg => 'fixed_param_names');
-has '_state_names'        => (is => 'rw', isa => 'ArrayRef[Str]', init_arg => 'state_names');
-has '_params_dirty'       => (is => 'rw', init_arg => undef);
-
-sub BUILD
-{
-    my ($self, $original_params) = @_;
-    $self->_fixed_param_names([]) unless defined $original_params->{fixed_param_names};
-    $self->_state_names([]) unless defined $original_params->{state_names};
-    $self->_params_dirty(0);
-    my ($symbol, $data_names, $label_names) = $self->_sym_gen->($self->_default_bucket_key);
-    $self->_check_input_names($symbol, $data_names//[], "data", 1);
-    $self->_check_input_names($symbol, $label_names//[], "label", 0);
-    $self->_check_input_names($symbol, $self->_state_names, "state", 1);
-    $self->_check_input_names($symbol, $self->_fixed_param_names, "fixed_param", 1);
-}
-
-method _reset_bind()
-{
-    $self->binded(0);
-    $self->_buckets({});
-    $self->_curr_module(undef);
-    $self->_curr_bucket_key(undef);
-}
-
-method data_names()
-{
-    if($self->binded)
-    {
-        return $self->_curr_module->data_names;
-    }
-    else
-    {
-        return ($self->_sym_gen->($self->_default_bucket_key))[1];
-    }
-}
-
-method output_names()
-{
-    if($self->binded)
-    {
-        return $self->_curr_module->ouput_names;
-    }
-    else
-    {
-        my ($symbol) = $self->_sym_gen->($self->_default_bucket_key);
-        return $symbol->list_ouputs;
-    }
-}
-
-method data_shapes()
-{
-    assert($self->binded);
-    return $self->_curr_module->data_shapes;
-}
-
-method label_shapes()
-{
-    assert($self->binded);
-    return $self->_curr_module->label_shapes;
-}
-
-method output_shapes()
-{
-    assert($self->binded);
-    return $self->_curr_module->output_shapes;
-}
-
-method get_params()
-{
-    assert($self->binded and $self->params_initialized);
-    $self->_curr_module->_p->_params_dirty($self->_params_dirty);
-    my ($arg_params, $aux_params) = $self->_curr_module->get_params;
-    $self->_params_dirty(0);
-    return ($arg_params, $aux_params);
-}
-
-method set_params(
-    HashRef[AI::MXNet::NDArray] $arg_params,
-    HashRef[AI::MXNet::NDArray] $aux_params,
-    Bool                        $allow_missing=0,
-    Bool                        $force_init=1,
-    Bool                        $allow_extra=0
-)
-{
-    if(not $allow_missing)
-    {
-        $self->init_params(
-            arg_params    => $arg_params,    aux_params => $aux_params,
-            allow_missing => $allow_missing, force_init => $force_init,
-            allow_extra   => $allow_extra
-        );
-       return;
-    }
-    if($self->params_initialized and not $force_init)
-    {
-        AI::MXNet::Logging->warning(
-            "Parameters already initialized and force_init=False. "
-            ."set_params call ignored."
-        );
-        return;
-    }
-    $self->_curr_module->set_params(
-        $arg_params, $aux_params,
-        allow_missing => $allow_missing,
-        force_init    => $force_init,
-        allow_extra   => $allow_extra
-    );
-    # because we didn't update self._arg_params, they are dirty now.
-    $self->_params_dirty(1);
-    $self->params_initialized(1);
-}
-
-method init_params(
-    AI::MXNet::Initializer             :$initializer=AI::MXNet::Initializer->Uniform(scale => 0.01),
-    Maybe[HashRef[AI::MXNet::NDArray]] :$arg_params=,
-    Maybe[HashRef[AI::MXNet::NDArray]] :$aux_params=,
-    Bool                               :$allow_missing=0,
-    Bool                               :$force_init=0,
-    Bool                               :$allow_extra=0
-)
-{
-    return if($self->params_initialized and not $force_init);
-    assert($self->binded, 'call bind before initializing the parameters');
-    $self->_curr_module->init_params(
-        initializer   => $initializer,
-        arg_params    => $arg_params,
-        aux_params    => $aux_params,
-        allow_missing => $allow_missing,
-        force_init    => $force_init,
-        allow_extra   => $allow_extra
-    );
-    $self->_params_dirty(0);
-    $self->params_initialized(1);
-}
-
-method get_states(Bool $merge_multi_context=1)
-{
-    assert($self->binded and $self->params_initialized);
-    $self->_curr_module->get_states($merge_multi_context);
-}
-
-method set_states(:$states=, :$value=)
-{
-    assert($self->binded and $self->params_initialized);
-    $self->_curr_module->set_states(states => $states, value => $value);
-}
-
-=head2 bind
-
-    Binding for a AI::MXNet::Module::Bucketing means setting up the buckets and bind the
-    executor for the default bucket key. Executors corresponding to other keys are
-    binded afterwards with switch_bucket.
-
-    Parameters
-    ----------
-    :$data_shapes : ArrayRef[AI::MXNet::DataDesc|NameShape]
-        This should correspond to the symbol for the default bucket.
-    :$label_shapes= : Maybe[ArrayRef[AI::MXNet::DataDesc|NameShape]]
-        This should correspond to the symbol for the default bucket.
-    :$for_training : Bool
-        Default is 1.
-    :$inputs_need_grad : Bool
-        Default is 0.
-    :$force_rebind : Bool
-        Default is 0.
-    :$shared_module : AI::MXNet::Module::Bucketing
-        Default is undef. This value is currently not used.
-    :$grad_req : str, array ref of str, hash ref of str to str
-        Requirement for gradient accumulation. Can be 'write', 'add', or 'null'
-        (defaults to 'write').
-        Can be specified globally (str) or for each argument (array ref, hash ref).
-    :$bucket_key : str
-        bucket key for binding. by default is to use the ->default_bucket_key
-=cut
-
-method bind(
-    ArrayRef[AI::MXNet::DataDesc|NameShape]                   :$data_shapes,
-    Maybe[ArrayRef[AI::MXNet::DataDesc|NameShape]]            :$label_shapes=,
-    Bool                                                      :$for_training=1,
-    Bool                                                      :$inputs_need_grad=0,
-    Bool                                                      :$force_rebind=0,
-    Maybe[AI::MXNet::BaseModule]                              :$shared_module=,
-    Str|ArrayRef[Str]|HashRef[Str]                            :$grad_req='write',
-    Maybe[Str]                                                :$bucket_key=
-)
-{
-    # in case we already initialized params, keep it
-    my ($arg_params, $aux_params);
-    if($self->params_initialized)
-    {
-        ($arg_params, $aux_params) = $self->get_params;
-    }
-
-    # force rebinding is typically used when one want to switch from
-    # training to prediction phase.
-    $self->_reset_bind if $force_rebind;
-
-    if($self->binded)
-    {
-        $self->logger->warning('Already binded, ignoring bind()');
-        return;
-    }
-
-    assert((not defined $shared_module), 'shared_module for BucketingModule is not supported');
-
-    $self->for_training($for_training);
-    $self->inputs_need_grad($inputs_need_grad);
-    $self->binded(1);
-
-    my ($symbol, $data_names, $label_names) = $self->_sym_gen->($bucket_key//$self->_default_bucket_key);
-    my $module = AI::MXNet::Module->new(
-            symbol            => $symbol,
-            data_names        => $data_names,
-            label_names       => $label_names,
-            logger            => $self->logger,
-            context           => $self->_context,
-            work_load_list    => $self->_work_load_list,
-            state_names       => $self->_state_names,
-            fixed_param_names => $self->_fixed_param_names
-    );
-    $module->bind(
-        data_shapes      => $data_shapes,
-        label_shapes     => $label_shapes,
-        for_training     => $for_training,
-        inputs_need_grad => $inputs_need_grad,
-        force_rebind     => 0,
-        shared_module    => undef,
-        grad_req         => $grad_req
-    );
-    $self->_curr_module($module);
-    $self->_curr_bucket_key($self->_default_bucket_key);
-    $self->_buckets->{ $self->_default_bucket_key } = $module;
-
-    # copy back saved params, if already initialized
-    if($self->params_initialized)
-    {
-        $self->set_params($arg_params, $aux_params);
-    }
-}
-
-=head2 switch_bucket
-
-    Switch to a different bucket. This will change $self->_curr_module.
-
-    Parameters
-    ----------
-    :$bucket_key : str (or any perl object that overloads "" op)
-        The key of the target bucket.
-    :$data_shapes :  Maybe[ArrayRef[AI::MXNet::DataDesc|NameShape]]
-        Typically $data_batch->provide_data.
-    :$label_shapes : Maybe[ArrayRef[AI::MXNet::DataDesc|NameShape]]
-        Typically $data_batch->provide_label.
-=cut
-
-method switch_bucket(
-    Maybe[ArrayRef[AI::MXNet::DataDesc|NameShape]]            :$data_shapes=,
-    Maybe[ArrayRef[AI::MXNet::DataDesc|NameShape]]            :$label_shapes=,
-                                                              :$bucket_key
-)
-{
-    assert($self->binded, 'call bind before switching bucket');
-    if(not exists $self->_buckets->{ $bucket_key })
-    {
-        my ($symbol, $data_names, $label_names) = $self->_sym_gen->($bucket_key);
-        my $module = AI::MXNet::Module->new(
-            symbol         => $symbol,
-            data_names     => $data_names,
-            label_names    => $label_names,
-            logger         => $self->logger,
-            context        => $self->_context,
-            work_load_list => $self->_work_load_list
-        );
-        $module->bind(
-            data_shapes      => $data_shapes,
-            label_shapes     => $label_shapes,
-            for_training     => $self->_curr_module->for_training,
-            inputs_need_grad => $self->_curr_module->inputs_need_grad,
-            force_rebind     => 0,
-            shared_module    => $self->_buckets->{ $self->_default_bucket_key },
-        );
-        $self->_buckets->{ $bucket_key } = $module;
-    }
-    $self->_curr_module($self->_buckets->{ $bucket_key });
-    $self->_curr_bucket_key($bucket_key);
-}
-
-method init_optimizer(
-    Str        :$kvstore='local',
-    Optimizer  :$optimizer='sgd',
-    HashRef    :$optimizer_params={ learning_rate => 0.01 },
-    Bool       :$force_init=0
-)
-{
-    assert($self->binded and $self->params_initialized);
-    if($self->optimizer_initialized and not $force_init)
-    {
-        $self->logger->warning('optimizer already initialized, ignoring.');
-        return;
-    }
-
-    $self->_curr_module->init_optimizer(
-        kvstore           => $kvstore,
-        optimizer         => $optimizer,
-        optimizer_params  => $optimizer_params,
-        force_init        => $force_init
-    );
-    for my $mod (values %{ $self->_buckets })
-    {
-        if($mod ne $self->_curr_module)
-        {
-            $mod->borrow_optimizer($self->_curr_module);
-        }
-    }
-    $self->optimizer_initialized(1);
-}
-
-method prepare(AI::MXNet::DataBatch $data_batch)
-{
-    assert($self->binded and $self->params_initialized);
-    ## perform bind if have not done so yet
-    my $original_bucket_key = $self->_curr_bucket_key;
-    $self->switch_bucket(
-        bucket_key   => $data_batch->bucket_key,
-        data_shapes  => $data_batch->provide_data,
-        label_shapes => $data_batch->provide_label
-    );
-    # switch back
-    $self->switch_bucket(bucket_key => $original_bucket_key);
-}
-
-method forward(
-    AI::MXNet::DataBatch  $data_batch,
-    Bool                 :$is_train=
-)
-{
-    assert($self->binded and $self->params_initialized);
-    $self->switch_bucket(
-        bucket_key   => $data_batch->bucket_key,
-        data_shapes  => $data_batch->provide_data,
-        label_shapes => $data_batch->provide_label
-    );
-    $self->_curr_module->forward($data_batch, is_train => $is_train);
-}
-
-method backward(Maybe[ArrayRef[AI::MXNet::NDArray]|AI::MXNet::NDArray] $out_grads=)
-{
-    assert($self->binded and $self->params_initialized);
-    $self->_curr_module->backward($out_grads);
-}
-
-method update()
-{
-    assert($self->binded and $self->params_initialized and $self->optimizer_initialized);
-    $self->_params_dirty(1);
-    $self->_curr_module->update;
-}
-
-method get_outputs(Bool $merge_multi_context=1)
-{
-    assert($self->binded and $self->params_initialized);
-    return $self->_curr_module->get_outputs($merge_multi_context);
-}
-
-method get_input_grads(Bool $merge_multi_context=1)
-{
-    assert($self->binded and $self->params_initialized and $self->inputs_need_grad);
-    return $self->_curr_module->get_input_grads($merge_multi_context);
-}
-
-method update_metric(
-    AI::MXNet::EvalMetric $eval_metric,
-    ArrayRef[AI::MXNet::NDArray] $labels
-)
-{
-    assert($self->binded and $self->params_initialized);
-    $self->_curr_module->update_metric($eval_metric, $labels);
-}
-
-method symbol()
-{
-    assert($self->binded);
-    return $self->_curr_module->symbol;
-}
-
-method get_symbol()
-{
-    assert($self->binded);
-    return $self->_buckets->{ $self->_default_bucket_key }->symbol;
-}
-
-method install_monitor(AI::MXNet::Monitor $mon)
-{
-    assert($self->binded);
-    for my $mod (values %{ $self->_buckets })
-    {
-        $mod->install_monitor($mon);
-    }
-}
-
-=head2 save_checkpoint
-
-    Save current progress to a checkpoint.
-    Use mx->callback->module_checkpoint as epoch_end_callback to save during training.
-
-    Parameters
-    ----------
-    prefix : str
-        The file prefix to checkpoint to
-    epoch : int
-        The current epoch number
-    save_optimizer_states : bool
-        Whether to save optimizer states for later training
-=cut
-
-
-method save_checkpoint(Str $prefix, Int $epoch, Bool $save_optimizer_states=0)
-{
-    my %buckets = %{ $self->_buckets };
-    while(my ($key, $module) = each %buckets)
-    {
-        $module->save_checkpoint("${prefix}_$key", $epoch, $save_optimizer_states);
-    }
-}
-
-1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Monitor.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Monitor.pm
deleted file mode 100644
index 9e4a96849a00..000000000000
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Monitor.pm
+++ /dev/null
@@ -1,210 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-package AI::MXNet::Monitor;
-use Mouse;
-use AI::MXNet::NS;
-use AI::MXNet::Function::Parameters;
-use AI::MXNet::Base;
-
-=head1 NAME
-
-    AI::MXNet::Monitor - Monitor outputs, weights, and gradients for debugging.
-
-=head1 DESCRIPTION
-
-    Monitor outputs, weights, and gradients for debugging.
-
-    Parameters
-    ----------
-    interval : Int
-        Number of batches between printing.
-    stat_func : CodeRef
-        a function that computes statistics of tensors.
-        Takes a NDArray and returns a NDArray. defaults to mean
-        absolute value |x|/size(x).
-    pattern : Str
-        A regular expression specifying which tensors to monitor.
-        Only tensors with names that match name_pattern will be included.
-        For example, '.*weight|.*output' will print all weights and outputs;
-        '.*backward.*' will print all gradients.
-=cut
-
-has 'interval'  => (is => 'ro', isa => 'Int', required => 1);
-has 'stat_func' => (
-    is => 'ro',
-    isa => 'CodeRef',
-    default => sub {
-        return sub {
-            # returns |x|/size(x), async execution.
-            my ($x) = @_;
-            return $x->norm/sqrt($x->size);
-        }
-    },
-    lazy => 1
-);
-has 'pattern'             => (is => 'ro', isa => 'Str', default => '.*');
-has '_sort'               => (is => 'ro', isa => 'Bool', init_arg => 'sort', default => 0);
-has [qw/queue exes/]      => (is => 'rw', init_arg => undef, default => sub { [] });
-has [qw/step activated/]  => (is => 'rw', init_arg => undef, default => 0);
-has 're_pattern'          => (
-    is => 'ro',
-    init_arg => undef,
-    default => sub {
-        my $pattern = shift->pattern;
-        my $re = eval { qr/$pattern/ };
-        confess("pattern $pattern failed to compile as a regexp $@")
-            if $@;
-        return $re;
-    },
-    lazy => 1
-);
-has 'stat_helper'          => (
-    is => 'ro',
-    init_arg => undef,
-    default => sub {
-        my $self = shift;
-        return sub {
-            my ($name, $handle) = @_;
-            return if(not $self->activated or not $name =~ $self->re_pattern);
-            my $array = AI::MXNet::NDArray->_ndarray_cls($handle, 0);
-            push @{ $self->queue }, [$self->step, $name, $self->stat_func->($array)];
-        }
-    },
-    lazy => 1
-);
-
-=head2 install
-
-    install callback to executor.
-    Supports installing to multiple exes.
-
-    Parameters
-    ----------
-    $exe : AI::MXNet::Executor
-        the Executor (returned by $symbol->bind) to install to.
-=cut
-
-method install(AI::MXNet::Executor $exe)
-{
-    $exe->set_monitor_callback($self->stat_helper);
-    push @{ $self->exes }, $exe;
-}
-
-=head2 tic
-
-    start collecting stats for current batch.
-    Call before forward
-=cut
-
-method tic()
-{
-        if ($self->step % $self->interval == 0)
-        {
-            for my $exe (@{ $self->exes })
-            {
-                $_->wait_to_read for @{ $exe->arg_arrays };
-                $_->wait_to_read for @{ $exe->aux_arrays };
-            }
-            $self->queue([]);
-            $self->activated(1);
-        }
-        $self->step($self->step + 1);
-}
-
-=head2 toc
-
-    End collecting for current batch and return results.
-    Call after computation of current batch.
-
-    Returns
-    -------
-    res : array ref of array refs with debug info
-=cut
-
-method toc()
-{
-    return [] unless $self->activated;
-    for my $exe (@{ $self->exes })
-    {
-        $_->wait_to_read for @{ $exe->arg_arrays };
-        $_->wait_to_read for @{ $exe->aux_arrays };
-    }
-    for my $exe (@{ $self->exes })
-    {
-        for(zip($exe->_symbol->list_arguments, $exe->arg_arrays)) {
-            my ($name, $array) = @$_;
-            push @{ $self->queue }, [$self->step, $name, $self->stat_func->($array)];
-        }
-        for(zip($exe->_symbol->list_auxiliary_states, $exe->aux_arrays)) {
-            my ($name, $array) = @$_;
-            push @{ $self->queue }, [$self->step, $name, $self->stat_func->($array)];
-        }
-    }
-    $self->activated(0);
-    my @res;
-    if($self->_sort)
-    {
-        @{ $self->queue } = sort { $a->[1] cmp $b->[1] } @{ $self->queue };
-    }
-    for my $q (@{ $self->queue })
-    {
-        my ($n, $k, $v_list) = @{ $q };
-        if(ref $v_list ne 'ARRAY')
-        {
-            $v_list = [$v_list];
-        }
-        my $s = '';
-        for my $v (@{ $v_list })
-        {
-            confess("the argument must be NDArray")
-                unless blessed($v) and $v->isa('AI::MXNet::NDArray');
-            if($v->size == 1)
-            {
-                $s .= $v->asscalar . "\t";
-            }
-            else
-            {
-                $s .= $v->aspdl . "\t";
-            }
-        }
-        push @res, [$n, $k, $s];
-    }
-    $self->queue([]);
-    return \@res;
-}
-
-=head2 toc_print
-
-    End collecting and print results
-=cut
-
-method toc_print()
-{
-    my $res = $self->toc;
-    for my $r (@{ $res })
-    {
-        AI::MXNet::Logging->info('Batch: %7d %30s %s', @{ $r });
-    }
-}
-
-method Monitor(@args)
-{
-    __PACKAGE__->new(@args % 2 ? ('interval', @args) : @args);
-}
-
-1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/NDArray.pm b/perl-package/AI-MXNet/lib/AI/MXNet/NDArray.pm
deleted file mode 100644
index b9268bf0fabc..000000000000
--- a/perl-package/AI-MXNet/lib/AI/MXNet/NDArray.pm
+++ /dev/null
@@ -1,1679 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-package AI::MXNet::NDArray;
-
-=head1 NAME
-
-    AI::MXNet::NDArray - Multidimensional tensor object of MXNet.
-=cut
-
-=head1 DESCRIPTION
-
-    AI::MXNet::NDArray - Imperative tensor operations on CPU/GPU
-    In AI::MXNet, NDArray is the core data structure for all mathematical computations.
-    An NDArray represents a multidimensional, fixed-size homogenous array.
-    If you're familiar with the PDL, you might notice some similarities.
-    However, NDArray is row-major, unlike the PDL that is column-major.
-    Like the PDL, MXNet's NDArray enables imperative computation.
-
-    Some NDArray advandages compared to PDL:
-    MXNet's NDArray supports fast execution on a wide range of hardware configurations, including CPU, GPU, and multi-GPU machines.
-    MXNet also scales to distributed systems in the cloud.
-    MXNet's NDArray executes code lazily, allowing it to automatically parallelize multiple operations across the available hardware.
-
-    An NDArray is a multidimensional array of numbers with the same type.
-    We could represent the coordinates of a point in 3D space, e.g. [2, 1, 6] as a 1D array with shape (3).
-    Similarly, we could represent a 2D array.
-    Below, we present an array with length 2 along the first axis and length 3 along the second axis.
-
-    [[0, 1, 2]
-     [3, 4, 5]]
-    Note that here the use of 'dimension' is overloaded. When we say a 2D array, we mean an array with 2 axes, not an array with two components.
-
-    Each NDArray supports some important attributes that you'll often want to query:
-
-    $ndarray->shape: The dimensions of the array.
-    It is an array ref of integers indicating the length of the array along each axis.
-    For a matrix with $n rows and $m columns, its shape will be [$n, $m].
-    $ndarray->dtype: A string describing the type of its elements.
-    Dtype (defined in AI::MXNet::Types) is one of (float32 float64 float16 uint8 int8 int32 int64)
-    $ndarray->size: The total number of components in the array - equal to the product of the components of its shape.
-    $ndarray->context: The device on which this array is stored, represented by an object of AI::MXNet::Context class, e.g. cpu() or gpu(1).
-
-=cut
-
-use strict;
-use warnings;
-use AI::MXNet::NS;
-use AI::MXNet::Base;
-use AI::MXNet::NDArray::Slice;
-use AI::MXNet::Context;
-use AI::MXNet::RunTime;
-use Mouse;
-use AI::MXNet::Function::Parameters;
-use overload (
-    '""' => \&stringify,
-    '+'  => \&add,
-    '+=' => \&iadd,
-    '-'  => \&subtract,
-    '-=' => \&isubtract,
-    '*'  => \&multiply,
-    '*=' => \&imultiply,
-    '/'  => \&divide,
-    '/=' => \&idivide,
-    '%'  => \&modulo,
-    '%=' => \&imodulo,
-    '**' => \&power,
-    '==' => \&equal,
-    '!=' => \&not_equal,
-    '>'  => \&greater,
-    '>=' => \&greater_equal,
-    '<'  => \&lesser,
-    '<=' => \&lesser_equal,
-    '.=' => \&set,
-    '@{}'=> \&split_array,
-    '='  => sub { $_[0] },
-
-    'sqrt' => sub { $_[0]->sqrt() },
-    'abs' => sub { $_[0]->abs() },
-    'sin' => sub { $_[0]->sin() },
-    'cos' => sub { $_[0]->cos() },
-    'atan2' => \&atan2,
-    'log' => sub { $_[0]->log() },
-    'exp' => sub { $_[0]->exp() },
-);
-
-extends 'AI::MXNet::NDArray::Base';
-has 'writable' => (is => 'rw', isa => 'Int', default => 1, lazy => 1);
-has 'handle'   => (is => 'rw', isa => 'NDArrayHandle', required => 1);
-
-sub DEMOLISH
-{
-    check_call(AI::MXNetCAPI::NDArrayFree(shift->handle));
-}
-
-method STORABLE_freeze($cloning)
-{
-    my $buf = check_call(AI::MXNetCAPI::NDArraySaveRawBytes($self->handle));
-    return ($buf,\ $self->writable);
-}
-
-method STORABLE_thaw($cloning, $buf, $writable)
-{
-    my $handle = check_call(
-                    AI::MXNetCAPI::NDArrayLoadFromRawBytes(
-                        $buf, length($buf)
-                    )
-    );
-    $self->handle($handle);
-    $self->writable($$writable);
-}
-
-method split_array(@args)
-{
-    my $shape = $self->shape;
-    return [] if $shape->[0] == 0;
-    my $list = $self->split(num_outputs=>$shape->[0],
-        squeeze_axis=>int(@$shape > 1), axis=>0);
-    $shape->[0] == 1 ? [ $list ] : $list;
-}
-
-method at(Index @indices)
-{
-    confess("No idxs supplied") unless @indices;
-    my $shape = $self->shape;
-    my $dsize = @$shape;
-    my $isize = @indices;
-    confess("Dimensions size $dsize < indexes size $isize")
-        if $dsize < $isize;
-    confess("Dimensions size $dsize = indexes size $isize,
-                   ndarray only supports either ->at on dimension 0
-                   or full crop")
-        if $isize > 1 and $dsize != $isize;
-    my $i = 0;
-    for(zip(\@indices, $shape)) {
-        my ($idx, $dim_size) = @$_;
-        confess("Dimension $i mismatch Idx: $idx >= Dim Size: $dim_size")
-            if $idx >= $dim_size or ($idx + $dim_size) < 0;
-        ++$i;
-    }
-    $i = 0;
-    for my $v (@indices)
-    {
-        $v += $shape->[$i] if $v < 0;
-        ++$i;
-    }
-    return $self->_at($indices[0]) if @indices == 1;
-    return $self->slice(@indices);
-}
-
-method len() { $self->shape->[0] }
-
-method slice(Slice|AdvancedSlice|InternalSlice @slices)
-{
-    confess("No slices supplied") unless @slices;
-    if(grep { not ref and /^(?:begin|end|slice)$/ } @slices)
-    {
-        return $self->SUPER::slice(@slices);
-    }
-    if(ref $slices[0] eq 'ARRAY' and ref $slices[0]->[0])
-    {
-        my @indices;
-        my $key = $slices[0];
-        my $dtype = 'int32';
-        for my $idx_i (@{ $key })
-        {
-            if(not (blessed $idx_i and $idx_i->isa(__PACKAGE__)))
-            {
-                $idx_i = __PACKAGE__->array($idx_i, ctx=>$self->context, dtype=>$dtype);
-            }
-            else
-            {
-                $dtype = $idx_i->dtype;
-            }
-            push @indices, $idx_i;
-        }
-        my $indices = __PACKAGE__->stack(@indices);
-        return __PACKAGE__->gather_nd($self, $indices);
-    }
-    my $shape = $self->shape;
-    my $dsize = @$shape;
-    my $isize = @slices;
-    confess("Dimensions size $dsize < slices size $isize")
-        if $dsize < $isize;
-    confess("Dimensions size $dsize != slices size $isize,
-                   ndarray only supports either ->slice on dimension 0
-                   or full crop")
-        if $isize > 1 and $dsize != $isize;
-    my $i = -1;
-    @slices = map {
-        ++$i;
-        ref $_ ? (@$_ == 1 ? [$_->[0], $_->[0]] : $_) : ($_ eq 'X' ? [0, $shape->[$i] - 1] : [$_, $_]);
-    } @slices;
-    for(zip(\@slices, $shape)) {
-        my ($slice, $dim_size) = @$_;
-        my ($begin, $end, $stride) = @$slice;
-        confess("NDArray does not support slice strides != 1")
-            if ($stride//0) > 1;
-        confess("Dimension $i mismatch slice begin : $begin >= Dim Size: $dim_size")
-            if $begin >= $dim_size or ($begin + $dim_size) < 0;
-        confess("Dimension $i mismatch slice end : $end >= Dim Size: $dim_size")
-            if $end >= $dim_size or ($end + $dim_size) < 0;
-    }
-    $i = 0;
-    my ($begin, $end) = ([], []);
-    for my $s (@slices)
-    {
-        $s->[0] += $shape->[$i] if $s->[0] < 0;
-        $s->[1] += $shape->[$i] if $s->[1] < 0;
-        confess("Dimension $i slice mismatch (begin $s->[0] > end $s->[1])")
-            if($s->[0] > $s->[1]);
-        push @$begin, $s->[0];
-        push @$end, $s->[1] + 1;
-        $i++;
-    }
-    return $self->_slice($begin->[0], $end->[0]) if @slices == 1;
-    return AI::MXNet::NDArray::Slice->new(parent => $self, begin => $begin, end => $end);
-}
-
-method set(AcceptableInput $value, $reverse=)
-{
-    confess("set value must be defined") unless defined $value;
-    confess("Array is not writable") if not $self->writable;
-    ## plain number
-    if(not ref $value)
-    {
-        $self->_set_value($value, out => $self);
-    }
-    # ndarray
-    elsif(blessed($value) and $value->isa(__PACKAGE__))
-    {
-        $value->copyto($self);
-    }
-    # slice of another ndarray
-    elsif(blessed($value) and $value->isa('AI::MXNet::NDArray::Slice'))
-    {
-        $value->sever->copyto($self);
-    }
-    # perl array, PDL, PDL::Matrix
-    else
-    {
-        $self->_sync_copyfrom($value);
-    }
-    return $self;
-}
-
-method asscalar()
-{
-    confess("ndarray size must be 1") unless $self->size == 1;
-    return $self->aspdl->at(0);
-    ## code below works happily on CPU/segfaults on GPU
-    #$self->wait_to_read;
-    #my $perl_pack_type = DTYPE_MX_TO_PERL->{$self->dtype};
-    #my $length = {qw/f 4 d 8 S 2 C 1 l 4/}->{$perl_pack_type};
-    #return
-    #(map {
-    #        $perl_pack_type eq 'S' ? AI::MXNetCAPI::_half_to_float($_) : $_
-    #     } unpack("$perl_pack_type", check_call(AI::MXNetCAPI::NDArrayGetData($self->handle, $length)))
-    #)[0];
-}
-
-method _sync_copyfrom(ArrayRef|PDL|PDL::Matrix $source_array)
-{
-    my $dtype = $self->dtype;
-    my $pdl_type = PDL::Type->new(DTYPE_MX_TO_PDL->{ $dtype });
-    if(not blessed($source_array))
-    {
-        $source_array = eval {
-            pdl($pdl_type, $source_array);
-        };
-        confess($@) if $@;
-    }
-    if($pdl_type->numval != $source_array->type->numval)
-    {
-        my $convert_func = $pdl_type->convertfunc;
-        $source_array = $source_array->$convert_func;
-    }
-    $source_array = pdl($pdl_type, [@{ $source_array->unpdl } ? $source_array->unpdl->[0] : 0 ])
-        unless @{ $source_array->shape->unpdl };
-    my $pdl_shape = $source_array->shape->unpdl;
-    my $pdl_shape_str = join(',', ref($source_array) eq 'PDL' ? reverse @{ $pdl_shape } : @{ $pdl_shape });
-    my $ndary_shape_str = join(',', @{ $self->shape });
-    if($pdl_shape_str ne $ndary_shape_str)
-    {
-        confess("Shape inconsistant: expected $ndary_shape_str vs got $pdl_shape_str")
-    }
-    my $perl_pack_type = DTYPE_MX_TO_PERL->{$dtype};
-    my $ptr = $source_array->get_dataref;
-    ## special handling for float16
-    if($perl_pack_type eq 'S')
-    {
-        $ptr = \( pack("S*", map { AI::MXNetCAPI::_float_to_half($_) } unpack ("f*", $$ptr)) );
-    }
-    check_call(AI::MXNetCAPI::NDArraySyncCopyFromCPU($self->handle, $$ptr, $self->size));
-    return $self;
-}
-
-=head2 aspdl
-
-    Returns a copied PDL array of current array.
-
-    Returns
-    -------
-    array : PDL
-        A copy of the array content.
-=cut
-
-method aspdl()
-{
-    my $dtype = $self->dtype;
-    my $pdl_type = PDL::Type->new(DTYPE_MX_TO_PDL->{ $dtype });
-    my $pdl = PDL->new_from_specification($pdl_type, reverse @{ $self->shape });
-    my $perl_pack_type = DTYPE_MX_TO_PERL->{$dtype};
-    my $ptr = $pdl->get_dataref;
-    check_call(AI::MXNetCAPI::NDArraySyncCopyToCPU($self->handle, $$ptr, $self->size));
-    ## special handling for float16
-    if($perl_pack_type eq 'S')
-    {
-        $$ptr = pack("f*", map { AI::MXNetCAPI::_half_to_float($_) } unpack("S*", $$ptr));
-    }
-    $pdl->upd_data;
-    return $pdl;
-}
-
-
-=head2 asmpdl
-
-    Returns copied PDL::Matrix objectt of current array.
-
-    Requires caller to "use PDL::Matrix" in user space.
-
-    Returns
-    -------
-    array : PDL::Matrix
-        A copy of array content.
-=cut
-
-method asmpdl()
-{
-    my $dtype = $self->dtype;
-    my $pdl_type = PDL::Type->new(DTYPE_MX_TO_PDL->{ $dtype });
-    my $pdl = PDL::Matrix->new_from_specification($pdl_type, @{ $self->shape });
-    my $perl_pack_type = DTYPE_MX_TO_PERL->{$dtype};
-    my $ptr = $pdl->get_dataref;
-    check_call(AI::MXNetCAPI::NDArraySyncCopyToCPU($self->handle, $$ptr, $self->size));
-    ## special handling for float16
-    if($perl_pack_type eq 'S')
-    {
-        $$ptr = pack("f*", map { AI::MXNetCAPI::_half_to_float($_) } unpack("S*", $$ptr));
-    }
-    $pdl->upd_data;
-    return $pdl;
-}
-
-
-=head2 _slice
-
-    Returns sliced NDArray that shares memory with the current one.
-
-    Parameters
-    ----------
-    start : int
-        Starting index of slice.
-    stop : int
-        Finishing index of slice.
-=cut
-
-method _slice (
-    Index $start,
-    Index $stop
-)
-{
-    confess("start $start > stop $stop") if $start > $stop;
-    my $sub = AI::MXNet::RunTime->Features()->is_enabled('INT64_TENSOR_SIZE')
-              ? \&AI::MXNetCAPI::NDArraySlice64
-              : \&AI::MXNetCAPI::NDArraySlice;
-    my $handle = check_call(
-        $sub->(
-            $self->handle,
-            $start,
-            $stop
-        )
-    );
-    return __PACKAGE__->_ndarray_cls($handle, $self->writable);
-}
-
-=head2  _at
-
-    Returns a sub NDArray that shares memory with current one.
-
-    Parameters
-    ----------
-    idx : int
-        index of the sub array.
-=cut
-
-
-method _at(Index $idx)
-{
-    my $sub = AI::MXNet::RunTime->Features()->is_enabled('INT64_TENSOR_SIZE')
-              ? \&AI::MXNetCAPI::NDArrayAt64
-              : \&AI::MXNetCAPI::NDArrayAt;
-    my $handle = check_call(
-                $sub->(
-                    $self->handle, $idx >=0 ? $idx : $self->shape->[0] + $idx
-                )
-    );
-    return __PACKAGE__->_ndarray_cls($handle, $self->writable);
-}
-
-=head2 reshape
-
-    Returns a **view** of this array with a new shape without altering any data.
-    One shape dimension can be -1. In this case, the value is inferred
-    from the length of the array and remaining dimensions.
-
-    Parameters
-    ----------
-    $new_shape : Shape
-        new shape of NDArray
-    :$reverse : bool, default 0
-        If true then the special values are inferred from right to left.
-=cut
-
-method reshape(ArrayRef[Int] $new_shape, Bool :$reverse=0)
-{
-    my $handle = check_call(
-                    AI::MXNetCAPI::NDArrayReshape64(
-                        $self->handle,
-                        scalar(@$new_shape),
-                        $new_shape,
-                        $reverse
-                    )
-    );
-    return __PACKAGE__->_ndarray_cls($handle, $self->writable);
-}
-
-=head2 ndim
-
-    Returns the number of dimensions of this array.
-=cut
-
-method ndim()
-{
-    scalar(@{ $self->shape });
-}
-
-=head2 moveaxis
-
-    Moves the 'source' axis into the 'destination' position
-    while leaving the other axes in their original order
-
-    Parameters
-    ----------
-    source : int
-        Original position of the axes to move.
-    destination : int
-        Destination position for each of the original axes.
-
-    Returns
-    -------
-    result :NDArray
-    Array with moved axes.
-
-    Examples
-    --------
-    > $X = mx->nd->array([[1, 2, 3],
-                          [4, 5, 6]]);
-    > print Dumper($X->moveaxis(0, 1)->shape)
-    > [3, 2]
-=cut
-
-method moveaxis(Int $source, Int $dest)
-{
-    my @axes = 0..$self->ndim-1;
-    $source += @axes if $source < 0;
-    $dest += @axes if $dest < 0;
-    assert($source < @axes);
-    assert($dest < @axes);
-    my ($to_move) = splice(@axes, $source, 1);
-    splice(@axes, $dest, 0, $to_move);
-    return __PACKAGE__->transpose($self, \@axes);
-}
-
-=head2 broadcast_to
-
-    Broadcasting the current NDArray into the given shape.
-
-    Parameters
-    ---------
-    Shape $shape : the shape to broadcast
-=cut
-
-method broadcast_to(Shape $shape)
-{
-    my $cur_shape = $self->shape;
-    my $err_str = "operands could not be broadcast together with remapped shapes"
-                  ."[original->remapped]: [@$cur_shape] and requested shape [@$shape]";
-    if(@$shape < @$cur_shape)
-    {
-        confess($err_str);
-    }
-    @$cur_shape = ((1)x(@$shape - @$cur_shape), @$cur_shape);
-    my $cur_shape_arr = pdl($cur_shape);
-    my $broadcasting_axes = ($cur_shape_arr != pdl($shape))->which->unpdl;
-    if (grep { $cur_shape->[$_] != 1 } @$broadcasting_axes)
-    {
-        confess($err_str);
-    }
-    if(join(',',@$cur_shape) ne join(',',@{ $self->shape }))
-    {
-        return __PACKAGE__->SUPER::broadcast_to($self->reshape($cur_shape),{ shape => $shape });
-    }
-    else
-    {
-        return __PACKAGE__->SUPER::broadcast_to($self, { shape => $shape });
-    }
-}
-
-=head2 wait_to_read
-
-    Block until all pending write operations on the NDArray are finished.
-
-    This function will return when all the pending writes to the current
-    NDArray are finished. There can be pending reads going on when the
-    function returns.
-=cut
-
-method wait_to_read()
-{
-    check_call(AI::MXNetCAPI::NDArrayWaitToRead($self->handle));
-}
-
-=head2 shape
-
-    Get the shape of current NDArray.
-
-    Returns
-    -------
-    an array ref representing the shape of current ndarray
-=cut
-
-method shape()
-{
-    if(AI::MXNet::RunTime->Features()->is_enabled('INT64_TENSOR_SIZE'))
-    {
-        return [map { $_ + 0 } @{ scalar(check_call(AI::MXNetCAPI::NDArrayGetShapeEx64($self->handle))) }];
-    }
-    else
-    {
-       return scalar(check_call(AI::MXNetCAPI::NDArrayGetShapeEx($self->handle)));
-    }
-}
-
-=head2 size
-
-    Number of elements in the array.
-=cut
-
-method size(Shape|Undef $shape=)
-{
-    my $size = 1;
-    map { $size *= $_ } @{ $shape//$self->shape };
-    return $size;
-}
-
-
-=head2 context
-
-    The context of the NDArray.
-
-    Returns
-    -------
-    $context : AI::MXNet::Context
-=cut
-
-method context()
-{
-    my ($dev_type_id, $dev_id) = check_call(
-        AI::MXNetCAPI::NDArrayGetContext($self->handle)
-    );
-    return AI::MXNet::Context->new(
-        device_type => AI::MXNet::Context::devtype2str->{ $dev_type_id },
-        device_id => $dev_id
-    );
-}
-
-=head2 dtype
-
-    The data type of current NDArray.
-
-    Returns
-    -------
-    a data type string ('float32', 'float64', 'float16', 'uint8', 'int32')
-    representing the data type of the ndarray.
-    'float32' is the default dtype for the ndarray class.
-=cut
-
-method dtype()
-{
-    my $dtype = check_call(
-        AI::MXNetCAPI::NDArrayGetDType(
-            $self->handle
-        )
-    );
-    return DTYPE_MX_TO_STR->{ $dtype };
-}
-
-=head2 copyto
-
-    Copy the content of current array to another entity.
-
-    When another entity is the NDArray, the content is copied over.
-    When another entity is AI::MXNet::Context, a new NDArray in the context
-    will be created.
-
-    Parameters
-    ----------
-    other : NDArray or Context
-        Target NDArray or context we want to copy data to.
-
-    Returns
-    -------
-    dst : NDArray
-=cut
-
-method copyto(AI::MXNet::Context|AI::MXNet::NDArray $other)
-{
-    if(blessed($other) and $other->isa('AI::MXNet::Context'))
-    {
-        my $hret = __PACKAGE__->empty(
-            $self->shape,
-            ctx => $other,
-            dtype => $self->dtype
-        );
-        return __PACKAGE__->_copyto($self, { out => $hret });
-    }
-    else
-    {
-        if ($other->handle eq $self->handle)
-        {
-            Carp::cluck('copy an array to itself, is it intended?');
-        }
-        return __PACKAGE__->_copyto($self, { out => $other });
-    }
-}
-
-=head2 copy
-
-    Makes a copy of the current ndarray in the same context
-
-    Returns
-    ------
-    $copy : NDArray
-=cut
-
-method copy()
-{
-    return $self->copyto($self->context);
-}
-
-## alias for PDL::NiceSlice
-*sever = \&copy;
-
-=head2 T
-
-    Get transpose of the NDArray.
-    Works only on 2-D matrices.
-=cut
-
-method T()
-{
-    if (@{$self->shape} > 2)
-    {
-        confess('Only 2D matrix is allowed to be transposed');
-    }
-    return __PACKAGE__->transpose($self);
-}
-
-=head2 astype
-
-    Returns copied ndarray of current array with the specified type.
-
-    Parameters
-    ----------
-    $dtype : Dtype
-
-    Returns
-    -------
-    $array : ndarray
-        A copy of the array content.
-=cut
-
-method astype(Dtype $dtype)
-{
-    my $res = __PACKAGE__->empty($self->shape, ctx => $self->context, dtype => $dtype);
-    $self->copyto($res);
-    return $res;
-}
-
-=head2 as_in_context
-
-    Returns an NDArray in the target context.
-    If the array is already in that context, self is returned. Otherwise, a copy is
-    made.
-
-    Parameters
-    ----------
-    context : AI::MXNet::Context
-        The target context we want the return value to live in.
-
-    Returns
-    -------
-        A copy or self as an NDArray in the target context.
-=cut
-
-method as_in_context(AI::MXNet::Context $context)
-{
-    return $self if $self->context == $context;
-    return $self->copyto($context);
-}
-
-=head2 onehot_encode
-
-    One hot encoding indices into matrix out.
-
-    Parameters
-    ----------
-    indices: NDArray
-        An NDArray containing indices of the categorical features.
-
-    out: NDArray
-        The result of the encoding.
-
-    Returns
-    -------
-        $out: NDArray
-=cut
-
-method onehot_encode(AI::MXNet::NDArray $indices, AI::MXNet::NDArray $out)
-{
-    return __PACKAGE__->_onehot_encode($indices, $out, { out => $out });
-}
-
-sub  _ufunc_helper
-{
-    my ($lhs, $rhs, $fn_array, $lfn_scalar, $rfn_scalar, $reverse) = @_;
-    ($rhs, $lhs) = ($lhs, $rhs) if $reverse and $rfn_scalar;
-    if(not ref $lhs)
-    {
-        if(not $rfn_scalar)
-        {
-            return __PACKAGE__->can($lfn_scalar)->(__PACKAGE__, $rhs, $lhs);
-        }
-        else
-        {
-            return __PACKAGE__->can($rfn_scalar)->(__PACKAGE__, $rhs, $lhs);
-        }
-    }
-    elsif(not ref $rhs)
-    {
-        return __PACKAGE__->can($lfn_scalar)->(__PACKAGE__, $lhs, $rhs);
-    }
-    else
-    {
-        return __PACKAGE__->can($fn_array)->(__PACKAGE__, $lhs, $rhs);
-    }
-}
-
-method stringify($other=, $reverse=)
-{
-    sprintf("<%s %s @%s>", ref($self), join('x', @{ $self->shape }), $self->context);
-}
-
-method atan2($other=, $reverse=)
-{
-    my $val = $reverse ? $other / $self : $self / $other;
-    return __PACKAGE__->arctan($val);
-}
-
-method iadd(AI::MXNet::NDArray|Num $other, $reverse=)
-{
-    confess('trying to add to a readonly NDArray') unless $self->writable;
-    return ref $other
-        ? __PACKAGE__->broadcast_add($self, $other, { out => $self })
-        : __PACKAGE__->_plus_scalar($self, $other, { out => $self })
-}
-
-method add(AI::MXNet::NDArray|Num $other, $reverse=)
-{
-    return _ufunc_helper(
-        $self,
-        $other,
-        qw/broadcast_add _plus_scalar/
-    );
-}
-
-
-method subtract(AI::MXNet::NDArray|Num $other, $reverse=)
-{
-    return _ufunc_helper(
-        $self,
-        $other,
-        qw/broadcast_sub _minus_scalar _rminus_scalar/,
-        $reverse
-    );
-}
-
-method isubtract(AI::MXNet::NDArray|Num $other, $reverse=)
-{
-    confess('trying to add to a readonly NDArray') unless $self->writable;
-    return ref $other
-        ? __PACKAGE__->broadcast_sub($self, $other, { out => $self })
-        : __PACKAGE__->_minus_scalar($self, $other, { out => $self })
-}
-
-method multiply(AI::MXNet::NDArray|Num $other, $reverse=)
-{
-    return _ufunc_helper(
-        $self,
-        $other,
-        qw/broadcast_mul _mul_scalar/
-    );
-}
-
-method imultiply(AI::MXNet::NDArray|Num $other, $reverse=)
-{
-    confess('trying to add to a readonly NDArray') unless $self->writable;
-    return ref $other
-        ? __PACKAGE__->broadcast_mul($self, $other, { out => $self })
-        : __PACKAGE__->_mul_scalar($self, $other, { out => $self })
-}
-
-method divide(AI::MXNet::NDArray|Num $other, $reverse=)
-{
-    return _ufunc_helper(
-        $self,
-        $other,
-        qw/broadcast_div _div_scalar _rdiv_scalar/,
-        $reverse
-    );
-}
-
-method idivide(AI::MXNet::NDArray|Num $other, $reverse=)
-{
-    confess('trying to add to a readonly NDArray') unless $self->writable;
-    return ref $other
-        ? __PACKAGE__->broadcast_div($self, $other, { out => $self })
-        : __PACKAGE__->_div_scalar($self, $other, { out => $self })
-}
-
-method power(AI::MXNet::NDArray|Num $other, $reverse=)
-{
-    return _ufunc_helper(
-        $self,
-        $other,
-        qw/broadcast_power _power_scalar _rpower_scalar/,
-        $reverse
-    );
-}
-
-method maximum(AI::MXNet::NDArray|Num $other)
-{
-    return _ufunc_helper(
-        $self,
-        $other,
-        qw/broadcast_maximum _maximum_scalar/
-    );
-}
-
-method minimum(AI::MXNet::NDArray|Num $other)
-{
-    return _ufunc_helper(
-        $self,
-        $other,
-        qw/broadcast_minimum _minimum_scalar/
-    );
-}
-
-method equal(AI::MXNet::NDArray|Num $other, $reverse=)
-{
-    return _ufunc_helper(
-        $self,
-        $other,
-        qw/broadcast_equal _equal_scalar/
-    );
-}
-
-method not_equal(AI::MXNet::NDArray|Num $other, $reverse=)
-{
-    return _ufunc_helper(
-        $self,
-        $other,
-        qw/broadcast_not_equal _not_equal_scalar/
-    );
-}
-
-method greater(AI::MXNet::NDArray|Num $other, $reverse=)
-{
-    return _ufunc_helper(
-        $self,
-        $other,
-        qw/broadcast_greater _greater_scalar _lesser_scalar/,
-        $reverse
-    );
-}
-
-method greater_equal(AI::MXNet::NDArray|Num $other, $reverse=)
-{
-    return _ufunc_helper(
-        $self,
-        $other,
-        qw/broadcast_greater_equal _greater_equal_scalar _lesser_equal_scalar/,
-        $reverse
-    );
-}
-
-method lesser(AI::MXNet::NDArray|Num $other, $reverse=)
-{
-    return _ufunc_helper(
-        $self,
-        $other,
-        qw/broadcast_lesser _lesser_scalar _greater_scalar/,
-        $reverse
-    );
-}
-
-method lesser_equal(AI::MXNet::NDArray|Num $other, $reverse=)
-{
-    return _ufunc_helper(
-        $self,
-        $other,
-        qw/broadcast_lesser_equal _lesser_equal_scalar _greater_equal_scalar/,
-        $reverse
-    );
-}
-
-method true_divide(AI::MXNet::NDArray|Num $other, $reverse=)
-{
-    return $self->divide($other, $reverse);
-}
-
-method modulo(AI::MXNet::NDArray|Num $other, $reverse=)
-{
-    return _ufunc_helper(
-        $self,
-        $other,
-        qw/broadcast_mod _mod_scalar _rmod_scalar/,
-        $reverse
-    );
-}
-
-method imodulo(AI::MXNet::NDArray|Num $other, $reverse=)
-{
-    confess('trying to modulo to a readonly NDArray') unless $self->writable;
-    return ref $other
-        ? __PACKAGE__->broadcast_mod($self, $other, { out => $self })
-        : __PACKAGE__->_mod_scalar($self, $other, { out => $self })
-}
-
-=head2 empty
-
-    Creates an empty uninitialized NDArray, with the specified shape.
-
-    Parameters
-    ----------
-    $shape : Shape
-        shape of the NDArray.
-
-    :$ctx : AI::MXNet::Context, optional
-        The context of the NDArray, defaults to current default context.
-
-    :$dtype : Dtype, optional
-        The dtype of the NDArray, defaults to 'float32'.
-
-    :$stype: Stype, optional
-        The stype of the NDArray, defaults to 'default'
-
-    Returns
-    -------
-    out: Array
-        The created NDArray.
-=cut
-
-method empty(Shape $shape, AI::MXNet::Context :$ctx=AI::MXNet::Context->current_ctx, Dtype :$dtype='float32', Stype :$stype='default')
-{
-    if($stype ne 'default')
-    {
-        return AI::MXNet::NDArray::Sparse->empty($stype, $shape, ctx => $ctx, dtype => $dtype);
-    }
-    return __PACKAGE__->new(
-                handle => _new_alloc_handle(
-                    $shape,
-                    $ctx,
-                    0,
-                    DTYPE_STR_TO_MX->{$dtype}
-                )
-    );
-}
-
-=head2 zeros
-
-    Creates a new NDArray filled with 0, with specified shape.
-
-    Parameters
-    ----------
-    $shape : Shape
-        shape of the NDArray.
-
-    :$ctx : AI::MXNet::Context, optional
-        The context of the NDArray, defaults to current default context.
-
-    :$dtype : Dtype, optional
-        The dtype of the NDArray, defaults to 'float32'.
-
-    :$stype: Stype, optional
-        The stype of the NDArray, defaults to 'default'
-    Returns
-    -------
-    out: Array
-        The created NDArray.
-=cut
-
-method zeros(
-    Shape $shape,
-    AI::MXNet::Context :$ctx=AI::MXNet::Context->current_ctx,
-    Dtype :$dtype='float32',
-    Maybe[AI::MXNet::NDArray] :$out=,
-    Maybe[Str] :$name=,
-    Maybe[Str] :$__layout__=,
-    Stype :$stype='default'
-)
-{
-    if($stype ne 'default')
-    {
-        return AI::MXNet::NDArray::Sparse->zeros($stype, $shape, ctx => $ctx, dtype => $dtype, out => $out);
-    }
-    return __PACKAGE__->_zeros({ shape => $shape, ctx => "$ctx", dtype => $dtype, ($out ? (out => $out) : ())  });
-}
-
-=head2 ones
-
-    Creates a new NDArray filled with 1, with specified shape.
-
-    Parameters
-    ----------
-    $shape : Shape
-        shape of the NDArray.
-
-    :$ctx : AI::MXNet::Context, optional
-        The context of the NDArray, defaults to current default context.
-
-    :$dtype : Dtype, optional
-        The dtype of the NDArray, defaults to 'float32'.
-
-    Returns
-    -------
-    out: Array
-        The created NDArray.
-=cut
-
-method ones(
-    Shape $shape,
-    AI::MXNet::Context :$ctx=AI::MXNet::Context->current_ctx,
-    Dtype :$dtype='float32',
-    Maybe[AI::MXNet::NDArray] :$out=,
-    Maybe[Str] :$name=,
-    Maybe[Str] :$__layout__=,
-)
-{
-    return __PACKAGE__->_ones({ shape => $shape, ctx => "$ctx", dtype => $dtype, ($out ? (out => $out) : ()) });
-}
-
-=head2 full
-
-    Creates a new NDArray filled with given value, with specified shape.
-
-    Parameters
-    ----------
-    $shape : Shape
-        shape of the NDArray.
-
-    val : float or int
-        The value to be filled with.
-
-    :$ctx : AI::MXNet::Context, optional
-        The context of the NDArray, defaults to current default context.
-
-    :$dtype : Dtype, optional
-        The dtype of the NDArray, defaults to 'float32'.
-
-    Returns
-    -------
-    out: Array
-        The created NDArray.
-=cut
-
-method full(
-    Shape $shape, Num $val,
-    AI::MXNet::Context :$ctx=AI::MXNet::Context->current_ctx,
-    Dtype :$dtype='float32', Maybe[AI::MXNet::NDArray] :$out=,
-    Maybe[Str] :$name=,
-    Maybe[Str] :$__layout__=
-)
-{
-    return __PACKAGE__->_set_value({ src => $val, out => $out ? $out : __PACKAGE__->empty($shape, ctx => $ctx, dtype => $dtype) });
-}
-
-=head2 array
-
-    Creates a new NDArray that is a copy of the source_array.
-
-    Parameters
-    ----------
-    $source_array : AI::MXNet::NDArray PDL, PDL::Matrix, Array ref in PDL::pdl format
-        Source data to create NDArray from.
-
-    :$ctx : AI::MXNet::Context, optional
-        The context of the NDArray, defaults to current default context.
-
-    :$dtype : Dtype, optional
-        The dtype of the NDArray, defaults to 'float32'.
-
-    Returns
-    -------
-    out: Array
-        The created NDArray.
-=cut
-
-method array(PDL|PDL::Matrix|PDL::CCS::Nd|ArrayRef|AI::MXNet::NDArray $source_array, AI::MXNet::Context :$ctx=AI::MXNet::Context->current_ctx, Dtype :$dtype='float32')
-{
-    if(blessed $source_array and $source_array->isa('AI::MXNet::NDArray'))
-    {
-        return AI::MXNet::NDArray::Sparse->array($source_array, ctx => $ctx, dtype => $dtype) unless $source_array->stype eq 'default';
-        my $arr = __PACKAGE__->empty($source_array->shape, ctx => $ctx, dtype => $dtype);
-        $arr .= $source_array;
-        return $arr;
-    }
-    elsif(blessed $source_array and $source_array->isa('PDL::CCS::Nd'))
-    {
-        return AI::MXNet::NDArray::Sparse->array($source_array, ctx => $ctx, dtype => $dtype);
-    }
-    my $pdl_type = PDL::Type->new(DTYPE_MX_TO_PDL->{ $dtype });
-    if(not blessed($source_array))
-    {
-        $source_array = eval {
-            pdl($pdl_type, $source_array);
-        };
-        confess($@) if $@;
-    }
-    $source_array = pdl($pdl_type, [@{ $source_array->unpdl } ? $source_array->unpdl->[0] : 0 ]) unless @{ $source_array->shape->unpdl };
-    my $shape = $source_array->shape->unpdl;
-    my $arr = __PACKAGE__->empty([ref($source_array) eq 'PDL' ? reverse @{ $shape } : @{ $shape }], ctx => $ctx, dtype => $dtype );
-    $arr .= $source_array;
-    return $arr;
-}
-
-
-=head2 concatenate
-
-    Concatenates an array ref of NDArrays along the first dimension.
-
-    Parameters
-    ----------
-    $arrays :  array ref of NDArrays
-        Arrays to be concatenate. They must have identical shape except
-        for the first dimension. They also must have the same data type.
-    :$axis=0 : int
-        The axis along which to concatenate.
-    :$always_copy=1 : bool
-        Default is 1. When not 1, if the arrays only contain one
-        NDArray, that element will be returned directly, avoid copying.
-
-    Returns
-    -------
-    An NDArray in the same context as $arrays->[0]->context.
-=cut
-
-method concatenate(ArrayRef[AI::MXNet::NDArray] $arrays, Index :$axis=0, :$always_copy=1)
-{
-    confess("no arrays provided") unless @$arrays > 0;
-    if(not $always_copy and @$arrays == 1)
-    {
-        return $arrays->[0];
-    }
-    my $shape_axis = $arrays->[0]->shape->[$axis];
-    my $shape_rest1 = [@{ $arrays->[0]->shape }[0..($axis-1)]];
-    my $shape_rest2 = [@{ $arrays->[0]->shape }[($axis+1)..(@{ $arrays->[0]->shape }-1)]];
-    my $dtype = $arrays->[0]->dtype;
-    my $i = 1;
-    for my $arr (@{ $arrays }[1..(@{ $arrays }-1)])
-    {
-        $shape_axis += $arr->shape->[$axis];
-        my $arr_shape_rest1 = [@{ $arr->shape }[0..($axis-1)]];
-        my $arr_shape_rest2 = [@{ $arr->shape }[($axis+1)..(@{ $arr->shape }-1)]];
-        confess("first array $arrays->[0] and $i array $arr do not match")
-            unless  join(',',@$arr_shape_rest1) eq join(',',@$shape_rest1);
-        confess("first array $arrays->[0] and $i array $arr do not match")
-            unless  join(',',@$arr_shape_rest2) eq join(',',@$shape_rest2);
-        confess("first array $arrays->[0] and $i array $arr dtypes do not match")
-            unless  join(',',@$arr_shape_rest2) eq join(',',@$shape_rest2);
-        $i++;
-    }
-    my $ret_shape = [@$shape_rest1, $shape_axis, @$shape_rest2];
-    my $ret = __PACKAGE__->empty($ret_shape, ctx => $arrays->[0]->context, dtype => $dtype);
-    my $idx = 0;
-    my $begin = [(0)x@$ret_shape];
-    my $end = [@$ret_shape];
-    for my $arr (@$arrays)
-    {
-        if ($axis == 0)
-        {
-            $ret->slice([$idx,($idx+$arr->shape->[0]-1)]) .= $arr;
-        }
-        else
-        {
-            $begin->[$axis] = $idx;
-            $end->[$axis] = $idx+$arr->shape->[$axis];
-            __PACKAGE__->_crop_assign(
-                $ret, $arr,
-                {
-                    out => $ret,
-                    begin => $begin,
-                    end => $end
-                }
-            );
-        }
-        $idx += $arr->shape->[$axis];
-    }
-    return $ret
-}
-
-=head2 arange
-
-    Similar function in the MXNet ndarray as numpy.arange
-    See Also https://docs.scipy.org/doc/numpy/reference/generated/numpy.arange.html.
-
-    Parameters
-    ----------
-    :$start=0 : number, optional
-        Start of interval. The interval includes this value. The default start value is 0.
-    :$stop= : number, optional
-        End of interval. The interval does not include this value.
-    :$step=1 : number, optional
-        Spacing between the values
-    :$repeat=1 : number, optional
-        The repeating time of all elements.
-        E.g repeat=3, the element a will be repeated three times --> a, a, a.
-    :$infer_range=0 : Bool
-        When set to 1, infer stop position from start, step, repeat, and
-        output tensor size.
-    :$ctx : Context, optional
-        The context of the NDArray, defaultw to current default context.
-    :$dtype : data type, optional
-        The value type of the NDArray, defaults to float32
-
-    Returns
-    -------
-    $out : NDArray
-        The created NDArray
-=cut
-
-method arange(Index :$start=0, Maybe[Index] :$stop=, Index :$step=1, Index :$repeat=1, Bool :$infer_range=0,
-              AI::MXNet::Context :$ctx=AI::MXNet::Context->current_ctx, Dtype :$dtype='float32')
-{
-    return __PACKAGE__->_arange({
-                start => $start,
-                (defined $stop ? (stop => $stop) : ()),
-                step => $step,
-                repeat => $repeat,
-                dtype => $dtype,
-                infer_range => $infer_range,
-                ctx => "$ctx"
-    });
-}
-
-=head2 load
-
-    Loads ndarrays from a binary file.
-
-    You can also use Storable to do the job if you only work with Perl.
-    The advantage of load/save is the file is language agnostic.
-    This means the file saved using save can be loaded by other language binding of mxnet.
-    You also get the benefit being able to directly load/save from cloud storage(S3, HDFS)
-
-    Parameters
-    ----------
-    fname : str
-        The name of the file.Can be S3 or HDFS address (remember built with S3 support).
-        Example of fname:
-
-        - `s3://my-bucket/path/my-s3-ndarray`
-        - `hdfs://my-bucket/path/my-hdfs-ndarray`
-        - `/path-to/my-local-ndarray`
-
-    Returns
-    -------
-    $out : array ref of NDArrays or hash ref with NDArrays
-=cut
-
-method load(Str $filename)
-{
-    my ($handles, $names) = check_call(AI::MXNetCAPI::NDArrayLoad($filename));
-    if (not @$names)
-    {
-        return [map { __PACKAGE__->_ndarray_cls($_) } @$handles];
-    }
-    else
-    {
-        my $n = @$names;
-        my $h = @$handles;
-        confess("Handles [$h] and names [$n] count mismatch") unless $h == $n;
-        my %ret;
-        @ret{ @$names } = map { __PACKAGE__->_ndarray_cls($_) } @$handles;
-        return \%ret;
-    }
-}
-
-=head2 load_frombuffer
-
-    Loads an array dictionary or list from a buffer
-
-    See more details in 'save'.
-
-    Parameters
-    ----------
-    buf : str
-        Binary string containing contents of a file.
-
-    Returns
-    -------
-    array ref of AI::MXNet::NDArray, AI::MXNet::NDArrayRowSparseNDArray or AI::MXNet::NDArray::CSR, or
-    hash ref of AI::MXNet::NDArray, AI::MXNet::NDArrayRowSparseNDArray or AI::MXNet::NDArray::CSR
-        Loaded data.
-=cut
-
-method load_frombuffer(Str $buf)
-{
-    my ($handles, $names) = check_call(AI::MXNetCAPI::NDArrayLoadFromBuffer($buf, length($buf)));
-    if (not @$names)
-    {
-        return [map { __PACKAGE__->_ndarray_cls($_) } @$handles];
-    }
-    else
-    {
-        my $n = @$names;
-        my $h = @$handles;
-        confess("Handles [$h] and names [$n] count mismatch") unless $h == $n;
-        my %ret;
-        @ret{ @$names } = map { __PACKAGE__->_ndarray_cls($_) } @$handles;
-        return \%ret;
-    }
-}
-
-=head2 save
-
-    Save array ref of NDArray or hash of str->NDArray to a binary file.
-
-    You can also use Storable to do the job if you only work with Perl.
-    The advantage of load/save is the file is language agnostic.
-    This means the file saved using save can be loaded by other language binding of mxnet.
-    You also get the benefit being able to directly load/save from cloud storage(S3, HDFS)
-
-    Parameters
-    ----------
-    fname : str
-        The name of the file.Can be S3 or HDFS address (remember built with S3 support).
-        Example of fname:
-
-        - `s3://my-bucket/path/my-s3-ndarray`
-        - `hdfs://my-bucket/path/my-hdfs-ndarray`
-        - `/path-to/my-local-ndarray`
-
-    $data : array ref of NDArrays or hash ref of NDArrays
-        The data to be saved.
-=cut
-
-method save(Str $filename, ArrayRef[AI::MXNet::NDArray]|HashRef[AI::MXNet::NDArray] $data)
-{
-    my $handles = [];
-    my $names = [];
-    if(ref $data eq 'HASH')
-    {
-        for my $name (keys %$data)
-        {
-            push @$names, $name;
-            push @$handles, $data->{ $name }->handle;
-        }
-    }
-    else
-    {
-        @$handles = map { $_->handle } @$data;
-    }
-    check_call(
-        AI::MXNetCAPI::NDArraySave(
-            $filename,
-            scalar(@$handles),
-            $handles,
-            $names
-        )
-    );
-}
-
-=head2 imdecode
-
-    Decode an image from string. Requires OpenCV to work.
-
-    Parameters
-    ----------
-    $str_img : str
-        binary image data
-    :$clip_rect : iterable of 4 int
-        clip decoded image to rectangle (x0, y0, x1, y1)
-    :$out= : Maybe[NDArray]
-        output buffer. can be 3 dimensional (c, h, w) or 4 dimensional (n, c, h, w)
-    :$index : int
-        output decoded image to i-th slice of 4 dimensional buffer
-    :$channels=3 : int
-        number of channels to output. Decode to grey scale when channels = 1.
-    $mean= : Maybe[NDArray]
-        subtract mean from decode image before outputting.
-=cut
-
-method imdecode($str_img, ArrayRef[Int] :$clip_rect=[0, 0, 0, 0],
-                Maybe[AI::MXNet::NDArray] :$out=, Int :$index=0, Int :$channels=3, Maybe[AI::MXNet::NDArray] :$mean=)
-{
-    return __PACKAGE__->_imdecode(
-        $mean//__PACKAGE__->_new_empty_handle(),
-        $index,
-        @$clip_rect,
-        $channels,
-        length($str_img),
-        { str_img => $str_img, ($out ? (out => $out) : ()) }
-    );
-}
-
-=head2 _new_empty_handle
-
-    Returns a new empty handle.
-
-    Empty handle can be used to hold result
-
-    Returns
-    -------
-        a new empty ndarray handle
-=cut
-
-sub _new_empty_handle
-{
-    my $hdl = check_call(AI::MXNetCAPI::NDArrayCreateNone());
-    return $hdl;
-}
-
-=head2 _new_alloc_handle
-
-    Returns a new handle with specified shape and context.
-
-    Empty handle is only used to hold results
-
-    Returns
-    -------
-    a new empty ndarray handle
-=cut
-
-func _new_alloc_handle($shape, $ctx, $delay_alloc, $dtype)
-{
-    my $sub = AI::MXNet::RunTime->Features()->is_enabled('INT64_TENSOR_SIZE')
-              ? \&AI::MXNetCAPI::NDArrayCreateEx64
-              : \&AI::MXNetCAPI::NDArrayCreateEx;
-    my $hdl = check_call(
-        $sub->(
-            $shape,
-            scalar(@$shape),
-            $ctx->device_type_id,
-            $ctx->device_id,
-            $delay_alloc,
-            $dtype
-        )
-    );
-    return $hdl;
-}
-
-method _new_from_shared_mem($shared_pid, $shared_id, $shape, $dtype)
-{
-    my $hdl = check_call(
-        AI::MXNetCAPI::NDArrayCreateFromSharedMemEx(
-            $shared_pid,
-            $shared_id,
-            $shape,
-            scalar(@$shape),
-            DTYPE_STR_TO_MX->{$dtype}
-        )
-    );
-    return $hdl;
-}
-
-=head2 tostype
-
-        Return a copy of the array with chosen storage type.
-
-        Returns
-        -------
-        AI::MXNet::NDArray or AI::MXNet::NDArray::CSR or AI::MXNet::NDArray::RowSparse
-            A copy of the array with the chosen storage stype
-=cut
-
-method tostype(Stype $stype)
-{
-    return $self->cast_storage(stype => $stype);
-}
-
-
-=head2 waitall
-
-    Wait for all async operations to finish in MXNet.
-    This function is used for benchmarks only.
-=cut
-
-method waitall()
-{
-    check_call(AI::MXNetCAPI::NDArrayWaitAll());
-}
-
-=head2 _fresh_grad
-
-        Parameters:
-        ----------
-        Maybe[Bool] $state=
-
-        Whether this array's corresponding gradient array
-        (registered via `autograd->mark_variables`) has been
-        updated by `autograd->backward` since last reset.
-
-        `_fresh_grad` need to be manually set to False
-        after consuming gradient (usually after updating this
-        array).
-=cut
-
-method _fresh_grad(Maybe[Bool] $state=)
-{
-    if(defined $state)
-    {
-        check_call(AI::MXNetCAPI::NDArraySetGradState($self->handle, $state));
-        return $state;
-    }
-    else
-    {
-        return scalar(check_call(AI::MXNetCAPI::NDArrayGetGradState($self->handle)));
-    }
-}
-
-=head2 detach
-
-    Returns a new NDArray, detached from the current graph.
-=cut
-
-method detach()
-{
-    my $handle = check_call(AI::MXNetCAPI::NDArrayDetach($self->handle));
-    return __PACKAGE__->_ndarray_cls($handle);
-}
-
-=head2 attach_grad
-
-        Attach a gradient buffer to this NDArray, so that `backward`
-        can compute gradient with respect to it.
-
-        Parameters
-        ----------
-        GradReq :$grad_req='write' : {'write', 'add', 'null'}
-            How gradient will be accumulated.
-            - 'write': gradient will be overwritten on every backward.
-            - 'add': gradient will be added to existing value on every backward.
-            - 'null': do not compute gradient for this NDArray.
-        Maybe[Str] :$stype= : str, optional
-            The storage type of the gradient array. Defaults to the same stype of this NDArray.
-=cut
-
-method attach_grad(GradReq :$grad_req='write', Maybe[Str] :$stype=)
-{
-    my $grad;
-    if(defined $stype)
-    {
-        $grad = __PACKAGE__->zeros($self->shape, stype => $stype);
-    }
-    else
-    {
-        $grad = $self->zeros_like;
-    }
-    $grad_req = GRAD_REQ_MAP->{$grad_req};
-    check_call(
-        AI::MXNetCAPI::AutogradMarkVariables(
-            1,
-            [$self->handle],
-            [$grad_req],
-            [$grad->handle]
-        )
-    );
-}
-
-=head2 grad
-
-    Returns gradient buffer attached to this NDArray.
-=cut
-
-method grad()
-{
-    my $handle = check_call(AI::MXNetCAPI::NDArrayGetGrad($self->handle));
-    return undef unless defined $handle;
-    return __PACKAGE__->_ndarray_cls($handle);
-}
-
-=head2 backward
-
-    Compute the gradients of this NDArray w.r.t variables.
-
-    Parameters
-    ----------
-    :$out_grad= : NDArray, optional
-        Gradient with respect to head.
-    :$retain_graph=0 : bool, optional
-        Whether to retain the computaion graph for another backward
-        pass on the same graph. By default the computaion history
-        is cleared.
-    :$train_mode=1 : bool, optional
-        Whether to compute gradient for training or inference.
-=cut
-
-method backward(Maybe[AI::MXNet::NDArray] :$out_grad=, Bool :$retain_graph=0, Bool :$train_mode=1)
-{
-    check_call(
-        AI::MXNetCAPI::AutogradBackwardEx(
-            1,
-            [$self->handle],
-            [defined $out_grad ? $out_grad->handle : undef],
-            0,
-            [],
-            $retain_graph,
-            0,
-            $train_mode
-        )
-    )
-}
-
-method CachedOp(@args) { AI::MXNet::CachedOp->new(@args) }
-
-method histogram(@args) { __PACKAGE__->_histogram(@args%2 ? ('data', @args) : @args) }
-
-my $lvalue_methods = join "\n", map {"use attributes 'AI::MXNet::NDArray', \\&AI::MXNet::NDArray::$_, 'lvalue';"}
-qw/at slice aspdl asmpdl reshape copy sever T astype as_in_context copyto empty zero ones full
-                       array/;
-eval << "EOV" if ($^V and $^V >= 5.006007);
-{
-  no warnings qw(misc);
-  $lvalue_methods
-}
-EOV
-
-sub contrib { 'AI::MXNet::Contrib::NDArray' }
-sub random  { 'AI::MXNet::Random' }
-sub sparse  { 'AI::MXNet::NDArray::Sparse' }
-sub linalg  { 'AI::MXNet::LinAlg::NDArray' }
-sub image   { 'AI::MXNet::Image::NDArray' }
-
-__PACKAGE__->meta->make_immutable;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/NDArray/Base.pm b/perl-package/AI-MXNet/lib/AI/MXNet/NDArray/Base.pm
deleted file mode 100644
index 6b4b3e1ca516..000000000000
--- a/perl-package/AI-MXNet/lib/AI/MXNet/NDArray/Base.pm
+++ /dev/null
@@ -1,237 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-package AI::MXNet::NDArray::Base;
-use strict;
-use warnings;
-use AI::MXNet::Base;
-use AI::MXNet::NDArray::Doc;
-use Mouse;
-use AI::MXNet::Function::Parameters;
-
-=head1 NAME
-
-    AI::MXNet::NDArray::Base
-=cut
-
-=head1 DESCRIPTION
-
-    This module provides a convenient interface to a C++ functions
-    that work with NDArray.
-    Essentially it loads them up during the lib startup into the Perl space.
-=cut
-
-my %function_meta;
-method function_meta($code)
-{
-    return $function_meta{$code};
-}
-
-method function_meta_hash()
-{
-    return \%function_meta;
-}
-
-func _make_ndarray_function($handle, $func_name)
-{
-    my ($real_name, $desc, $arg_names,
-        $arg_types, $arg_descs, $key_var_num_args,
-        $ret_type) = @{ check_call(AI::MXNetCAPI::SymbolGetAtomicSymbolInfo($handle)) };
-    $ret_type //= '';
-    my $doc_str = build_doc($func_name,
-                            $desc,
-                            $arg_names,
-                            $arg_types,
-                            $arg_descs,
-                            $key_var_num_args,
-                            $ret_type
-    );
-    my %ndarguments;
-    my @arguments;
-    my %arguments = (out => 1, name => 1, ctx => 1, shape => 1);
-    my $j = 0;
-    for my $i (0..(@$arg_names-1))
-    {
-        if(not $arg_types->[$i] =~ /^(?:NDArray|Symbol|ndarray\-or\-symbol)/)
-        {
-            push @arguments, $arg_names->[$i];
-            $arguments{ $arg_names->[$i] } = 1;
-        }
-        else
-        {
-            $ndarguments{ $arg_names->[$i] } = $j++;
-        }
-    }
-    my $generic_ndarray_function = sub
-    {
-        my $class = shift;
-        my (@args, %kwargs, %ndkwargs, @tmp);
-        if(@_ and ref $_[-1] eq 'HASH')
-        {
-            %kwargs = %{ pop(@_) };
-        }
-        else
-        {
-            while(@_ >= 2 and not ref $_[-2])
-            {
-                if(exists $arguments{ $_[-2] })
-                {
-                    my $v = pop(@_);
-                    my $k = pop(@_);
-                    $kwargs{ $k } = $v;
-                }
-                elsif(exists $ndarguments{ $_[-2] })
-                {
-                    my $v = pop(@_);
-                    my $k = pop(@_);
-                    $ndkwargs{ $k } = $v;
-                }
-                else
-                {
-                    unshift(@tmp, pop(@_));
-                    unshift(@tmp, pop(@_));
-                }
-            }
-        }
-        @args = (@_, @tmp);
-        if(%ndkwargs)
-        {
-            for my $k (keys %ndkwargs)
-            {
-                $args[$ndarguments{$k}] = $ndkwargs{$k};
-            }
-        }
-        my @ndargs;
-        my @pos_args;
-        for my $i (@args)
-        {
-            if(blessed($i) and $i->isa(__PACKAGE__))
-            {
-                push @ndargs, $i->handle;
-            }
-            else
-            {
-                push @pos_args, $i;
-            }
-            if(@pos_args > @arguments)
-            {
-                confess("Too many positional arguments");
-            }
-        }
-        @kwargs{ @arguments[0..$#pos_args] } = @pos_args;
-        my $original_output;
-        my $output_vars;
-        delete $kwargs{name};
-        if(grep { $_ eq 'out' } keys %kwargs)
-        {
-            $output_vars = delete $kwargs{out};
-            $original_output = $output_vars;
-            unless(ref($output_vars) and ref($output_vars) eq 'ARRAY')
-            {
-                $output_vars = [$output_vars];
-            }
-        }
-        else
-        {
-            $output_vars = [];
-        }
-        if(blessed($class) and $class->isa(__PACKAGE__) and not @{ $output_vars })
-        {
-            @ndargs = ($class->handle) if not @ndargs;
-            $class = ref $class;
-        }
-        for my $key (keys %kwargs)
-        {
-            $kwargs{ $key } = "(" .join(", ", map { defined($_) ? $_ : 'None' } @{ $kwargs{ $key } }) .")"
-                if ref $kwargs{ $key } eq 'ARRAY';
-        }
-        my ($out, $stypes) = check_call(AI::MXNetCAPI::ImperativeInvokeEx(
-                    $handle,
-                    scalar(@ndargs),
-                    \@ndargs,
-                    [map { $_->handle } @$output_vars],
-                    scalar(keys %kwargs),
-                    \%kwargs)
-        );
-        return $original_output if $original_output;
-        if(@$out == 1)
-        {
-            return __PACKAGE__->_ndarray_cls($out->[0], 1, $stypes->[0]);
-        }
-        else
-        {
-            my $i = 0;
-            return [map { __PACKAGE__->_ndarray_cls($_, 1, $stypes->[$i++]) } @$out];
-        }
-    };
-    $function_meta{ $generic_ndarray_function }{__name__} = $func_name;
-    $function_meta{ $generic_ndarray_function }{__doc__} = $doc_str;
-    return $generic_ndarray_function;
-}
-
-method _ndarray_cls($handle, $writable=1, $stype=STORAGE_TYPE_UNDEFINED)
-{
-    if($stype eq STORAGE_TYPE_UNDEFINED)
-    {
-        $stype = __PACKAGE__->_storage_type($handle);
-    }
-    if($stype eq STORAGE_TYPE_DEFAULT)
-    {
-        return AI::MXNet::NDArray->new(handle => $handle, writable => $writable);
-    }
-    elsif($stype eq STORAGE_TYPE_CSR)
-    {
-        return AI::MXNet::NDArray::CSR->new(handle => $handle, writable => $writable);
-    }
-    elsif($stype eq STORAGE_TYPE_ROW_SPARSE)
-    {
-        return AI::MXNet::NDArray::RowSparse->new(handle => $handle, writable => $writable);
-    }
-    else
-    {
-        confess("unknown storage type: $stype");
-    }
-}
-
-method _storage_type($handle)
-{
-    scalar(check_call(AI::MXNetCAPI::NDArrayGetStorageType($handle)));
-}
-
-method stype()
-{
-    return STORAGE_TYPE_ID_TO_STR->{ __PACKAGE__->_storage_type($self->handle) };
-}
-
-method _init_ndarray_module()
-{
-    my $op_names = check_call(AI::MXNetCAPI::ListAllOpNames());
-    for my $name (@$op_names)
-    {
-        my $handle = check_call(AI::NNVMCAPI::GetOpHandle($name));
-        my $function = _make_ndarray_function($handle, $name);
-        {
-            no strict 'refs';
-            *{__PACKAGE__."::$name"} = $function;
-        }
-    }
-}
-
-
-__PACKAGE__->_init_ndarray_module;
-
-1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/NDArray/Doc.pm b/perl-package/AI-MXNet/lib/AI/MXNet/NDArray/Doc.pm
deleted file mode 100644
index fc44812f2cff..000000000000
--- a/perl-package/AI-MXNet/lib/AI/MXNet/NDArray/Doc.pm
+++ /dev/null
@@ -1,56 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-package AI::MXNet::NDArray::Doc;
-use strict;
-use warnings;
-use AI::MXNet::Base;
-use Exporter;
-use base qw(Exporter);
-@AI::MXNet::NDArray::Doc::EXPORT = qw(build_doc);
-
-=head2
-
-    Build docstring for imperative functions.
-=cut
-
-sub build_doc
-{
-    my ($func_name,
-        $desc,
-        $arg_names,
-        $arg_types,
-        $arg_desc,
-        $key_var_num_args,
-        $ret_type) = @_;
-    my $param_str = build_param_doc($arg_names, $arg_types, $arg_desc);
-    if($key_var_num_args)
-    {
-        $desc .= "\nThis function support variable length of positional input."
-    }
-    my $doc_str = sprintf("%s\n\n" .
-               "%s\n" .
-               "out : NDArray, optional\n" .
-               "    The output NDArray to hold the result.\n\n".
-               "Returns\n" .
-               "-------\n" .
-               "out : NDArray or list of NDArray\n" .
-               "    The output of this function.", $desc, $param_str);
-    return $doc_str
-}
-
-1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/NDArray/Slice.pm b/perl-package/AI-MXNet/lib/AI/MXNet/NDArray/Slice.pm
deleted file mode 100644
index 1a3ea7e0a460..000000000000
--- a/perl-package/AI-MXNet/lib/AI/MXNet/NDArray/Slice.pm
+++ /dev/null
@@ -1,120 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-package AI::MXNet::NDArray::Slice;
-use strict;
-use warnings;
-use Mouse;
-use AI::MXNet::Base;
-use AI::MXNet::Function::Parameters;
-
-=head1 NAME
-
-    AI::MXNet::NDArray::Slice - A convenience class for slicing of the AI::MXNet::NDArray objects.
-=cut
-
-has parent => (is => 'ro', isa => 'AI::MXNet::NDArray', required => 1);
-has begin  => (is => 'ro', isa => 'Shape', required => 1);
-has end    => (is => 'ro', isa => 'Shape', required => 1);
-use overload
-    '.=' => \&set,
-    '='  => sub { $_[0] },
-    '""' => sub { my $self = $_[0]->sever; "$self" },
-    '**' => sub { my $self = $_[0]->sever; $self ** $_[1] },
-    '==' => sub { my $self = $_[0]->sever; $self == $_[1] },
-    '!=' => sub { my $self = $_[0]->sever; $self != $_[1] },
-    '+'  => sub { my $self = $_[0]->sever; $self +  $_[1] },
-    '*'  => sub { my $self = $_[0]->sever; $self *  $_[1] },
-    '-'  => sub { my $self = $_[0]->sever; $_[2] ? $_[1] - $self : $self - $_[1] },
-    '/'  => sub { my $self = $_[0]->sever; $_[2] ? $_[1] / $self : $self / $_[1] },
-    '+=' => sub { my ($self, $other) = @_; my $in = $self->sever; $self .= ($in+$_[1]) },
-    '-=' => sub { my ($self, $other) = @_; my $in = $self->sever; $self .= ($in-$_[1]) },
-    '*=' => sub { my ($self, $other) = @_; my $in = $self->sever; $self .= ($in*$_[1]) },
-    '/=' => sub { my ($self, $other) = @_; my $in = $self->sever; $self .= ($in/$_[1]) },
-    '**='=> sub { my ($self, $other) = @_; my $in = $self->sever; $self .= ($in**$_[1]) },
-    '>'  => sub { my $self = $_[0]->sever; return $_[2] ? $_[1] >  $self : $self >  $_[1] },
-    '>=' => sub { my $self = $_[0]->sever; return $_[2] ? $_[1] >= $self : $self >= $_[1] },
-    '<'  => sub { my $self = $_[0]->sever; return $_[2] ? $_[1] <  $self : $self <  $_[1] },
-    '<=' => sub { my $self = $_[0]->sever; return $_[2] ? $_[1] <= $self : $self <= $_[1] };
-
-method set(AcceptableInput $value, $reverse=)
-{
-    confess("set value must be defined") unless defined $value;
-    confess("${\ $self->parent } is not writable") unless $self->parent->writable;
-    my $shape = [ map {
-        my($begin, $end) = @$_;
-        ($end-$begin);
-    } zip($self->begin, $self->end) ];
-    if(ref $value)
-    {
-        if(blessed($value) and $value->isa('AI::MXNet::NDArray'))
-        {
-            $value = $value->as_in_context($self->parent->context);
-        }
-        elsif(blessed($value) and $value->isa('AI::MXNet::NDArray::Slice'))
-        {
-            $value = $value->sever->as_in_context($self->parent->context);
-        }
-        else
-        {
-            $value = AI::MXNet::NDArray->array($value, ctx => $self->parent->context);
-        }
-        confess("value $value does not match slice dim sizes [@$shape]")
-            if @{$value->shape} != @$shape;
-        for(zip($shape, $value->shape)) {
-                my ($dsize, $vdsize) = @$_;
-                confess("Slice [@$shape]  != $value given as value")
-                    if $dsize != $vdsize;
-        }
-        AI::MXNet::NDArray->_crop_assign(
-            $self->parent,
-            $value,
-            { out => $self->parent, begin => $self->begin, end => $self->end }
-        );
-    }
-    else
-    {
-        AI::MXNet::NDArray->_crop_assign_scalar(
-            $self->parent,
-            { "scalar" => $value, out => $self->parent, begin => $self->begin, end => $self->end }
-        );
-    }
-    return $self->parent;
-}
-
-method sever()
-{
-    return AI::MXNet::NDArray->crop(
-            $self->parent,
-            { begin => $self->begin, end => $self->end }
-    );
-}
-
-{
-    no warnings 'misc';
-    use attributes 'AI::MXNet::NDArray::Slice', \&AI::MXNet::NDArray::Slice::sever, 'lvalue';
-}
-
-sub notsupported  { confess("NDArray only support continuous slicing on axis 0"); }
-sub AUTOLOAD {
-    my $sub = $AI::MXNet::NDArray::Slice::AUTOLOAD;
-    $sub =~ s/.*:://;
-    my $self = shift;
-    return $self->sever->$sub(@_);
-}
-
-1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/NDArray/Sparse.pm b/perl-package/AI-MXNet/lib/AI/MXNet/NDArray/Sparse.pm
deleted file mode 100644
index 53e6b6362e89..000000000000
--- a/perl-package/AI-MXNet/lib/AI/MXNet/NDArray/Sparse.pm
+++ /dev/null
@@ -1,1433 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-package AI::MXNet::NDArray::Sparse;
-use strict;
-use warnings;
-use AI::MXNet::Base;
-use AI::MXNet::Function::Parameters;
-use AI::MXNet::RunTime;
-use Mouse;
-extends 'AI::MXNet::NDArray';
-
-=head1 NAME
-
-    AI::MXNet::NDArray::Sparse - Sparse NDArray API of MXNet
-=cut
-
-=head1 DESCRIPTION
-
-    The base class of an NDArray stored in a sparse storage format.
-    See AI::MXNet::NDArray::CSR and AI::MXNet::NDArray::RowSparse for more details.
-=cut
-
-method _new_alloc_handle(
-    Stype                    $stype,
-    Shape                    $shape,
-    AI::MXNet::Context       $ctx,
-    Bool                     $delay_alloc,
-    Dtype                    $dtype,
-    AuxTypes                 $aux_types,
-    Maybe[ArrayRef[Shape]]   $aux_shapes=
-)
-{
-    confess("only int64 is supported for aux types")
-        if (grep { $_ ne 'int64' } @$aux_types);
-    my $aux_type_ids = [map { DTYPE_STR_TO_MX->{$_} } @$aux_types];
-    $aux_shapes //= [map { [0] } @$aux_types];
-    my $aux_shape_lens = [map { scalar(@$_) } @$aux_shapes];
-    @$aux_shapes = map { @$_ } @$aux_shapes;
-    my $num_aux = @{ $aux_types };
-    my $sub = AI::MXNet::RunTime->Features()->is_enabled('INT64_TENSOR_SIZE')
-              ? \&AI::MXNetCAPI::NDArrayCreateSparseEx64
-              : \&AI::MXNetCAPI::NDArrayCreateSparseEx;
-    my $handle = check_call(
-        $sub->(
-            STORAGE_TYPE_STR_TO_ID->{$stype},
-            $shape,
-            scalar(@$shape),
-            $ctx->device_type_id,
-            $ctx->device_id,
-            $delay_alloc,
-            DTYPE_STR_TO_MX->{$dtype},
-            scalar(@$aux_types),
-            $aux_type_ids,
-            $aux_shape_lens,
-            $aux_shapes
-        )
-    );
-}
-
-method _class_name()
-{
-    my $class = ref $self || $self;
-    $class;
-}
-
-sub not_implemented { confess "Not implemented" }
-use overload '""' => sub {
-                        my $self = shift;
-                        my $shape_info = join('x', @{ $self->shape });
-                        sprintf("\n<%s, %s @%s>", $self->_class_name, $shape_info, $self->context);
-                     },
-             '+'  => \&add,
-             '-'  => \&subtract,
-             '*'  => \&multiply,
-             '/'  => \&divide,
-             '+=' => \&not_implemented,
-             '-=' => \&not_implemented,
-             '*=' => \&not_implemented,
-             '/=' => \&not_implemented;
-
-method add(AI::MXNet::NDArray|Num $other, $reverse=)
-{
-    if(blessed $other and join(',', @{ $self->shape }) eq join(',', @{ $other->shape }))
-    {
-        return AI::MXNet::NDArray::_ufunc_helper(
-            $self,
-            $other,
-            qw/elemwise_add _plus_scalar/
-        );
-    }
-    else
-    {
-        return AI::MXNet::NDArray::_ufunc_helper(
-            $self,
-            $other,
-            qw/broadcast_add _plus_scalar/
-        );
-    }
-}
-
-
-method subtract(AI::MXNet::NDArray|Num $other, $reverse=)
-{
-    if(blessed $other and join(',', @{ $self->shape }) eq join(',', @{ $other->shape }))
-    {
-        return AI::MXNet::NDArray::_ufunc_helper(
-            $self,
-            $other,
-            qw/elemwise_sub _minus_scalar _rminus_scalar/,
-            $reverse
-        );
-    }
-    else
-    {
-        return AI::MXNet::NDArray::_ufunc_helper(
-            $self,
-            $other,
-            qw/broadcast_sub _minus_scalar _rminus_scalar/,
-            $reverse
-        );
-    }
-}
-
-method multiply(AI::MXNet::NDArray|Num $other, $reverse=)
-{
-    if(blessed $other and join(',', @{ $self->shape }) eq join(',', @{ $other->shape }))
-    {
-        return AI::MXNet::NDArray::_ufunc_helper(
-            $self,
-            $other,
-            qw/elemwise_mul _mul_scalar/,
-        );
-    }
-    else
-    {
-        return AI::MXNet::NDArray::_ufunc_helper(
-            $self,
-            $other,
-            qw/broadcast_mul _mul_scalar/,
-        );
-    }
-}
-
-method divide(AI::MXNet::NDArray|Num $other, $reverse=)
-{
-    if(blessed $other and join(',', @{ $self->shape }) eq join(',', @{ $other->shape }))
-    {
-        return AI::MXNet::NDArray::_ufunc_helper(
-            $self,
-            $other,
-            qw/elemwise_div _div_scalar _rdiv_scalar/,
-            $reverse
-        );
-    }
-    else
-    {
-        return AI::MXNet::NDArray::_ufunc_helper(
-            $self,
-            $other,
-            qw/broadcast_div _div_scalar _rdiv_scalar/,
-            $reverse
-        );
-    }
-}
-
-{
-    no warnings 'redefine';
-    *_sync_copyfrom = *_at = *_slice = *reshape = *size = \&not_implemented;
-}
-
-method _aux_type(Int $i)
-{
-    return DTYPE_MX_TO_STR->{
-        check_call(
-            AI::MXNetCAPI::NDArrayGetAuxType(
-                $self->handle, $i
-            )
-        )
-    }
-}
-
-method _num_aux()
-{
-    return scalar(@{ STORAGE_AUX_TYPES->{ $self->stype } });
-}
-
-method _aux_types()
-{
-    [map { $self->_aux_type($_) } 0..$self->_num_aux-1];
-}
-
-=head2 aspdl
-
-    Return a dense PDL object with value copied from this array
-=cut
-
-method aspdl()
-{
-    return $self->tostype('default')->aspdl;
-}
-
-=head2 astype
-
-        Returns a copy of the array after casting to a specified type.
-        Parameters
-        ----------
-        dtype : Dtype
-            The type of the returned array.
-        Examples
-        --------
-        >>> $x = mx->nd->sparse->zeros('row_sparse', [2,3], dtype=>'float32')
-        >>> $y = $x->astype('int32')
-        >>> $y->dtype
-        <type 'int32'>
-=cut
-
-method astype(Dtype $dtype)
-{
-    my $res = $self->zeros(
-        $self->stype, $self->shape, ctx => $self->context,
-        dtype => $dtype
-    );
-    $self->copyto($res);
-    return $res;
-}
-
-=head2 copyto
-
-        Copies the value of this array to another array.
-
-        Parameters
-        ----------
-        other : NDArray or NDArray::CSR or NDArray::RowSparse or Context
-            The destination array or context.
-
-        Returns
-        -------
-        NDArray or CSRNDArray::CSR or NDArray::RowSparse
-            The copied array.
-=cut
-
-method copyto(AI::MXNet::NDArray|AI::MXNet::Context $other)
-{
-    if($other->isa('AI::MXNet::NDArray'))
-    {
-        if($self->handle eq $other->handle)
-        {
-            Carp::cluck('You are attempting to copy an array to itself');
-            return;
-        }
-        else
-        {
-            return __PACKAGE__->_copyto($self, out => $other);
-        }
-    }
-    elsif($other->isa('AI::MXNet::Context'))
-    {
-        my $hret = __PACKAGE__->_ndarray_cls(
-            __PACKAGE__->_new_alloc_handle(
-                $self->stype, $self->shape, $other, 1, $self->dtype, $self->_aux_types
-            )
-        );
-        return __PACKAGE__->_copyto($self, out=>$hret)
-    }
-}
-
-=head2 check_format
-
-        Check whether the NDArray format is valid.
-
-        Parameters
-        ----------
-        full_check : bool, optional
-            If `True`, rigorous check, O(N) operations. Otherwise
-            basic check, O(1) operations (default True).
-=cut
-
-method check_format(Bool $full_check=1)
-{
-    scalar(check_call(AI::MXNetCAPI::NDArraySyncCheckFormat($self->handle, $full_check)));
-}
-
-=head2 _data
-
-        A deep copy NDArray of the data array associated with the BaseSparseNDArray.
-
-        This function blocks. Do not use it in performance critical code.
-=cut
-
-method _data()
-{
-    $self->wait_to_read;
-    my $handle = check_call(AI::MXNetCAPI::NDArrayGetDataNDArray($self->handle));
-    return AI::MXNet::NDArray->new(handle => $handle);
-}
-
-=head2 _aux_data
-
-        Get a deep copy NDArray of the i-th aux data array associated with the
-        AI::MXNet::NDArray::Sparse
-
-        This function blocks. Do not use it in performance critical code.
-=cut
-
-method _aux_data(Int $i)
-{
-    $self->wait_to_read;
-    my $handle = check_call(AI::MXNetCAPI::NDArrayGetAuxNDArray($self->handle, $i));
-    return AI::MXNet::NDArray->new(handle => $handle);
-}
-
-package AI::MXNet::NDArray::CSR;
-use AI::MXNet::Base;
-use Mouse;
-extends 'AI::MXNet::NDArray::Sparse';
-
-=head1 NAME
-
-    AI::MXNet::NDArray::CSR - A sparse representation of 2D NDArray in the Compressed Sparse Row format.
-=cut
-
-=head1 DESCRIPTION
-
-    A AI::MXNet::NDArray::CSR represents an AI::MXNet::NDArray as three separate arrays: `data`,
-    `indptr` and `indices`. It uses the CSR representation where the column indices for
-    row i are stored in ``indices[indptr[i]:indptr[i+1]]`` and their corresponding values are stored
-    in ``data[indptr[i]:indptr[i+1]]``.
-
-    The column indices for a given row are expected to be sorted in ascending order.
-    Duplicate column entries for the same row are not allowed.
-
-    Example
-    -------
-    >>> $a = mx->nd->array([[0, 1, 0], [2, 0, 0], [0, 0, 0], [0, 0, 3]]);
-    >>> $a = $a->tostype('csr');
-    >>> $a->data->aspdl;
-    [ 1  2  3]
-    >>> $a->indices->aspdl
-    [1 0 2]
-    >>> $a->indptr->aspdl
-    [0 1 2 2 3]
-
-    See Also
-    --------
-    csr_matrix: Several ways to construct a CSRNDArray
-=cut
-
-use overload '+=' => sub { ($_[0] + $_[1])->copyto($_[0]) },
-             '-=' => sub { ($_[0] - $_[1])->copyto($_[0]) },
-             '*=' => sub { ($_[0] * $_[1])->copyto($_[0]) },
-             '/=' => sub { ($_[0] / $_[1])->copyto($_[0]) };
-
-=head2 slice
-
-        Returns a newly created array based on the indexing key.
-
-        Parameters
-        ----------
-        key : int or array ref
-            Indexing key.
-
-        Examples
-        --------
-        >>> $indptr = [0, 2, 3, 6];
-        >>> $indices = [0, 2, 2, 0, 1, 2];
-        >>> $data = [1, 2, 3, 4, 5, 6];
-        >>> $a = mx->nd->sparse->csr_matrix([$data, $indices, $indptr], shape=>[3, 3])
-        >>> $a->aspdl
-            [[ 1  0  2]
-             [ 0  0  3]
-             [ 4  5  6]]
-        >>> $a->slice([1,2])->aspdl
-        [[ 0  0  3]]
-        >>> $a->slice(1)->aspdl
-        [[ 0  0  3]]
-        >>> $a->[-1]->aspdl
-        [[ 4  5  6]]
-=cut
-
-method slice(Slice|InternalSlice @slices)
-{
-    if(grep { /^begin|end|slice$/ } @slices)
-    {
-        return $self->SUPER::slice(@slices);
-    }
-    my $slice = $slices[0];
-    my ($begin, $end);
-    if(not ref $slice)
-    {
-        if($slice < 0)
-        {
-            $begin = $self->shape->[0] + $slice;
-        }
-        else
-        {
-            $begin = $slice;
-        }
-        $end = $begin;
-    }
-    else
-    {
-        ($begin, $end) = @{ $slice };
-        $end //= $self->shape->[0] - 1;
-        if($begin < 0)
-        {
-            $begin += $self->shape->[0];
-        }
-        if($end < 0)
-        {
-            $end += $self->shape->[0];
-        }
-    }
-    return $self->SUPER::slice(begin => $begin, end => $end + 1);
-}
-
-
-=head2 set
-
-        Set self to value. Also usable as overloaded .=
-
-        Parameters
-        ----------
-        value : AI::MXNet::NDArray or AI::MXNet::NDArray::CSR
-                or PDL/PDL::CCS::Nd/perl array ref in PDL constructor format
-            The value to set.
-
-        Examples
-        --------
-        >>> $src = mx->nd->sparse->zeros('csr', [3,3])
-        >>> $src->aspdl
-              [[ 0  0  0]
-               [ 0  0  0]
-               [ 0  0  0]]
-        >>> # AI::MXNet::NDArray::CSR with same storage type
-        >>> $x = mx->nd->ones('row_sparse', [3,3])->tostype('csr')
-        >>> $x .= $src
-        >>> $x->aspdl
-              [[ 1  1  1]
-               [ 1  1  1]
-               [ 1  1  1]]
-        >>> # assign NDArray to AI::MXNet::NDArray::CSR
-        >>> $x .= mx->nd->ones([3,3]) * 2
-        >>> $x->aspdl
-              [[ 2  2  2]
-               [ 2  2  2]
-               [ 2  2  2]]
-=cut
-
-method set(AcceptableInput $other, $reverse=)
-{
-    confess('Failed to assign to a readonly CSR') unless $self->writable;
-    if($other->isa('AI::MXNet::NDArray'))
-    {
-        if($self->handle ne $other->handle)
-        {
-            $other->copyto($self);
-        }
-    }
-    else
-    {
-        my $tmp = __PACKAGE__->array($other, ((not ref $other) ? ( pdl => $self->aspdl) : ()));
-        $tmp->copyto($self);
-    }
-}
-
-use overload '.=' => \&set;
-
-=head2 indices
-
-        A deep copy NDArray of the indices array of the AI::MXNet::NDArray::CSR.
-        This generates a deep copy of the column indices of the current `csr` matrix.
-
-        Returns
-        -------
-        NDArray
-            This AI::MXNet::NDArray::CSR indices array.
-=cut
-
-method indices()
-{
-    return $self->_aux_data(1);
-}
-
-=head2 indptr
-
-        A deep copy NDArray of the inptr array of the AI::MXNet::NDArray::CSR.
-        This generates a deep copy of the indptr of the current `csr` matrix.
-
-        Returns
-        -------
-        NDArray
-            This AI::MXNet::NDArray::CSR indptr array.
-=cut
-
-method indptr()
-{
-    return $self->_aux_data(0);
-}
-
-=head2 data
-
-        A deep copy NDArray of the data array of the AI::MXNet::NDArray::CSR.
-        This generates a deep copy of the data of the current `csr` matrix.
-
-        Returns
-        -------
-        NDArray
-            This AI::MXNet::NDArray::CSR data array.
-=cut
-
-method data()
-{
-    return $self->_data;
-}
-
-=head2 tostype
-
-        Return a copy of the array with chosen storage type.
-
-        Returns
-        -------
-        NDArray or AI::MXNet::NDArray::CSR 
-            A copy of the array with the chosen storage stype
-=cut
-
-method tostype(Stype $stype)
-{
-    if($stype eq 'row_sparse')
-    {
-        confess("cast_storage from csr to row_sparse is not supported");
-    }
-    return $self->cast_storage(stype => $stype);
-}
-
-=head2 copyto
-
-        Copies the value of this array to another array.
-
-        If $other is a AI::MXNet::NDArray or AI::MXNet::NDArray::CSR object, then $other->shape and
-        $self->shape should be the same. This function copies the value from
-        $self to $other.
-
-        If $other is a context, a new AI::MXNet::NDArray::CSR will be first created on
-        the target context, and the value of $self is copied.
-
-        Parameters
-        ----------
-        $other : AI::MXNet::NDArray or AI::MXNet::NDArray::CSR or AI::MXNet::Context
-            The destination array or context.
-
-        Returns
-        -------
-        AI::MXNet::NDArray or AI::MXNet::NDArray::CSR
-=cut
-
-method copyto(AI::MXNet::Context|AI::MXNet::NDArray $other)
-{
-    if($other->isa('AI::MXNet::Context'))
-    {
-        return $self->SUPER::copyto($other);
-    }
-    else
-    {
-        my $stype = $other->stype;
-        if($stype eq 'default' or $stype eq 'csr')
-        {
-            return return $self->SUPER::copyto($other);
-        }
-        else
-        {
-            confess("copyto does not support destination NDArray stype $stype");
-        }
-    }
-}
-
-=head2
-
-    Returns a PDL::CCS::Nd object with value copied from this array
-=cut
-
-method aspdlccs()
-{
-    return ascsr($self->data->aspdl, $self->indptr->aspdl, $self->indices->aspdl, $self->shape);
-}
-
-package AI::MXNet::NDArray::RowSparse;
-use Mouse;
-extends 'AI::MXNet::NDArray::Sparse';
-
-=head1 NAME
-
-    AI::MXNet::NDArray::RowSparse - A sparse representation of a set of NDArray row slices at given indices.
-=cut
-
-=head1 DESCRIPTION
-
-    A AI::MXNet::NDArray::RowSparse represents a multidimensional NDArray using two separate arrays: `data` and
-    `indices`. The number of dimensions has to be at least 2.
-
-    - data: an NDArray of any dtype with shape [D0, D1, ..., Dn].
-    - indices: a 1-D int64 NDArray with shape [D0] with values sorted in ascending order.
-
-    The `indices` stores the indices of the row slices with non-zeros,
-    while the values are stored in `data`. The corresponding NDArray ``dense``
-    represented by AI::MXNet::NDArray::RowSparse ``rsp`` has
-
-    ``dense[rsp.indices[i], :, :, :, ...] = rsp.data[i, :, :, :, ...]``
-
-        >>> $dense->aspdl
-              [[ 1  2  3 ]
-               [ 0  0  0 ]
-               [ 4  0  5 ]
-               [ 0  0  0 ]
-               [ 0  0  0 ]]
-        >>> $rsp = $dense->tostype('row_sparse');
-        >>> $rsp->indices->aspdl
-              [ 0 2 ]
-        >>> $rsp->data->aspdl
-              [[ 1  2 3 ]
-               [ 4  0 5 ]]
-
-    A AI::MXNet::NDArray::RowSparse is typically used to represent non-zero row slices of a large NDArray
-    of shape [LARGE0, D1, .. , Dn] where LARGE0 >> D0 and most row slices are zeros.
-
-    AI::MXNet::NDArray::RowSparse is used principally in the definition of gradients for operations
-    that have sparse gradients (e.g. sparse dot and sparse embedding).
-
-    See Also
-    --------
-    row_sparse_array: Several ways to construct a AI::MXNet::NDArray::RowSparse
-=cut
-
-use overload '+=' => sub { ($_[0] + $_[1])->copyto($_[0]) },
-             '-=' => sub { ($_[0] - $_[1])->copyto($_[0]) },
-             '*=' => sub { ($_[0] * $_[1])->copyto($_[0]) },
-             '/=' => sub { ($_[0] / $_[1])->copyto($_[0]) };
-
-method slice(@args) { confess("not implemented") }
-
-=head2 set
-
-        Set self to value. Also usable as overloaded .=
-
-        Parameters
-        ----------
-        value : AI::MXNet::NDArray or AI::MXNet::NDArray::CSR
-                or PDL/PDL::CCS::Nd/perl array ref in PDL constructor format
-            The value to set.
-
-        Examples
-        --------
-        >>> $src = mx->nd->sparse->zeros('raw_sparse', [3,3])
-        >>> $src->aspdl
-              [[ 0  0  0]
-               [ 0  0  0]
-               [ 0  0  0]]
-        >>> # AI::MXNet::NDArray::RowSparse with same storage type
-        >>> $x = mx->nd->ones('row_sparse', [3,3])
-        >>> $src .= $x
-        >>> $src->aspdl
-              [[ 1  1  1]
-               [ 1  1  1]
-               [ 1  1  1]]
-        >>> # assign NDArray to AI::MXNet::NDArray::RowSparse
-        >>> $x .= mx->nd->ones([3,3]) * 2
-        >>> $x->aspdl
-              [[ 2  2  2]
-               [ 2  2  2]
-               [ 2  2  2]]
-=cut
-
-method set(AcceptableInput $other, $reverse=)
-{
-    confess('Failed to assign to a readonly RowSparse') unless $self->writable;
-    if($other->isa('AI::MXNet::NDArray'))
-    {
-        if($self->handle ne $other->handle)
-        {
-            $other->copyto($self);
-        }
-    }
-    else
-    {
-        my $tmp = __PACKAGE__->array($other, ((not ref $other) ? ( pdl => $self->aspdl) : ()));
-        $tmp->copyto($self);
-    }
-}
-
-use overload '.=' => \&set;
-
-=head2 data
-
-        A deep copy NDArray of the data array of the AI::MXNet::NDArray::RowSparse.
-        This generates a deep copy of the data of the current `row_sparse` matrix.
-
-        Returns
-        -------
-        NDArray
-            This AI::MXNet::NDArray::RowSparse data array.
-=cut
-
-method data()
-{
-    return $self->_data;
-}
-
-=head2 indices
-
-        A deep copy NDArray of the indices array of the AI::MXNet::NDArray::RowSparse.
-        This generates a deep copy of the column indices of the current `row_sparse` matrix.
-
-        Returns
-        -------
-        NDArray
-            This AI::MXNet::NDArray::RowSparse indices array.
-=cut
-
-method indices()
-{
-    return $self->_aux_data(0);
-}
-
-=head2 data
-
-        A deep copy NDArray of the data array of the AI::MXNet::NDArray::RowSparse.
-        This generates a deep copy of the data of the current `row_sparse` matrix.
-
-        Returns
-        -------
-        NDArray
-            This AI::MXNet::NDArray::RowSparse data array.
-=cut
-
-=head2 tostype
-
-        Return a copy of the array with chosen storage type.
-
-        Returns
-        -------
-        NDArray or RowSparseNDArray
-            A copy of the array with the chosen storage stype
-=cut
-
-method tostype(Stype $stype)
-{
-    if($stype eq 'csr')
-    {
-        confess("cast_storage from row_sparse to csr is not supported");
-    }
-    return $self->cast_storage(stype => $stype);
-}
-
-
-=head2 copyto
-
-        Copies the value of this array to another array.
-
-        If $other is a AI::MXNet::NDArray or AI::MXNet::NDArray::RawSparse object, then $other->shape and
-        $self->shape should be the same. This function copies the value from
-        $self to $other.
-
-        If $other is a context, a new AI::MXNet::NDArray::RawSparse will be first created on
-        the target context, and the value of $self is copied.
-
-        Parameters
-        ----------
-        $other : AI::MXNet::NDArray or AI::MXNet::NDArray::RawSparse or AI::MXNet::Context
-            The destination array or context.
-
-        Returns
-        -------
-        AI::MXNet::NDArray or AI::MXNet::NDArray::RawSparse
-=cut
-
-method copyto(AI::MXNet::Context|AI::MXNet::NDArray $other)
-{
-    if($other->isa('AI::MXNet::Context'))
-    {
-        return $self->SUPER::copyto($other);
-    }
-    else
-    {
-        my $stype = $other->stype;
-        if($stype eq 'default' or $stype eq 'row_sparse')
-        {
-            return return $self->SUPER::copyto($other);
-        }
-        else
-        {
-            confess("copyto does not support destination NDArray stype $stype");
-        }
-    }
-}
-
-package AI::MXNet::NDArray::Sparse;
-
-# Prepare `source_array` so that it can be used to construct NDArray.
-# `source_array` is converted to a `pdl` if it's neither an `NDArray`
-# nor a `pdl`.
-
-method _prepare_src_array($source_array, Dtype $dtype)
-{
-    my $pdl_type = PDL::Type->new(DTYPE_MX_TO_PDL->{ $dtype });
-    if(not blessed($source_array))
-    {
-        $source_array = eval {
-            pdl($pdl_type, $source_array);
-        };
-        confess($@) if $@;
-    }
-    elsif($source_array->isa('AI::MXNet::NDArray'))
-    {
-        return $source_array;
-    }
-    $source_array = pdl($pdl_type, [@{ $source_array->unpdl } ? $source_array->unpdl->[0] : 0 ]) unless @{ $source_array->shape->unpdl };
-    return $source_array;
-}
-
-
-# Prepare the value of dtype if `dtype` is undef. If `src_array` is an NDArray, PDL
-# or PDL::CCS::Ne, return src_array->dtype. float32 is returned otherwise.
-
-method _prepare_default_dtype($src_array, $dtype)
-{
-    if(not defined $dtype)
-    {
-        if(blessed $src_array)
-        {
-            $dtype = $src_array->dtype;
-        }
-        else
-        {
-            $dtype = 'float32';
-        }
-    }
-    return $dtype;
-}
-
-use Data::Dumper;
-method _check_shape($s1, $s2)
-{
-    my ($ps1, $ps2) = map { (blessed($_) and $_->isa('AI::MXNet:NDArray')) ? pdl($_->shape) : blessed($_) ? $_ : pdl($_) } ($s1, $s2);
-    return 1 unless defined $s2;
-    ($ps1 == $ps2)->all
-        or
-    confess("Shape mismatch detected. " . Dumper(blessed ($s1) ? $s1->undpl : $s1 ) . " v.s. " . Dumper(blessed ($s2) ? $s2 : $s2));
-}
-
-method coo_matrix(@args)
-{
-    my ($data, $row, $col, $shape) = map { blessed $_ ? $_ : pdl($_) } @args;
-    my @which;
-    my $i = 0;
-    my $j = 0;
-    for (my $i = 0; $i < $row->nelem; $i++)
-    {
-        push @which, [$row->at($i), $col->at($i)];
-    }
-    return PDL::CCS::Nd->newFromWhich(
-            pdl(\@which), $data, pdims => $shape
-    )->xchg(0, 1);
-}
-
-=head2 csr_matrix
-
-    Creates a AI::MXNet::NDArray::CSR, an 2D array with compressed sparse row (CSR) format.
-
-    The AI::MXNet::NDArray::CSR can be instantiated in several ways:
-
-    - csr_matrix($arg1, Maybe[AI::MXNet::Context] :$ctx=, Maybe[Shape] :$shape, Maybe [Dtype] :$dtype=)
-        $ctx, $shape, $dtype are optional
-        $arg1 can be given in following variants
-
-    - to construct a AI::MXNet::NDArray::CSR with a dense 2D array $arg1
-            - $arg1 is in AI::MXNet::NDArray::array input format
-
-    - to construct a AI::MXNet::NDArray::CSR with a sparse 2D array $arg1
-            $arg1 is AI::MXNet::NDArray::CSR or PDL::CCS::Nd - A sparse matrix.
-            PDL::CCS::Nd is expected to be converted internally into CSR format
-            AI::MXNet injects 'tocsr' method into PDL and PDL::CCS::Nd modules for this purpose.
-
-    - to construct an empty AI::MXNet::NDArray::CSR with shape $arg1 = [$M, $N]
-            -  $M - Number of rows in the matrix
-            -  $N - Number of columns in the matrix
-
-    - to construct a AI::MXNet::NDArray::CSR based on the definition of compressed sparse row format
-        using three separate arrays,
-        where the column indices for row i are stored in ``indices[indptr[i]:indptr[i+1]]``
-        and their corresponding values are stored in ``data[indptr[i]:indptr[i+1]]``.
-        The column indices for a given row are expected to be **sorted in ascending order.**
-        Duplicate column entries for the same row are not allowed.
-        In this case $arg1 = [$data, $indices, $indptr]
-            $data, $indices, $indptr must be given in the AI::MXNet::NDArray::array input format
-            - $data - holds all the non-zero entries of the matrix in row-major order.
-            - $indices - stores the column index for each non-zero element in $data.
-            stores the column index for each non-zero element in $data.
-            - $indptr  - stores the offset into $data of the first non-zero element number of each
-            row of the matrix.
-
-        to construct a AI::MXNet::NDArray::CSR based on the COOrdinate format
-        using three seperate arrays, 
-        where ``row[i]`` is the row index of the element,
-        ``col[i]`` is the column index of the element
-        and ``data[i]`` is the data corresponding to the element. All the missing
-        elements in the input are taken to be zeroes.
-        In this case $arg1 = [$data, [$row, $col]]
-            $data, $row, $col must be given in the AI::MXNet::NDArray::array input format
-            $data - holds all the non-zero entries of the matrix in COO format.
-            $row - stores the row index for each non zero element in $data.
-            - **col** (*array_like*) - An object exposing the array interface, which
-            $col - stores the col index for each non zero element in $data.
-
-    Returns
-    -------
-    AI::MXNet::NDArray::CSR
-        A AI::MXNet::NDArray::CSR with the 'csr' storage representation.
-
-    Example
-    -------
-    >>> $a = mx->nd->sparse->csr_matrix([[1, 2, 3], [1, 0, 2], [0, 1, 2, 2, 3]], shape => [4, 3])
-    >>> $a->aspdl
-          [[ 0  1  0]
-           [ 2  0  0]
-           [ 0  0  0]
-           [ 0  0  3]]
-
-    See Also
-    --------
-    CSRNDArray : MXNet NDArray in compressed sparse row format.
-=cut
-method csr_matrix(
-    $arg1,
-    Maybe[Shape|PDL]          :$shape=,
-    Maybe[AI::MXNet::Context] :$ctx=AI::MXNet::Context->current_ctx,
-    Maybe[Dtype]              :$dtype=
-)
-{
-    if(not defined $arg1)
-    {
-        return __PACKAGE__->empty('csr', $shape, ctx => $ctx, (defined $dtype ? (dtype => $dtype) : ()));
-    }
-    # construct a csr matrix from (M, N) or (data, indices, indptr)
-    if(ref $arg1 eq 'ARRAY')
-    {
-        my $arg_len = @{ $arg1 };
-        if($arg_len == 2)
-        {
-            # construct a sparse csr matrix from
-            # scipy coo matrix if input format is coo
-            if(ref $arg1->[1] eq 'ARRAY' and @{ $arg1->[1] } == 2)
-            {
-                my $coo = __PACKAGE__->coo_matrix($arg1->[0], @{ $arg1->[1] }, $shape);
-                __PACKAGE__->_check_shape($coo->shape, $shape);
-                return __PACKAGE__->array($coo, ctx => $ctx, dtype => $dtype);
-            }
-            else
-            {
-                # empty matrix with shape
-                __PACKAGE__->_check_shape($arg1, $shape);
-                return __PACKAGE__->empty('csr', $arg1, ctx=>$ctx, dtype=>$dtype);
-            }
-        }
-        elsif($arg_len == 3)
-        {
-            # data, indices, indptr
-            return __PACKAGE__->_csr_matrix_from_definition(
-                @{ $arg1 }, shape  => $shape,
-                ctx => $ctx, dtype => $dtype
-            );
-        }
-        else
-        {
-            confess("Unexpected length of input array: " . Dumper($arg1));
-        }
-    }
-    else
-    {
-        # construct a csr matrix from a sparse / dense one
-        if(blessed $arg1 and ($arg1->isa('AI::MXNet::NDArray::CSR') or $arg1->isa('PDL::CCS::Nd')))
-        {
-            # construct a csr matrix from scipy or CSRNDArray
-            __PACKAGE__->_check_shape($arg1->shape, $shape);
-            return __PACKAGE__->array($arg1, ctx => $ctx, dtype => $dtype);
-        }
-        elsif(blessed $arg1 and $arg1->isa('AI::MXNet::NDArray::RowSparse'))
-        {
-            confess("Unexpected input type: AI::MXNet::NDArray::RowSparse");
-        }
-        else
-        {
-            # construct a csr matrix from a dense one
-            # prepare default ctx and dtype since mx.nd.array doesn't use default values
-            # based on source_array
-            $dtype = __PACKAGE__->_prepare_default_dtype($arg1, $dtype);
-            # create dns array with provided dtype. ctx is not passed since copy across
-            # ctx requires dtype to be the same
-            my $dns = __PACKAGE__->array($arg1, dtype=>$dtype);
-            if(defined $ctx and $dns->context ne $ctx)
-            {
-                $dns = $dns->as_in_context($ctx);
-            }
-            __PACKAGE__->_check_shape($dns->shape, $shape);
-            return $dns->tostype('csr');
-        }
-    }
-}
-
-# Create a AI::MXNet::NDarray::CSR based on data, indices and indptr
-method _csr_matrix_from_definition(
-    $data, $indices, $indptr,
-    Maybe[Shape|PDL] :$shape=,
-    AI::MXNet::Context :$ctx=AI::MXNet::Context->current_ctx,
-    Maybe[Dtype] :$dtype=,
-    Maybe[Dtype] :$indices_type=STORAGE_AUX_TYPES->{'csr'}[0],
-    Maybe[Dtype] :$indptr_type=STORAGE_AUX_TYPES->{'csr'}[1]
-)
-{
-    $dtype = __PACKAGE__->_prepare_default_dtype($data, $dtype);
-    # prepare src array and types
-    $data = __PACKAGE__->_prepare_src_array($data, $dtype);
-    $indptr = __PACKAGE__->_prepare_src_array($indptr, $indptr_type);
-    $indices = __PACKAGE__->_prepare_src_array($indices, $indices_type);
-
-    if(not (blessed $data and $data->isa('AI::MXNet::NDArray')))
-    {
-        $data = __PACKAGE__->array($data, ctx => $ctx, dtype => $dtype);
-    }
-    if(not (blessed $indptr and $indptr->isa('AI::MXNet::NDArray')))
-    {
-        $indptr = __PACKAGE__->array($indptr, ctx => $ctx, dtype => $indptr_type);
-    }
-    if(not (blessed $indices and $indices->isa('AI::MXNet::NDArray')))
-    {
-        $indices = __PACKAGE__->array($indices, ctx => $ctx, dtype => $indices_type);
-    }
-    if(not defined $shape)
-    {
-        if($indices->shape->[0] == 0)
-        {
-            confess('invalid shape');
-        }
-        $shape = [@{ $indptr } - 1, $indices->max->asscalar + 1];
-    }
-    elsif(blessed $shape)
-    {
-        $shape = $shape->unpdl;
-    }
-    # verify shapes
-    my $aux_shapes = [$indptr->shape, $indices->shape];
-    if($data->ndim != 1 or $indptr->ndim != 1 or $indices->ndim != 1 or $indptr->shape->[0] == 0 or @{ $shape } != 2)
-    {
-        confess('invalid shape');
-    }
-    my $hdl = __PACKAGE__->_new_alloc_handle(
-        'csr', $shape, $ctx, 0, $dtype,
-        [$indptr_type, $indices_type], $aux_shapes
-    );
-    my $result = AI::MXNet::NDArray::CSR->new(handle => $hdl);
-    check_call(AI::MXNetCAPI::NDArraySyncCopyFromNDArray($result->handle, $data->handle, -1));
-    check_call(AI::MXNetCAPI::NDArraySyncCopyFromNDArray($result->handle, $indptr->handle, 0));
-    check_call(AI::MXNetCAPI::NDArraySyncCopyFromNDArray($result->handle, $indices->handle, 1));
-    return $result;
-}
-
-=head2 row_sparse_array
-
-    Creates a AI::MXNet::NDArray::RowSparse, a multidimensional row sparse array with a set of
-    tensor slices at given indices.
-
-    The AI::MXNet::NDArray::RowSparse can be instantiated in several ways:
-
-    - row_sparse_array($arg1, Maybe[AI::MXNet::Context] :$ctx=, Maybe[Shape] :$shape, Maybe [Dtype] :$dtype=)
-        $ctx, $shape, $dtype are optional
-        $arg1 can be given in following variants
-
-    - to construct a AI::MXNet::NDArray::RowSparse with a dense array $arg1
-            - $arg1 is in AI::MXNet::NDArray::array input format
-
-    - to construct a AI::MXNet::NDArray::RowSparse with a sparse array $arg1
-            $arg1 is AI::MXNet::NDArray::RowSparse
-
-    - to construct an empty AI::MXNet::NDArray::RowSparse with shape $arg1 = [$D1, $D1, ...$DN]
-
-    - to construct a RowSparseNDArray based on the definition of row sparse format
-        using two separate arrays,
-        where the $indices stores the indices of the row slices with non-zeros,
-        while the values are stored in $data. The corresponding NDArray dense
-        represented by RowSparse rsp has
-        dense[rsp.indices[i], :, :, :, ...] = rsp.data[i, :, :, :, ...]
-        The row indices for are expected to be **sorted in ascending order.
-        $arg1 = [$data, $indices]
-            $data, $indices must be given in the AI::MXNet::NDArray::array input format
-
-    Returns
-    -------
-    AI::MXNet::NDArray::RowSparse
-        A AI::MXNet::NDArray::RowSparse with the 'row_sparse' storage representation.
-
-    Example
-    -------
-    >>> $a = mx->nd->sparse->row_sparse_array([[[1, 2], [3, 4]], [1, 4]], shape=>[6, 2])
-    >>> $a->aspdl
-          [[ 0  0]
-           [ 1  2]
-           [ 0  0]
-           [ 0  0]
-           [ 3  4]
-           [ 0  0]]
-=cut
-
-method row_sparse_array(
-    $arg1,
-    Maybe[Shape]              :$shape=,
-    Maybe[AI::MXNet::Context] :$ctx=AI::MXNet::Context->current_ctx,
-    Maybe[Dtype]              :$dtype=
-)
-{
-    if(not defined $arg1)
-    {
-        return __PACKAGE__->empty('row_sparse', $shape, ctx => $ctx, (defined $dtype ? (dtype => $dtype) : ()));
-    }
-    # construct a row sparse array from (D0, D1 ..) or (data, indices)
-    if(ref $arg1 eq 'ARRAY')
-    {
-        my $arg_len = @{ $arg1 };
-        if($arg_len < 2)
-        {
-            confess("Unexpected length of input array: $arg_len ");
-        }
-        elsif($arg_len > 2)
-        {
-            # empty ndarray with shape
-            __PACKAGE__->_check_shape($arg1, $shape);
-            return __PACKAGE__->empty('row_sparse', $arg1, ctx => $ctx, dtype => $dtype);
-        }
-        else
-        {
-            # len(arg1) = 2, is either shape or (data, indices)
-            if(not ref $arg1->[0] and not ref $arg1->[1])
-            {
-                # empty ndarray with shape
-                __PACKAGE__->_check_shape($arg1, $shape);
-                return __PACKAGE__->empty('row_sparse', $arg1, ctx => $ctx, dtype => $dtype);
-            }
-            else
-            {
-                # data, indices, indptr
-                return __PACKAGE__->_row_sparse_ndarray_from_definition(
-                    @{ $arg1 }, shape => $shape, ctx => $ctx, dtype => $dtype
-                );
-            }
-        }
-    }
-    else
-    {
-        # construct a row sparse ndarray from a dense / sparse array
-        if(blessed $arg1 and $arg1->isa('AI::MXNet::NDArray::RowSparse'))
-        {
-            # construct a row sparse ndarray from RowSparseNDArray
-            __PACKAGE__->_check_shape($arg1->shape, $shape);
-            return __PACKAGE__->array($arg1, ctx => $ctx, dtype => $dtype);
-        }
-        elsif(blessed $arg1 and $arg1->isa('AI::MXNet::NDArray::CSR'))
-        {
-            confess("Unexpected input type: AI::MXNet::NDArray::CSR");
-        }
-        else
-        {
-            # construct a csr matrix from a dense one
-            # prepare default dtype since mx.nd.array doesn't use default values
-            # based on source_array
-            $dtype = __PACKAGE__->_prepare_default_dtype($arg1, $dtype);
-            # create dns array with provided dtype. ctx is not passed since copy across
-            # ctx requires dtype to be the same
-            my $dns = __PACKAGE__->array($arg1, dtype => $dtype);
-            if(defined $ctx and $dns->context ne $ctx)
-            {
-                $dns = $dns->as_in_context($ctx);
-            }
-            __PACKAGE__->_check_shape($dns->shape, $shape);
-            return $dns->tostype('row_sparse');
-        }
-    }
-}
-
-# Create a AI::MXNet::NDArray::RowSparse based on data and indices
-method _row_sparse_ndarray_from_definition(
-    $data, $indices,
-    Maybe[Shape] :$shape=,
-    AI::MXNet::Context :$ctx=AI::MXNet::Context->current_ctx,
-    Maybe[Dtype] :$dtype=,
-    Maybe[Dtype] :$indices_type=STORAGE_AUX_TYPES->{'row_sparse'}[0]
-)
-{
-    $dtype = __PACKAGE__->_prepare_default_dtype($data, $dtype);
-    # prepare src array and types
-    $data = __PACKAGE__->_prepare_src_array($data, $dtype);
-    $indices = __PACKAGE__->_prepare_src_array($indices, $indices_type);
-
-    if(not (blessed $data and $data->isa('AI::MXNet::NDArray')))
-    {
-        $data = __PACKAGE__->array($data, ctx => $ctx, dtype => $dtype);
-    }
-    if(not (blessed $indices and $indices->isa('AI::MXNet::NDArray')))
-    {
-        $indices = __PACKAGE__->array($indices, ctx => $ctx, dtype => $indices_type);
-    }
-    if(not defined $shape)
-    {
-        my $num_indices = $indices->shape->[0];
-        if($num_indices == 0)
-        {
-            confess('invalid shape');
-        }
-        my $dim0 = $indices->at($num_indices - 1)->asscalar + 1;
-        $shape = [$dim0, @{ $data->shape } [1..@{ $data->shape } - 1]];
-    }
-    # verify shapes
-    if($data->ndim != @{ $shape } or $indices->ndim != 1 or product(@{ $shape } [1..@{ $shape } - 1]) == 0)
-    {
-        confess("invalid shape");
-    }
-    my $handle = __PACKAGE__->_new_alloc_handle(
-        'row_sparse', $shape, $ctx, 0, $dtype,
-        [$indices_type], [$indices->shape]
-    );
-    my $result = AI::MXNet::NDArray::RowSparse->new(handle => $handle);
-    check_call(AI::MXNetCAPI::NDArraySyncCopyFromNDArray($result->handle, $data->handle, -1));
-    check_call(AI::MXNetCAPI::NDArraySyncCopyFromNDArray($result->handle, $indices->handle, 0));
-    return $result
-}
-
-=head2 zeros
-
-    Return a new array of given shape and type, filled with zeros.
-
-    Parameters
-    ----------
-    $stype: string
-        The storage type of the empty array, such as 'row_sparse', 'csr', etc
-    shape : int or array ref of int
-        The shape of the empty array
-    :$ctx : AI::MXNet::Context, optional
-        An optional device context (default is the current default context)
-    :$dtype : Dtype, optional
-        An optional value type (default is `float32`)
-
-    Returns
-    -------
-    AI::MXNet::NDArray::RowSparse or AI::MXNet::NDArray::CSR
-        A created array
-    Examples
-    --------
-    >>> mx->nd->sparse->zeros('csr', [1,2])
-    <AI::MXNet::NDArray::CSR 1x2 @cpu(0)>
-    >>> mx->nd->sparse->zeros('row_sparse', [1,2], ctx=>mx->cpu(), dtype=>'float16')->aspdl
-    [[ 0  0]]
-=cut
-
-
-method zeros(
-    Stype $stype,
-    Shape $shape,
-    AI::MXNet::Context :$ctx=AI::MXNet::Context->current_ctx,
-    Maybe[Dtype] :$dtype='float32',
-    Maybe[AI::MXNet::NDArray] :$out=,
-    Maybe[Str] :$name=,
-    Maybe[Str] :$__layout__=
-)
-{
-    if($stype eq 'default')
-    {
-        return AI::MXNet::NDArray->zeros(
-            $shape, ctx => $ctx, dtype => $dtype, out => $out, name => $name, __layout__ => $__layout__
-        );
-    }
-    my $aux_types;
-    if($stype eq 'row_sparse' or $stype or 'csr')
-    {
-        $aux_types = STORAGE_AUX_TYPES->{ $stype };
-    }
-    else
-    {
-        confess("unknown storage type: $stype");
-    }
-    $out //= __PACKAGE__->_ndarray_cls(
-        __PACKAGE__->_new_alloc_handle(
-            $stype, $shape, $ctx, 1, $dtype, $aux_types)
-    );
-    return __PACKAGE__->_zeros(
-        shape => $shape, ctx => $ctx, dtype => $dtype, out => $out,
-        ($__layout__ ? (__layout__ => $__layout__) : ())
-    );
-}
-
-=head2 empty
-
-    Returns a new array of given shape and type, without initializing entries.
-
-    Parameters
-    ----------
-    stype: string
-        The storage type of the empty array, such as 'row_sparse', 'csr', etc
-    shape : int or array ref of int
-        The shape of the empty array.
-    ctx : Context, optional
-        An optional device context (default is the current default context).
-    dtype : Dtype, optional
-        An optional value type (default is `float32`).
-
-    Returns
-    -------
-    AI::MXNet::NDArray::CSR or AI::MXNet::NDArray::RowSparse
-        A created array.
-=cut
-
-method empty(
-    Stype $stype,
-    Shape $shape,
-    Maybe[AI::MXNet::Context] :$ctx=AI::MXNet::Context->current_ctx,
-    Maybe[Dtype] :$dtype='float32'
-)
-{
-    assert(defined $stype);
-    return __PACKAGE__->zeros($stype, $shape, ctx => $ctx, dtype => $dtype);
-}
-
-=head2 array
-
-    Creates a sparse array from any object exposing the array interface.
-
-    Parameters
-    ----------
-    $source_array : AI::MXNet::NDArray::RowSparse, AI::MXNet::NDArray::CSR or PDL::CCS::Nd
-        The source sparse array
-    :$ctx : Context, optional
-        The default context is $source_array->context if $source_array is an NDArray.
-        The current default context otherwise.
-    :$dtype : Dtype, optional
-        The data type of the output array. The default dtype is $source_array->dtype
-        if $source_array is an AI::MXNet::NDArray, PDL or PDL::CCS::Nd
-        'float32' otherwise.
-
-    Returns
-    -------
-    AI::MXNet::NDArray::RowSparse or AI::MXNet::NDArray::CSR
-        An array with the same contents as the $source_array.
-
-    Examples
-    --------
-    >>> use PDL; use PDL::CCS::Nd
-    >>> $csr = zeros([100, 2])->tocsr
-    >>> mx->nd->sparse->array($csr)
-    <AI::MXNet::NDArray::CSR 2x100 @cpu(0)>
-    >>> mx->nd->sparse->array(mx->nd->sparse->zeros('csr', [3, 2]))
-    <AI::MXNet::NDArray::CSR 3x2 @cpu(0)>
-    >>> mx->nd->sparse->array(mx->nd->sparse->zeros('row_sparse', [3, 2]))
-    <AI::MXNet::NDArray::RowSparse 3x2 @cpu(0)>
-=cut
-
-method array(
-    AcceptableInput $source_array,
-    Maybe[AI::MXNet::Context] :$ctx=AI::MXNet::Context->current_ctx,
-    Maybe[Dtype] :$dtype='float32',
-    Maybe[PDL] :$pdl=
-)
-{
-    if(not blessed $source_array  or $source_array->isa('PDL') or ($source_array->isa('AI::MXNet::NDArray') and $source_array->stype eq 'default'))
-    {
-        if(not ref $source_array)
-        {
-            $pdl .= $source_array;
-            $source_array = $pdl;
-        }
-        return __PACKAGE__->SUPER::array($source_array, ctx => $ctx, dtype => $dtype);
-    }
-    if($source_array->isa('AI::MXNet::NDArray'))
-    {
-        assert(
-            ($source_array->stype ne 'default'),
-            "Please use tostype to create AI::MXNet::NDarray::RowSparse or AI::MXNet::NDarray::CSR from an AI::MXNet::NDarray"
-        );
-        # prepare dtype and ctx based on source_array, if not provided
-        $dtype = __PACKAGE__->_prepare_default_dtype($source_array, $dtype);
-        # if both dtype and ctx are different from source_array, we cannot copy directly
-        my $arr;
-        if($source_array->dtype ne $dtype and $source_array->context ne $ctx)
-        {
-            $arr = __PACKAGE__->empty($source_array->stype, $source_array->shape, dtype => $dtype);
-            $arr .= $source_array;
-            $arr = $arr->as_in_context($ctx);
-        }
-        else
-        {
-            $arr = __PACKAGE__->empty($source_array->stype, $source_array->shape, dtype => $dtype, ctx => $ctx);
-            $arr .= $source_array;
-        }
-        return $arr;
-    }
-    elsif($source_array->isa('PDL::CCS::Nd'))
-    {
-        $dtype = __PACKAGE__->_prepare_default_dtype($source_array, $dtype);
-        return __PACKAGE__->csr_matrix(
-            [$source_array->data, $source_array->indices, $source_array->indptr],
-            shape => $source_array->shape , dtype => $dtype, ctx => $ctx
-        );
-    }
-}
-
-sub AUTOLOAD {
-    my $sub = $AI::MXNet::NDArray::Sparse::AUTOLOAD;
-    $sub =~ s/.*:://;
-    $sub = "_sparse_$sub";
-    shift;
-    return AI::MXNet::NDArray->$sub(@_);
-}
-
-1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/NS.pm b/perl-package/AI-MXNet/lib/AI/MXNet/NS.pm
deleted file mode 100644
index 03cd5f195182..000000000000
--- a/perl-package/AI-MXNet/lib/AI/MXNet/NS.pm
+++ /dev/null
@@ -1,78 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-package AI::MXNet::NS;
-# this class is similar to Exporter, in that it will add an "import"
-# method to the calling package.  It is to allow a package to emulate
-# the python "import mxnet as mx" style aliasing as "use AI::MXNet 'mx'"
-use strict;
-use warnings;
-
-sub _sym : lvalue
-{
-    my ($pkg, $name) = @_;
-    no strict 'refs';
-    *{"$pkg\::$name"};
-}
-
-sub import
-{
-    my (undef, $opt) = @_;
-    my $class = caller();
-    my $func = sub { $class };
-    _sym($class, 'import') = sub {
-        my (undef, @names) = @_;
-        @names = map { s/[^\w:]//sgr } @names;
-        my $target = caller();
-
-        _sym($names[0], '') = _sym($class, '') if
-            @names == 1 and $opt and $opt eq 'global';
-
-        _sym($target, $_) = $func for @names;
-    };
-}
-
-my $autoload_template = q(
-    sub AUTOLOAD
-    {
-        our ($AUTOLOAD, %AUTOLOAD);
-        my $name = $AUTOLOAD =~ s/.*:://sr;
-        my $func = $AUTOLOAD{$name};
-        Carp::carp(qq(Can't locate object method "$name" via package "${\ __PACKAGE__ }"))
-            unless $func;
-        goto $func;
-    }
-);
-
-# using AUTOLOAD here allows for the addition of an AI::MXNet::SomeClass
-# class to coexist with an AI::MXNet->SomeClass() shorthand constructor.
-sub register
-{
-    my ($class, $target) = @_;
-    my $name = $class =~ s/.*:://sr;
-    my $dest = $class->can('new');
-    ${_sym($target, 'AUTOLOAD')}{$name} = sub {
-        splice @_, 0, 1, $class;
-        goto $dest;
-    };
-    return if $target->can('AUTOLOAD');
-    eval sprintf 'package %s { %s }', $target, $autoload_template;
-    die if $@;
-    return;
-}
-
-1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Optimizer.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Optimizer.pm
deleted file mode 100644
index e4d8b5abde0b..000000000000
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Optimizer.pm
+++ /dev/null
@@ -1,1966 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-package AI::MXNet::Optimizer;
-use strict;
-use warnings;
-use AI::MXNet::NS;
-use AI::MXNet::Base;
-use AI::MXNet::NDArray;
-use AI::MXNet::Random;
-use List::Util qw(max);
-
-=head1 NAME
-
-    AI::MXNet::Optimizer - Common Optimization algorithms with regularizations.
-
-=head1  DESCRIPTION
-
-    Common Optimization algorithms with regularizations.
-=cut
-
-use Mouse;
-use AI::MXNet::Function::Parameters;
-my %opt_registry;
-method get_opt_registry()
-{
-    return \%opt_registry;
-}
-
-method register()
-{
-    my $name = $self;
-    ($name) = $name =~ /::(\w+)$/;
-    {  no strict 'refs'; *{__PACKAGE__."::$name"} = sub { shift; $self->new(@_)  }; }
-    $name = lc $name;
-    if(exists $opt_registry{ $name })
-    {
-        my $existing = $opt_registry{ $name };
-        warn(
-            "WARNING: New optimizer $self.$name"
-            ."is overriding existing optimizer $existing.$name"
-        );
-    }
-    $opt_registry{ $name } = $self;
-}
-
-=head2 create_optimizer
-
-        Create an optimizer with specified name.
-
-        Parameters
-        ----------
-        $name: Str
-            Name of required optimizer. Should be the name
-            of a subclass of Optimizer. Case insensitive.
-
-        :$rescale_grad : Num
-            Rescaling factor on gradient. Normally should be 1/batch_size.
-
-        %kwargs: Hash
-            Parameters for optimizer
-
-        Returns
-        -------
-        opt : Optimizer
-            The result optimizer.
-=cut
-
-method create_optimizer(Str $name, %kwargs)
-{
-    if(exists $opt_registry{ lc $name })
-    {
-        my $rescale_grad = delete($kwargs{rescale_grad})//1;
-        return $opt_registry{ lc $name }->new(
-            rescale_grad => $rescale_grad,
-            %kwargs
-        );
-    }
-    confess("Cannot find optimizer $name");
-}
-
-*create = \&create_optimizer;
-
-has 'rescale_grad'        => (is => "rw", isa => "Num", default=>1);
-has 'lr'                  => (is => "rw", isa => "Num");
-has 'learning_rate'       => (is => "rw", isa => "Num", default => 0.01);
-has 'lr_scheduler'        => (is => "rw", isa => "Maybe[AI::MXNet::LRScheduler]");
-has 'wd'                  => (is => "rw", isa => "Num", default => 0);
-has 'lr_mult'             => (is => "rw", isa => "HashRef", default => sub { +{} });
-has 'wd_mult'             => (is => "rw", isa => "HashRef", , default => sub { +{} });
-has 'num_update'          => (is => "rw", isa => "Int");
-has 'begin_num_update'    => (is => "rw", isa => "Int", default => 0);
-has '_index_update_count' => (is => "rw", isa => "HashRef", default => sub { +{} });
-has 'clip_gradient'       => (is => "rw", isa => "Maybe[Num]");
-has 'param_idx2name'      => (is => "rw", isa => "HashRef[Str]", default => sub { +{} });
-has 'idx2name'            => (is => "rw", isa => "HashRef[Str]");
-has 'sym'                 => (is => "rw", isa => "Maybe[AI::MXNet::Symbol]");
-has 'param_dict'          => (is => "rw", isa => "HashRef", default => sub { +{} });
-
-sub BUILD
-{
-    my $self = shift;
-    if($self->lr_scheduler)
-    {
-        $self->lr_scheduler->base_lr($self->learning_rate);
-    }
-    $self->lr($self->learning_rate);
-    $self->num_update($self->begin_num_update);
-    $self->idx2name({ %{ $self->param_idx2name } });
-    $self->set_lr_mult({});
-    $self->set_wd_mult({});
-}
-# Create additional optimizer state such as momentum.
-# override in implementations.
-method create_state($index, $weight){}
-
-# Update the parameters. override in implementations
-method update($index, $weight, $grad, $state){}
-
-# set lr scale is deprecated. Use set_lr_mult instead.
-method set_lr_scale($args_lrscale)
-{
-    Carp::cluck("set lr scale is deprecated. Use set_lr_mult instead.");
-}
-
-=head2 set_lr_mult
-
-        Set individual learning rate multipler for parameters
-
-        Parameters
-        ----------
-        args_lr_mult : dict of string/int to float
-            set the lr multipler for name/index to float.
-            setting multipler by index is supported for backward compatibility,
-            but we recommend using name and symbol.
-=cut
-
-method set_lr_mult(HashRef[Num] $args_lr_mult)
-{
-    $self->lr_mult({});
-    if($self->sym)
-    {
-        my $attr = $self->sym->attr_dict();
-        for my $name (@{ $self->sym->list_arguments() })
-        {
-            if(exists $attr->{ $name } and exists $attr->{ $name }{ __lr_mult__ })
-            {
-                $self->lr_mult->{ $name } = $attr->{ $name }{ __lr_mult__ };
-            }
-        }
-    }
-    $self->lr_mult({ %{ $self->lr_mult }, %{ $args_lr_mult } });
-}
-
-=head2 set_wd_mult
-
-        Set individual weight decay multipler for parameters.
-        By default wd multipler is 0 for all params whose name doesn't
-        end with _weight, if param_idx2name is provided.
-
-        Parameters
-        ----------
-        args_wd_mult : dict of string/int to float
-            set the wd multipler for name/index to float.
-            setting multipler by index is supported for backward compatibility,
-            but we recommend using name and symbol.
-=cut
-
-method set_wd_mult(HashRef[Num] $args_wd_mult)
-{
-    $self->wd_mult({});
-    for my $n (values %{ $self->idx2name })
-    {
-        if(not $n =~ /(?:_weight|_gamma)$/)
-        {
-            $self->wd_mult->{ $n } = 0;
-        }
-    }
-    if($self->sym)
-    {
-        my $attr = $self->sym->attr_dict();
-        for my $name (@{ $self->sym->list_arguments() })
-        {
-            if(exists $attr->{ $name } and exists $attr->{ $name }{ __wd_mult__ })
-            {
-                $self->wd_mult->{ $name } = $attr->{ $name }{ __wd_mult__ };
-            }
-        }
-    }
-    $self->wd_mult({ %{ $self->wd_mult }, %{ $args_wd_mult } });
-}
-
-method _update_count(Index $index)
-{
-    if(not exists $self->_index_update_count->{ $index })
-    {
-        $self->_index_update_count->{ $index } = $self->begin_num_update;
-    }
-    $self->_index_update_count->{ $index } += 1;
-    $self->num_update(max($self->_index_update_count->{ $index }, $self->num_update));
-}
-
-method _get_lr(Index $index)
-{
-    my $lr;
-    if($self->lr_scheduler)
-    {
-        $lr = $self->lr_scheduler->($self->num_update);
-    }
-    else
-    {
-        $lr = $self->lr;
-    }
-
-    if(exists $self->param_dict->{ $index })
-    {
-        $lr *= $self->param_dict->{ $index }->lr_mult;
-    }
-    elsif(exists $self->lr_mult->{ $index })
-    {
-        $lr *= $self->lr_mult->{ $index };
-    }
-    elsif(exists $self->idx2name->{ $index })
-    {
-        $lr *= $self->lr_mult->{ $self->idx2name->{ $index } }//1;
-    }
-    return $lr;
-}
-
-method _get_wd(Index $index)
-{
-    my $wd = $self->wd;
-    if(exists $self->param_dict->{ $index })
-    {
-        $wd *= $self->param_dict->{ $index }->wd_mult;
-    }
-    elsif(exists $self->wd_mult->{ $index })
-    {
-        $wd *= $self->wd_mult->{ $index };
-    }
-    elsif(exists $self->idx2name->{ $index })
-    {
-        $wd *= $self->wd_mult->{ $self->idx2name->{ $index } }//1;
-    }
-    return $wd;
-}
-
-=head1 NAME
-
-    AI::MXNet::SGD - A very simple SGD optimizer with momentum and weight regularization.
-=cut
-
-=head1 DESCRIPTION
-
-    A very simple SGD optimizer with momentum and weight regularization.
-
-    If the storage types of weight and grad are both 'row_sparse', and 'lazy_update' is True,
-    **lazy updates** are applied by
-
-        for row in grad.indices:
-            rescaled_grad[row] = lr * rescale_grad * clip(grad[row], clip_gradient) + wd * weight[row]
-            state[row] = momentum[row] * state[row] + rescaled_grad[row]
-            weight[row] = weight[row] - state[row]
-
-    The sparse update only updates the momentum for the weights whose row_sparse
-    gradient indices appear in the current batch, rather than updating it for all
-    indices. Compared with the original update, it can provide large
-    improvements in model training throughput for some applications. However, it
-    provides slightly different semantics than the original update, and
-    may lead to different empirical results.
-
-    Otherwise, **standard updates** are applied by::
-
-        rescaled_grad = lr * rescale_grad * clip(grad, clip_gradient) + wd * weight
-        state = momentum * state + rescaled_grad
-        weight = weight - state
-
-    Parameters
-    ----------
-    learning_rate : Num, optional
-        learning_rate of SGD
-
-    momentum : Num, optional
-       momentum value
-
-    wd : Num, optional
-        L2 regularization coefficient add to all the weights
-
-    rescale_grad : Num, optional
-        rescaling factor of gradient. Normally should be 1/batch_size.
-
-    clip_gradient : Num, optional
-        clip gradient in range [-clip_gradient, clip_gradient]
-
-    param_idx2name : hash ref of Str/Int to Num, optional
-        special treat weight decay in parameter ends with bias, gamma, and beta
-
-    multi_precision: Bool, optional
-        Flag to control the internal precision of the optimizer.
-        False results in using the same precision as the weights (default),
-        True makes internal 32-bit copy of the weights and applies gradients
-        in 32-bit precision even if actual weights used in the model have lower precision.
-        Turning this on can improve convergence and accuracy when training with float16.
-
-    lazy_update: Bool, optional, default true
-=cut
-
-package AI::MXNet::SGD;
-use Mouse;
-extends 'AI::MXNet::Optimizer';
-
-has 'kwargs'          => (is => "rw", isa => "HashRef[Num]");
-has 'momentum'        => (is => "rw", isa => "Num", default => 0);
-has 'multi_precision' => (is => "ro", isa => "Bool", default => 0);
-has 'lazy_update'     => (is => "ro", isa => "Bool", default => 1);
-
-sub BUILD
-{
-    my $self = shift;
-    $self->kwargs({});
-    if($self->momentum)
-    {
-        $self->kwargs->{momentum} = $self->momentum;
-    }
-    if($self->clip_gradient)
-    {
-        $self->kwargs->{clip_gradient} = $self->clip_gradient;
-    }
-}
-
-method create_state(Index $index, AI::MXNet::NDArray $weight)
-{
-    my $momentum;
-    my $weight_master_copy;
-    my $stype = $self->lazy_update ? $weight->stype : 'default';
-    if($self->multi_precision and $weight->dtype eq 'float16')
-    {
-        my $weight_master_copy = AI::MXNet::NDArray->array($weight, ctx => $weight->context, dtype => 'float32');
-        if($self->momentum != 0)
-        {
-            $momentum = AI::MXNet::NDArray->zeros($weight->shape, stype => $stype, ctx => $weight->context, dtype => 'float32');
-        }
-        return [$momentum, $weight_master_copy];
-    }
-    if($weight->dtype eq 'float16' and not $self->multi_precision)
-    {
-        AI::MXNet::Logging->warning(
-            "Accumulating with float16 in optimizer can lead to ".
-            "poor accuracy or slow convergence. ".
-            "Consider using multi_precision=True option of the ".
-            "SGD optimizer"
-        );
-    }
-    if($self->momentum != 0)
-    {
-        $momentum = AI::MXNet::NDArray->zeros($weight->shape, stype => $stype, ctx => $weight->context, dtype => $weight->dtype);
-    }
-    return $momentum;
-}
-
-method update(
-    Index                     $index,
-    AI::MXNet::NDArray        $weight,
-    AI::MXNet::NDArray        $grad,
-    Maybe[AI::MXNet::NDArray|ArrayRef[Maybe[AI::MXNet::NDArray]]] $state
-)
-{
-    $self->_update_count($index);
-    my $lr = $self->_get_lr($index);
-    my $wd = $self->_get_wd($index);
-    my $kwargs = {
-        out => $weight,
-        lr  => $lr,
-        wd  => $wd,
-        rescale_grad => $self->rescale_grad,
-        %{ $self->kwargs }
-    };
-    my $use_multi_precision = ref($state) eq 'ARRAY';
-    if(not $use_multi_precision)
-    {
-        if(defined $state)
-        {
-            AI::MXNet::NDArray->sgd_mom_update(
-                $weight, $grad, $state, $kwargs
-            );
-        }
-        else
-        {
-            AI::MXNet::NDArray->sgd_update(
-                $weight, $grad, $kwargs
-            );
-        }
-    }
-    else
-    {
-        if(defined $state->[0])
-        {
-            AI::MXNet::NDArray->mp_sgd_mom_update(
-                $weight, $grad, $state->[0], $state->[1], $kwargs
-            );
-        }
-        else
-        {
-            AI::MXNet::NDArray->mp_sgd_update(
-                $weight, $grad, $state->[1], $kwargs
-            );
-        }
-    }
-}
-
-__PACKAGE__->register;
-
-=head1 NAME
-
-    AI::MXNet::Signum - The Signum optimizer that takes the sign of gradient or momentum.
-=cut
-
-=head1 DESCRIPTION
-
-    The optimizer updates the weight by:
-
-        rescaled_grad = rescale_grad * clip(grad, clip_gradient) + wd * weight
-        state = momentum * state + (1-momentum)*rescaled_grad
-        weight = (1 - lr * wd_lh) * weight - lr * sign(state)
-
-    See the original paper at: https://jeremybernste.in/projects/amazon/signum.pdf
-
-    This optimizer accepts the following parameters in addition to those accepted
-    by AI::MXNet::Optimizer
-
-    Parameters
-    ----------
-    momentum : Num, optional
-       The momentum value.
-    wd_lh : Num, optional
-       The amount of decoupled weight decay regularization, see details in the original paper at:
-       https://arxiv.org/abs/1711.05101
-=cut
-
-package AI::MXNet::Signum;
-use Mouse;
-extends 'AI::MXNet::Optimizer';
-
-has 'momentum' => (is => "rw", isa => "Num", default => 0.9);
-has 'wd_lh'    => (is => "rw", isa => "Num", default => 0);
-
-method create_state(Index $index, AI::MXNet::NDArray $weight)
-{
-
-    my $momentum;
-    if($self->momentum != 0)
-    {
-        $momentum = AI::MXNet::NDArray->zeros(
-            $weight->shape,
-            ctx => $weight->context,
-            dtype=>$weight->dtype,
-            stype=>$weight->stype
-        );
-    }
-    return $momentum;
-}
-
-method update(
-    Index                     $index,
-    AI::MXNet::NDArray        $weight,
-    AI::MXNet::NDArray        $grad,
-    Maybe[AI::MXNet::NDArray|ArrayRef[Maybe[AI::MXNet::NDArray]]] $state
-)
-{
-    $self->_update_count($index);
-    my $lr = $self->_get_lr($index);
-    my $wd = $self->_get_wd($index);
-    my %kwargs = (
-        out => $weight,
-        lr  => $lr,
-        wd  => $wd,
-        rescale_grad => $self->rescale_grad,
-    );
-    if($self->momentum > 0)
-    {
-        $kwargs{momentum} = $self->momentum;
-    }
-    if($self->clip_gradient)
-    {
-        $kwargs{clip_gradient} = $self->clip_gradient;
-    }
-    if($self->wd_lh)
-    {
-        $kwargs{wd_lh} = $self->wd_lh;
-    }
-    if(defined $state)
-    {
-        AI::MXNet::NDArray->signum_update(
-            $weight, $grad, $state, %kwargs
-        );
-    }
-    else
-    {
-        AI::MXNet::NDArray->signsgd_update(
-            $weight, $grad, %kwargs
-        );
-    }
-}
-
-__PACKAGE__->register;
-
-=head1 NAME
-
-    AI::MXNet::FTML - The FTML optimizer.
-=cut
-
-=head1 DESCRIPTION
-
-    This class implements the optimizer described in
-    *FTML - Follow the Moving Leader in Deep Learning*,
-    available at http://proceedings.mlr.press/v70/zheng17a/zheng17a.pdf.
-
-    This optimizer accepts the following parameters in addition to those accepted
-    by AI::MXNet::Optimizer
-
-    Parameters
-    ----------
-    beta1 : Num, optional
-        0 < beta1 < 1. Generally close to 0.5.
-    beta2 : Num, optional
-        0 < beta2 < 1. Generally close to 1.
-    epsilon : Num, optional
-        Small value to avoid division by 0.
-=cut
-
-package AI::MXNet::FTML;
-use Mouse;
-extends 'AI::MXNet::Optimizer';
-
-has 'beta1'   => (is => "rw", isa => "Num", default => 0.6);
-has 'beta2'   => (is => "rw", isa => "Num", default => 0.999);
-has 'epsilon' => (is => "rw", isa => "Num", default => 1e-8);
-
-method create_state(Index $index, AI::MXNet::NDArray $weight)
-{
-    return [
-        AI::MXNet::NDArray->zeros($weight->shape, ctx => $weight->context, dtype=>$weight->dtype), # d_0
-        AI::MXNet::NDArray->zeros($weight->shape, ctx => $weight->context, dtype=>$weight->dtype), # v_0
-        AI::MXNet::NDArray->zeros($weight->shape, ctx => $weight->context, dtype=>$weight->dtype), # z_0
-    ];
-}
-
-method update(
-    Index                     $index,
-    AI::MXNet::NDArray        $weight,
-    AI::MXNet::NDArray        $grad,
-    Maybe[AI::MXNet::NDArray|ArrayRef[Maybe[AI::MXNet::NDArray]]] $state
-)
-{
-    my $lr = $self->_get_lr($index);
-    my $wd = $self->_get_wd($index);
-    my $t = $self->_update_count($index);
-    my %kwargs = (
-        out => $weight,
-        lr  => $lr,
-        wd  => $wd,
-        t   => $t,
-        beta1 => $self->beta1,
-        beta2 => $self->beta2,
-        epsilon => $self->epsilon,
-        rescale_grad => $self->rescale_grad
-    );
-    if($self->clip_gradient)
-    {
-        $kwargs{clip_grad} = $self->clip_gradient;
-    }
-    AI::MXNet::NDArray->ftml_update($weight, $grad, @{ $state }, \%kwargs);
-}
-
-__PACKAGE__->register;
-
-=head1 NAME
-
-    AI::MXNet::LBSGD - The Large Batch SGD optimizer with momentum and weight decay.
-=cut
-
-=head1 DESCRIPTION
-
-    The optimizer updates the weight by::
-
-        state = momentum * state + lr * rescale_grad * clip(grad, clip_gradient) + wd * weight
-        weight = weight - state
-
-    Parameters
-    ----------
-    momentum : Num, optional
-       The momentum value.
-    multi_precision: Bool, optional
-       Flag to control the internal precision of the optimizer.
-       0 results in using the same precision as the weights (default),
-       1 makes internal 32-bit copy of the weights and applies gradients
-                in 32-bit precision even if actual weights used in the model have lower precision.`<
-                Turning this on can improve convergence and accuracy when training with float16.
-    warmup_strategy: string ('linear', 'power2', 'sqrt'. , 'lars'   default : 'linear')
-    warmup_epochs: unsigned, default: 5
-    batch_scale:   unsigned, default: 1 (same as batch size*numworkers)
-    updates_per_epoch: updates_per_epoch (default: 32, Default might not reflect true number batches per epoch. Used for warmup.)
-    begin_epoch: unsigned, default 0, starting epoch.
-=cut
-
-package AI::MXNet::LBSGD;
-use Mouse;
-extends 'AI::MXNet::Optimizer';
-
-has 'momentum'          => (is => 'rw', isa => 'Num', default => 0);
-has 'multi_precision'   => (is => 'rw', isa => 'Bool', default => 0);
-has 'warmup_startegy'   => (is => 'rw', isa => 'Str', default => 'linear');
-has 'warmup_epochs'     => (is => 'rw', isa => 'Int', default => 5);
-has 'batch_scale'       => (is => 'rw', isa => 'Num', default => 1);
-has 'updates_per_epoch' => (is => 'rw', isa => 'Int', default => 32);
-has 'begin_epoch'       => (is => 'rw', isa => 'Int', default => 0);
-has 'num_epochs'        => (is => 'rw', isa => 'Int', default => 60);
-has 'beta2'             => (is => 'rw', isa => 'Num', default => 0.999);
-has 'epsilon'           => (is => 'rw', isa => 'Num', default => 1e-8);
-has 'init_updates'      => (is => 'rw', init_arg => undef);
-has [qw/lbmult
-        cumgrads
-        adaptive
-        init_updates
-        admult/]        => (is => 'rw', init_arg => undef);
-
-sub BUILD
-{
-    my $self = shift;
-    AI::MXNet::Logging->info('Running Large-Batch SGD Algorithm');
-    AI::MXNet::Logging->info(
-        '(Batch_scale=%f, warmup_epochs=%d, warmup_strategy=%s, updates_per_epoch=%d)',
-        map { $self->$_ } qw/batch_scale warmup_epochs warmup_strategy updates_per_epoch/
-    );
-    $self->init_updates($self->begin_epoch * $self->updates_per_epoch);
-    $self->lbmult(1);
-    $self->cumgrads({});
-    $self->adaptive(0);
-    $self->admult(1);
-}
-
-method create_state(Index $index, AI::MXNet::NDArray $weight)
-{
-    return [
-        AI::MXNet::NDArray->zeros($weight->shape, ctx => $weight->context, dtype=>$weight->dtype), # d_0
-        AI::MXNet::NDArray->zeros($weight->shape, ctx => $weight->context, dtype=>$weight->dtype), # v_0
-        AI::MXNet::NDArray->zeros($weight->shape, ctx => $weight->context, dtype=>$weight->dtype), # z_0
-    ];
-    my $momentum;
-    my $weight_master_copy;
-    if($self->multi_precision and $weight->dtype eq 'float16')
-    {
-        $weight_master_copy = AI::MXNet::NDArray->array($weight, ctx=>$weight->context, dtype=>'float32');
-        if($self->momentum != 0)
-        {
-            $momentum = AI::MXNet::NDArray->zeros(
-                $weight->shape, ctx => $weight->context, dtype => 'float32',
-                stype => $weight->stype
-            );
-        }
-        return [$momentum, $weight_master_copy];
-    }
-    if($weight->dtype eq 'float16' and not $self->multi_precision)
-    {
-        AI::MXNet::Logging->warning(
-            "Accumulating with float16 in optimizer can lead to "
-            ."poor accuracy or slow convergence. "
-            ."Consider using multi_precision=True option of the "
-            ."LBSGD optimizer"
-        );
-    }
-    if($self->momentum != 0)
-    {
-        $momentum = AI::MXNet::NDArray->zeros(
-            $weight->shape, ctx => $weight->context, dtype => $weight->dtype,
-            stype => $weight->stype
-        );
-    }
-    return $momentum;
-}
-
-method _get_lbmult($nup)
-{
-    my $nwup = $self->warmup_epochs * $self->updates_per_epoch;
-    my $strategy = $self->warmup_strategy;
-    my $maxmult = $self->batch_scale;
-    my $mult;
-    if($nup >= $nwup)
-    {
-        $mult = $maxmult;
-    }
-    elsif($nwup <= 1)
-    {
-        $mult = 1;
-    }
-    else
-    {
-        if ($strategy eq 'linear')
-        {
-            $mult = 1 + ($maxmult - 1) * $nup / $nwup;
-        }
-        elsif($strategy eq 'power2')
-        {
-            $mult = 1 + ($maxmult-1) * ($nup*$nup)/($nwup*$nwup);
-        }
-        elsif($strategy eq 'sqrt')
-        {
-            $mult = 1 + ($maxmult - 1) * sqrt($nup / $nwup);
-        }
-        else
-        {
-            $mult = 1;
-        }
-    }
-    return $mult;
-}
-
-
-method _get_lars($weight, $g, $wd)
-{
-    my $weight2 = $self->_l2norm($weight);
-    my $grad2 = $self->_l2norm($g);
-    my $lars = sqrt($weight2 / ($grad2 + $wd * $weight2 + 1e-18));
-    if($lars < 0.01)
-    {
-        $lars = 0.01;
-    }
-    elsif($lars > 100)
-    {
-        $lars = 100;
-    }
-    return $lars;
-}
-
-method _l2norm($v)
-{
-    my $norm = AI::MXNet::NDArray->multiply($v, $v)->aspdl->sum;
-    return $norm;
-}
-
-method  _reset_cum_gradient($index)
-{
-    $self->cumgrads->{$index}{cum_grad} = 0;
-}
-
-method _get_cum_gradient($index)
-{
-    if(exists $self->cumgrads->{$index})
-    {
-        return $self->cumgrads->{$index};
-    }
-    else
-    {
-        return {}
-    }
-}
-
-method _put_cum_gradient($index, $cgrad)
-{
-    $self->cumgrads->{$index} = $cgrad;
-}
-
-method _cumulate_gradient($grad, $index)
-{
-    my $cgrad = $self->_get_cum_gradient($index);
-    my ($num_cums, $cum_grad);
-    if(%{ $cgrad })
-    {
-        my $num_cums = $cgrad->{num_cums};
-        if($num_cums > 0)
-        {
-            $cum_grad = $cgrad->{cum_grad} + $grad;
-            $num_cums += 1;
-        }
-        else
-        {
-            $cum_grad = $grad;
-            $num_cums = $self->init_updates + 1;
-        }
-    }
-    else
-    {
-        $cum_grad = $grad;
-        $num_cums = $self->init_updates + 1;
-    }
-    $cgrad = {cum_grad => $cum_grad, num_cums => $num_cums};
-    $self->_put_cum_gradient($index, $cgrad);
-    return $cgrad;
-}
-
-
-
-method update(
-    Index                     $index,
-    AI::MXNet::NDArray        $weight,
-    AI::MXNet::NDArray        $grad,
-    Maybe[AI::MXNet::NDArray|ArrayRef[Maybe[AI::MXNet::NDArray]]] $state
-)
-{
-    my $lr = $self->_get_lr($index);
-    my $wd = $self->_get_wd($index);
-    my $t = $self->_update_count($index);
-    my $cgrad = $self->_cumulate_gradient($grad, $index);
-    if(($cgrad->{num_cums} % $self->batch_scale) == 0)
-    {
-        my $lbmult;
-        $grad = $cgrad->{cum_grad} / $self->batch_scale;
-        if($self->warmup_strategy eq 'lars')
-        {
-            $lbmult = $self->_get_lars($weight, $grad, $wd);
-        }
-        else
-        {
-            $lbmult = $self->_get_lbmult($cgrad->{num_cums});
-        }
-        $lr = $lr * $lbmult;
-        my %kwargs = (
-            out => $weight,
-            lr  => $lr,
-            wd  => $wd,
-            rescale_grad => $self->rescale_grad
-        );
-        if($self->clip_gradient)
-        {
-            $kwargs{clip_gradient} = $self->clip_gradient;
-        }
-        if($self->momentum > 0)
-        {
-            $kwargs{momentum} = $self->momentum;
-        }
-        my $use_multi_precision = ref($state) eq 'ARRAY';
-        if(not $use_multi_precision)
-        {
-            if(defined $state)
-            {
-                AI::MXNet::NDArray->sgd_mom_update($weight, $grad, $state, %kwargs);
-            }
-            else
-            {
-                AI::MXNet::NDArray->sgd_update($weight, $grad, %kwargs);
-            }
-        }
-        else
-        {
-            if(defined $state->[0])
-            {
-                AI::MXNet::NDArray->mp_sgd_mom_update($weight, $grad, @{ $state }, %kwargs);
-            }
-            else
-            {
-                AI::MXNet::NDArray->mp_sgd_update($weight, $grad, $state->[1], %kwargs);
-            }
-        }
-        $self->_reset_cum_gradient($index);
-    }
-    else
-    {
-        AI::MXNet::NDArray->sgd_update($weight, $grad, out => $weight, lr => 0, wd => $wd);
-    }
-}
-
-__PACKAGE__->register;
-
-package AI::MXNet::DCASGD;
-use Mouse;
-use AI::MXNet::Base;
-extends 'AI::MXNet::Optimizer';
-
-=head1 NAME
-
-    AI::MXNet::DCASGD - DCASGD optimizer with momentum and weight regularization.
-=cut
-
-=head1 DESCRIPTION
-
-    DCASGD optimizer with momentum and weight regularization.
-
-    Implements paper "Asynchronous Stochastic Gradient Descent with
-                    Delay Compensation for Distributed Deep Learning"
-
-    Parameters
-    ----------
-    learning_rate : Num, optional
-        learning_rate of SGD
-
-    momentum : Num, optional
-       momentum value
-
-    lamda : NUm, optional
-       scale DC value
-
-    wd : Num, optional
-        L2 regularization coefficient add to all the weights
-
-    rescale_grad : Num, optional
-        rescaling factor of gradient. Normally should be 1/batch_size.
-
-    clip_gradient : Num, optional
-        clip gradient in range [-clip_gradient, clip_gradient]
-
-    param_idx2name : hash ref of Str/Int to Num, optional
-        special threating of weight decay for parameters that end with bias, gamma, and beta
-=cut
-has 'momentum'        => (is => 'ro', isa => 'Num', default => 0);
-has 'lamda'           => (is => 'ro', isa => 'Num', default => 0.04);
-has 'weight_previous' => (is => 'rw', init_arg => undef);
-
-sub BUILD
-{
-    my $self = shift;
-    $self->weight_previous({});
-}
-
-method create_state(Index $index, AI::MXNet::NDArray $weight)
-{
-        return [
-            $self->momentum ? AI::MXNet::NDArray->zeros(
-                $weight->shape, ctx => $weight->context, dtype => $weight->dtype
-            ) : undef,
-            $weight->copy
-        ];
-}
-
-method update(
-    Index                     $index,
-    AI::MXNet::NDArray        $weight,
-    AI::MXNet::NDArray        $grad,
-    Maybe[AI::MXNet::NDArray|ArrayRef[Maybe[AI::MXNet::NDArray]]] $state
-)
-{
-    my $lr = $self->_get_lr($index);
-    my $wd = $self->_get_wd($index);
-    $self->_update_count($index);
-    $grad *= $self->rescale_grad;
-    if($self->clip_gradient)
-    {
-        $grad = AI::MXNet::NDArray->clip(
-            $grad,
-            -$self->clip_gradient,
-            $self->clip_gradient
-        );
-    }
-    my ($mom, $weight_previous) = @{ $state };
-    if(defined $mom)
-    {
-        $mom *= $self->momentum;
-        $mom += -$lr * (
-                $grad + $wd * $weight
-                    +
-                $self->lamda * $grad * $grad * ($weight - $weight_previous)
-        );
-    }
-    else
-    {
-        assert($self->momentum == 0);
-        $mom = -$lr * (
-                $grad + $wd * $weight
-                    +
-                $self->lamda * $grad * $grad * ($weight - $weight_previous)
-        );
-    }
-    $weight_previous .= $weight;
-    $weight += $mom;
-}
-
-__PACKAGE__->register;
-
-=head1 NAME
-
-    AI::MXNet::NAG - SGD with Nesterov weight handling.
-=cut
-
-=head1 DESCRIPTION
-
-    It is implemented according to
-    https://github.com/torch/optim/blob/master/sgd.lua
-=cut
-
-package AI::MXNet::NAG;
-use Mouse;
-extends 'AI::MXNet::SGD';
-
-method create_state(Index $index, AI::MXNet::NDArray $weight)
-{
-    my $momentum;
-    my $weight_master_copy;
-    my $do_multi_precision = ($self->multi_precision and $weight->dtype eq 'float16');
-    if($do_multi_precision)
-    {
-        if($self->momentum != 0)
-        {
-            $momentum = AI::MXNet::NDArray->zeros($weight->shape, ctx => $weight->context, dtype=>'float32');
-        }
-        $weight_master_copy = AI::MXNet::NDArray->array($weight, ctx=>$weight->context, dtype=>'float32');
-        return [$weight_master_copy, $momentum];
-    }
-    else
-    {
-        if($self->momentum != 0)
-        {
-            $momentum = AI::MXNet::NDArray->zeros($weight->shape, ctx => $weight->context, dtype=>$weight->dtype);
-        }
-        return $momentum;
-    }
-}
-
-method update($index, $weight, $grad, $state)
-{
-    my $lr = $self->_get_lr($index);
-    my $wd = $self->_get_wd($index);
-    $self->_update_count($index);
-    my $use_multi_precision = (defined $state and not Scalar::Util::blessed($state) and ref($state eq 'ARRAY'));
-    if(not $use_multi_precision)
-    {
-        $grad *= $self->rescale_grad;
-        if(defined $self->clip_gradient)
-        {
-            $grad = AI::MXNet::NDArray->clip($grad, -$self->clip_gradient, $self->clip_gradient);
-        }
-        if($self->momentum == 0)
-        {
-            $weight += -$lr * ($grad + $wd * $weight);
-        }
-        else
-        {
-            $grad += $wd * $weight;
-            my $mom = $state;
-            $mom *= $self->momentum;
-            $mom -= $lr * $grad;
-	    $grad *= -$lr;
-            $grad += $self->momentum * $mom;
-            $weight += $grad;
-        }
-    }
-    else
-    {
-        my $grad32 = AI::MXNet::NDArray->array($grad, ctx=>$grad->context, dtype=>'float32');
-        $grad32 *= $self->rescale_grad;
-        if(defined $self->clip_gradient)
-        {
-            $grad32 = AI::MXNet::NDArray->clip($grad32, -$self->clip_gradient, $self->clip_gradient);
-        }
-        my $mom = $state->[1];
-        my $weight32 = $state->[0];
-        if($self->momentum == 0)
-        {
-            $weight32 += -$lr * ($grad32 + $wd * $weight32);
-        }
-        else
-        {
-	    $grad32 += $wd * $weight32;
-            $mom *= $self->momentum;
-            $mom -= $lr * $grad32;
-	    $grad32 *= -$lr;
-            $grad32 += $self->momentum * $mom;
-            $weight32 += $grad32;
-        }
-        my $tmp = $weight32->astype($weight->dtype);
-        $tmp->copyto($weight);
-    }
-}
-
-__PACKAGE__->register;
-
-=head1 NAME
-
-    AI::MXNet::SGLD - Stochastic Gradient Riemannian Langevin Dynamics.
-=cut
-
-=head1 DESCRIPTION
-
-    Stochastic Gradient Riemannian Langevin Dynamics.
-
-    This class implements the optimizer described in the paper *Stochastic Gradient
-    Riemannian Langevin Dynamics on the Probability Simplex*, available at
-    https://papers.nips.cc/paper/4883-stochastic-gradient-riemannian-langevin-dynamics-on-the-probability-simplex.pdf.
-
-    Parameters
-    ----------
-    learning_rate : Num, optional
-        learning_rate of SGD
-
-    wd : Num, optional
-        L2 regularization coefficient add to all the weights
-
-    rescale_grad : Num, optional
-        rescaling factor of gradient. Normally should be 1/batch_size.
-
-    clip_gradient : Num, optional
-        clip gradient in range [-clip_gradient, clip_gradient]
-=cut
-
-package AI::MXNet::SGLD;
-use Mouse;
-
-extends 'AI::MXNet::Optimizer';
-
-method create_state(Index $index, AI::MXNet::NDArray $weight)
-{
-    return undef;
-}
-
-method update(
-    Index $index,
-    AI::MXNet::NDArray $weight,
-    AI::MXNet::NDArray $grad,
-    AI::MXNet::NDArray|Undef $state
-)
-{
-    my $lr = $self->_get_lr($index);
-    my $wd = $self->_get_wd($index);
-    $self->_update_count($index);
-    $grad *= $self->rescale_grad;
-    if($self->clip_gradient)
-    {
-        $grad = AI::MXNet::NDArray->clip(
-            $grad,
-            -$self->clip_gradient,
-             $self->clip_gradient
-        );
-    }
-    $weight +=  - $lr/2 * ($grad + $wd * $weight)
-                    +
-                AI::MXNet::Random->normal(
-                        0, sqrt($lr),
-                        shape => $weight->shape,
-                        ctx   => $weight->context,
-                        dtype => $weight->dtype
-                );
-}
-
-__PACKAGE__->register;
-
-=head1 NAME
-
-    AI::MXNet::Adam - Adam optimizer as described in [King2014]_.
-=cut
-
-=head1 DESCRIPTION
-
-    Adam optimizer as described in [King2014]_.
-
-    .. [King2014] Diederik Kingma, Jimmy Ba,
-       *Adam: A Method for Stochastic Optimization*,
-       http://arxiv.org/abs/1412.6980
-
-    Parameters
-    ----------
-    learning_rate : Num, optional
-        Step size.
-        Default value is set to 0.001.
-    beta1 : Num, optional
-        Exponential decay rate for the first moment estimates.
-        Default value is set to 0.9.
-    beta2 : Num, optional
-        Exponential decay rate for the second moment estimates.
-        Default value is set to 0.999.
-    epsilon : Num, optional
-        Default value is set to 1e-8.
-
-    wd : NUm, optional
-        L2 regularization coefficient add to all the weights
-    rescale_grad : Num, optional
-        rescaling factor of gradient. Normally should be 1/batch_size.
-
-    clip_gradient : Num, optional
-        clip gradient in range [-clip_gradient, clip_gradient]
-=cut
-package AI::MXNet::Adam;
-use Mouse;
-
-extends 'AI::MXNet::Optimizer';
-
-has 'kwargs'   => (is => "rw", isa => "HashRef[Num]");
-has '+learning_rate' => (default => 0.001);
-has 'beta1'    => (is => "rw", isa => "Num", default => 0.9);
-has 'beta2'    => (is => "rw", isa => "Num", default => 0.999);
-has 'epsilon'  => (is => "rw", isa => "Num", default => 1e-8);
-has 'lazy_update' => (is => 'rw', isa => 'Bool', default => 1);
-
-sub BUILD
-{
-    my $self = shift;
-    $self->kwargs({
-        beta1   => $self->beta1,
-        beta2   => $self->beta2,
-        epsilon => $self->epsilon
-    });
-    if($self->clip_gradient)
-    {
-        $self->kwargs->{clip_gradient} = $self->clip_gradient;
-    }
-}
-
-method create_state(Index $index, AI::MXNet::NDArray $weight)
-{
-    my $stype = $self->lazy_update ? $weight->stype : 'default';
-    return [AI::MXNet::NDArray->zeros(
-                $weight->shape,
-                ctx => $weight->context,
-                dtype => $weight->dtype,
-                stype => $stype
-            ),  # mean
-            AI::MXNet::NDArray->zeros(
-                $weight->shape,
-                ctx => $weight->context,
-                dtype => $weight->dtype,
-                stype => $stype
-            )  # variance
-    ];
-}
-
-method update(
-    Index $index,
-    AI::MXNet::NDArray $weight,
-    AI::MXNet::NDArray $grad,
-    ArrayRef[AI::MXNet::NDArray] $state
-)
-{
-    my $lr = $self->_get_lr($index);
-    my $wd = $self->_get_wd($index);
-    $self->_update_count($index);
-    my $t = $self->_index_update_count->{$index};
-    my $coef1 = 1 - $self->beta1**$t;
-    my $coef2 = 1 - $self->beta2**$t;
-    $lr *= sqrt($coef2)/$coef1;
-    my ($mean, $var) = @{ $state };
-    AI::MXNet::NDArray->adam_update(
-        $weight, $grad, $mean, $var,
-        {
-            out => $weight,
-            lr  => $lr,
-            wd  => $wd,
-            rescale_grad => $self->rescale_grad,
-            %{ $self->kwargs }
-        }
-    );
-}
-
-__PACKAGE__->register;
-
-=head1 NAME
-
-    AI::MXNet::AdaGrad - AdaGrad optimizer of Duchi et al., 2011
-=cut
-
-=head1 DESCRIPTION
-
-    AdaGrad optimizer of Duchi et al., 2011,
-
-    This code follows the version in http://arxiv.org/pdf/1212.5701v1.pdf  Eq(5)
-    by Matthew D. Zeiler, 2012. AdaGrad will help the network to converge faster
-    in some cases.
-
-    Parameters
-    ----------
-    learning_rate : Num, optional
-        Step size.
-        Default value is set to 0.05.
-
-    wd : Num, optional
-        L2 regularization coefficient add to all the weights
-
-    rescale_grad : Num, optional
-        rescaling factor of gradient. Normally should be 1/batch_size.
-
-    epsilon: Num, optional
-        A small float number to make the updating processing stable
-        Default value is set to 1e-7.
-
-    clip_gradient : Num, optional
-        clip gradient in range [-clip_gradient, clip_gradient]
-=cut
-package AI::MXNet::AdaGrad;
-use Mouse;
-
-extends 'AI::MXNet::Optimizer';
-
-has 'epsilon'    => (is => "rw", isa => "Num", default => 1e-7);
-
-method create_state(Index $index, AI::MXNet::NDArray $weight)
-{
-    return AI::MXNet::NDArray->zeros(
-                $weight->shape,
-                ctx => $weight->context,
-                stype => $weight->stype
-    );  # history
-}
-
-method update(
-    Index $index,
-    AI::MXNet::NDArray $weight,
-    AI::MXNet::NDArray $grad,
-    AI::MXNet::NDArray $state
-)
-{
-    my $lr = $self->_get_lr($index);
-    my $wd = $self->_get_wd($index);
-    $self->_update_count($index);
-    my $is_sparse = $grad->stype eq 'row_sparse' ? 1 : 0;
-    my $history = $state;
-    if($is_sparse)
-    {
-        my %kwargs = (
-            epsilon => $self->epsilon,
-            rescale_grad => $self->rescale_grad
-        );
-        if($self->clip_gradient)
-        {
-            $kwargs{clip_gradient} = $self->clip_gradient;
-        }
-        AI::MXNet::NDArray::Sparse->adagrad_update($weight, $grad, $history, { out=>$weight, lr=>$lr, wd=>$wd, %kwargs });
-    }
-    else
-    {
-        $grad *= $self->rescale_grad;
-        if(defined $self->clip_gradient)
-        {
-            $grad = AI::MXNet::NDArray->clip($grad, -$self->clip_gradient, $self->clip_gradient);
-        }
-	$grad += $wd * $weight;
-        $history += $grad->square;
-        my $div = $grad / (($history)->sqrt + $self->epsilon);
-        $weight += $div * -$lr;
-    }
-}
-
-__PACKAGE__->register;
-
-=head1 NAME
-
-    AI::MXNet::RMSProp - RMSProp optimizer of Tieleman & Hinton, 2012.
-=cut
-
-=head1 DESCRIPTION
-
-    RMSProp optimizer of Tieleman & Hinton, 2012,
-
-    For centered=False, the code follows the version in
-    http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf by
-    Tieleman & Hinton, 2012
-
-    For centered=True, the code follows the version in
-    http://arxiv.org/pdf/1308.0850v5.pdf Eq(38) - Eq(45) by Alex Graves, 2013.
-
-    Parameters
-    ----------
-    learning_rate : Num, optional
-        Step size.
-        Default value is set to 0.001.
-    rho: Num, optional
-        decay factor of moving average for gradient^2.
-        Default value is set to 0.9.
-    momentum: Num, optional
-        Default value if set to 0.9.
-        Only used if centered=True
-    epsilon : Num, optional
-        Default value is set to 1e-8.
-    centered : Bool, optional
-        Use Graves or Tielemans & Hintons version of RMSProp
-    wd : Num, optional
-        L2 regularization coefficient add to all the weights
-    rescale_grad : Num, optional
-        rescaling factor of gradient.
-    clip_gradient : Num, optional
-        clip gradient in range [-clip_gradient, clip_gradient]
-    clip_weights : Num, optional
-        clip weights in range [-clip_weights, clip_weights]
-=cut
-
-package AI::MXNet::RMSProp;
-use Mouse;
-
-extends 'AI::MXNet::Optimizer';
-
-has '+learning_rate' => (default => 0.001);
-has 'rho'         => (is => "ro", isa => "Num",  default => 0.9);
-has 'momentum'         => (is => "ro", isa => "Num",  default => 0.9);
-has 'epsilon'        => (is => "ro", isa => "Num",  default => 1e-8);
-has 'centered'       => (is => "ro", isa => "Bool", default => 0);
-has 'clip_weights'   => (is => "ro", isa => "Num");
-has 'kwargs'         => (is => "rw", init_arg => undef);
-
-sub BUILD
-{
-    my $self = shift;
-    $self->kwargs({
-        rho       => $self->rho,
-        epsilon      => $self->epsilon
-    });
-    if($self->centered)
-    {
-        $self->kwargs->{momentum} = $self->momentum;
-    }
-    if($self->clip_gradient)
-    {
-        $self->kwargs->{clip_gradient} = $self->clip_gradient;
-    }
-    if($self->clip_weights)
-    {
-        $self->kwargs->{clip_weights} = $self->clip_weights;
-    }
-}
-
-# For centered=False: n
-# For centered=True: n, g, delta
-method create_state(Index $index, AI::MXNet::NDArray $weight)
-{
-    return [
-            $self->centered
-            ? (
-                AI::MXNet::NDArray->zeros(
-                    $weight->shape,
-                    ctx => $weight->context,
-                    stype => $weight->stype
-                ),  # n
-                AI::MXNet::NDArray->zeros(
-                    $weight->shape,
-                    ctx => $weight->context,
-                    stype => $weight->stype
-                ),  # g
-                AI::MXNet::NDArray->zeros(
-                    $weight->shape,
-                    ctx => $weight->context,
-                    stype => $weight->stype
-                )
-            )   # delta
-            : (
-                AI::MXNet::NDArray->zeros(
-                    $weight->shape,
-                    ctx => $weight->context,
-                    stype => $weight->stype
-                ),  # n
-            )
-    ];
-}
-
-method update(
-    Index $index,
-    AI::MXNet::NDArray $weight,
-    AI::MXNet::NDArray $grad,
-    ArrayRef[AI::MXNet::NDArray] $state
-)
-{
-    my $lr = $self->_get_lr($index);
-    my $wd = $self->_get_wd($index);
-    $self->_update_count($index);
-    my ($n, $g, $delta) = @{ $state };
-    if($self->centered)
-    {
-        AI::MXNet::NDArray->rmspropalex_update(
-            $weight, $grad, $g, $n, $delta,
-            {
-                out => $weight,
-                lr  => $lr,
-                wd  => $wd,
-                rescale_grad => $self->rescale_grad,
-                %{ $self->kwargs }
-            }
-        );
-    }
-    else
-    {
-        AI::MXNet::NDArray->rmsprop_update(
-            $weight, $grad, $n,
-            {
-                out => $weight,
-                lr  => $lr,
-                wd  => $wd,
-                rescale_grad => $self->rescale_grad,
-                %{ $self->kwargs }
-            }
-        );
-    }
-}
-
-__PACKAGE__->register;
-
-=head1 NAME
-
-    AI::MXNet::AdaDelta - AdaDelta optimizer.
-=cut
-
-=head1 DESCRIPTION
-
-    AdaDelta optimizer as described in
-    Zeiler, M. D. (2012).
-    *ADADELTA: An adaptive learning rate method.*
-
-    http://arxiv.org/abs/1212.5701
-
-    Parameters
-    ----------
-    rho: Num
-        Decay rate for both squared gradients and delta x
-    epsilon : Num
-        The constant as described in the thesis
-    wd : Num
-        L2 regularization coefficient add to all the weights
-    rescale_grad : Num, optional
-        rescaling factor of gradient. Normally should be 1/batch_size.
-    clip_gradient : Num, optional
-        clip gradient in range [-clip_gradient, clip_gradient]
-=cut
-package AI::MXNet::AdaDelta;
-use Mouse;
-
-extends 'AI::MXNet::Optimizer';
-
-has 'rho'    => (is => "rw", isa => "Num", default => 0.9);
-has 'epsilon'    => (is => "rw", isa => "Num", default => 1e-5);
-
-method create_state(Index $index, AI::MXNet::NDArray $weight)
-{
-    return [
-            AI::MXNet::NDArray->zeros(
-                $weight->shape,
-                ctx => $weight->context
-            ),  # accumulated g
-            AI::MXNet::NDArray->zeros(
-                $weight->shape,
-                ctx => $weight->context
-            )   # accumulated delta
-    ];
-}
-
-method update(
-    Index $index,
-    AI::MXNet::NDArray $weight,
-    AI::MXNet::NDArray $grad,
-    ArrayRef[AI::MXNet::NDArray] $state
-)
-{
-    my $wd = $self->_get_wd($index);
-    $self->_update_count($index);
-    $grad *= $self->rescale_grad;
-    if($self->clip_gradient)
-    {
-        $grad = AI::MXNet::NDArray->clip(
-            $grad,
-            -$self->clip_gradient,
-             $self->clip_gradient
-        );
-    }
-    my ($acc_g, $acc_delta) = @{ $state };
-    $acc_g .= $self->rho * $acc_g + (1 - $self->rho) * $grad * $grad;
-    my $current_delta = ($acc_delta + $self->epsilon)->sqrt
-                            /
-                        ($acc_g + $self->epsilon)->sqrt
-                            *
-                        $grad;
-    $acc_delta .= $self->rho * $acc_delta + (1 - $self->rho) * $current_delta * $current_delta;
-    $weight -= $current_delta + $wd * $weight;
-}
-
-__PACKAGE__->register;
-
-# For test use
-package AI::MXNet::Test;
-use Mouse;
-
-extends 'AI::MXNet::Optimizer';
-
-# Create a state to duplicate weight
-method create_state(Index $index, AI::MXNet::NDArray $weight)
-{
-    return AI::MXNet::NDArray->zeros(
-                $weight->shape,
-                ctx => $weight->context
-    );
-}
-
-# performs w += rescale_grad * grad
-method update(
-    Index $index,
-    AI::MXNet::NDArray $weight,
-    AI::MXNet::NDArray $grad,
-    AI::MXNet::NDArray $state
-)
-{
-    $weight += $grad * $self->rescale_grad;
-    $state .= $weight;
-}
-
-__PACKAGE__->register;
-
-package AI::MXNet::Ftrl;
-
-
-=head1 NAME
-
-    AI::MXNet::Ftrl
-=cut
-
-=head1 DESCRIPTION
-
-    Referenced from *Ad Click Prediction: a View from the Trenches*, available at
-    http://dl.acm.org/citation.cfm?id=2488200.
-
-    The optimizer updates the weight by:
-
-        rescaled_grad = clip(grad * rescale_grad, clip_gradient)
-        z += rescaled_grad - (sqrt(n + rescaled_grad**2) - sqrt(n)) * weight / learning_rate
-        n += rescaled_grad**2
-        w = (sign(z) * lamda1 - z) / ((beta + sqrt(n)) / learning_rate + wd) * (abs(z) > lamda1)
-
-    If the storage types of weight, state and grad are all row_sparse,
-    **sparse updates** are applied by::
-
-        for row in grad.indices:
-            rescaled_grad[row] = clip(grad[row] * rescale_grad, clip_gradient)
-            z[row] += rescaled_grad[row] - (sqrt(n[row] + rescaled_grad[row]**2) - sqrt(n[row])) * weight[row] / learning_rate
-            n[row] += rescaled_grad[row]**2
-            w[row] = (sign(z[row]) * lamda1 - z[row]) / ((beta + sqrt(n[row])) / learning_rate + wd) * (abs(z[row]) > lamda1)
-
-    The sparse update only updates the z and n for the weights whose row_sparse
-    gradient indices appear in the current batch, rather than updating it for all
-    indices. Compared with the original update, it can provide large
-    improvements in model training throughput for some applications. However, it
-    provides slightly different semantics than the original update, and
-    may lead to different empirical results.
-
-    This optimizer accepts the following parameters in addition to those accepted
-    by AI::MXNet::Optimizer
-
-    Parameters
-    ----------
-    lamda1 : Num, optional
-        L1 regularization coefficient.
-    learning_rate : Num, optional
-        The initial learning rate.
-    beta : Num, optional
-        Per-coordinate learning rate correlation parameter.
-=cut
-
-use Mouse;
-extends 'AI::MXNet::Optimizer';
-has '+learning_rate' => (default => 0.1);
-has 'beta'           => (is => "ro", isa => "Num",  default => 1);
-has 'lamda1'         => (is => "ro", isa => "Num",  default => 0.01);
-
-method create_state(Index $index, AI::MXNet::NDArray $weight)
-{
-    return [
-            AI::MXNet::NDArray->zeros(
-                $weight->shape,
-                ctx => $weight->context,
-                stype => $weight->stype
-            ),  # z
-            AI::MXNet::NDArray->zeros(
-                $weight->shape,
-                ctx => $weight->context,
-                stype => $weight->stype
-            )   # n
-    ];
-}
-
-method update(
-    Index $index,
-    AI::MXNet::NDArray $weight,
-    AI::MXNet::NDArray $grad,
-    ArrayRef[AI::MXNet::NDArray] $state
-)
-{
-    $self->_update_count($index);
-    my $wd = $self->_get_wd($index);
-    my $lr = $self->_get_lr($index);
-    my %kwargs = (lamda1 => $self->lamda1, beta => $self->beta, rescale_grad => $self->rescale_grad);
-    if($self->clip_gradient)
-    {
-        $kwargs{clip_gradient} = $self->clip_gradient;
-    }
-    # accumulated g and delta initialization
-    my ($z, $n) = @{ $state };
-    AI::MXNet::NDArray->ftrl_update(
-        $weight, $grad, $z, $n,
-        { lr => $lr, wd => $wd, %kwargs, out => $weight }
-    );
-}
-
-__PACKAGE__->register;
-
-package AI::MXNet::Adamax;
-
-=head1 NAME
-
-    AI::MXNet::Adamax
-=cut
-
-=head1 DESCRIPTION
-
-    It is a variant of Adam based on the infinity norm
-    available at http://arxiv.org/abs/1412.6980 Section 7.
-
-    This optimizer accepts the following parameters in addition to those accepted
-    AI::MXNet::Optimizer.
-
-    Parameters
-    ----------
-    beta1 : Num, optional
-        Exponential decay rate for the first moment estimates.
-    beta2 : Num, optional
-        Exponential decay rate for the second moment estimates.
-=cut
-
-use Mouse;
-extends 'AI::MXNet::Optimizer';
-has '+learning_rate' => (default => 0.002);
-has 'beta1'          => (is => "ro", isa => "Num",  default => 0.9);
-has 'beta2'          => (is => "ro", isa => "Num",  default => 0.999);
-
-method create_state(Index $index, AI::MXNet::NDArray $weight)
-{
-    return [
-            AI::MXNet::NDArray->zeros(
-                $weight->shape,
-                ctx => $weight->context,
-                dtype => $weight->dtype
-            ),  # mean
-            AI::MXNet::NDArray->zeros(
-                $weight->shape,
-                ctx => $weight->context,
-                dtype => $weight->dtype
-            )   # variance
-    ];
-}
-
-method update(
-    Index $index,
-    AI::MXNet::NDArray $weight,
-    AI::MXNet::NDArray $grad,
-    ArrayRef[AI::MXNet::NDArray] $state
-)
-{
-    my $wd = $self->_get_wd($index);
-    my $lr = $self->_get_lr($index);
-    $self->_update_count($index);
-    my $t = $self->_index_update_count->{$index};
-    $lr /= (1 - $self->beta1**$t);
-
-    $grad = $grad * $self->rescale_grad + $wd * $weight;
-    if($self->clip_gradient)
-    {
-        $grad = AI::MXNet::NDArray->clip(
-            $grad,
-            -$self->clip_gradient,
-             $self->clip_gradient
-        );
-    }
-
-    # update m_t and u_t
-    my($m_t, $u_t) = @{ $state };
-    $m_t .= $self->beta1 * $m_t + (1 - $self->beta1) * $grad;
-    $u_t .= AI::MXNet::NDArray->maximum($self->beta2 * $u_t, $grad->abs);
-
-    # update weight
-    $weight -= $lr * $m_t / $u_t;
-}
-
-__PACKAGE__->register;
-
-package AI::MXNet::Nadam;
-
-=head1 NAME
-
-    AI::MXNet::Nadam
-=cut
-
-=head1 DESCRIPTION
-
-    The Nesterov Adam optimizer.
-
-    Much like Adam is essentially RMSprop with momentum,
-    Nadam is Adam RMSprop with Nesterov momentum available
-    at http://cs229.stanford.edu/proj2015/054_report.pdf.
-
-    This optimizer accepts the following parameters in addition to those accepted
-    by AI::MXNet::Optimizer.
-
-    Parameters
-    ----------
-    beta1 : Num, optional
-        Exponential decay rate for the first moment estimates.
-    beta2 : Num, optional
-        Exponential decay rate for the second moment estimates.
-    epsilon : Num, optional
-        Small value to avoid division by 0.
-    schedule_decay : Num, optional
-        Exponential decay rate for the momentum schedule
-=cut
-
-use Mouse;
-extends 'AI::MXNet::Optimizer';
-has '+learning_rate' => (default => 0.001);
-has 'beta1'          => (is => "ro", isa => "Num",  default => 0.9);
-has 'beta2'          => (is => "ro", isa => "Num",  default => 0.999);
-has 'epsilon'        => (is => "ro", isa => "Num",  default => 1e-8);
-has 'schedule_decay' => (is => "ro", isa => "Num",  default => 0.004);
-has 'm_schedule'     => (is => "rw", default => 1, init_arg => undef);
-
-method create_state(Index $index, AI::MXNet::NDArray $weight)
-{
-    return [
-            AI::MXNet::NDArray->zeros(
-                $weight->shape,
-                ctx => $weight->context,
-                dtype => $weight->dtype
-            ),  # mean
-            AI::MXNet::NDArray->zeros(
-                $weight->shape,
-                ctx => $weight->context,
-                dtype => $weight->dtype
-            )   # variance
-    ];
-}
-
-method update(
-    Index $index,
-    AI::MXNet::NDArray $weight,
-    AI::MXNet::NDArray $grad,
-    ArrayRef[AI::MXNet::NDArray] $state
-)
-{
-    my $wd = $self->_get_wd($index);
-    my $lr = $self->_get_lr($index);
-    $self->_update_count($index);
-    my $t = $self->_index_update_count->{$index};
-    $grad = $grad * $self->rescale_grad + $wd * $weight;
-    if($self->clip_gradient)
-    {
-        $grad = AI::MXNet::NDArray->clip(
-            $grad,
-            -$self->clip_gradient,
-             $self->clip_gradient
-        );
-    }
-    # warming momentum schedule
-    my $momentum_t    = $self->beta1 * (1 - 0.5 * (0.96**($t * $self->schedule_decay)));
-    my $momentum_t_1  = $self->beta1 * (1 - 0.5 * (0.96**(($t + 1) * $self->schedule_decay)));
-    $self->m_schedule = $self->m_schedule * $momentum_t;
-    my $m_schedule_next  = $self->m_schedule * $momentum_t_1;
-
-    # update m_t and v_t
-    my ($m_t, $v_t) = @{ $state };
-    $m_t .= $self->beta1 * $m_t + (1 - $self->beta1) * $grad;
-    $v_t .= $self->beta2 * $v_t + (1 - $self->beta2) * $grad * $grad;
-
-    my $grad_prime = $grad / (1 - $self->m_schedule);
-    my $m_t_prime  = $m_t  / (1 - $m_schedule_next);
-    my $v_t_prime  = $v_t  / (1 - $self->beta2**$t);
-    my $m_t_bar    = (1 - $momentum_t) * $grad_prime + $momentum_t_1 * $m_t_prime;
-
-    # update weight
-    $weight -= $lr * $m_t_bar / (sqrt($v_t_prime) + $self->epsilon);
-}
-
-__PACKAGE__->register;
-
-=head1 NAME
-
-    AI::MXNet::Updater - Updater for kvstore
-=cut
-
-package AI::MXNet::Updater;
-use Mouse;
-use Storable qw(thaw freeze);
-use overload "&{}" => sub { my $self = shift; sub { $self->call(@_) } },
-             fallback => 1;
-
-has "optimizer"     => (is => "rw", isa => "AI::MXNet::Optimizer");
-has "states"        => (is => "rw", isa => "HashRef", default => sub { +{} });
-has "states_synced" => (is => "rw", isa => "HashRef", default => sub { +{} });
-
-method call(Index $index, AI::MXNet::NDArray $grad, AI::MXNet::NDArray $weight)
-{
-    if(not exists $self->states->{ $index })
-    {
-        $self->states->{ $index } = $self->optimizer->create_state($index, $weight);
-        $self->states_synced->{ $index } = 1;
-    }
-    elsif(not $self->states_synced->{ $index })
-    {
-        $self->states->{ $index } = $self->sync_state_context($self->states->{ $index }, $weight->context);
-        $self->states_synced->{ $index } = 1;
-    }
-    $self->optimizer->update($index, $weight, $grad, $self->states->{ $index });
-}
-*slice = *call;
-
-method sync_state_context(Maybe[AI::MXNet::NDArray|ArrayRef[AI::MXNet::NDArray]] $state, AI::MXNet::Context $context)
-{
-    if(blessed $state)
-    {
-        return $state->as_in_context($context);
-    }
-    elsif(ref $state)
-    {
-        return [map { $self->sync_state_context($_, $context) } @{ $state }];
-    }
-    return $state;
-}
-
-=head2 set_states
-
-    Sets updater states.
-=cut
-
-method set_states($states)
-{
-    my $thawed_states = thaw($states);
-    my ($optimizer);
-    if(ref $thawed_states eq 'ARRAY')
-    {
-        ($thawed_states, $optimizer) = @{ $thawed_states };
-        $self->optimizer($optimizer);
-    }
-    $self->states($thawed_states);
-    %{ $self->states_synced } = map { $_ => 0 } keys %{ $thawed_states };
-}
-
-=head2 get_states
-
-        Gets updater states.
-
-        Parameters
-        ----------
-        dump_optimizer : bool, default False
-            Whether to also save the optimizer itself. This would also save optimizer
-            information such as learning rate and weight decay schedules.
-=cut
-
-method get_states(Bool $dump_optimizer=0)
-{
-    if($dump_optimizer)
-    {
-        my $param_dict = $self->optimizer->param_dict;
-        $self->optimizer->param_dict({});
-        my $freezed = freeze([$self->states, $self->optimizer]);
-        $self->optimizer->param_dict($param_dict);
-        return $freezed;
-    }
-    return freeze($self->states);
-}
-
-package AI::MXNet::Optimizer;
-
-method get_updater(AI::MXNet::Optimizer $optimizer)
-{
-    return AI::MXNet::Updater->new(optimizer => $optimizer);
-}
-
-1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Profiler.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Profiler.pm
deleted file mode 100644
index 4a831b1b4dc2..000000000000
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Profiler.pm
+++ /dev/null
@@ -1,83 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-package AI::MXNet::Profiler;
-use strict;
-use warnings;
-use AI::MXNet::Base;
-use AI::MXNet::Function::Parameters;
-
-=head1 NAME
-
-    AI::MXNet::Profiler - Optional profiler feature.
-=cut
-
-=head1 DESCRIPTION
-
-    Optional profirer.
-=cut
-
-=head2 profiler_set_config
-
-    Set up the configure of profiler.
-
-    Parameters
-    ----------
-    kwargs : hash ref
-        Indicates configuration parameters with key/value pairs, listed below
-          profile_symbolic : boolean, whether to profile symbolic operators
-          profile_imperative : boolean, whether to profile imperative operators
-          profile_memory : boolean, whether to profile memory usage
-          profile_api : boolean, whether to profile the C API
-          file_name : string, output file for profile data
-          continuous_dump : boolean, whether to periodically dump profiling data to file
-          dump_period : float, seconds between profile data dumps
-=cut
-
-method profiler_set_config(HashRef[Str] $kwargs)
-{
-    check_call(AI::MXNet::SetProfilerConfig(scalar(keys %{ $kwargs }), $kwargs));
-}
-
-=head2 profiler_set_state
-
-    Set up the profiler state to record operator.
-
-    Parameters
-    ----------
-    state : int, optional
-        Indicting whether to run the profiler, can
-        be 'stop' - 0 or 'run' - 1. Default is `stop`.
-=cut
-
-method profiler_set_state(Int $state)
-{
-    check_call(AI::MXNet::SetProfilerState($state));
-}
-
-=head2 dump_profile
-
-    Dump profile and stop profiler. Use this to save profile
-    in advance in case your program cannot exit normally
-=cut
-
-method dump_profile()
-{
-    check_call(AI::MXNetCAPI::DumpProfile());
-}
-
-1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/RNN.pm b/perl-package/AI-MXNet/lib/AI/MXNet/RNN.pm
deleted file mode 100644
index 6ba9bf431e08..000000000000
--- a/perl-package/AI-MXNet/lib/AI/MXNet/RNN.pm
+++ /dev/null
@@ -1,182 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-package AI::MXNet::RNN;
-use strict;
-use warnings;
-use AI::MXNet::NS;
-use AI::MXNet::Function::Parameters;
-use AI::MXNet::RNN::IO;
-use AI::MXNet::RNN::Cell;
-use List::Util qw(max);
-
-=encoding UTF-8
-
-=head1 NAME
-
-    AI::MXNet::RNN - Functions for constructing recurrent neural networks.
-=cut
-
-=head1 SYNOPSIS
-
-
-=head1 DESCRIPTION
-
-    Functions for constructing recurrent neural networks.
-=cut
-
-=head2 save_rnn_checkpoint
-
-    Save checkpoint for model using RNN cells.
-    Unpacks weight before saving.
-
-    Parameters
-    ----------
-    cells : AI::MXNet::RNN::Cell or array ref of AI::MXNet::RNN::Cell
-        The RNN cells used by this symbol.
-    prefix : str
-        Prefix of model name.
-    epoch : int
-        The epoch number of the model.
-    symbol : Symbol
-        The input symbol
-    arg_params : hash ref of str to AI::MXNet::NDArray
-        Model parameter, hash ref of name to NDArray of net's weights.
-    aux_params : hash ref of str to AI::MXNet::NDArray
-        Model parameter, hash ref of name to NDArray of net's auxiliary states.
-
-    Notes
-    -----
-    - prefix-symbol.json will be saved for symbol.
-    - prefix-epoch.params will be saved for parameters.
-=cut
-
-method save_rnn_checkpoint(
-    AI::MXNet::RNN::Cell::Base|ArrayRef[AI::MXNet::RNN::Cell::Base] $cells,
-    Str                                                             $prefix,
-    Int                                                             $epoch,
-    AI::MXNet::Symbol                                               $symbol,
-    HashRef[AI::MXNet::NDArray]                                     $arg_params,
-    HashRef[AI::MXNet::NDArray]                                     $aux_params
-)
-{
-    $cells = [$cells] unless ref $cells eq 'ARRAY';
-    my %arg_params = %{ $arg_params };
-    %arg_params = %{ $_->unpack_weights(\%arg_params) } for @{ $cells };
-    AI::MXNet::Module->model_save_checkpoint($prefix, $epoch, $symbol, \%arg_params, $aux_params);
-}
-
-
-=head2 load_rnn_checkpoint
-
-    Load model checkpoint from file.
-    Pack weights after loading.
-
-    Parameters
-    ----------
-    cells : AI::MXNet::RNN::Cell or ir array ref of AI::MXNet::RNN::Cell
-        The RNN cells used by this symbol.
-    prefix : str
-        Prefix of model name.
-    epoch : int
-        Epoch number of model we would like to load.
-
-    Returns
-    -------
-    symbol : Symbol
-        The symbol configuration of computation network.
-    arg_params : hash ref of str to NDArray
-        Model parameter, dict of name to NDArray of net's weights.
-    aux_params : hash ref of str to NDArray
-        Model parameter, dict of name to NDArray of net's auxiliary states.
-
-    Notes
-    -----
-    - symbol will be loaded from prefix-symbol.json.
-    - parameters will be loaded from prefix-epoch.params.
-=cut
-
-method load_rnn_checkpoint(
-    AI::MXNet::RNN::Cell::Base|ArrayRef[AI::MXNet::RNN::Cell::Base] $cells,
-    Str                                                             $prefix,
-    Int                                                             $epoch
-)
-{
-    my ($sym, $arg, $aux) = AI::MXNet::Module->load_checkpoint($prefix, $epoch);
-    $cells = [$cells] unless ref $cells eq 'ARRAY';
-    $arg = $_->pack_weights($arg) for @{ $cells };
-    return ($sym, $arg, $aux);
-}
-
-=head2 do_rnn_checkpoint
-
-    Make a callback to checkpoint Module to prefix every epoch.
-    unpacks weights used by cells before saving.
-
-    Parameters
-    ----------
-    cells : subclass of RNN::Cell
-        RNN cells used by this module.
-    prefix : str
-        The file prefix to checkpoint to
-    period : int
-        How many epochs to wait before checkpointing. Default is 1.
-
-    Returns
-    -------
-    callback : function
-        The callback function that can be passed as iter_end_callback to fit.
-=cut
-
-method do_rnn_checkpoint(
-    AI::MXNet::RNN::Cell::Base|ArrayRef[AI::MXNet::RNN::Cell::Base]  $cells,
-    Str                                                              $prefix,
-    Int                                                              $period
-)
-{
-    $period = max(1, $period);
-    return sub {
-        my ($iter_no, $sym, $arg, $aux) = @_;
-        if (($iter_no + 1) % $period == 0)
-        {
-            __PACKAGE__->save_rnn_checkpoint($cells, $prefix, $iter_no+1, $sym, $arg, $aux);
-        }
-    };
-}
-
-## In order to closely resemble the Python's usage
-method RNNCell(@args)            { AI::MXNet::RNN::Cell->new(@args % 2 ? ('num_hidden', @args) : @args) }
-method LSTMCell(@args)           { AI::MXNet::RNN::LSTMCell->new(@args % 2 ? ('num_hidden', @args) : @args) }
-method GRUCell(@args)            { AI::MXNet::RNN::GRUCell->new(@args % 2 ? ('num_hidden', @args) : @args) }
-method FusedRNNCell(@args)       { AI::MXNet::RNN::FusedCell->new(@args % 2 ? ('num_hidden', @args) : @args) }
-method SequentialRNNCell(@args)  { AI::MXNet::RNN::SequentialCell->new(@args) }
-method BidirectionalCell(@args)  { AI::MXNet::RNN::BidirectionalCell->new(@args) }
-method DropoutCell(@args)        { AI::MXNet::RNN::DropoutCell->new(@args) }
-method ZoneoutCell(@args)        { AI::MXNet::RNN::ZoneoutCell->new(@args) }
-method ConvRNNCell(@args)        { AI::MXNet::RNN::ConvCell->new(@args) }
-method ConvLSTMCell(@args)       { AI::MXNet::RNN::ConvLSTMCell->new(@args) }
-method ConvGRUCell(@args)        { AI::MXNet::RNN::ConvGRUCell->new(@args) }
-method ResidualCell(@args)       { AI::MXNet::RNN::ResidualCell->new(@args) }
-method encode_sentences(@args)   { AI::MXNet::RNN::IO->encode_sentences(@args) }
-method BucketSentenceIter(@args)
-{
-    my $sentences  = shift(@args);
-    my $batch_size = shift(@args);
-    AI::MXNet::BucketSentenceIter->new(sentences => $sentences, batch_size => $batch_size, @args);
-}
-
-1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/RNN/Cell.pm b/perl-package/AI-MXNet/lib/AI/MXNet/RNN/Cell.pm
deleted file mode 100644
index 9dd88cbb029e..000000000000
--- a/perl-package/AI-MXNet/lib/AI/MXNet/RNN/Cell.pm
+++ /dev/null
@@ -1,2025 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-package AI::MXNet::RNN::Params;
-use Mouse;
-use AI::MXNet::Function::Parameters;
-
-=head1 NAME
-
-    AI::MXNet::RNN::Params - A container for holding variables.
-=cut
-
-=head1 DESCRIPTION
-
-    A container for holding variables.
-    Used by RNN cells for parameter sharing between cells.
-
-    Parameters
-    ----------
-    prefix : str
-        All variables name created by this container will
-        be prepended with the prefix
-=cut
-has '_prefix' => (is => 'ro', init_arg => 'prefix', isa => 'Str', default => '');
-has '_params' => (is => 'rw', init_arg => undef);
-around BUILDARGS => sub {
-    my $orig  = shift;
-    my $class = shift;
-    return $class->$orig(prefix => $_[0]) if @_ == 1;
-    return $class->$orig(@_);
-};
-
-sub BUILD
-{
-    my $self = shift;
-    $self->_params({});
-}
-
-
-=head2 get
-
-    Get a variable with the name or create a new one if does not exist.
-
-    Parameters
-    ----------
-    $name : str
-        name of the variable
-    @kwargs:
-        more arguments that are passed to mx->sym->Variable call
-=cut
-
-method get(Str $name, @kwargs)
-{
-    $name = $self->_prefix . $name;
-    if(not exists $self->_params->{$name})
-    {
-        $self->_params->{$name} = AI::MXNet::Symbol->Variable($name, @kwargs);
-    }
-    return $self->_params->{$name};
-}
-
-package AI::MXNet::RNN::Cell::Base;
-=head1 NAME
-
-    AI::MXNet::RNNCell::Base
-=cut
-
-=head1 DESCRIPTION
-
-    Abstract base class for RNN cells
-
-    Parameters
-    ----------
-    prefix : str
-        prefix for name of layers
-        (and name of weight if params is undef)
-    params : AI::MXNet::RNN::Params or undef
-        container for weight sharing between cells.
-        created if undef.
-=cut
-
-use AI::MXNet::Base;
-use Mouse;
-use overload "&{}"  => sub { my $self = shift; sub { $self->call(@_) } };
-has '_prefix'       => (is => 'rw', init_arg => 'prefix', isa => 'Str', default => '');
-has '_params'       => (is => 'rw', init_arg => 'params', isa => 'Maybe[AI::MXNet::RNN::Params]');
-has [qw/_own_params
-        _modified
-        _init_counter
-        _counter
-                 /] => (is => 'rw', init_arg => undef);
-
-around BUILDARGS => sub {
-    my $orig  = shift;
-    my $class = shift;
-    return $class->$orig(prefix => $_[0]) if @_ == 1;
-    return $class->$orig(@_);
-};
-
-sub BUILD
-{
-    my $self = shift;
-    if(not defined $self->_params)
-    {
-        $self->_own_params(1);
-        $self->_params(AI::MXNet::RNN::Params->new($self->_prefix));
-    }
-    else
-    {
-        $self->_own_params(0);
-    }
-    $self->_modified(0);
-    $self->reset;
-}
-
-=head2 reset
-
-    Reset before re-using the cell for another graph
-=cut
-
-method reset()
-{
-    $self->_init_counter(-1);
-    $self->_counter(-1);
-}
-
-=head2 call
-
-    Construct symbol for one step of RNN.
-
-    Parameters
-    ----------
-    $inputs : mx->sym->Variable
-        input symbol, 2D, batch * num_units
-    $states : mx->sym->Variable or ArrayRef[AI::MXNet::Symbol]
-        state from previous step or begin_state().
-
-    Returns
-    -------
-    $output : AI::MXNet::Symbol
-        output symbol
-    $states : ArrayRef[AI::MXNet::Symbol]
-        state to next step of RNN.
-    Can be called via overloaded &{}: &{$cell}($inputs, $states);
-=cut
-
-method call(AI::MXNet::Symbol $inputs, AI::MXNet::Symbol|ArrayRef[AI::MXNet::Symbol] $states)
-{
-    confess("Not Implemented");
-}
-
-method _gate_names()
-{
-    [''];
-}
-
-=head2 params
-
-    Parameters of this cell
-=cut
-
-method params()
-{
-    $self->_own_params(0);
-    return $self->_params;
-}
-
-=head2 state_shape
-
-    shape(s) of states
-=cut
-
-method state_shape()
-{
-    return [map { $_->{shape} } @{ $self->state_info }];
-}
-
-=head2 state_info
-
-    shape and layout information of states
-=cut
-
-method state_info()
-{
-    confess("Not Implemented");
-}
-
-=head2 begin_state
-
-    Initial state for this cell.
-
-    Parameters
-    ----------
-    :$func : sub ref, default is AI::MXNet::Symbol->can('zeros')
-        Function for creating initial state.
-        Can be AI::MXNet::Symbol->can('zeros'),
-        AI::MXNet::Symbol->can('uniform'), AI::MXNet::Symbol->can('Variable') etc.
-        Use AI::MXNet::Symbol->can('Variable') if you want to directly
-        feed the input as states.
-    @kwargs :
-        more keyword arguments passed to func. For example
-        mean, std, dtype, etc.
-
-    Returns
-    -------
-    $states : ArrayRef[AI::MXNet::Symbol]
-        starting states for first RNN step
-=cut
-
-method begin_state(CodeRef :$func=AI::MXNet::Symbol->can('zeros'), @kwargs)
-{
-    assert(
-        (not $self->_modified),
-        "After applying modifier cells (e.g. DropoutCell) the base "
-        ."cell cannot be called directly. Call the modifier cell instead."
-    );
-    my @states;
-    my $func_needs_named_name = $func ne AI::MXNet::Symbol->can('Variable');
-    for my $info (@{ $self->state_info })
-    {
-        $self->_init_counter($self->_init_counter + 1);
-        my @name = (sprintf("%sbegin_state_%d", $self->_prefix, $self->_init_counter));
-        my %info = %{ $info//{} };
-        if($func_needs_named_name)
-        {
-            unshift(@name, 'name');
-        }
-        else
-        {
-            if(exists $info{__layout__})
-            {
-                $info{kwargs} = { __layout__ => delete $info{__layout__} };
-            }
-        }
-        my %kwargs = (@kwargs, %info);
-        my $state = $func->(
-            'AI::MXNet::Symbol',
-            @name,
-            %kwargs
-        );
-        push @states, $state;
-    }
-    return \@states;
-}
-
-=head2 unpack_weights
-
-    Unpack fused weight matrices into separate
-    weight matrices
-
-    Parameters
-    ----------
-    $args : HashRef[AI::MXNet::NDArray]
-        hash ref containing packed weights.
-        usually from AI::MXNet::Module->get_output()
-
-    Returns
-    -------
-    $args : HashRef[AI::MXNet::NDArray]
-        hash ref with weights associated with
-        this cell, unpacked.
-=cut
-
-method unpack_weights(HashRef[AI::MXNet::NDArray] $args)
-{
-    my %args = %{ $args };
-    my $h = $self->_num_hidden;
-    for my $group_name ('i2h', 'h2h')
-    {
-        my $weight = delete $args{ sprintf('%s%s_weight', $self->_prefix, $group_name) };
-        my $bias   = delete $args{ sprintf('%s%s_bias', $self->_prefix, $group_name) };
-        enumerate(sub {
-            my ($j, $name) = @_;
-            my $wname = sprintf('%s%s%s_weight', $self->_prefix, $group_name, $name);
-            $args->{$wname} = $weight->slice([$j*$h,($j+1)*$h-1])->copy;
-            my $bname = sprintf('%s%s%s_bias', $self->_prefix, $group_name, $name);
-            $args->{$bname} = $bias->slice([$j*$h,($j+1)*$h-1])->copy;
-        }, $self->_gate_names);
-    }
-    return \%args;
-}
-
-=head2 pack_weights
-
-    Pack fused weight matrices into common
-    weight matrices
-
-    Parameters
-    ----------
-    args : HashRef[AI::MXNet::NDArray]
-        hash ref containing unpacked weights.
-
-    Returns
-    -------
-    $args : HashRef[AI::MXNet::NDArray]
-        hash ref with weights associated with
-        this cell, packed.
-=cut
-
-method pack_weights(HashRef[AI::MXNet::NDArray] $args)
-{
-    my %args = %{ $args };
-    my $h = $self->_num_hidden;
-    for my $group_name ('i2h', 'h2h')
-    {
-        my @weight;
-        my @bias;
-        for my $name (@{ $self->_gate_names })
-        {
-            my $wname = sprintf('%s%s%s_weight', $self->_prefix, $group_name, $name);
-            push @weight, delete $args{$wname};
-            my $bname = sprintf('%s%s%s_bias', $self->_prefix, $group_name, $name);
-            push @bias, delete $args{$bname};
-        }
-        $args{ sprintf('%s%s_weight', $self->_prefix, $group_name) } = AI::MXNet::NDArray->concatenate(
-            \@weight
-        );
-        $args{ sprintf('%s%s_bias', $self->_prefix, $group_name) } = AI::MXNet::NDArray->concatenate(
-            \@bias
-        );
-    }
-    return \%args;
-}
-
-=head2 unroll
-
-    Unroll an RNN cell across time steps.
-
-    Parameters
-    ----------
-    :$length : Int
-        number of steps to unroll
-    :$inputs : AI::MXNet::Symbol, array ref of Symbols, or undef
-        if inputs is a single Symbol (usually the output
-        of Embedding symbol), it should have shape
-        of [$batch_size, $length, ...] if layout == 'NTC' (batch, time series)
-        or ($length, $batch_size, ...) if layout == 'TNC' (time series, batch).
-
-        If inputs is a array ref of symbols (usually output of
-        previous unroll), they should all have shape
-        ($batch_size, ...).
-
-        If inputs is undef, a placeholder variables are
-        automatically created.
-    :$begin_state : array ref of Symbol
-        input states. Created by begin_state()
-        or output state of another cell. Created
-        from begin_state() if undef.
-    :$input_prefix : str
-        prefix for automatically created input
-        placehodlers.
-    :$layout : str
-        layout of input symbol. Only used if the input
-        is a single Symbol.
-    :$merge_outputs : Bool
-        If 0, returns outputs as an array ref of Symbols.
-        If 1, concatenates the output across the time steps
-        and returns a single symbol with the shape
-        [$batch_size, $length, ...) if the layout equal to 'NTC',
-        or [$length, $batch_size, ...) if the layout equal tp 'TNC'.
-        If undef, output whatever is faster
-
-    Returns
-    -------
-    $outputs : array ref of Symbol or Symbol
-        output symbols.
-    $states : Symbol or nested list of Symbol
-        has the same structure as begin_state()
-=cut
-
-
-method unroll(
-    Int $length,
-    Maybe[AI::MXNet::Symbol|ArrayRef[AI::MXNet::Symbol]] :$inputs=,
-    Maybe[AI::MXNet::Symbol|ArrayRef[AI::MXNet::Symbol]] :$begin_state=,
-    Str                                                  :$input_prefix='',
-    Str                                                  :$layout='NTC',
-    Maybe[Bool]                                          :$merge_outputs=
-)
-{
-    $self->reset;
-    my $axis = index($layout, 'T');
-    if(not defined $inputs)
-    {
-        $inputs = [
-            map { AI::MXNet::Symbol->Variable("${input_prefix}t${_}_data") } (0..$length-1)
-        ];
-    }
-    elsif(blessed($inputs))
-    {
-        assert(
-            (@{ $inputs->list_outputs() } == 1),
-            "unroll doesn't allow grouped symbol as input. Please "
-            ."convert to list first or let unroll handle slicing"
-        );
-        $inputs = AI::MXNet::Symbol->SliceChannel(
-            $inputs,
-            axis         => $axis,
-            num_outputs  => $length,
-            squeeze_axis => 1
-        );
-    }
-    else
-    {
-        assert(@$inputs == $length);
-    }
-    $begin_state //= $self->begin_state;
-    my $states = $begin_state;
-    my $outputs;
-    my @inputs = @{ $inputs };
-    for my $i (0..$length-1)
-    {
-        my $output;
-        ($output, $states) = $self->(
-            $inputs[$i],
-            $states
-        );
-        push @$outputs, $output;
-    }
-    if($merge_outputs)
-    {
-        @$outputs = map { AI::MXNet::Symbol->expand_dims($_, axis => $axis) } @$outputs;
-        $outputs = AI::MXNet::Symbol->Concat(@$outputs, dim => $axis);
-    }
-    return($outputs, $states);
-}
-
-method _get_activation($inputs, $activation, @kwargs)
-{
-    if(not ref $activation)
-    {
-        return AI::MXNet::Symbol->Activation($inputs, act_type => $activation, @kwargs);
-    }
-    else
-    {
-        return $activation->($inputs, @kwargs);
-    }
-}
-
-method _cells_state_shape($cells)
-{
-    return [map { @{ $_->state_shape } } @$cells];
-}
-
-method _cells_state_info($cells)
-{
-    return [map { @{ $_->state_info } } @$cells];
-}
-
-method _cells_begin_state($cells, @kwargs)
-{
-    return [map { @{ $_->begin_state(@kwargs) } } @$cells];
-}
-
-method _cells_unpack_weights($cells, $args)
-{
-    $args = $_->unpack_weights($args) for @$cells;
-    return $args;
-}
-
-method _cells_pack_weights($cells, $args)
-{
-    $args = $_->pack_weights($args) for @$cells;
-    return $args;
-}
-
-package AI::MXNet::RNN::Cell;
-use Mouse;
-extends 'AI::MXNet::RNN::Cell::Base';
-
-=head1 NAME
-
-    AI::MXNet::RNN::Cell
-=cut
-
-=head1 DESCRIPTION
-
-    Simple recurrent neural network cell
-
-    Parameters
-    ----------
-    num_hidden : int
-        number of units in output symbol
-    activation : str or Symbol, default 'tanh'
-        type of activation function
-    prefix : str, default 'rnn_'
-        prefix for name of layers
-        (and name of weight if params is undef)
-    params : AI::MXNet::RNNParams or undef
-        container for weight sharing between cells.
-        created if undef.
-=cut
-
-has '_num_hidden'  => (is => 'ro', init_arg => 'num_hidden', isa => 'Int', required => 1);
-has 'forget_bias'  => (is => 'ro', isa => 'Num');
-has '_activation'  => (
-    is       => 'ro',
-    init_arg => 'activation',
-    isa      => 'Activation',
-    default  => 'tanh'
-);
-has '+_prefix'    => (default => 'rnn_');
-has [qw/_iW _iB
-        _hW _hB/] => (is => 'rw', init_arg => undef);
-
-around BUILDARGS => sub {
-    my $orig  = shift;
-    my $class = shift;
-    return $class->$orig(num_hidden => $_[0]) if @_ == 1;
-    return $class->$orig(@_);
-};
-
-sub BUILD
-{
-    my $self = shift;
-    $self->_iW($self->params->get('i2h_weight'));
-    $self->_iB(
-        $self->params->get(
-            'i2h_bias',
-            (defined($self->forget_bias)
-                ? (init => AI::MXNet::LSTMBias->new(forget_bias => $self->forget_bias))
-                : ()
-            )
-        )
-    );
-    $self->_hW($self->params->get('h2h_weight'));
-    $self->_hB($self->params->get('h2h_bias'));
-}
-
-method state_info()
-{
-    return [{ shape => [0, $self->_num_hidden], __layout__ => 'NC' }];
-}
-
-method call(AI::MXNet::Symbol $inputs, SymbolOrArrayOfSymbols $states)
-{
-    $self->_counter($self->_counter + 1);
-    my $name = sprintf('%st%d_', $self->_prefix, $self->_counter);
-    my $i2h = AI::MXNet::Symbol->FullyConnected(
-        data       => $inputs,
-        weight     => $self->_iW,
-        bias       => $self->_iB,
-        num_hidden => $self->_num_hidden,
-        name       => "${name}i2h"
-    );
-    my $h2h = AI::MXNet::Symbol->FullyConnected(
-        data       => @{$states}[0],
-        weight     => $self->_hW,
-        bias       => $self->_hB,
-        num_hidden => $self->_num_hidden,
-        name       => "${name}h2h"
-    );
-    my $output = $self->_get_activation(
-        $i2h + $h2h,
-        $self->_activation,
-        name       => "${name}out"
-    );
-    return ($output, [$output]);
-}
-
-package AI::MXNet::RNN::LSTMCell;
-use Mouse;
-use AI::MXNet::Base;
-extends 'AI::MXNet::RNN::Cell';
-
-=head1 NAME
-
-    AI::MXNet::RNN::LSTMCell
-=cut
-
-=head1 DESCRIPTION
-
-    Long-Short Term Memory (LSTM) network cell.
-
-    Parameters
-    ----------
-    num_hidden : int
-        number of units in output symbol
-    prefix : str, default 'lstm_'
-        prefix for name of layers
-        (and name of weight if params is undef)
-    params : AI::MXNet::RNN::Params or None
-        container for weight sharing between cells.
-        created if undef.
-    forget_bias : bias added to forget gate, default 1.0.
-        Jozefowicz et al. 2015 recommends setting this to 1.0
-=cut
-
-has '+_prefix'     => (default => 'lstm_');
-has '+_activation' => (init_arg => undef);
-has '+forget_bias' => (is => 'ro', isa => 'Num', default => 1);
-
-method state_info()
-{
-    return [{ shape => [0, $self->_num_hidden], __layout__ => 'NC' } , { shape => [0, $self->_num_hidden], __layout__ => 'NC' }];
-}
-
-method _gate_names()
-{
-    [qw/_i _f _c _o/];
-}
-
-method call(AI::MXNet::Symbol $inputs, SymbolOrArrayOfSymbols $states)
-{
-    $self->_counter($self->_counter + 1);
-    my $name = sprintf('%st%d_', $self->_prefix, $self->_counter);
-    my @states = @{ $states };
-    my $i2h = AI::MXNet::Symbol->FullyConnected(
-        data       => $inputs,
-        weight     => $self->_iW,
-        bias       => $self->_iB,
-        num_hidden => $self->_num_hidden*4,
-        name       => "${name}i2h"
-    );
-    my $h2h = AI::MXNet::Symbol->FullyConnected(
-        data       => $states[0],
-        weight     => $self->_hW,
-        bias       => $self->_hB,
-        num_hidden => $self->_num_hidden*4,
-        name       => "${name}h2h"
-    );
-    my $gates = $i2h + $h2h;
-    my @slice_gates = @{ AI::MXNet::Symbol->SliceChannel(
-        $gates, num_outputs => 4, name => "${name}slice"
-    ) };
-    my $in_gate = AI::MXNet::Symbol->Activation(
-        $slice_gates[0], act_type => "sigmoid", name => "${name}i"
-    );
-    my $forget_gate = AI::MXNet::Symbol->Activation(
-        $slice_gates[1], act_type => "sigmoid", name => "${name}f"
-    );
-    my $in_transform = AI::MXNet::Symbol->Activation(
-        $slice_gates[2], act_type => "tanh", name => "${name}c"
-    );
-    my $out_gate = AI::MXNet::Symbol->Activation(
-        $slice_gates[3], act_type => "sigmoid", name => "${name}o"
-    );
-    my $next_c = AI::MXNet::Symbol->_plus(
-        $forget_gate * $states[1], $in_gate * $in_transform,
-        name => "${name}state"
-    );
-    my $next_h = AI::MXNet::Symbol->_mul(
-        $out_gate,
-        AI::MXNet::Symbol->Activation(
-            $next_c, act_type => "tanh"
-        ),
-        name => "${name}out"
-    );
-    return ($next_h, [$next_h, $next_c]);
-
-}
-
-package AI::MXNet::RNN::GRUCell;
-use Mouse;
-use AI::MXNet::Base;
-extends 'AI::MXNet::RNN::Cell';
-
-=head1 NAME
-
-    AI::MXNet::RNN::GRUCell
-=cut
-
-=head1 DESCRIPTION
-
-    Gated Rectified Unit (GRU) network cell.
-    Note: this is an implementation of the cuDNN version of GRUs
-    (slight modification compared to Cho et al. 2014).
-
-    Parameters
-    ----------
-    num_hidden : int
-        number of units in output symbol
-    prefix : str, default 'gru_'
-        prefix for name of layers
-        (and name of weight if params is undef)
-    params : AI::MXNet::RNN::Params or undef
-        container for weight sharing between cells.
-        created if undef.
-=cut
-
-has '+_prefix'     => (default => 'gru_');
-
-method _gate_names()
-{
-    [qw/_r _z _o/];
-}
-
-method call(AI::MXNet::Symbol $inputs, SymbolOrArrayOfSymbols $states)
-{
-    $self->_counter($self->_counter + 1);
-    my $name = sprintf('%st%d_', $self->_prefix, $self->_counter);
-    my $prev_state_h = @{ $states }[0];
-    my $i2h = AI::MXNet::Symbol->FullyConnected(
-        data       => $inputs,
-        weight     => $self->_iW,
-        bias       => $self->_iB,
-        num_hidden => $self->_num_hidden*3,
-        name       => "${name}i2h"
-    );
-    my $h2h = AI::MXNet::Symbol->FullyConnected(
-        data       => $prev_state_h,
-        weight     => $self->_hW,
-        bias       => $self->_hB,
-        num_hidden => $self->_num_hidden*3,
-        name       => "${name}h2h"
-    );
-    my ($i2h_r, $i2h_z);
-    ($i2h_r, $i2h_z, $i2h) = @{ AI::MXNet::Symbol->SliceChannel(
-        $i2h, num_outputs => 3, name => "${name}_i2h_slice"
-    ) };
-    my ($h2h_r, $h2h_z);
-    ($h2h_r, $h2h_z, $h2h) = @{ AI::MXNet::Symbol->SliceChannel(
-        $h2h, num_outputs => 3, name => "${name}_h2h_slice"
-    ) };
-    my $reset_gate = AI::MXNet::Symbol->Activation(
-        $i2h_r + $h2h_r, act_type => "sigmoid", name => "${name}_r_act"
-    );
-    my $update_gate = AI::MXNet::Symbol->Activation(
-        $i2h_z + $h2h_z, act_type => "sigmoid", name => "${name}_z_act"
-    );
-    my $next_h_tmp = AI::MXNet::Symbol->Activation(
-        $i2h + $reset_gate * $h2h, act_type => "tanh", name => "${name}_h_act"
-    );
-    my $next_h = AI::MXNet::Symbol->_plus(
-        (1 - $update_gate) * $next_h_tmp, $update_gate * $prev_state_h,
-        name => "${name}out"
-    );
-    return ($next_h, [$next_h]);
-}
-
-package AI::MXNet::RNN::FusedCell;
-use Mouse;
-use AI::MXNet::Types;
-use AI::MXNet::Base;
-extends 'AI::MXNet::RNN::Cell::Base';
-
-=head1 NAME
-
-    AI::MXNet::RNN::FusedCell
-=cut
-
-=head1 DESCRIPTION
-
-    Fusing RNN layers across time step into one kernel.
-    Improves speed but is less flexible. Currently only
-    supported if using cuDNN on GPU.
-=cut
-
-has '_num_hidden'      => (is => 'ro', isa => 'Int',  init_arg => 'num_hidden',     required => 1);
-has '_num_layers'      => (is => 'ro', isa => 'Int',  init_arg => 'num_layers',     default => 1);
-has '_dropout'         => (is => 'ro', isa => 'Num',  init_arg => 'dropout',        default => 0);
-has '_get_next_state'  => (is => 'ro', isa => 'Bool', init_arg => 'get_next_state', default => 0);
-has '_bidirectional'   => (is => 'ro', isa => 'Bool', init_arg => 'bidirectional',  default => 0);
-has 'forget_bias'      => (is => 'ro', isa => 'Num',  default => 1);
-has 'initializer'      => (is => 'rw', isa => 'Maybe[Initializer]');
-has '_mode'            => (
-    is => 'ro',
-    isa => enum([qw/rnn_relu rnn_tanh lstm gru/]),
-    init_arg => 'mode',
-    default => 'lstm'
-);
-has [qw/_parameter
-        _directions/] => (is => 'rw', init_arg => undef);
-
-around BUILDARGS => sub {
-    my $orig  = shift;
-    my $class = shift;
-    return $class->$orig(num_hidden => $_[0]) if @_ == 1;
-    return $class->$orig(@_);
-};
-
-sub BUILD
-{
-    my $self = shift;
-    if(not $self->_prefix)
-    {
-        $self->_prefix($self->_mode.'_');
-    }
-    if(not defined $self->initializer)
-    {
-        $self->initializer(
-            AI::MXNet::Xavier->new(
-                factor_type => 'in',
-                magnitude   => 2.34
-            )
-        );
-    }
-    if(not $self->initializer->isa('AI::MXNet::FusedRNN'))
-    {
-        $self->initializer(
-            AI::MXNet::FusedRNN->new(
-                init           => $self->initializer,
-                num_hidden     => $self->_num_hidden,
-                num_layers     => $self->_num_layers,
-                mode           => $self->_mode,
-                bidirectional  => $self->_bidirectional,
-                forget_bias    => $self->forget_bias
-            )
-        );
-    }
-    $self->_parameter($self->params->get('parameters', init => $self->initializer));
-    $self->_directions($self->_bidirectional ? [qw/l r/] : ['l']);
-}
-
-
-method state_info()
-{
-    my $b = @{ $self->_directions };
-    my $n = $self->_mode eq 'lstm' ? 2 : 1;
-    return [map { +{ shape => [$b*$self->_num_layers, 0, $self->_num_hidden], __layout__ => 'LNC' } } 0..$n-1];
-}
-
-method _gate_names()
-{
-    return {
-        rnn_relu => [''],
-        rnn_tanh => [''],
-        lstm     => [qw/_i _f _c _o/],
-        gru      => [qw/_r _z _o/]
-    }->{ $self->_mode };
-}
-
-method _num_gates()
-{
-    return scalar(@{ $self->_gate_names })
-}
-
-method _slice_weights($arr, $li, $lh)
-{
-    my %args;
-    my @gate_names = @{ $self->_gate_names };
-    my @directions = @{ $self->_directions };
-
-    my $b = @directions;
-    my $p = 0;
-    for my $layer (0..$self->_num_layers-1)
-    {
-        for my $direction (@directions)
-        {
-            for my $gate (@gate_names)
-            {
-                my $name = sprintf('%s%s%d_i2h%s_weight', $self->_prefix, $direction, $layer, $gate);
-                my $size;
-                if($layer > 0)
-                {
-                    $size = $b*$lh*$lh;
-                    $args{$name} = $arr->slice([$p,$p+$size-1])->reshape([$lh, $b*$lh]);
-                }
-                else
-                {
-                    $size = $li*$lh;
-                    $args{$name} = $arr->slice([$p,$p+$size-1])->reshape([$lh, $li]);
-                }
-                $p += $size;
-            }
-            for my $gate (@gate_names)
-            {
-                my $name = sprintf('%s%s%d_h2h%s_weight', $self->_prefix, $direction, $layer, $gate);
-                my $size = $lh**2;
-                $args{$name} = $arr->slice([$p,$p+$size-1])->reshape([$lh, $lh]);
-                $p += $size;
-            }
-        }
-    }
-    for my $layer (0..$self->_num_layers-1)
-    {
-        for my $direction (@directions)
-        {
-            for my $gate (@gate_names)
-            {
-                my $name = sprintf('%s%s%d_i2h%s_bias', $self->_prefix, $direction, $layer, $gate);
-                $args{$name} = $arr->slice([$p,$p+$lh-1]);
-                $p += $lh;
-            }
-            for my $gate (@gate_names)
-            {
-                my $name = sprintf('%s%s%d_h2h%s_bias', $self->_prefix, $direction, $layer, $gate);
-                $args{$name} = $arr->slice([$p,$p+$lh-1]);
-                $p += $lh;
-            }
-        }
-    }
-    assert($p == $arr->size, "Invalid parameters size for FusedRNNCell");
-    return %args;
-}
-
-method unpack_weights(HashRef[AI::MXNet::NDArray] $args)
-{
-    my %args = %{ $args };
-    my $arr = delete $args{ $self->_parameter->name };
-    my $b = @{ $self->_directions };
-    my $m = $self->_num_gates;
-    my $h = $self->_num_hidden;
-    my $num_input = int(int(int($arr->size/$b)/$h)/$m) - ($self->_num_layers - 1)*($h+$b*$h+2) - $h - 2;
-    my %nargs = $self->_slice_weights($arr, $num_input, $self->_num_hidden);
-    %args = (%args, map { $_ => $nargs{$_}->copy } keys %nargs);
-    return \%args
-}
-
-method pack_weights(HashRef[AI::MXNet::NDArray] $args)
-{
-    my %args = %{ $args };
-    my $b = @{ $self->_directions };
-    my $m = $self->_num_gates;
-    my @c = @{ $self->_gate_names };
-    my $h = $self->_num_hidden;
-    my $w0 = $args{ sprintf('%sl0_i2h%s_weight', $self->_prefix, $c[0]) };
-    my $num_input = $w0->shape->[1];
-    my $total = ($num_input+$h+2)*$h*$m*$b + ($self->_num_layers-1)*$m*$h*($h+$b*$h+2)*$b;
-    my $arr = AI::MXNet::NDArray->zeros([$total], ctx => $w0->context, dtype => $w0->dtype);
-    my %nargs = $self->_slice_weights($arr, $num_input, $h);
-    while(my ($name, $nd) = each %nargs)
-    {
-        $nd .= delete $args{ $name };
-    }
-    $args{ $self->_parameter->name } = $arr;
-    return \%args;
-}
-
-method call(AI::MXNet::Symbol $inputs, SymbolOrArrayOfSymbols $states)
-{
-    confess("AI::MXNet::RNN::FusedCell cannot be stepped. Please use unroll");
-}
-
-method unroll(
-    Int $length,
-    Maybe[AI::MXNet::Symbol|ArrayRef[AI::MXNet::Symbol]] :$inputs=,
-    Maybe[AI::MXNet::Symbol|ArrayRef[AI::MXNet::Symbol]] :$begin_state=,
-    Str                                                  :$input_prefix='',
-    Str                                                  :$layout='NTC',
-    Maybe[Bool]                                          :$merge_outputs=
-)
-{
-    $self->reset;
-    my $axis = index($layout, 'T');
-    $inputs //= AI::MXNet::Symbol->Variable("${input_prefix}data");
-    if(blessed($inputs))
-    {
-        assert(
-            (@{ $inputs->list_outputs() } == 1),
-            "unroll doesn't allow grouped symbol as input. Please "
-            ."convert to list first or let unroll handle slicing"
-        );
-        if($axis == 1)
-        {
-            AI::MXNet::Logging->warning(
-                "NTC layout detected. Consider using "
-                ."TNC for RNN::FusedCell for faster speed"
-            );
-            $inputs = AI::MXNet::Symbol->SwapAxis($inputs, dim1 => 0, dim2 => 1);
-        }
-        else
-        {
-            assert($axis == 0, "Unsupported layout $layout");
-        }
-    }
-    else
-    {
-        assert(@$inputs == $length);
-        $inputs = [map { AI::MXNet::Symbol->expand_dims($_, axis => 0) } @{ $inputs }];
-        $inputs = AI::MXNet::Symbol->Concat(@{ $inputs }, dim => 0);
-    }
-    $begin_state //= $self->begin_state;
-    my $states = $begin_state;
-    my @states = @{ $states };
-    my %states;
-    if($self->_mode eq 'lstm')
-    {
-        %states = (state => $states[0], state_cell => $states[1]);
-    }
-    else
-    {
-        %states = (state => $states[0]);
-    }
-    my $rnn = AI::MXNet::Symbol->RNN(
-        data          => $inputs,
-        parameters    => $self->_parameter,
-        state_size    => $self->_num_hidden,
-        num_layers    => $self->_num_layers,
-        bidirectional => $self->_bidirectional,
-        p             => $self->_dropout,
-        state_outputs => $self->_get_next_state,
-        mode          => $self->_mode,
-        name          => $self->_prefix.'rnn',
-        %states
-    );
-    my $outputs;
-    my %attr = (__layout__ => 'LNC');
-    if(not $self->_get_next_state)
-    {
-        ($outputs, $states) = ($rnn, []);
-    }
-    elsif($self->_mode eq 'lstm')
-    {
-        my @rnn = @{ $rnn };
-        $rnn[1]->_set_attr(%attr);
-        $rnn[2]->_set_attr(%attr);
-        ($outputs, $states) = ($rnn[0], [$rnn[1], $rnn[2]]);
-    }
-    else
-    {
-        my @rnn = @{ $rnn };
-        $rnn[1]->_set_attr(%attr);
-        ($outputs, $states) = ($rnn[0], [$rnn[1]]);
-    }
-    if(defined $merge_outputs and not $merge_outputs)
-    {
-        AI::MXNet::Logging->warning(
-            "Call RNN::FusedCell->unroll with merge_outputs=1 "
-            ."for faster speed"
-        );
-        $outputs = [@ {
-            AI::MXNet::Symbol->SliceChannel(
-                $outputs,
-                axis         => 0,
-                num_outputs  => $length,
-                squeeze_axis => 1
-            )
-        }];
-    }
-    elsif($axis == 1)
-    {
-        $outputs = AI::MXNet::Symbol->SwapAxis($outputs, dim1 => 0, dim2 => 1);
-    }
-    return ($outputs, $states);
-}
-
-=head2 unfuse
-
-    Unfuse the fused RNN
-
-    Returns
-    -------
-    $cell : AI::MXNet::RNN::SequentialCell
-        unfused cell that can be used for stepping, and can run on CPU.
-=cut
-
-method unfuse()
-{
-    my $stack = AI::MXNet::RNN::SequentialCell->new;
-    my $get_cell = {
-        rnn_relu => sub {
-            AI::MXNet::RNN::Cell->new(
-                num_hidden => $self->_num_hidden,
-                activation => 'relu',
-                prefix     => shift
-            )
-        },
-        rnn_tanh => sub {
-            AI::MXNet::RNN::Cell->new(
-                num_hidden => $self->_num_hidden,
-                activation => 'tanh',
-                prefix     => shift
-            )
-        },
-        lstm     => sub {
-            AI::MXNet::RNN::LSTMCell->new(
-                num_hidden => $self->_num_hidden,
-                prefix     => shift
-            )
-        },
-        gru      => sub {
-            AI::MXNet::RNN::GRUCell->new(
-                num_hidden => $self->_num_hidden,
-                prefix     => shift
-            )
-        },
-    }->{ $self->_mode };
-    for my $i (0..$self->_num_layers-1)
-    {
-        if($self->_bidirectional)
-        {
-            $stack->add(
-                AI::MXNet::RNN::BidirectionalCell->new(
-                    $get_cell->(sprintf('%sl%d_', $self->_prefix, $i)),
-                    $get_cell->(sprintf('%sr%d_', $self->_prefix, $i)),
-                    output_prefix => sprintf('%sbi_%s_%d', $self->_prefix, $self->_mode, $i)
-                )
-            );
-        }
-        else
-        {
-            $stack->add($get_cell->(sprintf('%sl%d_', $self->_prefix, $i)));
-        }
-    }
-    return $stack;
-}
-
-package AI::MXNet::RNN::SequentialCell;
-use Mouse;
-use AI::MXNet::Base;
-extends 'AI::MXNet::RNN::Cell::Base';
-
-=head1 NAME
-
-    AI:MXNet::RNN::SequentialCell
-=cut
-
-=head1 DESCRIPTION
-
-    Sequentially stacking multiple RNN cells
-
-    Parameters
-    ----------
-    params : AI::MXNet::RNN::Params or undef
-        container for weight sharing between cells.
-        created if undef.
-=cut
-
-has [qw/_override_cell_params _cells/] => (is => 'rw', init_arg => undef);
-
-sub BUILD
-{
-    my ($self, $original_arguments) = @_;
-    $self->_override_cell_params(defined $original_arguments->{params});
-    $self->_cells([]);
-}
-
-=head2 add
-
-    Append a cell to the stack.
-
-    Parameters
-    ----------
-    $cell : AI::MXNet::RNN::Cell::Base
-=cut
-
-method add(AI::MXNet::RNN::Cell::Base $cell)
-{
-    push @{ $self->_cells }, $cell;
-    if($self->_override_cell_params)
-    {
-        assert(
-            $cell->_own_params,
-            "Either specify params for SequentialRNNCell "
-            ."or child cells, not both."
-        );
-        %{ $cell->params->_params } = (%{ $cell->params->_params }, %{ $self->params->_params });
-    }
-    %{ $self->params->_params } = (%{ $self->params->_params }, %{ $cell->params->_params });
-}
-
-method state_info()
-{
-    return $self->_cells_state_info($self->_cells);
-}
-
-method begin_state(@kwargs)
-{
-    assert(
-        (not $self->_modified),
-        "After applying modifier cells (e.g. DropoutCell) the base "
-        ."cell cannot be called directly. Call the modifier cell instead."
-    );
-    return $self->_cells_begin_state($self->_cells, @kwargs);
-}
-
-method unpack_weights(HashRef[AI::MXNet::NDArray] $args)
-{
-    return $self->_cells_unpack_weights($self->_cells, $args)
-}
-
-method pack_weights(HashRef[AI::MXNet::NDArray] $args)
-{
-    return $self->_cells_pack_weights($self->_cells, $args);
-}
-
-method call($inputs, $states)
-{
-    $self->_counter($self->_counter + 1);
-    my @next_states;
-    my $p = 0;
-    for my $cell (@{ $self->_cells })
-    {
-        assert(not $cell->isa('AI::MXNet::BidirectionalCell'));
-        my $n = scalar(@{ $cell->state_info });
-        my $state = [@{ $states }[$p..$p+$n-1]];
-        $p += $n;
-        ($inputs, $state) = $cell->($inputs, $state);
-        push @next_states, $state;
-    }
-    return ($inputs, [map { @$_} @next_states]);
-}
-
-method unroll(
-    Int $length,
-    Maybe[AI::MXNet::Symbol|ArrayRef[AI::MXNet::Symbol]] :$inputs=,
-    Maybe[AI::MXNet::Symbol|ArrayRef[AI::MXNet::Symbol]] :$begin_state=,
-    Str                                                  :$input_prefix='',
-    Str                                                  :$layout='NTC',
-    Maybe[Bool]                                          :$merge_outputs=
-)
-{
-    my $num_cells = @{ $self->_cells };
-    $begin_state //= $self->begin_state;
-    my $p = 0;
-    my $states;
-    my @next_states;
-    enumerate(sub {
-        my ($i, $cell) = @_;
-        my $n   = @{ $cell->state_info };
-        $states = [@{$begin_state}[$p..$p+$n-1]];
-        $p += $n;
-        ($inputs, $states) = $cell->unroll(
-            $length,
-            inputs          => $inputs,
-            input_prefix    => $input_prefix,
-            begin_state     => $states,
-            layout          => $layout,
-            merge_outputs   => ($i < $num_cells-1) ? undef : $merge_outputs
-        );
-        push @next_states, $states;
-    }, $self->_cells);
-    return ($inputs, [map { @{ $_ } } @next_states]);
-}
-
-package AI::MXNet::RNN::BidirectionalCell;
-use Mouse;
-use AI::MXNet::Base;
-extends 'AI::MXNet::RNN::Cell::Base';
-
-=head1 NAME
-
-    AI::MXNet::RNN::BidirectionalCell
-=cut
-
-=head1 DESCRIPTION
-
-    Bidirectional RNN cell
-
-    Parameters
-    ----------
-    l_cell : AI::MXNet::RNN::Cell::Base
-        cell for forward unrolling
-    r_cell : AI::MXNet::RNN::Cell::Base
-        cell for backward unrolling
-    output_prefix : str, default 'bi_'
-        prefix for name of output
-=cut
-
-has 'l_cell'         => (is => 'ro', isa => 'AI::MXNet::RNN::Cell::Base', required => 1);
-has 'r_cell'         => (is => 'ro', isa => 'AI::MXNet::RNN::Cell::Base', required => 1);
-has '_output_prefix' => (is => 'ro', init_arg => 'output_prefix', isa => 'Str', default => 'bi_');
-has [qw/_override_cell_params _cells/] => (is => 'rw', init_arg => undef);
-
-around BUILDARGS => sub {
-    my $orig  = shift;
-    my $class = shift;
-    if(@_ >= 2 and blessed $_[0] and blessed $_[1])
-    {
-        my $l_cell = shift(@_);
-        my $r_cell = shift(@_);
-        return $class->$orig(
-            l_cell => $l_cell,
-            r_cell => $r_cell,
-            @_
-        );
-    }
-    return $class->$orig(@_);
-};
-
-sub BUILD
-{
-    my ($self, $original_arguments) = @_;
-    $self->_override_cell_params(defined $original_arguments->{params});
-    if($self->_override_cell_params)
-    {
-        assert(
-            ($self->l_cell->_own_params and $self->r_cell->_own_params),
-            "Either specify params for BidirectionalCell ".
-            "or child cells, not both."
-        );
-        %{ $self->l_cell->params->_params } = (%{ $self->l_cell->params->_params }, %{ $self->params->_params });
-        %{ $self->r_cell->params->_params } = (%{ $self->r_cell->params->_params }, %{ $self->params->_params });
-    }
-    %{ $self->params->_params } = (%{ $self->params->_params }, %{ $self->l_cell->params->_params });
-    %{ $self->params->_params } = (%{ $self->params->_params }, %{ $self->r_cell->params->_params });
-    $self->_cells([$self->l_cell, $self->r_cell]);
-}
-
-method unpack_weights(HashRef[AI::MXNet::NDArray] $args)
-{
-    return $self->_cells_unpack_weights($self->_cells, $args)
-}
-
-method pack_weights(HashRef[AI::MXNet::NDArray] $args)
-{
-    return $self->_cells_pack_weights($self->_cells, $args);
-}
-
-method call($inputs, $states)
-{
-    confess("Bidirectional cannot be stepped. Please use unroll");
-}
-
-method state_info()
-{
-    return $self->_cells_state_info($self->_cells);
-}
-
-method begin_state(@kwargs)
-{
-    assert((not $self->_modified),
-            "After applying modifier cells (e.g. DropoutCell) the base "
-            ."cell cannot be called directly. Call the modifier cell instead."
-    );
-    return $self->_cells_begin_state($self->_cells, @kwargs);
-}
-
-method unroll(
-    Int $length,
-    Maybe[AI::MXNet::Symbol|ArrayRef[AI::MXNet::Symbol]] :$inputs=,
-    Maybe[AI::MXNet::Symbol|ArrayRef[AI::MXNet::Symbol]] :$begin_state=,
-    Str                                                  :$input_prefix='',
-    Str                                                  :$layout='NTC',
-    Maybe[Bool]                                          :$merge_outputs=
-)
-{
-
-    my $axis = index($layout, 'T');
-    if(not defined $inputs)
-    {
-        $inputs = [
-            map { AI::MXNet::Symbol->Variable("${input_prefix}t${_}_data") } (0..$length-1)
-        ];
-    }
-    elsif(blessed($inputs))
-    {
-        assert(
-            (@{ $inputs->list_outputs() } == 1),
-            "unroll doesn't allow grouped symbol as input. Please "
-            ."convert to list first or let unroll handle slicing"
-        );
-        $inputs = [ @{ AI::MXNet::Symbol->SliceChannel(
-            $inputs,
-            axis         => $axis,
-            num_outputs  => $length,
-            squeeze_axis => 1
-        ) }];
-    }
-    else
-    {
-        assert(@$inputs == $length);
-    }
-    $begin_state //= $self->begin_state;
-    my $states = $begin_state;
-    my ($l_cell, $r_cell) = @{ $self->_cells };
-    my ($l_outputs, $l_states) = $l_cell->unroll(
-        $length, inputs => $inputs,
-        begin_state     => [@{$states}[0..@{$l_cell->state_info}-1]],
-        layout          => $layout,
-        merge_outputs   => $merge_outputs
-    );
-    my ($r_outputs, $r_states) = $r_cell->unroll(
-        $length, inputs => [reverse @{$inputs}],
-        begin_state     => [@{$states}[@{$l_cell->state_info}..@{$states}-1]],
-        layout          => $layout,
-        merge_outputs   => $merge_outputs
-    );
-    if(not defined $merge_outputs)
-    {
-        $merge_outputs = (
-            blessed $l_outputs and $l_outputs->isa('AI::MXNet::Symbol')
-                and
-            blessed $r_outputs and $r_outputs->isa('AI::MXNet::Symbol')
-        );
-        if(not $merge_outputs)
-        {
-            if(blessed $l_outputs and $l_outputs->isa('AI::MXNet::Symbol'))
-            {
-                $l_outputs = [
-                    @{ AI::MXNet::Symbol->SliceChannel(
-                        $l_outputs, axis => $axis,
-                        num_outputs      => $length,
-                        squeeze_axis     => 1
-                    ) }
-                ];
-            }
-            if(blessed $r_outputs and $r_outputs->isa('AI::MXNet::Symbol'))
-            {
-                $r_outputs = [
-                    @{ AI::MXNet::Symbol->SliceChannel(
-                        $r_outputs, axis => $axis,
-                        num_outputs      => $length,
-                        squeeze_axis     => 1
-                    ) }
-                ];
-            }
-        }
-    }
-    if($merge_outputs)
-    {
-        $l_outputs = [@{ $l_outputs }];
-        $r_outputs = [@{ AI::MXNet::Symbol->reverse(blessed $r_outputs ? $r_outputs : @{ $r_outputs }, axis=>$axis) }];
-    }
-    else
-    {
-        $r_outputs = [reverse(@{ $r_outputs })];
-    }
-    my $outputs = [];
-    for(zip([0..@{ $l_outputs }-1], [@{ $l_outputs }], [@{ $r_outputs }])) {
-        my ($i, $l_o, $r_o) = @$_;
-        push @$outputs, AI::MXNet::Symbol->Concat(
-            $l_o, $r_o, dim=>(1+($merge_outputs?1:0)),
-            name => $merge_outputs
-                        ? sprintf('%sout', $self->_output_prefix)
-                        : sprintf('%st%d', $self->_output_prefix, $i)
-        );
-    }
-    if($merge_outputs)
-    {
-        $outputs = @{ $outputs }[0];
-    }
-    $states = [$l_states, $r_states];
-    return($outputs, $states);
-}
-
-package AI::MXNet::RNN::ConvCell::Base;
-use Mouse;
-use AI::MXNet::Base;
-extends 'AI::MXNet::RNN::Cell::Base';
-
-=head1 NAME
-
-    AI::MXNet::RNN::Conv::Base
-=cut
-
-=head1 DESCRIPTION
-
-    Abstract base class for Convolutional RNN cells
-
-=cut
-
-has '_h2h_kernel'  => (is => 'ro', isa => 'Shape', init_arg => 'h2h_kernel');
-has '_h2h_dilate'  => (is => 'ro', isa => 'Shape', init_arg => 'h2h_dilate');
-has '_h2h_pad'     => (is => 'rw', isa => 'Shape', init_arg => undef);
-has '_i2h_kernel'  => (is => 'ro', isa => 'Shape', init_arg => 'i2h_kernel');
-has '_i2h_stride'  => (is => 'ro', isa => 'Shape', init_arg => 'i2h_stride');
-has '_i2h_dilate'  => (is => 'ro', isa => 'Shape', init_arg => 'i2h_dilate');
-has '_i2h_pad'     => (is => 'ro', isa => 'Shape', init_arg => 'i2h_pad');
-has '_num_hidden'  => (is => 'ro', isa => 'DimSize', init_arg => 'num_hidden');
-has '_input_shape' => (is => 'ro', isa => 'Shape', init_arg => 'input_shape');
-has '_conv_layout' => (is => 'ro', isa => 'Str', init_arg => 'conv_layout', default => 'NCHW');
-has '_activation'  => (is => 'ro', init_arg => 'activation');
-has '_state_shape' => (is => 'rw', init_arg => undef);
-has [qw/i2h_weight_initializer h2h_weight_initializer
-    i2h_bias_initializer h2h_bias_initializer/] => (is => 'rw', isa => 'Maybe[Initializer]');
-
-sub BUILD
-{
-    my $self = shift;
-    assert (
-        ($self->_h2h_kernel->[0] % 2 == 1 and $self->_h2h_kernel->[1] % 2 == 1),
-        "Only support odd numbers, got h2h_kernel= (@{[ $self->_h2h_kernel ]})"
-    );
-    $self->_h2h_pad([
-        int($self->_h2h_dilate->[0] * ($self->_h2h_kernel->[0] - 1) / 2),
-        int($self->_h2h_dilate->[1] * ($self->_h2h_kernel->[1] - 1) / 2)
-    ]);
-    # Infer state shape
-    my $data = AI::MXNet::Symbol->Variable('data');
-    my $state_shape = AI::MXNet::Symbol->Convolution(
-        data => $data,
-        num_filter => $self->_num_hidden,
-        kernel => $self->_i2h_kernel,
-        stride => $self->_i2h_stride,
-        pad => $self->_i2h_pad,
-        dilate => $self->_i2h_dilate,
-        layout => $self->_conv_layout
-    );
-    $state_shape = ($state_shape->infer_shape(data=>$self->_input_shape))[1]->[0];
-    $state_shape->[0] = 0;
-    $self->_state_shape($state_shape);
-}
-
-method state_info()
-{
-    return [
-                { shape => $self->_state_shape, __layout__ => $self->_conv_layout },
-                { shape => $self->_state_shape, __layout__ => $self->_conv_layout }
-    ];
-}
-
-method call($inputs, $states)
-{
-    confess("AI::MXNet::RNN::ConvCell::Base is abstract class for convolutional RNN");
-}
-
-package AI::MXNet::RNN::ConvCell;
-use Mouse;
-extends 'AI::MXNet::RNN::ConvCell::Base';
-
-=head1 NAME
-
-    AI::MXNet::RNN::ConvCell
-=cut
-
-=head1 DESCRIPTION
-
-    Convolutional RNN cells
-
-    Parameters
-    ----------
-    input_shape : array ref of int
-        Shape of input in single timestep.
-    num_hidden : int
-        Number of units in output symbol.
-    h2h_kernel : array ref of int, default (3, 3)
-        Kernel of Convolution operator in state-to-state transitions.
-    h2h_dilate : array ref of int, default (1, 1)
-        Dilation of Convolution operator in state-to-state transitions.
-    i2h_kernel : array ref of int, default (3, 3)
-        Kernel of Convolution operator in input-to-state transitions.
-    i2h_stride : array ref of int, default (1, 1)
-        Stride of Convolution operator in input-to-state transitions.
-    i2h_pad : array ref of int, default (1, 1)
-        Pad of Convolution operator in input-to-state transitions.
-    i2h_dilate : array ref of int, default (1, 1)
-        Dilation of Convolution operator in input-to-state transitions.
-    activation : str or Symbol,
-        default functools.partial(symbol.LeakyReLU, act_type='leaky', slope=0.2)
-        Type of activation function.
-    prefix : str, default 'ConvRNN_'
-        Prefix for name of layers (and name of weight if params is None).
-    params : RNNParams, default None
-        Container for weight sharing between cells. Created if None.
-    conv_layout : str, , default 'NCHW'
-        Layout of ConvolutionOp
-=cut
-
-has '+_h2h_kernel' => (default => sub { [3, 3] });
-has '+_h2h_dilate' => (default => sub { [1, 1] });
-has '+_i2h_kernel' => (default => sub { [3, 3] });
-has '+_i2h_stride' => (default => sub { [1, 1] });
-has '+_i2h_dilate' => (default => sub { [1, 1] });
-has '+_i2h_pad'    => (default => sub { [1, 1] });
-has '+_prefix'     => (default => 'ConvRNN_');
-has '+_activation' => (default => sub { sub { AI::MXNet::Symbol->LeakyReLU(@_, act_type => 'leaky', slope => 0.2) } });
-has '+i2h_bias_initializer' => (default => 'zeros');
-has '+h2h_bias_initializer' => (default => 'zeros');
-has 'forget_bias'  => (is => 'ro', isa => 'Num');
-has [qw/_iW _iB
-        _hW _hB/] => (is => 'rw', init_arg => undef);
-
-
-sub BUILD
-{
-    my $self = shift;
-    $self->_iW($self->_params->get('i2h_weight', init => $self->i2h_weight_initializer));
-    $self->_hW($self->_params->get('h2h_weight', init => $self->h2h_weight_initializer));
-    $self->_iB(
-        $self->params->get(
-            'i2h_bias',
-            (defined($self->forget_bias and not defined $self->i2h_bias_initializer)
-                ? (init => AI::MXNet::LSTMBias->new(forget_bias => $self->forget_bias))
-                : (init => $self->i2h_bias_initializer)
-            )
-        )
-    );
-    $self->_hB($self->_params->get('h2h_bias', init => $self->h2h_bias_initializer));
-}
-
-method _num_gates()
-{
-    scalar(@{ $self->_gate_names() });
-}
-
-method _gate_names()
-{
-    return ['']
-}
-
-method _conv_forward($inputs, $states, $name)
-{
-    my $i2h = AI::MXNet::Symbol->Convolution(
-        name       => "${name}i2h",
-        data       => $inputs,
-        num_filter => $self->_num_hidden*$self->_num_gates(),
-        kernel     => $self->_i2h_kernel,
-        stride     => $self->_i2h_stride,
-        pad        => $self->_i2h_pad,
-        dilate     => $self->_i2h_dilate,
-        weight     => $self->_iW,
-        bias       => $self->_iB
-    );
-    my $h2h = AI::MXNet::Symbol->Convolution(
-        name       => "${name}h2h",
-        data       => @{ $states }[0],
-        num_filter => $self->_num_hidden*$self->_num_gates(),
-        kernel     => $self->_h2h_kernel,
-        stride     => [1, 1],
-        pad        => $self->_h2h_pad,
-        dilate     => $self->_h2h_dilate,
-        weight     => $self->_hW,
-        bias       => $self->_hB
-    );
-    return ($i2h, $h2h);
-}
-
-method call(AI::MXNet::Symbol $inputs, AI::MXNet::Symbol|ArrayRef[AI::MXNet::Symbol] $states)
-{
-    $self->_counter($self->_counter + 1);
-    my $name = sprintf('%st%d_', $self->_prefix, $self->_counter);
-    my ($i2h, $h2h) = $self->_conv_forward($inputs, $states, $name);
-    my $output = $self->_get_activation($i2h + $h2h, $self->_activation, name => "${name}out");
-    return ($output, [$output]);
-}
-
-package AI::MXNet::RNN::ConvLSTMCell;
-use Mouse;
-extends 'AI::MXNet::RNN::ConvCell';
-has '+forget_bias' => (default => 1);
-has '+_prefix'     => (default => 'ConvLSTM_');
-
-=head1 NAME
-
-    AI::MXNet::RNN::ConvLSTMCell
-=cut
-
-=head1 DESCRIPTION
-
-    Convolutional LSTM network cell.
-
-    Reference:
-        Xingjian et al. NIPS2015
-=cut
-
-method _gate_names()
-{
-    return ['_i', '_f', '_c', '_o'];
-}
-
-method call(AI::MXNet::Symbol $inputs, AI::MXNet::Symbol|ArrayRef[AI::MXNet::Symbol] $states)
-{
-    $self->_counter($self->_counter + 1);
-    my $name = sprintf('%st%d_', $self->_prefix, $self->_counter);
-    my ($i2h, $h2h) = $self->_conv_forward($inputs, $states, $name);
-    my $gates = $i2h + $h2h;
-    my @slice_gates = @{ AI::MXNet::Symbol->SliceChannel(
-        $gates,
-        num_outputs => 4,
-        axis => index($self->_conv_layout, 'C'),
-        name => "${name}slice"
-    ) };
-    my $in_gate = AI::MXNet::Symbol->Activation(
-        $slice_gates[0],
-        act_type => "sigmoid",
-        name => "${name}i"
-    );
-    my $forget_gate = AI::MXNet::Symbol->Activation(
-        $slice_gates[1],
-        act_type => "sigmoid",
-        name => "${name}f"
-    );
-    my $in_transform = $self->_get_activation(
-        $slice_gates[2],
-        $self->_activation,
-        name => "${name}c"
-    );
-    my $out_gate = AI::MXNet::Symbol->Activation(
-        $slice_gates[3],
-        act_type => "sigmoid",
-        name => "${name}o"
-    );
-    my $next_c = AI::MXNet::Symbol->_plus(
-        $forget_gate * @{$states}[1],
-        $in_gate * $in_transform,
-        name => "${name}state"
-    );
-    my $next_h = AI::MXNet::Symbol->_mul(
-        $out_gate, $self->_get_activation($next_c, $self->_activation),
-        name => "${name}out"
-    );
-    return ($next_h, [$next_h, $next_c]);
-}
-
-package AI::MXNet::RNN::ConvGRUCell;
-use Mouse;
-extends 'AI::MXNet::RNN::ConvCell';
-has '+_prefix'     => (default => 'ConvGRU_');
-
-=head1 NAME
-
-    AI::MXNet::RNN::ConvGRUCell
-=cut
-
-=head1 DESCRIPTION
-
-    Convolutional GRU network cell.
-=cut
-
-method _gate_names()
-{
-    return ['_r', '_z', '_o'];
-}
-
-method call(AI::MXNet::Symbol $inputs, AI::MXNet::Symbol|ArrayRef[AI::MXNet::Symbol] $states)
-{
-    $self->_counter($self->_counter + 1);
-    my $name = sprintf('%st%d_', $self->_prefix, $self->_counter);
-    my ($i2h, $h2h) = $self->_conv_forward($inputs, $states, $name);
-    my ($i2h_r, $i2h_z, $h2h_r, $h2h_z);
-    ($i2h_r, $i2h_z, $i2h) = @{ AI::MXNet::Symbol->SliceChannel($i2h, num_outputs => 3, name => "${name}_i2h_slice") };
-    ($h2h_r, $h2h_z, $h2h) = @{ AI::MXNet::Symbol->SliceChannel($h2h, num_outputs => 3, name => "${name}_h2h_slice") };
-    my $reset_gate = AI::MXNet::Symbol->Activation(
-        $i2h_r + $h2h_r, act_type => "sigmoid",
-        name => "${name}_r_act"
-    );
-    my $update_gate = AI::MXNet::Symbol->Activation(
-        $i2h_z + $h2h_z, act_type => "sigmoid",
-        name => "${name}_z_act"
-    );
-    my $next_h_tmp = $self->_get_activation($i2h + $reset_gate * $h2h, $self->_activation, name => "${name}_h_act");
-    my $next_h = AI::MXNet::Symbol->_plus(
-        (1 - $update_gate) * $next_h_tmp, $update_gate * @{$states}[0],
-        name => "${name}out"
-    );
-    return ($next_h, [$next_h]);
-}
-
-package AI::MXNet::RNN::ModifierCell;
-use Mouse;
-use AI::MXNet::Base;
-extends 'AI::MXNet::RNN::Cell::Base';
-
-=head1 NAME
-
-    AI::MXNet::RNN::ModifierCell
-=cut
-
-=head1 DESCRIPTION
-
-    Base class for modifier cells. A modifier
-    cell takes a base cell, apply modifications
-    on it (e.g. Dropout), and returns a new cell.
-
-    After applying modifiers the base cell should
-    no longer be called directly. The modifer cell
-    should be used instead.
-=cut
-
-has 'base_cell' => (is => 'ro', isa => 'AI::MXNet::RNN::Cell::Base', required => 1);
-
-around BUILDARGS => sub {
-    my $orig  = shift;
-    my $class = shift;
-    if(@_%2)
-    {
-        my $base_cell = shift;
-        return $class->$orig(base_cell => $base_cell, @_);
-    }
-    return $class->$orig(@_);
-};
-
-sub BUILD
-{
-    my $self = shift;
-    $self->base_cell->_modified(1);
-}
-
-method params()
-{
-    $self->_own_params(0);
-    return $self->base_cell->params;
-}
-
-method state_info()
-{
-    return $self->base_cell->state_info;
-}
-
-method begin_state(CodeRef :$init_sym=AI::MXNet::Symbol->can('zeros'), @kwargs)
-{
-    assert(
-        (not $self->_modified),
-        "After applying modifier cells (e.g. DropoutCell) the base "
-        ."cell cannot be called directly. Call the modifier cell instead."
-    );
-    $self->base_cell->_modified(0);
-    my $begin_state = $self->base_cell->begin_state(func => $init_sym, @kwargs);
-    $self->base_cell->_modified(1);
-    return $begin_state;
-}
-
-method unpack_weights(HashRef[AI::MXNet::NDArray] $args)
-{
-    return $self->base_cell->unpack_weights($args)
-}
-
-method pack_weights(HashRef[AI::MXNet::NDArray] $args)
-{
-    return $self->base_cell->pack_weights($args)
-}
-
-method call(AI::MXNet::Symbol $inputs, SymbolOrArrayOfSymbols $states)
-{
-    confess("Not Implemented");
-}
-
-package AI::MXNet::RNN::DropoutCell;
-use Mouse;
-extends 'AI::MXNet::RNN::ModifierCell';
-has [qw/dropout_outputs dropout_states/] => (is => 'ro', isa => 'Num', default => 0);
-
-=head1 NAME
-
-    AI::MXNet::RNN::DropoutCell
-=cut
-
-=head1 DESCRIPTION
-
-    Apply the dropout on base cell
-=cut
-
-method call(AI::MXNet::Symbol $inputs, SymbolOrArrayOfSymbols $states)
-{
-    my ($output, $states) = $self->base_cell->($inputs, $states);
-    if($self->dropout_outputs > 0)
-    {
-        $output = AI::MXNet::Symbol->Dropout(data => $output, p => $self->dropout_outputs);
-    }
-    if($self->dropout_states > 0)
-    {
-        $states = [map { AI::MXNet::Symbol->Dropout(data => $_, p => $self->dropout_states) } @{ $states }];
-    }
-    return ($output, $states);
-}
-
-package AI::MXNet::RNN::ZoneoutCell;
-use Mouse;
-use AI::MXNet::Base;
-extends 'AI::MXNet::RNN::ModifierCell';
-has [qw/zoneout_outputs zoneout_states/] => (is => 'ro', isa => 'Num', default => 0);
-has 'prev_output' => (is => 'rw', init_arg => undef);
-
-=head1 NAME
-
-    AI::MXNet::RNN::ZoneoutCell
-=cut
-
-=head1 DESCRIPTION
-
-    Apply Zoneout on base cell.
-=cut
-
-sub BUILD
-{
-    my $self = shift;
-    assert(
-        (not $self->base_cell->isa('AI::MXNet::RNN::FusedCell')),
-        "FusedRNNCell doesn't support zoneout. ".
-        "Please unfuse first."
-    );
-    assert(
-        (not $self->base_cell->isa('AI::MXNet::RNN::BidirectionalCell')),
-        "BidirectionalCell doesn't support zoneout since it doesn't support step. ".
-        "Please add ZoneoutCell to the cells underneath instead."
-    );
-    assert(
-        (not $self->base_cell->isa('AI::MXNet::RNN::SequentialCell') or not $self->_bidirectional),
-        "Bidirectional SequentialCell doesn't support zoneout. ".
-        "Please add ZoneoutCell to the cells underneath instead."
-    );
-}
-
-method reset()
-{
-    $self->SUPER::reset;
-    $self->prev_output(undef);
-}
-
-method call(AI::MXNet::Symbol $inputs, SymbolOrArrayOfSymbols $states)
-{
-    my ($cell, $p_outputs, $p_states) = ($self->base_cell, $self->zoneout_outputs, $self->zoneout_states);
-    my ($next_output, $next_states) = $cell->($inputs, $states);
-    my $mask = sub {
-        my ($p, $like) = @_;
-        AI::MXNet::Symbol->Dropout(
-            AI::MXNet::Symbol->ones_like(
-                $like
-            ),
-            p => $p
-        );
-    };
-    my $prev_output = $self->prev_output // AI::MXNet::Symbol->zeros(shape => [0, 0]);
-    my $output = $p_outputs != 0
-        ? AI::MXNet::Symbol->where(
-            $mask->($p_outputs, $next_output),
-            $next_output,
-            $prev_output
-        )
-        : $next_output;
-    my @states;
-    if($p_states != 0)
-    {
-        for(zip($next_states, $states)) {
-            my ($new_s, $old_s) = @$_;
-            push @states, AI::MXNet::Symbol->where(
-                $mask->($p_states, $new_s),
-                $new_s,
-                $old_s
-            );
-        }
-    }
-    $self->prev_output($output);
-    return ($output, @states ? \@states : $next_states);
-}
-
-package AI::MXNet::RNN::ResidualCell;
-use Mouse;
-use AI::MXNet::Base;
-extends 'AI::MXNet::RNN::ModifierCell';
-
-=head1 NAME
-
-    AI::MXNet::RNN::ResidualCell
-=cut
-
-=head1 DESCRIPTION
-
-    Adds residual connection as described in Wu et al, 2016
-    (https://arxiv.org/abs/1609.08144).
-    Output of the cell is output of the base cell plus input.
-=cut
-
-method call(AI::MXNet::Symbol $inputs, SymbolOrArrayOfSymbols $states)
-{
-    my $output;
-    ($output, $states) = $self->base_cell->($inputs, $states);
-    $output = AI::MXNet::Symbol->elemwise_add($output, $inputs, name => $output->name.'_plus_residual');
-    return ($output, $states)
-}
-
-method unroll(
-    Int $length,
-    Maybe[AI::MXNet::Symbol|ArrayRef[AI::MXNet::Symbol]] :$inputs=,
-    Maybe[AI::MXNet::Symbol|ArrayRef[AI::MXNet::Symbol]] :$begin_state=,
-    Str                                                  :$input_prefix='',
-    Str                                                  :$layout='NTC',
-    Maybe[Bool]                                          :$merge_outputs=
-)
-{
-    $self->reset;
-    $self->base_cell->_modified(0);
-    my ($outputs, $states) = $self->base_cell->unroll($length, inputs=>$inputs, begin_state=>$begin_state,
-                                                layout=>$layout, merge_outputs=>$merge_outputs);
-    $self->base_cell->_modified(1);
-    $merge_outputs //= (blessed($outputs) and $outputs->isa('AI::MXNet::Symbol'));
-    ($inputs) = _normalize_sequence($length, $inputs, $layout, $merge_outputs);
-    if($merge_outputs)
-    {
-        $outputs = AI::MXNet::Symbol->elemwise_add($outputs, $inputs, name => $outputs->name . "_plus_residual");
-    }
-    else
-    {
-        my @temp;
-        for(zip([@{ $outputs }], [@{ $inputs }])) {
-            my ($output_sym, $input_sym) = @$_;
-            push @temp, AI::MXNet::Symbol->elemwise_add($output_sym, $input_sym,
-                            name=>$output_sym->name."_plus_residual");
-        }
-        $outputs = \@temp;
-    }
-    return ($outputs, $states);
-}
-
-func _normalize_sequence($length, $inputs, $layout, $merge, $in_layout=)
-{
-    assert((defined $inputs),
-        "unroll(inputs=>undef) has been deprecated. ".
-        "Please create input variables outside unroll."
-    );
-
-    my $axis = index($layout, 'T');
-    my $in_axis = defined $in_layout ? index($in_layout, 'T') : $axis;
-    if(blessed($inputs))
-    {
-        if(not $merge)
-        {
-            assert(
-                (@{ $inputs->list_outputs() } == 1),
-                "unroll doesn't allow grouped symbol as input. Please "
-                ."convert to list first or let unroll handle splitting"
-            );
-            $inputs = [ @{ AI::MXNet::Symbol->split(
-                $inputs,
-                axis         => $in_axis,
-                num_outputs  => $length,
-                squeeze_axis => 1
-            ) }];
-        }
-    }
-    else
-    {
-        assert(not defined $length or @$inputs == $length);
-        if($merge)
-        {
-            $inputs = [map { AI::MXNet::Symbol->expand_dims($_, axis=>$axis) } @{ $inputs }];
-            $inputs = AI::MXNet::Symbol->Concat(@{ $inputs }, dim=>$axis);
-            $in_axis = $axis;
-        }
-    }
-
-    if(blessed($inputs) and $axis != $in_axis)
-    {
-        $inputs = AI::MXNet::Symbol->swapaxes($inputs, dim0=>$axis, dim1=>$in_axis);
-    }
-    return ($inputs, $axis);
-}
-
-1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/RNN/IO.pm b/perl-package/AI-MXNet/lib/AI/MXNet/RNN/IO.pm
deleted file mode 100644
index be3bdbd373cb..000000000000
--- a/perl-package/AI-MXNet/lib/AI/MXNet/RNN/IO.pm
+++ /dev/null
@@ -1,309 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-package AI::MXNet::RNN::IO;
-use strict;
-use warnings;
-use AI::MXNet::Base;
-use AI::MXNet::Function::Parameters;
-
-=encoding UTF-8
-
-=head1 NAME
-
-    AI::MXNet::RNN::IO - Functions for constructing recurrent neural networks.
-=cut
-
-=head1 DESCRIPTION
-
-    Functions for constructing recurrent neural networks.
-=cut
-
-=head2 encode_sentences
-
-    Encode sentences and (optionally) build a mapping
-    from string tokens to integer indices. Unknown keys
-    will be added to vocabulary.
-
-    Parameters
-    ----------
-    $sentences : array ref of array refs of str
-        A array ref of sentences to encode. Each sentence
-        should be a array ref of string tokens.
-    :$vocab : undef or hash ref of str -> int
-        Optional input Vocabulary
-    :$invalid_label : int, default -1
-        Index for invalid token, like <end-of-sentence>
-    :$invalid_key : str, default '\n'
-        Key for invalid token. Uses '\n' for end
-        of sentence by default.
-    :$start_label=0 : int
-        lowest index.
-
-    Returns
-    -------
-    $result : array ref of array refs of int
-        encoded sentences
-    $vocab : hash ref of str -> int
-        result vocabulary
-=cut
-
-
-method encode_sentences(
-    ArrayRef[ArrayRef]  $sentences,
-    Maybe[HashRef]     :$vocab=,
-    Int                :$invalid_label=-1,
-    Str                :$invalid_key="\n",
-    Int                :$start_label=0
-)
-{
-    my $idx = $start_label;
-    my $new_vocab;
-    if(not defined $vocab)
-    {
-        $vocab = { $invalid_key => $invalid_label };
-        $new_vocab = 1;
-    }
-    else
-    {
-        $new_vocab = 0;
-    }
-    my @res;
-    for my $sent (@{ $sentences })
-    {
-        my @coded;
-        for my $word (@{ $sent })
-        {
-            if(not exists $vocab->{ $word })
-            {
-                assert($new_vocab, "Unknown token: $word");
-                if($idx == $invalid_label)
-                {
-                    $idx += 1;
-                }
-                $vocab->{$word} = $idx;
-                $idx += 1;
-            }
-            push @coded, $vocab->{ $word };
-        }
-        push @res, \@coded;
-    }
-    return (\@res, $vocab);
-}
-
-package AI::MXNet::BucketSentenceIter;
-
-=encoding UTF-8
-
-=head1 NAME
-
-    AI::MXNet::BucketSentenceIter
-=cut
-
-=head1 DESCRIPTION
-
-    Simple bucketing iterator for language model.
-    Label for each step is constructed from data of
-    next step.
-=cut
-
-=head2 new
-
-    Parameters
-    ----------
-    sentences : array ref of array refs of int
-        encoded sentences
-    batch_size : int
-        batch_size of data
-    invalid_label : int, default -1
-        key for invalid label, e.g. <end-of-sentence>
-    dtype : str, default 'float32'
-        data type
-    buckets : array ref of int
-        size of data buckets. Automatically generated if undef.
-    data_name : str, default 'data'
-        name of data
-    label_name : str, default 'softmax_label'
-        name of label
-    layout : str
-        format of data and label. 'NT' means (batch_size, length)
-        and 'TN' means (length, batch_size).
-=cut
-
-use Mouse;
-use AI::MXNet::Base;
-use List::Util qw(shuffle max);
-extends 'AI::MXNet::DataIter';
-has 'sentences'     => (is => 'ro', isa => 'ArrayRef[ArrayRef]', required => 1);
-has '+batch_size'   => (is => 'ro', isa => 'Int',                required => 1);
-has 'invalid_label' => (is => 'ro', isa => 'Int',   default => -1);
-has 'data_name'     => (is => 'ro', isa => 'Str',   default => 'data');
-has 'label_name'    => (is => 'ro', isa => 'Str',   default => 'softmax_label');
-has 'dtype'         => (is => 'ro', isa => 'Dtype', default => 'float32');
-has 'layout'        => (is => 'ro', isa => 'Str',   default => 'NT');
-has 'buckets'       => (is => 'rw', isa => 'Maybe[ArrayRef[Int]]');
-has [qw/data nddata ndlabel
-        major_axis default_bucket_key
-        provide_data provide_label
-        idx curr_idx
-    /]              => (is => 'rw', init_arg => undef);
-
-sub BUILD
-{
-    my $self = shift;
-    if(not defined $self->buckets)
-    {
-        my @buckets;
-        my $p = pdl([map { scalar(@$_) } @{ $self->sentences }]);
-        enumerate(sub {
-            my ($i, $j) = @_;
-            if($j >= $self->batch_size)
-            {
-                push @buckets, $i;
-            }
-        }, $p->histogram(1,0,$p->max+1)->unpdl);
-        $self->buckets(\@buckets);
-    }
-    @{ $self->buckets } = sort { $a <=> $b } @{ $self->buckets };
-    my $ndiscard = 0;
-    $self->data([map { [] } 0..@{ $self->buckets }-1]);
-    for my $i (0..@{$self->sentences}-1)
-    {
-        my $buck = bisect_left($self->buckets, scalar(@{ $self->sentences->[$i] }));
-        if($buck == @{ $self->buckets })
-        {
-            $ndiscard += 1;
-            next;
-        }
-        my $buff = AI::MXNet::NDArray->full(
-            [$self->buckets->[$buck]],
-            $self->invalid_label,
-            dtype => $self->dtype
-        )->aspdl;
-        $buff->slice([0, @{ $self->sentences->[$i] }-1]) .= pdl($self->sentences->[$i]);
-        push @{ $self->data->[$buck] }, $buff;
-    }
-    $self->data([map { pdl(PDL::Type->new(DTYPE_MX_TO_PDL->{$self->dtype}), $_) } @{$self->data}]);
-    AI::MXNet::Logging->warning("discarded $ndiscard sentences longer than the largest bucket.")
-        if $ndiscard;
-    $self->nddata([]);
-    $self->ndlabel([]);
-    $self->major_axis(index($self->layout, 'N'));
-    $self->default_bucket_key(max(@{ $self->buckets }));
-    my $shape;
-    if($self->major_axis == 0)
-    {
-        $shape = [$self->batch_size, $self->default_bucket_key];
-    }
-    elsif($self->major_axis == 1)
-    {
-        $shape = [$self->default_bucket_key, $self->batch_size];
-    }
-    else
-    {
-        confess("Invalid layout ${\ $self->layout }: Must by NT (batch major) or TN (time major)");
-    }
-    $self->provide_data([
-        AI::MXNet::DataDesc->new(
-            name  => $self->data_name,
-            shape => $shape,
-            dtype => $self->dtype,
-            layout => $self->layout
-        )
-    ]);
-    $self->provide_label([
-        AI::MXNet::DataDesc->new(
-            name  => $self->label_name,
-            shape => $shape,
-            dtype => $self->dtype,
-            layout => $self->layout
-        )
-    ]);
-    $self->idx([]);
-    enumerate(sub {
-        my ($i, $buck) = @_;
-        my $buck_len = $buck->shape->at(-1);
-        for my $j (0..($buck_len - $self->batch_size))
-        {
-            if(not $j%$self->batch_size)
-            {
-                push @{ $self->idx }, [$i, $j];
-            }
-        }
-    }, $self->data);
-    $self->curr_idx(0);
-    $self->reset;
-}
-
-method reset()
-{
-    $self->curr_idx(0);
-    @{ $self->idx } = shuffle(@{ $self->idx });
-    $self->nddata([]);
-    $self->ndlabel([]);
-    for my $buck (@{ $self->data })
-    {
-        $buck = pdl_shuffle($buck);
-        my $label = $buck->zeros;
-        $label->slice([0, -2], 'X')  .= $buck->slice([1, -1], 'X');
-        $label->slice([-1, -1], 'X') .= $self->invalid_label;
-        push @{ $self->nddata }, AI::MXNet::NDArray->array($buck, dtype => $self->dtype);
-        push @{ $self->ndlabel }, AI::MXNet::NDArray->array($label, dtype => $self->dtype);
-    }
-}
-
-method next()
-{
-    return undef if($self->curr_idx == @{ $self->idx });
-    my ($i, $j) = @{ $self->idx->[$self->curr_idx] };
-    $self->curr_idx($self->curr_idx + 1);
-    my ($data, $label);
-    if($self->major_axis == 1)
-    {
-        $data  = $self->nddata->[$i]->slice([$j, $j+$self->batch_size-1])->T;
-        $label = $self->ndlabel->[$i]->slice([$j, $j+$self->batch_size-1])->T;
-    }
-    else
-    {
-        $data = $self->nddata->[$i]->slice([$j, $j+$self->batch_size-1]);
-        $label = $self->ndlabel->[$i]->slice([$j, $j+$self->batch_size-1]);
-    }
-    return AI::MXNet::DataBatch->new(
-        data          => [$data],
-        label         => [$label],
-        bucket_key    => $self->buckets->[$i],
-        pad           => 0,
-        provide_data  => [
-            AI::MXNet::DataDesc->new(
-                name  => $self->data_name,
-                shape => $data->shape,
-                dtype => $self->dtype,
-                layout => $self->layout
-            )
-        ],
-        provide_label => [
-            AI::MXNet::DataDesc->new(
-                name  => $self->label_name,
-                shape => $label->shape,
-                dtype => $self->dtype,
-                layout => $self->layout
-            )
-        ],
-    );
-}
-
-1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Random.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Random.pm
deleted file mode 100644
index ddc0edca1ba8..000000000000
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Random.pm
+++ /dev/null
@@ -1,170 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-package AI::MXNet::Random;
-use strict;
-use warnings;
-use Scalar::Util qw/blessed/;
-use AI::MXNet::NS;
-use AI::MXNet::Base;
-use AI::MXNet::NDArray::Base;
-use AI::MXNet::Function::Parameters;
-
-=head1 NAME
-
-    AI::MXNet::Random - Handling of randomization in MXNet.
-=cut
-
-=head1 DESCRIPTION
-
-    Handling of randomization in MXNet.
-=cut
-
-=head2 seed
-
-    Seed the random number generators in mxnet.
-
-    This seed will affect behavior of functions in this module,
-    as well as results from executors that contains Random number
-    such as Dropout operators.
-
-    Parameters
-    ----------
-    $seed_state : int
-        The random number seed.
-
-    :$ctx : [Str|AI::MXNet::Context]
-        The device context of the generator. The default Str is "all" which means seeding random
-        number generators of all devices.
-    Notes
-    -----
-    Random number generators in MXNet are device specific.
-    mx->random->seed($seed_state) sets the state of each generator using $seed_state and the
-    device id. Therefore, random numbers generated from different devices can be different
-    even if they are seeded using the same seed.
-
-    To produce identical random number sequences independent of the device id,
-    set optional ctx argument. 
-    For example mx->random->seed($seed_state, ctx => mx->gpu(0))
-    This produces the same sequence of random numbers independent
-    of the device id, but the sequence can be different on different kind of devices as MXNet's
-    random number generators for CPU and GPU use different algorithms.
-=cut
-
-method seed(Int $seed_state, Str|AI::MXNet::Context :$ctx='all')
-{
-    if(not ref $ctx)
-    {
-        confess("ctx argument could be either string 'all' or AI::MXNet::Context")
-            unless $ctx eq 'all';
-        check_call(AI::MXNetCAPI::RandomSeed($seed_state));
-    }
-    else
-    {
-        check_call(AI::MXNetCAPI::RandomSeedContext($seed_state, $ctx->device_type_id, $ctx->device_id));
-    }
-}
-
-sub AUTOLOAD {
-    my $sub = $AI::MXNet::Random::AUTOLOAD;
-    $sub =~ s/.*:://;
-    shift;
-    my %updated;
-    my %defaults = (
-        ctx   => AI::MXNet::Context->current_ctx,
-        shape => 1,
-        out   => 1
-    );
-    my @args;
-    my @tmp = @_;
-    if($sub eq 'randn')
-    {
-        $sub = 'normal';
-        my @shape;
-        while(defined $tmp[0] and $tmp[0] =~ /^\d+$/)
-        {
-            push @shape, shift(@tmp);
-        }
-        if(@shape)
-        {
-            push @tmp, (shape => \@shape);
-        }
-        %defaults = (%defaults, loc => 0, scale => 1);
-    }
-    if(ref $tmp[-1] eq 'HASH')
-    {
-        my @kwargs = %{ pop(@tmp) };
-        push @tmp, @kwargs;
-    }
-    while(@tmp >= 2 and not ref $tmp[-2])
-    {
-        if(exists $defaults{$tmp[-2]})
-        {
-            my $v = pop(@tmp);
-            my $k = pop(@tmp);
-            if(defined $v)
-            {
-                $updated{$k} = 1;
-                $defaults{$k} = $v;
-            }
-        }
-        else
-        {
-            unshift @args, pop(@tmp);
-            unshift @args, pop(@tmp);
-        }
-    }
-    unshift @args, @tmp;
-    if(blessed($defaults{out}) and not exists $updated{shape})
-    {
-        delete $defaults{shape};
-    }
-    delete $defaults{out} unless blessed $defaults{out};
-    if($sub eq 'exponential')
-    {
-        my $changed = 0;
-        for my $i (0..@args-1)
-        {
-            if(not ref $args[$i] and $args[$i] eq 'scale')
-            {
-                $args[$i] = 'lam';
-                $args[$i+1] = 1/$args[$i+1];
-                $changed = 1;
-            }
-        }
-        $args[0] = 1/$args[0] unless $changed;
-    }
-    if(grep { blessed($_) and $_->isa('AI::MXNet::NDArray') } @args)
-    {
-        if($sub eq 'normal')
-        {
-            my %mapping = qw/loc mu scale sigma/;
-            @args = map { (not ref $_ and exists $mapping{$_}) ? $mapping{$_} : $_ } @args
-        }
-        $sub = "_sample_$sub";
-        delete $defaults{shape} if not exists $updated{shape};
-        delete $defaults{ctx};
-        return AI::MXNet::NDArray->$sub(@args, %defaults);
-    }
-    else
-    {
-        $sub = "_random_$sub";
-    }
-    return AI::MXNet::NDArray->$sub(@args, %defaults);
-}
-
-1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/RecordIO.pm b/perl-package/AI-MXNet/lib/AI/MXNet/RecordIO.pm
deleted file mode 100644
index 5637260a6b08..000000000000
--- a/perl-package/AI-MXNet/lib/AI/MXNet/RecordIO.pm
+++ /dev/null
@@ -1,357 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-package AI::MXNet::RecordIO;
-use strict;
-use warnings;
-use AI::MXNet::NS;
-use AI::MXNet::Function::Parameters;
-use AI::MXNet::Types;
-use AI::MXNet::Base;
-use Mouse;
-
-=head1 NAME
-
-    AI::MXNet::RecordIO - Read/write RecordIO format data
-=cut
-
-=head2 new
-
-    Parameters
-    ----------
-    uri : Str
-        uri path to recordIO file.
-    flag: Str
-        "r" for reading or "w" writing.
-=cut
-
-has 'uri'         => (is => 'ro', isa => 'Str', required => 1);
-has 'flag'        => (is => 'ro', isa => enum([qw/r w/]), required => 1);
-has 'handle'      => (is => 'rw', isa => 'RecordIOHandle');
-has [qw/writable
-        is_open/] => (is => 'rw', isa => 'Bool');
-
-sub BUILD
-{
-    my $self = shift;
-    $self->is_open(0);
-    $self->open();
-}
-
-sub DEMOLISH
-{
-    shift->close;
-}
-
-=head2 open
-
-    Open record file.
-=cut
-
-method open()
-{
-    my $handle;
-    if($self->flag eq 'w')
-    {
-        $handle = check_call(AI::MXNetCAPI::RecordIOWriterCreate($self->uri));
-        $self->writable(1);
-    }
-    else
-    {
-        $handle = check_call(AI::MXNetCAPI::RecordIOReaderCreate($self->uri));
-        $self->writable(0);
-    }
-    $self->handle($handle);
-    $self->is_open(1);
-}
-
-=head2 close
-
-    Close record file.
-=cut
-
-method close()
-{
-    return if not $self->is_open;
-    if($self->writable)
-    {
-        check_call(AI::MXNetCAPI::RecordIOWriterFree($self->handle));
-    }
-    else
-    {
-        check_call(AI::MXNetCAPI::RecordIOReaderFree($self->handle));
-    }
-    $self->is_open(0);
-}
-
-=head2 reset
-
-    Reset pointer to first item. If record is opened with 'w',
-    this will truncate the file to empty.
-=cut
-
-method reset()
-{
-    $self->close;
-    $self->open;
-}
-
-=head2 write
-
-    Write a string buffer as a record.
-
-    Parameters
-    ----------
-    $buf : a buffer to write.
-=cut
-
-method write(Str $buf)
-{
-    assert($self->writable);
-    check_call(
-        AI::MXNetCAPI::RecordIOWriterWriteRecord(
-            $self->handle,
-            $buf,
-            length($buf)
-        )
-    );
-}
-
-=head2 read
-
-    Read a record as a string.
-
-    Returns
-    ----------
-    $buf : string
-=cut
-
-method read()
-{
-    assert(not $self->writable);
-    return scalar(check_call(
-        AI::MXNetCAPI::RecordIOReaderReadRecord(
-            $self->handle,
-        )
-    ));
-}
-
-method MXRecordIO(@args) { return AI::MXNet::RecordIO->new(uri => $args[0], flag => $args[1]) }
-method MXIndexedRecordIO(@args)
-{
-    return AI::MXNet::IndexedRecordIO->new(
-        idx_path => $args[0], uri => $args[1], flag => $args[2]
-    )
-}
-
-package AI::MXNet::IRHeader;
-use Mouse;
-has [qw/flag id id2/] => (is => 'rw', isa => 'Int');
-has 'label'           => (is => 'rw', isa => 'AcceptableInput');
-around BUILDARGS => sub {
-    my $orig  = shift;
-    my $class = shift;
-    if(@_ == 4)
-    {
-        return $class->$orig(flag => $_[0], label => $_[1], id => $_[2], id2 => $_[3]);
-    }
-    return $class->$orig(@_);
-};
-my @order = qw/flag label id id2/;
-use overload '@{}' => sub { my $self = shift; [map { $self->$_ } @order] };
-
-package AI::MXNet::RecordIO;
-
-=head2 unpack
-
-    unpack a MXImageRecord to a string
-
-    Parameters
-    ----------
-    s : str
-        string buffer from MXRecordIO.read
-
-    Returns
-    -------
-    header : AI::MXNet::IRHeader
-        header of the image record
-    s : str
-        unpacked string
-=cut
-
-method unpack(Str $s)
-{
-    my $h;
-    my $h_size = 24;
-    ($h, $s) = (substr($s, 0, $h_size), substr($s, $h_size));
-    my $header = AI::MXNet::IRHeader->new(unpack('IfQQ', $h));
-    if($header->flag > 0)
-    {
-        my $label;
-        ($label, $s) = (substr($s, 0, 4*$header->flag), substr($s, 4*$header->flag));
-        my $pdl_type = PDL::Type->new(DTYPE_MX_TO_PDL->{float32});
-        my $pdl = PDL->new_from_specification($pdl_type, $header->flag);
-        ${$pdl->get_dataref} = $label;
-        $pdl->upd_data;
-        $header->label($pdl);
-    }
-    return ($header, $s)
-}
-
-=head2 pack
-
-    pack a string into MXImageRecord
-
-    Parameters
-    ----------
-    $header : AI::MXNet::IRHeader or ArrayRef suitable for AI::MXNet::IRHeader->new(@{ ArrayRef })
-        header of the image record.
-        $header->label can be a number or an array ref.
-    s : str
-        string to pack
-=cut
-
-method pack(AI::MXNet::IRHeader|ArrayRef $header, Str $s)
-{
-    $header = AI::MXNet::IRHeader->new(@$header) unless blessed $header;
-    if(not ref $header->label)
-    {
-        $header->flag(0);
-    }
-    else
-    {
-        my $label = AI::MXNet::NDArray->array($header->label, dtype=>'float32')->aspdl;
-        $header->label(0);
-        $header->flag($label->nelem);
-        my $buf = ${$label->get_dataref};
-        $s = "$buf$s";
-    }
-    $s = pack('IfQQ', @{ $header }) . $s;
-    return $s;
-}
-
-package AI::MXNet::IndexedRecordIO;
-use Mouse;
-use AI::MXNet::Base;
-extends 'AI::MXNet::RecordIO';
-
-=head1 NAME
-
-    AI::MXNet::IndexedRecordIO - Read/write RecordIO format data supporting random access.
-=cut
-
-=head2 new
-
-    Parameters
-    ----------
-    idx_path : str
-        Path to index file
-    uri : str
-        Path to record file. Only support file types that are seekable.
-    flag : str
-        'w' for write or 'r' for read
-=cut
-
-has 'idx_path'  => (is => 'ro', isa => 'Str', required => 1);
-has [qw/idx
-    keys fidx/] => (is => 'rw', init_arg => undef);
-
-method open()
-{
-    $self->SUPER::open();
-    $self->idx({});
-    $self->keys([]);
-    open(my $f, $self->flag eq 'r' ? '<' : '>', $self->idx_path);
-    $self->fidx($f);
-    if(not $self->writable)
-    {
-        while(<$f>)
-        {
-            chomp;
-            my ($key, $val) = split(/\t/);
-            push @{ $self->keys }, $key;
-            $self->idx->{$key} = $val;
-        }
-    }
-}
-
-method close()
-{
-    return if not $self->is_open;
-    $self->SUPER::close();
-    $self->fidx(undef);
-}
-
-=head2 seek
-
-    Query current read head position.
-=cut
-
-method seek(Int $idx)
-{
-    assert(not $self->writable);
-    my $pos = $self->idx->{$idx};
-    check_call(AI::MXNetCAPI::RecordIOReaderSeek($self->handle, $pos));
-}
-
-=head2 tell
-
-    Query current write head position.
-=cut
-
-method tell()
-{
-    assert($self->writable);
-    return scalar(check_call(AI::MXNetCAPI::RecordIOWriterTell($self->handle)));
-}
-
-=head2 read_idx
-
-    Read record with the index.
-
-    Parameters:
-    -----------
-    $idx
-=cut
-
-method read_idx(Int $idx)
-{
-    $self->seek($idx);
-    return $self->read();
-}
-
-=head2 write_idx
-
-    Write record with index.
-
-    Parameters:
-    -----------
-    Int $idx
-    Str $buf
-=cut
-
-method write_idx(Int $idx, Str $buf)
-{
-    my $pos = $self->tell();
-    $self->write($buf);
-    my $f = $self->fidx;
-    print $f "$idx\t$pos\n";
-    $self->idx->{$idx} = $pos;
-    push @{ $self->keys }, $idx;
-}
-
-1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/RunTime.pm b/perl-package/AI-MXNet/lib/AI/MXNet/RunTime.pm
deleted file mode 100644
index 3e4548b4f860..000000000000
--- a/perl-package/AI-MXNet/lib/AI/MXNet/RunTime.pm
+++ /dev/null
@@ -1,117 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-package AI::MXNet::RunTime;
-use strict;
-use warnings;
-use AI::MXNet::NS;
-use AI::MXNet::Base;
-use AI::MXNet::Function::Parameters;
-use Mouse;
-
-=encoding utf-8
-
-=head1 NAME
-
-    AI::MXNet::RunTime - Runtime querying of compile time features in the native library.
-=cut
-
-=head1 DESCRIPTION
-
-    With this module you can check at runtime which libraries and features were compiled in the library.
-
-    Example usage:
-
-    use AI::MXNet qw(mx);
-    my $features = mx->runtime->Features();
-
-    print $features->is_enabled("CUDNN");
-    0
-
-    print $features->is_enabled("CPU_SSE");
-    1
-
-    print Dumper($features->features);
-    $VAR1 = {
-            'LAPACK' => 1,
-            'F16C' => 1,
-            'CPU_SSE2' => 1,
-            'BLAS_MKL' => 0,
-            'CXX14' => 0,
-            'DIST_KVSTORE' => 0,
-            'NCCL' => 0,
-            'OPENMP' => 1,
-            'CUDNN' => 0,
-            'CPU_AVX' => 1,
-            'CUDA_RTC' => 0,
-            'BLAS_OPEN' => 1,
-            'CPU_SSE4_2' => 1,
-            'CPU_SSE4A' => 0,
-            'TVM_OP' => 0,
-            'MKLDNN' => 0,
-            'TENSORRT' => 0,
-            'JEMALLOC' => 1,
-            'SSE' => 0,
-            'DEBUG' => 0,
-            'BLAS_APPLE' => 0,
-            'CPU_SSE3' => 1,
-            'INT64_TENSOR_SIZE' => 0,
-            'CPU_SSE4_1' => 1,
-            'CUDA' => 0,
-            'OPENCV' => 1,
-            'CPU_SSE' => 1,
-            'SIGNAL_HANDLER' => 0,
-            'BLAS_ATLAS' => 0,
-            'CAFFE' => 0,
-            'CPU_AVX2' => 0
-    };
-
-    print $features;
-    [✖ CUDA, ✖ CUDNN, ✖ NCCL, ✖ CUDA_RTC, ✖ TENSORRT, ✔ CPU_SSE, ✔ CPU_SSE2, ✔ CPU_SSE3,
-    ✔ CPU_SSE4_1, ✔ CPU_SSE4_2, ✖ CPU_SSE4A, ✔ CPU_AVX, ✖ CPU_AVX2, ✔ OPENMP, ✖ SSE,
-    ✔ F16C, ✔ JEMALLOC, ✔ BLAS_OPEN, ✖ BLAS_ATLAS, ✖ BLAS_MKL, ✖ BLAS_APPLE, ✔ LAPACK,
-    ✖ MKLDNN, ✔ OPENCV, ✖ CAFFE, ✖ DIST_KVSTORE, ✖ CXX14, ✖ INT64_TENSOR_SIZE,
-    ✔ SIGNAL_HANDLER, ✔ DEBUG, ✖ TVM_OP]
-
-=cut
-use overload '""' => sub {
-    my $self = shift;
-    my $s = join(', ', map {
-        sprintf("%s %s", $self->features->{ $_ } ? '✔' : '✖', $_)
-    } sort keys %{ $self->features });
-    return "[$s]";
-};
-
-has 'features' => (is => 'rw', init_arg => undef, default => sub  {
-    return scalar(check_call(AI::MXNetCAPI::LibInfoFeatures()));
-});
-
-method is_enabled(Str $feature)
-{
-    confess("Feature $feature does not exist")
-        unless exists $self->features->{ $feature };
-    return $self->features->{ $feature };
-}
-
-my $features;
-method Features()
-{
-    $features //= __PACKAGE__->new;
-    return $features;
-}
-
-1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Symbol.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Symbol.pm
deleted file mode 100644
index 6a472c1e9d8e..000000000000
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Symbol.pm
+++ /dev/null
@@ -1,1535 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-package AI::MXNet::Symbol;
-
-=head1 NAME
-
-    AI::MXNet::Symbol - Symbolic interface of MXNet.
-=cut
-
-use strict;
-use warnings;
-use AI::MXNet::NS;
-use AI::MXNet::Base;
-use AI::MXNet::Symbol::Base;
-use AI::MXNet::Symbol::Random;
-use AI::MXNet::RunTime;
-use AI::MXNet::Types;
-use Mouse;
-use AI::MXNet::Function::Parameters;
-use overload
-    '""'  => \&stringify,
-    '+'   => \&add,
-    '-'   => \&subtract,
-    '*'   => \&multiply,
-    '/'   => \&divide,
-    '/='  => \&idivide,
-    '**'  => \&power,
-    '%'   => \&mod,
-    '=='  => \&equal,
-    '!='  => \&not_equal,
-    '>'   => \&greater,
-    '>='  => \&greater_equal,
-    '<'   => \&lesser,
-    '<='  => \&lesser_equal,
-    '&{}' => sub { my $self = shift; sub { $self->call(@_) } },
-    '@{}' => sub { my $self = shift; [map { $self->slice($_) } @{ $self->list_outputs }] };
-
-extends 'AI::MXNet::Symbol::Base';
-has 'handle'   => (is => 'rw', isa => 'SymbolHandle', required => 1);
-
-sub DEMOLISH
-{
-    check_call(AI::NNVMCAPI::SymbolFree(shift->handle));
-}
-
-method STORABLE_freeze($cloning)
-{
-    return $self->tojson();
-}
-
-method STORABLE_thaw($cloning, $json)
-{
-    my $handle = check_call(
-        AI::MXNetCAPI::SymbolCreateFromJSON(
-            $json
-        )
-    );
-    $self->handle($handle);
-}
-
-method stringify($other=, $reverse=)
-{
-    my $name = $self->name;
-    sprintf(
-        "<%s %s%s>",
-        ref($self),
-        $name ? ($name, '') : ('group [', join(', ', map { $_->name } @{ $self }) . ']')
-    );
-}
-
-method add(AI::MXNet::Symbol|Num $other, $reverse=)
-{
-    return _ufunc_helper(
-        $self,
-        $other,
-        qw/_Plus _PlusScalar/
-    );
-}
-
-method subtract(AI::MXNet::Symbol|Num $other, $reverse=)
-{
-    return _ufunc_helper(
-        $self,
-        $other,
-        qw/_Minus _MinusScalar _RMinusScalar/,
-        $reverse
-    );
-}
-
-method multiply(AI::MXNet::Symbol|Num $other, $reverse=)
-{
-    return _ufunc_helper(
-        $self,
-        $other,
-        qw/_Mul _MulScalar/
-    );
-}
-
-method divide(AI::MXNet::Symbol|Num $other, $reverse=)
-{
-    return _ufunc_helper(
-        $self,
-        $other,
-        qw/_Div _DivScalar _RDivScalar/,
-        $reverse
-    );
-}
-
-method power(AI::MXNet::Symbol|Num $other, $reverse=)
-{
-    return _ufunc_helper(
-        $self,
-        $other,
-        qw/_Power _PowerScalar _RPowerScalar/,
-        $reverse
-    );
-}
-
-method equal(AI::MXNet::Symbol|Num $other, $reverse=)
-{
-    return _ufunc_helper(
-        $self,
-        $other,
-        qw/_equal _equal_scalar/
-    );
-}
-
-method not_equal(AI::MXNet::Symbol|Num $other, $reverse=)
-{
-    return _ufunc_helper(
-        $self,
-        $other,
-        qw/_not_equal _not_equal_scalar/
-    );
-}
-
-method greater(AI::MXNet::Symbol|Num $other, $reverse=)
-{
-    return _ufunc_helper(
-        $self,
-        $other,
-        qw/_greater _greater_scalar _lesser_scalar/,
-        $reverse
-    );
-}
-
-method greater_equal(AI::MXNet::Symbol|Num $other, $reverse=)
-{
-    return _ufunc_helper(
-        $self,
-        $other,
-        qw/_greater_equal _greater_equal_scalar _lesser_equal_scalar/,
-        $reverse
-    );
-}
-
-method lesser(AI::MXNet::Symbol|Num $other, $reverse=)
-{
-    return _ufunc_helper(
-        $self,
-        $other,
-        qw/_lesser _lesser_scalar _greater_scalar/,
-        $reverse
-    );
-}
-
-method lesser_equal(AI::MXNet::Symbol|Num $other, $reverse=)
-{
-    return _ufunc_helper(
-        $self,
-        $other,
-        qw/_lesser_equal _lesser_equal_scalar _greater_equal_scalar/,
-        $reverse
-    );
-}
-
-method true_divide(AI::MXNet::Symbol|Num $other, $reverse=)
-{
-    return $self->divide($other, $reverse);
-}
-
-method mod(AI::MXNet::Symbol|Num $other, $reverse=)
-{
-    return _ufunc_helper(
-        $self,
-        $other,
-        qw/_Mod _ModScalar _RModScalar/,
-        $reverse
-    );
-}
-
-method maximum(AI::MXNet::Symbol|Num $other)
-{
-    return _ufunc_helper(
-        $self,
-        $other,
-        qw/_Maximum _MaximumScalar/
-    );
-}
-
-method minimum(AI::MXNet::Symbol|Num $other)
-{
-    return _ufunc_helper(
-        $self,
-        $other,
-        qw/_Minimum _MinimumScalar/
-    );
-}
-
-method hypot(AI::MXNet::Symbol|Num $other)
-{
-    return _ufunc_helper(
-        $self,
-        $other,
-        qw/_Hypot _HypotScalar/
-    );
-}
-
-method reshape(@args)
-{
-    if(@args%2)
-    {
-        unshift @args, 'shape';
-    }
-    return $self->SUPER::reshape(@args);
-}
-
-
-method deepcopy()
-{
-    my $handle = check_call(AI::MXNetCAPI::SymbolCopy($self->handle));
-    return __PACKAGE__->new(handle => $handle);
-}
-
-method call(@args)
-{
-    my $s = $self->deepcopy();
-    $s->_compose(@args);
-    return $s;
-}
-
-method slice(@slices)
-{
-    confess("No slices supplied") unless @slices;
-    if(@slices > 1)
-    {
-        return $self->SUPER::slice(@slices);
-    }
-    my $index = $slices[0];
-    ## __getitem__ tie needs to die
-    if(not find_type_constraint('Index')->check($index))
-    {
-        my $i = 0;
-        my $idx;
-        for my $name (@{ $self->list_outputs() })
-        {
-            if($name eq $index)
-            {
-                if(defined $idx)
-                {
-                    confess(qq/There are multiple outputs with name "$index"/);
-                }
-                $idx = $i;
-            }
-            $i++;
-        }
-        confess(qq/Cannot find output that matches name "$index"/) unless defined $idx;
-        $index = $idx;
-    }
-    elsif($index >= @{ $self->list_outputs() })
-    {
-        confess("Index: [$index] is outside of the range of the symbol: $self outputs");
-    }
-    my $handle = check_call(AI::MXNetCAPI::SymbolGetOutput($self->handle, $index));
-    return __PACKAGE__->new(handle => $handle);
-}
-
-=head2 name
-
-    Get name string from the symbol, this function only works for non-grouped symbol.
-
-    Returns
-    -------
-    value : str
-        The name of this symbol, returns None for grouped symbol.
-=cut
-
-method name()
-{
-    my ($name, $success) = check_call(AI::MXNetCAPI::SymbolGetName($self->handle));
-    return $success ? $name : undef;
-}
-
-=head2 attr
-
-    Get an attribute string from the symbol, this function only works for non-grouped symbol.
-
-    Parameters
-    ----------
-    key : str
-        The key to get attribute from.
-
-    Returns
-    -------
-    value : str
-        The attribute value of the key, returns None if attribute do not exist.
-=cut
-
-
-method attr(Str $key)
-{
-    my ($attr, $success) = check_call(
-        AI::MXNetCAPI::SymbolGetAttr($self->handle, $key)
-    );
-    return $success ? $attr : undef;
-}
-
-=head2 list_attr
-
-    Get all attributes from the symbol.
-
-    Returns
-    -------
-    ret : hash ref of str to str
-        a dicitonary mapping attribute keys to values
-=cut
-
-method list_attr()
-{
-    my %ret;
-    my @attrs = @{ check_call(AI::MXNetCAPI::SymbolListAttrShallow($self->handle)) };
-    while(@attrs)
-    {
-        my $k = shift(@attrs);
-        my $v = shift(@attrs);
-        $ret{ $k } = $v;
-    }
-    return \%ret;
-}
-
-=head2 attr_dict
-
-    Recursively get all attributes from the symbol and its childrens
-
-    Returns
-    -------
-    ret : hash ref of str to hash ref.
-        Returns a dict whose keys are names of the symbol and its children.
-        Values of the returned dict are dictionaries that map attribute keys to values.
-=cut
-
-method attr_dict()
-{
-    my %ret;
-    my @attrs = @{ check_call(AI::MXNetCAPI::SymbolListAttr($self->handle)) };
-    my $size = @attrs/2;
-    for (my $i = 0; $i < $size; $i++)
-    {
-        my ($name, $key) = split(/\$/, $attrs[$i*2]);
-        my $val = $attrs[$i*2+1];
-        $ret{ $name }{ $key } = $val;
-    }
-    return \%ret;
-}
-
-method _set_attr(Str @args)
-{
-    my %kwargs = @args;
-    while(my ($key, $val) = each(%kwargs))
-    {
-        check_call(
-            AI::MXNetCAPI::SymbolSetAttr(
-                $self->handle, $key, $val
-            )
-        );
-    }
-}
-
-=head2 get_internals
-
-    Get a new grouped symbol whose output contains all the internal outputs of this symbol.
-
-    Returns
-    -------
-    sgroup : AI::MXNet::Symbol
-        The internal symbol of the symbol.
-=cut
-
-method get_internals()
-{
-    my $handle = check_call(AI::MXNetCAPI::SymbolGetInternals($self->handle));
-    return __PACKAGE__->new(handle => $handle);
-}
-
-=head2 get_children
-
-    Get a new grouped symbol whose output contains
-    inputs to output nodes of the original symbol
-
-    Returns
-    -------
-    sgroup : Symbol or undef
-        The children of the head node. If the symbol has no
-        inputs undef will be returned.
-=cut
-
-
-method get_children()
-{
-    my $handle = check_call(AI::MXNetCAPI::SymbolGetChildren($self->handle));
-    my $ret = __PACKAGE__->new(handle => $handle);
-    return undef unless @{ $ret->list_outputs };
-    return $ret;
-}
-
-=head2 list_arguments
-
-    List all the arguments in the symbol.
-
-    Returns
-    -------
-    args : array ref of strings
-=cut
-
-method list_arguments()
-{
-    return scalar(check_call(AI::MXNetCAPI::SymbolListArguments($self->handle)));
-}
-
-=head2 list_outputs()
-
-    List all outputs in the symbol.
-
-    Returns
-    -------
-    $out : array ref of strings.
-=cut
-
-method list_outputs()
-{
-    return scalar(check_call(AI::MXNetCAPI::SymbolListOutputs($self->handle)));
-}
-
-
-=head2 list_auxiliary_states()
-
-    List all auxiliary states in the symbol.
-
-    Returns
-    -------
-    aux_states : array ref of string
-        List the names of the auxiliary states.
-
-    Notes
-    -----
-    Auxiliary states are special states of symbols that do not corresponds to an argument,
-    and do not have gradient. But still be useful for the specific operations.
-    A common example of auxiliary state is the moving_mean and moving_variance in BatchNorm.
-    Most operators do not have Auxiliary states.
-=cut
-
-method list_auxiliary_states()
-{
-    return scalar(check_call(AI::MXNetCAPI::SymbolListAuxiliaryStates($self->handle)));
-}
-
-
-=head2 list_inputs
-
-    Lists all arguments and auxiliary states of this Symbol.
-
-    Returns
-    -------
-    inputs : array ref of str
-    List of all inputs.
-
-    Examples
-    --------
-    >>> my $bn = mx->sym->BatchNorm(name=>'bn');
-=cut
-
-method list_inputs()
-{
-    return scalar(check_call(AI::NNVMCAPI::SymbolListInputNames($self->handle, 0)));
-}
-
-=head2 infer_type
-
-        Infer the type of outputs and arguments of given known types of arguments.
-
-        User can either pass in the known types in positional way or keyword argument way.
-        Tuple of Nones is returned if there is not enough information passed in.
-        An error will be raised if there is inconsistency found in the known types passed in.
-
-        Parameters
-        ----------
-        args : Array
-            Provide type of arguments in a positional way.
-            Unknown type can be marked as None
-
-        kwargs : Hash ref, must ne ssupplied as as sole argument to the method.
-            Provide keyword arguments of known types.
-
-        Returns
-        -------
-        arg_types : array ref of Dtype or undef
-            List of types of arguments.
-            The order is in the same order as list_arguments()
-        out_types : array ref of Dtype or undef
-            List of types of outputs.
-            The order is in the same order as list_outputs()
-        aux_types : array ref of Dtype or undef
-            List of types of outputs.
-            The order is in the same order as list_auxiliary()
-=cut
-
-
-method infer_type(Maybe[Str] @args)
-{
-    my ($positional_arguments, $kwargs, $kwargs_order) = _parse_arguments("Dtype", @args);
-    my $sdata = [];
-    my $keys  = [];
-    if(@$positional_arguments)
-    {
-        @{ $sdata } = map { defined($_) ? DTYPE_STR_TO_MX->{ $_ } : -1 } @{ $positional_arguments };
-    }
-    else
-    {
-        @{ $keys }  = @{ $kwargs_order };
-        @{ $sdata } = map { DTYPE_STR_TO_MX->{ $_ } } @{ $kwargs }{ @{ $kwargs_order } };
-    }
-    my ($arg_type, $out_type, $aux_type, $complete) = check_call(AI::MXNetCAPI::SymbolInferType(
-            $self->handle,
-            scalar(@{ $sdata }),
-            $keys,
-            $sdata
-        )
-    );
-    if($complete)
-    {
-        return (
-            [ map { DTYPE_MX_TO_STR->{ $_ } } @{ $arg_type }],
-            [ map { DTYPE_MX_TO_STR->{ $_ } } @{ $out_type }],
-            [ map { DTYPE_MX_TO_STR->{ $_ } } @{ $aux_type }]
-        );
-    }
-    else
-    {
-        return (undef, undef, undef);
-    }
-}
-
-=head2 infer_shape
-
-        Infer the shape of outputs and arguments of given known shapes of arguments.
-
-        User can either pass in the known shapes in positional way or keyword argument way.
-        Tuple of Nones is returned if there is not enough information passed in.
-        An error will be raised if there is inconsistency found in the known shapes passed in.
-
-        Parameters
-        ----------
-        *args :
-            Provide shape of arguments in a positional way.
-            Unknown shape can be marked as undef
-
-        **kwargs :
-            Provide keyword arguments of known shapes.
-
-        Returns
-        -------
-        arg_shapes : array ref of Shape or undef
-            List of shapes of arguments.
-            The order is in the same order as list_arguments()
-        out_shapes : array ref of Shape or undef
-            List of shapes of outputs.
-            The order is in the same order as list_outputs()
-        aux_shapes : array ref of Shape or undef
-            List of shapes of outputs.
-            The order is in the same order as list_auxiliary()
-=cut
-
-method infer_shape(Maybe[Str|Shape] @args)
-{
-    my @res = $self->_infer_shape_impl(0, @args);
-    if(not defined $res[1])
-    {
-        my ($arg_shapes) = $self->_infer_shape_impl(1, @args);
-        my $arg_names    = $self->list_arguments;
-        my @unknowns;
-        for(zip($arg_names, $arg_shapes)) {
-            my ($name, $shape) = @$_;
-            if(not ref $shape or not @$shape or not product(@$shape))
-            {
-                if(@unknowns >= 10)
-                {
-                    $unknowns[10] = '...';
-                }
-                else
-                {
-                    my @shape = eval { @$shape };
-                    push @unknowns, "$name @shape";
-                }
-            }
-        }
-        AI::MXNet::Logging->warning(
-            "Cannot decide shape for the following arguments "
-            ."(0s in shape means unknown dimensions). "
-            ."Consider providing them as input:\n\t"
-            ."\n\t"
-            .join(", ", @unknowns)
-        );
-    }
-    return @res;
-}
-
-=head2 infer_shape_partial
-
-    Partially infer the shape. The same as infer_shape, except that the partial
-    results can be returned.
-=cut
-
-method infer_shape_partial(Maybe[Str|Shape] @args)
-{
-    $self->_infer_shape_impl(1, @args)
-}
-
-# The actual implementation for calling shape inference API.
-method _infer_shape_impl(Maybe[Str|Shape] @args)
-{
-    my $partial = shift(@args);
-    my ($positional_arguments, $kwargs, $kwargs_order) = _parse_arguments("Shape", @args);
-    my $sdata = [];
-    my $indptr = [0];
-    my $keys = [];
-    if(@{ $positional_arguments })
-    {
-        for my $shape (grep { defined } @{ $positional_arguments })
-        {
-            push @{ $sdata }, @{ $shape };
-            push @{ $indptr }, scalar(@{ $sdata });
-        }
-    }
-    {
-        for my $k (@{ $kwargs_order })
-        {
-            push @{ $keys }, $k;
-            push @{ $sdata }, @{ $kwargs->{ $k } };
-            push @{ $indptr }, scalar(@{ $sdata });
-        }
-    }
-    my $is64 = AI::MXNet::RunTime->Features()->is_enabled('INT64_TENSOR_SIZE');
-    my $infer_func = $partial
-                        ? (
-                            $is64 ? \&AI::MXNetCAPI::SymbolInferShapePartialEx64
-                                  : \&AI::MXNetCAPI::SymbolInferShapePartialEx
-                        )
-                        : (
-                            $is64 ? \&AI::MXNetCAPI::SymbolInferShapeEx64
-                                  : \&AI::MXNetCAPI::SymbolInferShapeEx
-                        );
-    my ($arg_shapes, $out_shapes, $aux_shapes, $complete) = check_call(
-        $infer_func->(
-            $self->handle,
-            scalar(@{ $indptr }) - 1,
-            $keys,
-            $indptr,
-            $sdata,
-        )
-    );
-    if($complete)
-    {
-        return $arg_shapes, $out_shapes, $aux_shapes;
-    }
-    else
-    {
-        return (undef, undef, undef);
-    }
-}
-
-=head2 debug_str
-
-    The debug string.
-
-    Returns
-    -------
-    debug_str : string
-        Debug string of the symbol.
-=cut
-
-method debug_str()
-{
-    return scalar(check_call(AI::MXNetCAPI::SymbolPrint($self->handle)));
-}
-
-=head2 save
-
-        Save the symbol into a file.
-
-        You can also use Storable to do the job if you only work with Perl.
-        The advantage of load/save is the file is language agnostic.
-        This means the file saved using save can be loaded by other language binding of mxnet.
-        You also get the benefit being able to directly load/save from cloud storage(S3, HDFS)
-
-        Parameters
-        ----------
-        fname : str
-            The name of the file
-            - s3://my-bucket/path/my-s3-symbol
-            - hdfs://my-bucket/path/my-hdfs-symbol
-            - /path-to/my-local-symbol
-
-        See Also
-        --------
-        load : Used to load symbol from file.
-=cut
-
-method save(Str $fname)
-{
-    check_call(AI::MXNetCAPI::SymbolSaveToFile($self->handle, $fname));
-}
-
-=head2 tojson
-
-        Save the symbol into a JSON string.
-
-        See Also
-        --------
-        load_json : Used to load symbol from JSON string.
-=cut
-
-method tojson()
-{
-    return scalar(check_call(AI::MXNetCAPI::SymbolSaveToJSON($self->handle)));
-}
-
-method _get_ndarray_inputs(
-    Str                                                      $arg_key,
-    HashRef[AI::MXNet::NDArray]|ArrayRef[AI::MXNet::NDArray] $args,
-    ArrayRef[Str]                                            $arg_names,
-    Bool                                                     $allow_missing=0
-)
-{
-    my ($arg_handles, $arg_arrays) = ([], []);
-    if(ref $args eq 'ARRAY')
-    {
-        confess("Length of $arg_key do not match number of arguments")
-            unless @$args == @$arg_names;
-        @{ $arg_handles } = map { $_->handle } @{ $args };
-        $arg_arrays = $args;
-    }
-    else
-    {
-        my %tmp = ((map { $_ => undef } @$arg_names), %$args);
-        if(not $allow_missing and grep { not defined } values %tmp)
-        {
-            my ($missing) = grep { not defined $tmp{ $_ } } (keys %tmp);
-            confess("key $missing is missing in $arg_key");
-        }
-        for my $name (@$arg_names)
-        {
-            push @$arg_handles, defined($tmp{ $name }) ? $tmp{ $name }->handle : undef;
-            push @$arg_arrays, defined($tmp{ $name }) ? $tmp{ $name } : undef;
-        }
-    }
-    return ($arg_handles, $arg_arrays);
-}
-
-=head2 simple_bind
-
-    Bind current symbol to get an executor, allocate all the ndarrays needed.
-    Allows specifying data types.
-
-    This function will ask user to pass in ndarray of position
-    they like to bind to, and it will automatically allocate the ndarray
-    for arguments and auxiliary states that user did not specify explicitly.
-
-    Parameters
-    ----------
-    :$ctx : AI::MXNet::Context
-        The device context the generated executor to run on.
-
-    :$grad_req: string
-        {'write', 'add', 'null'}, or list of str or dict of str to str, optional
-        Specifies how we should update the gradient to the args_grad.
-            - 'write' means everytime gradient is write to specified args_grad NDArray.
-            - 'add' means everytime gradient is add to the specified NDArray.
-            - 'null' means no action is taken, the gradient may not be calculated.
-
-    :$type_dict  : hash ref of str->Dtype
-        Input type map, name->dtype
-
-    :$type_dict  : hash ref of str->Stype
-        Storage type map, name->stype (for sparse operations)
-
-    :$group2ctx : hash ref of string to AI::MXNet::Context
-        The mapping of the ctx_group attribute to the context assignment.
-
-    :$shapes : hash ref of str->Shape
-        Input shape map, name->shape
-
-    :$shared_arg_names : Maybe[ArrayRef[Str]]
-        The argument names whose 'NDArray' of shared_exec can be reused for initializing
-        the current executor.
-
-    :$shared_exec : Maybe[AI::MXNet::Executor]
-        The executor whose arg_arrays, arg_arrays, grad_arrays, and aux_arrays can be
-        reused for initializing the current executor.
-
-    :$shared_buffer : Maybe[HashRef[AI::MXNet::NDArray]]
-        The dict mapping argument names to the `NDArray` that can be reused for initializing
-        the current executor. This buffer will be checked for reuse if one argument name
-        of the current executor is not found in `shared_arg_names`.
-
-    Returns
-    -------
-    $executor : AI::MXNet::Executor
-        The generated Executor
-=cut
-
-method simple_bind(
-    AI::MXNet::Context                             :$ctx=AI::MXNet::Context->current_ctx,
-    GradReq|ArrayRef[GradReq]|HashRef[GradReq]     :$grad_req='write',
-    Maybe[HashRef[Shape]]                          :$shapes=,
-    Maybe[HashRef[Dtype]]                          :$type_dict=,
-    Maybe[HashRef[Stype]]                          :$stype_dict=,
-    Maybe[HashRef[AI::MXNet::Context]]             :$group2ctx=,
-    Maybe[ArrayRef[Str]]                           :$shared_arg_names=,
-    Maybe[AI::MXNet::Executor]                     :$shared_exec=,
-    Maybe[HashRef[AI::MXNet::NDArray]]             :$shared_buffer=
-)
-{
-    my $num_provided_arg_types = 0;
-    my @provided_arg_type_names;
-    my @provided_arg_type_data;
-    if(defined $type_dict)
-    {
-        while(my ($k, $v) = each %{ $type_dict })
-        {
-            push @provided_arg_type_names, $k;
-            push @provided_arg_type_data, DTYPE_STR_TO_MX->{$v};
-        }
-        $num_provided_arg_types = @provided_arg_type_names;
-    }
-    my $num_provided_arg_stypes = 0;
-    my @provided_arg_stype_names;
-    my @provided_arg_stype_data;
-    if(defined $stype_dict)
-    {
-        while(my ($k, $v) = each %{ $stype_dict })
-        {
-            push @provided_arg_stype_names, $k;
-            push @provided_arg_stype_data, STORAGE_TYPE_STR_TO_ID->{$v};
-        }
-        $num_provided_arg_stypes = @provided_arg_stype_names;
-    }
-    my @provided_arg_shape_data;
-    # argument shape index in sdata,
-    # e.g. [sdata[indptr[0]], sdata[indptr[1]]) is the shape of the first arg
-    my @provided_arg_shape_idx = (0);
-    my @provided_arg_shape_names;
-    while(my ($k, $v) = each %{ $shapes//{} })
-    {
-        push @provided_arg_shape_names, $k;
-        push @provided_arg_shape_data, @{ $v };
-        push @provided_arg_shape_idx, scalar(@provided_arg_shape_data);
-    }
-    $num_provided_arg_types = @provided_arg_type_names;
-
-    my $provided_req_type_list_len = 0;
-    my @provided_grad_req_types;
-    my @provided_grad_req_names;
-    if(defined $grad_req)
-    {
-        if(not ref $grad_req)
-        {
-            push @provided_grad_req_types, $grad_req;
-        }
-        elsif(ref $grad_req eq 'ARRAY')
-        {
-            assert((@{ $grad_req } != 0), 'grad_req in simple_bind cannot be an empty list');
-            @provided_grad_req_types = @{ $grad_req };
-            $provided_req_type_list_len = @provided_grad_req_types;
-        }
-        elsif(ref $grad_req eq 'HASH')
-        {
-            assert((keys %{ $grad_req } != 0), 'grad_req in simple_bind cannot be an empty hash');
-            while(my ($k, $v) = each %{ $grad_req })
-            {
-                push @provided_grad_req_names, $k;
-                push @provided_grad_req_types, $v;
-            }
-            $provided_req_type_list_len = @provided_grad_req_types;
-        }
-    }
-    my $num_ctx_map_keys = 0;
-    my @ctx_map_keys;
-    my @ctx_map_dev_types;
-    my @ctx_map_dev_ids;
-    if(defined $group2ctx)
-    {
-        while(my ($k, $v) = each %{ $group2ctx })
-        {
-            push @ctx_map_keys, $k;
-            push @ctx_map_dev_types, $v->device_type_id;
-            push @ctx_map_dev_ids, $v->device_id;
-        }
-        $num_ctx_map_keys = @ctx_map_keys;
-    }
-
-    my @shared_arg_name_list;
-    if(defined $shared_arg_names)
-    {
-        @shared_arg_name_list = @{ $shared_arg_names };
-    }
-    my %shared_data;
-    if(defined $shared_buffer)
-    {
-        while(my ($k, $v) = each %{ $shared_buffer })
-        {
-            $shared_data{$k} = $v->handle;
-        }
-    }
-    my $shared_exec_handle = defined $shared_exec ? $shared_exec->handle : undef;
-    my (
-        $updated_shared_data,
-        $in_arg_handles,
-        $arg_grad_handles,
-        $aux_state_handles,
-        $exe_handle
-    );
-    my $sub = AI::MXNet::RunTime->Features()->is_enabled('INT64_TENSOR_SIZE')
-              ? \&AI::MXNetCAPI::ExecutorSimpleBindEx64
-              : \&AI::MXNetCAPI::ExecutorSimpleBindEx;
-    eval {
-        ($updated_shared_data, $in_arg_handles, $arg_grad_handles, $aux_state_handles, $exe_handle)
-            =
-        check_call(
-            $sub->(
-                $self->handle,
-                $ctx->device_type_id,
-                $ctx->device_id,
-                $num_ctx_map_keys,
-                \@ctx_map_keys,
-                \@ctx_map_dev_types,
-                \@ctx_map_dev_ids,
-                $provided_req_type_list_len,
-                \@provided_grad_req_names,
-                \@provided_grad_req_types,
-                scalar(@provided_arg_shape_names),
-                \@provided_arg_shape_names,
-                \@provided_arg_shape_data,
-                \@provided_arg_shape_idx,
-                $num_provided_arg_types,
-                \@provided_arg_type_names,
-                \@provided_arg_type_data,
-                $num_provided_arg_stypes,
-                \@provided_arg_stype_names,
-                \@provided_arg_stype_data,
-                scalar(@shared_arg_name_list),
-                \@shared_arg_name_list,
-                defined $shared_buffer ? \%shared_data : undef,
-                $shared_exec_handle
-            )
-        );
-    };
-    if($@)
-    {
-        confess(
-            "simple_bind failed: Error: $@; Arguments: ".
-            Data::Dumper->new(
-                [$shapes//{}]
-            )->Purity(1)->Deepcopy(1)->Terse(1)->Dump
-        );
-    }
-    if(defined $shared_buffer)
-    {
-        while(my ($k, $v) = each %{ $updated_shared_data })
-        {
-            $shared_buffer->{$k} = AI::MXNet::NDArray->_ndarray_cls($v);
-        }
-    }
-    my @arg_arrays  = map { AI::MXNet::NDArray->_ndarray_cls($_) } @{ $in_arg_handles };
-    my @grad_arrays = map { defined $_ ? AI::MXNet::NDArray->_ndarray_cls($_) : undef  } @{ $arg_grad_handles };
-    my @aux_arrays  = map { AI::MXNet::NDArray->_ndarray_cls($_) } @{ $aux_state_handles };
-    my $executor = AI::MXNet::Executor->new(
-        handle    => $exe_handle,
-        symbol    => $self,
-        ctx       => $ctx,
-        grad_req  => $grad_req,
-        group2ctx => $group2ctx
-    );
-    $executor->arg_arrays(\@arg_arrays);
-    $executor->grad_arrays(\@grad_arrays);
-    $executor->aux_arrays(\@aux_arrays);
-    return $executor;
-}
-
-=head2 bind
-
-    Bind current symbol to get an executor.
-
-    Parameters
-    ----------
-    :$ctx : AI::MXNet::Context
-        The device context the generated executor to run on.
-
-    :$args : HashRef[AI::MXNet::NDArray]|ArrayRef[AI::MXNet::NDArray]
-        Input arguments to the symbol.
-            - If type is array ref of NDArray, the position is in the same order of list_arguments.
-            - If type is hash ref of str to NDArray, then it maps the name of arguments
-                to the corresponding NDArray.
-            - In either case, all the arguments must be provided.
-
-    :$args_grad : Maybe[HashRef[AI::MXNet::NDArray]|ArrayRef[AI::MXNet::NDArray]]
-        When specified, args_grad provide NDArrays to hold
-        the result of gradient value in backward.
-            - If type is array ref of NDArray, the position is in the same order of list_arguments.
-            - If type is hash ref of str to NDArray, then it maps the name of arguments
-                to the corresponding NDArray.
-            - When the type is hash ref of str to NDArray, users only need to provide the dict
-                for needed argument gradient.
-        Only the specified argument gradient will be calculated.
-
-    :$grad_req : {'write', 'add', 'null'}, or array ref of str or hash ref of str to str, optional
-        Specifies how we should update the gradient to the args_grad.
-            - 'write' means everytime gradient is write to specified args_grad NDArray.
-            - 'add' means everytime gradient is add to the specified NDArray.
-            - 'null' means no action is taken, the gradient may not be calculated.
-
-    :$aux_states : array ref of NDArray, or hash ref of str to NDArray, optional
-        Input auxiliary states to the symbol, only need to specify when
-        list_auxiliary_states is not empty.
-            - If type is array ref of NDArray, the position is in the same order of list_auxiliary_states
-            - If type is hash ref of str to NDArray, then it maps the name of auxiliary_states
-                to the corresponding NDArray,
-            - In either case, all the auxiliary_states need to be provided.
-
-    :$group2ctx : hash ref of string to AI::MXNet::Context
-        The mapping of the ctx_group attribute to the context assignment.
-
-    :$shared_exec : AI::MXNet::Executor
-        Executor to share memory with. This is intended for runtime reshaping, variable length
-        sequences, etc. The returned executor shares state with shared_exec, and should not be
-        used in parallel with it.
-
-    Returns
-    -------
-    $executor : AI::MXNet::Executor
-        The generated Executor
-
-    Notes
-    -----
-    Auxiliary states are special states of symbols that do not corresponds to an argument,
-    and do not have gradient. But still be useful for the specific operations.
-    A common example of auxiliary state is the moving_mean and moving_variance in BatchNorm.
-    Most operators do not have auxiliary states and this parameter can be safely ignored.
-
-    User can give up gradient by using a hash ref in args_grad and only specify
-    the gradient they're interested in.
-=cut
-
-method bind(
-        AI::MXNet::Context                                              :$ctx,
-        HashRef[AI::MXNet::NDArray]|ArrayRef[AI::MXNet::NDArray]        :$args,
-        Maybe[HashRef[AI::MXNet::NDArray]|ArrayRef[AI::MXNet::NDArray]] :$args_grad=,
-        Str|HashRef[Str]|ArrayRef[Str]                                  :$grad_req='write',
-        Maybe[HashRef[AI::MXNet::NDArray]|ArrayRef[AI::MXNet::NDArray]] :$aux_states=,
-        Maybe[HashRef[AI::MXNet::Context]]                              :$group2ctx=,
-        Maybe[AI::MXNet::Executor]                                      :$shared_exec=
-)
-{
-    $grad_req //= 'write';
-    my $listed_arguments = $self->list_arguments();
-    my ($args_handle, $args_grad_handle, $aux_args_handle) = ([], [], []);
-    ($args_handle, $args) = $self->_get_ndarray_inputs('args', $args, $listed_arguments);
-    if(not defined $args_grad)
-    {
-        @$args_grad_handle = ((undef) x (@$args));
-    }
-    else
-    {
-        ($args_grad_handle, $args_grad) = $self->_get_ndarray_inputs(
-                'args_grad', $args_grad, $listed_arguments, 1
-        );
-    }
-
-    if(not defined $aux_states)
-    {
-        $aux_states = [];
-    }
-    ($aux_args_handle, $aux_states) = $self->_get_ndarray_inputs(
-            'aux_states', $aux_states, $self->list_auxiliary_states()
-    );
-
-    # setup requirements
-    my $req_map = { null => 0, write => 1, add =>  3 };
-    my $req_array = [];
-    if(not ref $grad_req)
-    {
-        confess('grad_req must be one of "null,write,add"')
-            unless exists $req_map->{ $grad_req };
-        @{ $req_array } = (($req_map->{ $grad_req }) x @{ $listed_arguments });
-    }
-    elsif(ref $grad_req eq 'ARRAY')
-    {
-        @{ $req_array } = map { $req_map->{ $_ } } @{ $grad_req };
-    }
-    else
-    {
-        for my $name (@{ $listed_arguments })
-        {
-            if(exists $grad_req->{ $name })
-            {
-                push @{ $req_array }, $req_map->{ $grad_req->{ $name } };
-            }
-            else
-            {
-                push @{ $req_array }, 0;
-            }
-        }
-    }
-
-    my $ctx_map_keys = [];
-    my $ctx_map_dev_types = [];
-    my $ctx_map_dev_ids = [];
-
-    if(defined $group2ctx)
-    {
-        while(my ($key, $val) = each %{ $group2ctx })
-        {
-            push @{ $ctx_map_keys } , $key;
-            push @{ $ctx_map_dev_types }, $val->device_type_id;
-            push @{ $ctx_map_dev_ids }, $val->device_id;
-        }
-    }
-    my $shared_handle = $shared_exec->handle if $shared_exec;
-    my $handle = check_call(AI::MXNetCAPI::ExecutorBindEX(
-                $self->handle,
-                $ctx->device_type_id,
-                $ctx->device_id,
-                scalar(@{ $ctx_map_keys }),
-                $ctx_map_keys,
-                $ctx_map_dev_types,
-                $ctx_map_dev_ids,
-                scalar(@{ $args }),
-                $args_handle,
-                $args_grad_handle,
-                $req_array,
-                scalar(@{ $aux_states }),
-                $aux_args_handle,
-                $shared_handle
-            )
-    );
-    my $executor = AI::MXNet::Executor->new(
-        handle    => $handle,
-        symbol    => $self,
-        ctx       => $ctx,
-        grad_req  => $grad_req,
-        group2ctx => $group2ctx
-    );
-    $executor->arg_arrays($args);
-    $executor->grad_arrays($args_grad);
-    $executor->aux_arrays($aux_states);
-    return $executor;
-}
-
-=head2 eval
-
-    Evaluate a symbol given arguments
-
-    The `eval` method combines a call to `bind` (which returns an executor)
-    with a call to `forward` (executor method).
-    For the common use case, where you might repeatedly evaluate with same arguments,
-    eval is slow.
-    In that case, you should call `bind` once and then repeatedly call forward.
-    Eval allows simpler syntax for less cumbersome introspection.
-
-    Parameters
-    ----------
-    :$ctx : Context
-    The device context the generated executor to run on.
-    Optional, defaults to cpu(0)
-
-    :$args array ref of NDArray or hash ref of NDArray
-
-        - If the type is an array ref of NDArray, the position is in the same order of list_arguments.
-        - If the type is a hash of str to NDArray, then it maps the name of the argument
-            to the corresponding NDArray.
-        - In either case, all arguments must be provided.
-
-    Returns
-    ----------
-    result :  an array ref of NDArrays corresponding to the values
-        taken by each symbol when evaluated on given args.
-        When called on a single symbol (not a group),
-        the result will be an array ref with one element.
-
-    Examples:
-    my $result = $symbol->eval(ctx => mx->gpu, args => {data => mx->nd->ones([5,5])});
-    my $result = $symbol->eval(args => {data => mx->nd->ones([5,5])});
-
-=cut
-
-method eval(:$ctx=AI::MXNet::Context->cpu, HashRef[AI::MXNet::NDArray]|ArrayRef[AI::MXNet::NDArray] :$args)
-{
-    return $self->bind(ctx => $ctx, args => $args)->forward;
-}
-
-=head2  grad
-
-    Get the autodiff of current symbol.
-    This function can only be used if current symbol is a loss function.
-
-    Parameters
-    ----------
-    $wrt : Array of String
-        keyword arguments of the symbol that the gradients are taken.
-
-    Returns
-    -------
-    grad : AI::MXNet::Symbol
-        A gradient Symbol with returns to be the corresponding gradients.
-=cut
-
-method grad(ArrayRef[Str] $wrt)
-{
-    my $handle = check_call(AI::MXNetCAPI::SymbolGrad(
-                    $self->handle,
-                    scalar(@$wrt),
-                    $wrt
-                 )
-    );
-    return __PACKAGE__->new(handle => $handle);
-}
-
-=head2 Variable
-
-    Create a symbolic variable with specified name.
-
-    Parameters
-    ----------
-    name : str
-        Name of the variable.
-    attr : hash ref of string -> string
-        Additional attributes to set on the variable.
-    shape : array ref of positive integers
-        Optionally, one can specify the shape of a variable. This will be used during
-        shape inference. If user specified a different shape for this variable using
-        keyword argument when calling shape inference, this shape information will be ignored.
-    lr_mult : float
-        Specify learning rate muliplier for this variable.
-    wd_mult : float
-        Specify weight decay muliplier for this variable.
-    dtype : Dtype
-        Similar to shape, we can specify dtype for this variable.
-    init : initializer (mx->init->*)
-        Specify initializer for this variable to override the default initializer
-    kwargs : hash ref
-        other additional attribute variables
-    Returns
-    -------
-    variable : Symbol
-        The created variable symbol.
-=cut
-
-method Variable(
-    Str                            $name,
-    HashRef[Str]                  :$attr={},
-    Maybe[Shape]                  :$shape=,
-    Maybe[Num]                    :$lr_mult=,
-    Maybe[Num]                    :$wd_mult=,
-    Maybe[Dtype]                  :$dtype=,
-    Maybe[Stype]                  :$stype=,
-    Maybe[Initializer]            :$init=,
-    HashRef[Str]                  :$kwargs={},
-    Maybe[Str]                    :$__layout__=
-)
-{
-    my $handle = check_call(AI::MXNetCAPI::SymbolCreateVariable($name));
-    my $ret = __PACKAGE__->new(handle => $handle);
-    $attr = AI::MXNet::Symbol::AttrScope->current->get($attr);
-    $attr->{__shape__}   = "(".join(',', @{ $shape }).")" if $shape;
-    $attr->{__lr_mult__} =  $lr_mult if defined $lr_mult;
-    $attr->{__wd_mult__} =  $wd_mult if defined $wd_mult;
-    $attr->{__dtype__}   = DTYPE_STR_TO_MX->{ $dtype } if $dtype;
-    $attr->{__init__}    = "$init" if defined $init;
-    $attr->{__layout__}  = $__layout__ if defined $__layout__;
-    $attr->{__storage_type__} = STORAGE_TYPE_STR_TO_ID->{$stype} if defined $stype;
-    while(my ($k, $v) = each %{ $kwargs })
-    {
-        if($k =~ /^__/ and $k =~ /__$/)
-        {
-            $attr->{$k} = "$v";
-        }
-        else
-        {
-            confess("Attribute name=$k is not supported.".
-                    ' Additional attributes must start and end with double underscores,'.
-                    ' e.g, __yourattr__'
-            );
-        }
-    }
-    $ret->_set_attr(%{ $attr });
-    return $ret;
-}
-
-=head2 var
-
-    A synonym to Variable.
-=cut
-
-*var = \&Variable;
-
-=head2 Group
-
-    Create a symbol that groups symbols together.
-
-    Parameters
-    ----------
-    symbols : array ref
-        List of symbols to be grouped.
-
-    Returns
-    -------
-    sym : Symbol
-        The created group symbol.
-=cut
-
-method Group(ArrayRef[AI::MXNet::Symbol] $symbols)
-{
-    my @handles = map { $_->handle } @{ $symbols };
-    my $handle = check_call(AI::MXNetCAPI::SymbolCreateGroup(scalar(@handles), \@handles));
-    return __PACKAGE__->new(handle => $handle);
-}
-
-=head2 load
-
-    Load symbol from a JSON file.
-
-    You can also use Storable to do the job if you only work with Perl.
-    The advantage of load/save is the file is language agnostic.
-    This means the file saved using save can be loaded by other language binding of mxnet.
-    You also get the benefit being able to directly load/save from cloud storage(S3, HDFS)
-
-    Parameters
-    ----------
-    fname : str
-        The name of the file, examples:
-
-        - `s3://my-bucket/path/my-s3-symbol`
-        - `hdfs://my-bucket/path/my-hdfs-symbol`
-        - `/path-to/my-local-symbol`
-
-    Returns
-    -------
-    sym : Symbol
-        The loaded symbol.
-
-    See Also
-    --------
-    AI::MXNet::Symbol->save : Used to save symbol into file.
-=cut
-
-method load(Str $fname)
-{
-    my $handle = check_call(AI::MXNetCAPI::SymbolCreateFromFile($fname));
-    return __PACKAGE__->new(handle => $handle);
-}
-
-=head2 load_json
-
-    Load symbol from json string.
-
-    Parameters
-    ----------
-    json_str : str
-        A json string.
-
-    Returns
-    -------
-    sym : Symbol
-        The loaded symbol.
-
-    See Also
-    --------
-    AI::MXNet::Symbol->tojson : Used to save symbol into json string.
-=cut
-
-method load_json(Str $json)
-{
-    my $handle = check_call(AI::MXNetCAPI::SymbolCreateFromJSON($json));
-    return __PACKAGE__->new(handle => $handle);
-}
-
-method zeros(Shape :$shape, Dtype :$dtype='float32', Maybe[Str] :$name=, Maybe[Str] :$__layout__=)
-{
-    return __PACKAGE__->_zeros({ shape => $shape, dtype => $dtype, name => $name, ($__layout__ ? (__layout__ => $__layout__) : ()) });
-}
-
-method ones(Shape :$shape, Dtype :$dtype='float32', Maybe[Str] :$name=, Maybe[Str] :$__layout__=)
-{
-    return __PACKAGE__->_ones({ shape => $shape, dtype => $dtype, name => $name, ($__layout__ ? (__layout__ => $__layout__) : ()) });
-}
-
-=head2 arange
-
-    Simlar function in the MXNet ndarray as numpy.arange
-        See Also https://docs.scipy.org/doc/numpy/reference/generated/numpy.arange.html.
-
-    Parameters
-    ----------
-    :$start=0 : number
-        Start of interval. The interval includes this value. The default start value is 0.
-    :$stop= : number, optional
-        End of interval. The interval does not include this value.
-    :$step=1.0 : number, optional
-        Spacing between values
-    :$repeat=1 : int, optional
-        "The repeating time of all elements.
-        E.g repeat=3, the element a will be repeated three times --> a, a, a.
-    :$infer_range=0 : Bool
-        When set to 1, infer stop position from start, step, repeat, and
-        output tensor size.
-    :$dtype='float32' : type, optional
-        The value type of the NDArray, default to np.float32
-
-    Returns
-    -------
-    out : Symbol
-        The created Symbol
-=cut
-
-method arange(Index :$start=0, Index :$stop=, Num :$step=1.0, Index :$repeat=1, Bool :$infer_range=0, Maybe[Str] :$name=, Dtype :$dtype='float32')
-{
-    return __PACKAGE__->_arange({
-                 start => $start, (defined $stop ? (stop => $stop) : ()),
-                 step => $step, repeat => $repeat, name => $name, dtype => $dtype,
-                 infer_range => $infer_range
-    });
-}
-
-
-sub _parse_arguments
-{
-    my $type = shift;
-    my @args = @_;
-    my $type_c = find_type_constraint($type);
-    my $str_c  = find_type_constraint("Str");
-    my @positional_arguments;
-    my %kwargs;
-    my @kwargs_order;
-    my $only_dtypes_and_undefs = (@args == grep { not defined($_) or $type_c->check($_) } @args);
-    my $only_dtypes_and_strs   = (@args == grep { $type_c->check($_) or $str_c->check($_) } @args);
-    if(@args % 2 and $only_dtypes_and_undefs)
-    {
-        @positional_arguments = @args;
-    }
-    else
-    {
-        if($only_dtypes_and_undefs)
-        {
-            @positional_arguments = @args;
-        }
-        elsif($only_dtypes_and_strs)
-        {
-            my %tmp = @args;
-            if(values(%tmp) == grep { $type_c->check($_) } values(%tmp))
-            {
-                %kwargs = %tmp;
-                my $i = 0;
-                @kwargs_order = grep { $i ^= 1 } @args;
-            }
-            else
-            {
-                confess("Argument needs to be of type $type");
-            }
-        }
-        else
-        {
-            confess("Argument needs to be one type $type");
-        }
-    }
-    return (\@positional_arguments, \%kwargs, \@kwargs_order);
-}
-
-sub  _ufunc_helper
-{
-    my ($lhs, $rhs, $fn_symbol, $lfn_scalar, $rfn_scalar, $reverse) = @_;
-    ($rhs, $lhs) = ($lhs, $rhs) if $reverse and $rfn_scalar;
-    if(not ref $lhs)
-    {
-        if(not $rfn_scalar)
-        {
-            return __PACKAGE__->can($lfn_scalar)->(__PACKAGE__, $rhs, { "scalar" => $lhs });
-        }
-        else
-        {
-            return __PACKAGE__->can($rfn_scalar)->(__PACKAGE__, $rhs, { "scalar" => $lhs });
-        }
-    }
-    elsif(not ref $rhs)
-    {
-        return __PACKAGE__->can($lfn_scalar)->(__PACKAGE__, $lhs, { "scalar" => $rhs });
-    }
-    else
-    {
-        return __PACKAGE__->can($fn_symbol)->(__PACKAGE__, $lhs, $rhs);
-    }
-}
-
-method histogram(@args) { __PACKAGE__->_histogram(@args%2 ? ('data', @args) : @args) }
-
-sub contrib { 'AI::MXNet::Contrib::Symbol' }
-sub random  { 'AI::MXNet::Symbol::Random' }
-sub sparse  { 'AI::MXNet::Symbol::Sparse' }
-sub linalg  { 'AI::MXNet::LinAlg::Symbol' }
-sub image   { 'AI::MXNet::Image::Symbol' }
-
-1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Symbol/AttrScope.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Symbol/AttrScope.pm
deleted file mode 100644
index 549939f006aa..000000000000
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Symbol/AttrScope.pm
+++ /dev/null
@@ -1,94 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-package AI::MXNet::Symbol::AttrScope;
-use strict;
-use warnings;
-use Mouse;
-use AI::MXNet::NS;
-use AI::MXNet::Function::Parameters;
-around BUILDARGS => sub {
-    my $orig  = shift;
-    my $class = shift;
-    return $class->$orig(attr => {@_});
-};
-
-=head1 NAME
-
-    AI::MXNet::Symbol::AttrScope - Attribute manager for local scoping.
-
-=head1 DESCRIPTION
-
-    Attribute manager for scoping.
-
-    User can also inherit this object to change naming behavior.
-
-    Parameters
-    ----------
-    kwargs
-        The attributes to set for all symbol creations in the scope.
-=cut
-
-has 'attr' => (
-    is => 'ro',
-    isa => 'HashRef[Str]',
-);
-
-=head2 current
-
-    Get the attribute hash ref given the attribute set by the symbol.
-
-    Returns
-    -------
-    $attr : current value of the class singleton object
-=cut
-
-method current()
-{
-    $AI::MXNet::AttrScope;
-}
-
-method set_current(AI::MXNet::Symbol::AttrScope $new)
-{
-    $AI::MXNet::AttrScope = $new;
-}
-
-=head2 get
-
-    Get the attribute hash ref given the attribute set by the symbol.
-
-    Parameters
-    ----------
-    $attr : Maybe[HashRef[Str]]
-        The attribute passed in by user during symbol creation.
-
-    Returns
-    -------
-    $attr : HashRef[Str]
-        The attributes updated to include another the scope related attributes.
-=cut
-
-method get(Maybe[HashRef[Str]] $attr=)
-{
-    return bless($attr//{}, 'AI::MXNet::Util::Printable') unless %{ $self->attr };
-    my %ret = (%{ $self->attr }, %{ $attr//{} });
-    return bless (\%ret, 'AI::MXNet::Util::Printable');
-}
-
-__PACKAGE__->AI::MXNet::NS::register('AI::MXNet');
-
-1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Symbol/Base.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Symbol/Base.pm
deleted file mode 100644
index d668decc6918..000000000000
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Symbol/Base.pm
+++ /dev/null
@@ -1,210 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-package AI::MXNet::Symbol::Base;
-use strict;
-use warnings;
-use AI::MXNet::Base;
-use AI::MXNet::Symbol::AttrScope;
-use AI::MXNet::Symbol::Doc;
-use AI::MXNet::Symbol::NameManager;
-use Mouse;
-use AI::MXNet::Function::Parameters;
-
-=head1 NAME
-
-    AI::MXNet::Symbol::Base
-=cut
-
-=head1 DESCRIPTION
-
-    A convenience class that loads all C++ symbol related functions at runtime.
-=cut
-
-my %function_meta;
-method function_meta($code)
-{
-    return $function_meta{$code};
-}
-
-method function_meta_hash()
-{
-    return \%function_meta;
-}
-
-sub _compose
-{
-    my $self = shift;
-    my (@args, %kwargs);
-    while(ref $_[0])
-    {
-        push @args, shift(@_);
-    }
-    %kwargs = @_;
-    my $name = delete $kwargs{'name'};
-    if(@args and %kwargs)
-    {
-        confess("_compose only accept input Symbols \
-            either as positional or keyword arguments, not both");
-    }
-    if(grep { not blessed($_) or not $_->isa(__PACKAGE__) } (@args, values %kwargs))
-    {
-        confess("_compose expect 'Symbol' as arguments");
-    }
-
-    my $num_args = scalar(@args) + scalar(keys %kwargs);
-    my $keys = [];
-    my $args = [];
-    for my $key (keys %kwargs)
-    {
-        push @$keys, $key;
-        push @$args, $kwargs{ $key }->handle;
-    }
-    @$args = map { $_->handle } @args if @args;
-    check_call(
-        AI::NNVMCAPI::SymbolCompose(
-            $self->handle, $name, $num_args, $keys, $args
-        )
-    );
-}
-
-# Create an atomic symbol function by handle and funciton name
-func _make_atomic_symbol_function($handle, $name)
-{
-    my ($real_name, $desc, $arg_names,
-        $arg_types, $arg_descs, $key_var_num_args,
-        $ret_type) = @{ check_call(AI::MXNetCAPI::SymbolGetAtomicSymbolInfo($handle)) };
-    $ret_type //= '';
-    my $func_name = $name;
-    my @arguments;
-    my %arguments = map { $_ => 1 } qw/name attr lr_mult wd_mult
-                                       init __layout__ dtype shape/;
-    for my $i (0..@{ $arg_names }-1)
-    {
-        push @arguments, $arg_names->[$i];
-        $arguments{ $arg_names->[$i] } = 1;
-    }
-    my $doc_str = build_doc($func_name,
-                            $desc,
-                            $arg_names,
-                            $arg_types,
-                            $arg_descs,
-                            $key_var_num_args,
-                            $ret_type
-    );
-    my $creator = sub {
-        my $class = ref($_[0]) || shift;
-        my (@args, %kwargs);
-        if(
-            @_
-                and
-            ref $_[-1] eq 'HASH'
-                and
-            not (@_ >= 2 and not blessed $_[-2] and $_[-2] eq 'attr')
-        )
-        {
-            %kwargs = %{ pop(@_) };
-            @args = @_;
-        }
-        elsif(blessed $_[0] and $_[0]->isa(__PACKAGE__))
-        {
-
-            while(blessed $_[0] and $_[0]->isa(__PACKAGE__))
-            {
-                push @args, shift(@_);
-            }
-            %kwargs = @_;
-        }
-        else
-        {
-            while(@_ >= 2 and not ref $_[-2]
-                    and (exists $arguments{ $_[-2] } or (blessed $_[-1] and $_[-1]->isa(__PACKAGE__))))
-            {
-                my $v = pop(@_);
-                my $k = pop(@_);
-                $kwargs{ $k } = $v;
-            }
-            @kwargs{ @arguments[0..@args-1] } = @args;
-        }
-        if(blessed $class and $class->isa(__PACKAGE__))
-        {
-            $kwargs{data} = $class;
-        }
-        my $params = {};
-        my $symbol_kwargs = {};
-        my $attr = delete $kwargs{ 'attr' };
-        %kwargs = (%kwargs, % { AI::MXNet::Symbol::AttrScope->current->get($attr) });
-        $name = delete $kwargs{ 'name' };
-        if($key_var_num_args and not exists $kwargs { $key_var_num_args })
-        {
-            $params->{ $key_var_num_args } = scalar(@args);
-        }
-        for my $key (keys %kwargs)
-        {
-            $kwargs{ $key } = "(" .join(", ", map { defined($_) ? $_ : 'None' } @{ $kwargs{ $key } }) .")"
-                if ref $kwargs{ $key } eq 'ARRAY';
-        }
-        while(my ($k, $v) = each %kwargs)
-        {
-            if(blessed($v) and $v->isa(__PACKAGE__))
-            {
-                $symbol_kwargs->{ $k } = $v;
-            }
-            else
-            {
-                $params->{ $k } = "$v";
-            }
-        }
-        # create atomic symbol
-        my $sym_handle = check_call(
-            AI::MXNetCAPI::SymbolCreateAtomicSymbol(
-                $handle,
-                scalar(keys %$params),
-                $params
-            )
-        );
-        my $s = $class->new(handle => $sym_handle);
-        my $hint = lc($func_name);
-        $name = AI::MXNet::Symbol::NameManager->current->get($name, $hint);
-        $s->_compose(@args, name => $name, %$symbol_kwargs);
-        return $s;
-    };
-    $function_meta{ $creator }{__name__} = $func_name;
-    $function_meta{ $creator }{__doc__} = $doc_str;
-    return $creator;
-}
-
-method _init_symbol_module()
-{
-    my $op_names = check_call(AI::MXNetCAPI::ListAllOpNames());
-    for my $name (@$op_names)
-    {
-        my $handle = check_call(AI::NNVMCAPI::GetOpHandle($name));
-        my $function = _make_atomic_symbol_function($handle, $name);
-        {
-            no strict 'refs';
-            {
-                *{__PACKAGE__."::$name"} = $function;
-            }
-        }
-    }
-}
-
-
-__PACKAGE__->_init_symbol_module;
-
-1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Symbol/Doc.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Symbol/Doc.pm
deleted file mode 100644
index 1d9a2c1288ea..000000000000
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Symbol/Doc.pm
+++ /dev/null
@@ -1,61 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-package AI::MXNet::Symbol::Doc;
-use strict;
-use warnings;
-use AI::MXNet::Base;
-use AI::MXNet::Function::Parameters;
-use Exporter;
-use base qw(Exporter);
-@AI::MXNet::Symbol::Doc::EXPORT = qw/build_doc/;
-
-method get_output_shape(AI::MXNet::Symbol $sym, %input_shapes)
-{
-    my $s_outputs = $sym->infer_shape(%input_shapes);
-    my %ret;
-    @ret{ @{ $sym->list_outputs() } } = @$s_outputs;
-    return bless \%ret, 'AI::MXNet::Util::Printable';
-}
-
-func build_doc(
-                    Str $func_name,
-                    Str $desc,
-                    ArrayRef[Str] $arg_names,
-                    ArrayRef[Str] $arg_types,
-                    ArrayRef[Str] $arg_desc,
-                    Str $key_var_num_args=,
-                    Str $ret_type=
-)
-{
-    my $param_str = build_param_doc($arg_names, $arg_types, $arg_desc);
-    if($key_var_num_args)
-    {
-        $desc .= "\nThis function support variable length of positional input."
-    }
-    my $doc_str = sprintf("%s\n\n" .
-               "%s\n" .
-               "name : string, optional.\n" .
-               "    Name of the resulting symbol.\n\n" .
-               "Returns\n" .
-               "-------\n" .
-               "symbol: Symbol\n" .
-               "    The result symbol.", $desc, $param_str);
-    return $doc_str;
-}
-
-1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Symbol/NameManager.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Symbol/NameManager.pm
deleted file mode 100644
index 2238a4366789..000000000000
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Symbol/NameManager.pm
+++ /dev/null
@@ -1,117 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-package AI::MXNet::Symbol::NameManager;
-use strict;
-use warnings;
-use Mouse;
-use AI::MXNet::Function::Parameters;
-
-=head1 NAME
-
-    AI::MXNet::Symbol::NameManager - Automated symbol naming.
-
-=head1 DESCRIPTION
-
-    NameManager that does an automatic naming.
-
-    A user can also inherit this object to change the naming behavior.
-=cut
-
-has 'counter' => (
-    is => 'ro',
-    isa => 'HashRef',
-    default => sub { +{} }
-);
-
-our $current;
-
-=head2 get
-
-    Get the canonical name for a symbol.
-
-    This is default implementation.
-    When user specified a name,
-    the user specified name will be used.
-
-    When user did not, we will automatically generate a
-    name based on hint string.
-
-    Parameters
-    ----------
-    name : str or undef
-        The name the user has specified.
-
-    hint : str
-        A hint string, which can be used to generate name.
-
-    Returns
-    -------
-    full_name : str
-        A canonical name for the symbol.
-=cut
-
-method get(Maybe[Str] $name, Str $hint)
-{
-    return $name if $name;
-    if(not exists $self->counter->{ $hint })
-    {
-        $self->counter->{ $hint } = 0;
-    }
-    $name = sprintf("%s%d", $hint, $self->counter->{ $hint });
-    $self->counter->{ $hint }++;
-    return $name;
-}
-
-method current()
-{
-    $AI::MXNet::NameManager;
-}
-
-method set_current(AI::MXNet::Symbol::NameManager $new)
-{
-    $AI::MXNet::NameManager = $new;
-}
-
-package AI::MXNet::Symbol::Prefix;
-use Mouse;
-
-=head1 NAME
-
-    AI::MXNet::Symbol::Prefix
-=cut
-
-extends 'AI::MXNet::Symbol::NameManager';
-
-=head1 DESCRIPTION
-
-    A name manager that always attaches a prefix to all names.
-=cut
-
-has prefix => (
-    is => 'ro',
-    isa => 'Str',
-    required => 1
-);
-
-method get(Maybe[Str] $name, Str $hint)
-{
-    $name = $self->SUPER::get($name, $hint);
-    return $self->prefix . $name;
-}
-
-1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Symbol/Random.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Symbol/Random.pm
deleted file mode 100644
index 795fe44825e4..000000000000
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Symbol/Random.pm
+++ /dev/null
@@ -1,58 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-package AI::MXNet::Symbol::Random;
-use strict;
-use warnings;
-use Scalar::Util qw/blessed/;
-
-sub AUTOLOAD {
-    my $sub = $AI::MXNet::Symbol::Random::AUTOLOAD;
-    $sub =~ s/.*:://;
-    shift;
-    my @args = @_;
-    if($sub eq 'exponential')
-    {
-        my $changed = 0;
-        for my $i (0..@args-1)
-        {
-            if(not ref $args[$i] and $args[$i] eq 'scale')
-            {
-                $args[$i] = 'lam';
-                $args[$i+1] = 1/$args[$i+1];
-                $changed = 1;
-            }
-        }
-        $args[0] = 1/$args[0] unless $changed;
-    }
-    if(grep { blessed($_) and $_->isa('AI::MXNet::Symbol') } @args)
-    {
-        if($sub eq 'normal')
-        {
-            my %mapping = qw/loc mu scale sigma/;
-            @args = map { (not ref $_ and exists $mapping{$_}) ? $mapping{$_} : $_ } @args
-        }
-        $sub = "_sample_$sub";
-    }
-    else
-    {
-        $sub = "_random_$sub";
-    }
-    return AI::MXNet::Symbol->$sub(@args);
-}
-
-1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Symbol/Sparse.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Symbol/Sparse.pm
deleted file mode 100644
index b81cba9010b9..000000000000
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Symbol/Sparse.pm
+++ /dev/null
@@ -1,33 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-package AI::MXNet::Symbol::Sparse;
-use strict;
-use warnings;
-use Mouse;
-extends 'AI::MXNet::Symbol';
-
-sub AUTOLOAD {
-    my $sub = $AI::MXNet::Symbol::Sparse::AUTOLOAD;
-    $sub =~ s/.*:://;
-    shift;
-    my @args = @_;
-    $sub = "_sparse_$sub";
-    return AI::MXNet::Symbol->$sub(@args);
-}
-
-1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/TestUtils.pm b/perl-package/AI-MXNet/lib/AI/MXNet/TestUtils.pm
deleted file mode 100644
index 45f13dbf4e53..000000000000
--- a/perl-package/AI-MXNet/lib/AI/MXNet/TestUtils.pm
+++ /dev/null
@@ -1,819 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-package AI::MXNet::TestUtils;
-use strict;
-use warnings;
-use AI::MXNet 'mx';
-use PDL;
-use Carp qw(confess);
-use Scalar::Util qw(blessed);
-use List::Util qw(shuffle);
-use AI::MXNet::Function::Parameters;
-use AI::MXNet::Base;
-use Exporter;
-use base qw(Exporter);
-our @EXPORT_OK = qw(same reldiff almost_equal GetMNIST_ubyte
-                    GetCifar10 pdl_maximum pdl_minimum mlp2 dies_ok
-                    check_consistency zip assert enumerate same_array dies_like allclose rand_shape_2d
-                    rand_shape_3d rand_sparse_ndarray random_arrays rand_ndarray randint pdl);
-use constant default_numerical_threshold => 1e-6;
-=head1 NAME
-
-    AI::MXNet::TestUtils - Convenience subs used in tests.
-
-=head2 same
-
-    Test if two pdl arrays are the same
-
-    Parameters
-    ----------
-    a : pdl
-    b : pdl
-=cut
-
-func isNaN(PDL $a)
-{
-    # Only NaN is not greater, equal to, or less than zero.
-    return !(($a > 0) + ($a <= 0));
-}
-
-func same(PDL $a, PDL $b)
-{
-    my $rv = ($a != $b) - isNaN($a) * isNaN($b);
-    return $rv->sum == 0;
-}
-
-=head2 allclose
-
-    Test if all elements of two pdl arrays are almost equal
-
-    Parameters
-    ----------
-    a : pdl
-    b : pdl
-=cut
-
-func allclose(PDL $a, PDL $b, Maybe[Num] $threshold=)
-{
-    return (($a - $b)->abs <= ($threshold//default_numerical_threshold))->all;
-}
-
-=head2 reldiff
-
-    Calculate the relative difference between two input arrays
-
-    Calculated by :math:`\\frac{|a-b|_1}{|a|_1 + |b|_1}`
-
-    Parameters
-    ----------
-    a : pdl
-    b : pdl
-=cut
-
-func reldiff(PDL $a, PDL $b)
-{
-    my $diff = sum(abs($a - $b));
-    my $norm = sum(abs($a)) + sum(abs($b));
-    if($diff == 0)
-    {
-        return 0;
-    }
-    my $ret = $diff / $norm;
-    return $ret;
-}
-
-=head2 almost_equal
-
-    Test if two pdl arrays are almost equal.
-=cut
-
-func almost_equal(PDL $a, PDL $b, Maybe[Num] $threshold=)
-{
-    $threshold //= default_numerical_threshold;
-    my $rel = reldiff($a, $b);
-    return $rel <= $threshold;
-}
-
-func GetMNIST_ubyte()
-{
-    if(not -d "data")
-    {
-        mkdir "data";
-    }
-    if (
-        not -f 'data/train-images-idx3-ubyte'
-            or
-        not -f 'data/train-labels-idx1-ubyte'
-            or
-        not -f 'data/t10k-images-idx3-ubyte'
-            or
-        not -f 'data/t10k-labels-idx1-ubyte'
-    )
-    {
-        `wget http://data.mxnet.io/mxnet/data/mnist.zip -P data`;
-        chdir 'data';
-        `unzip -u mnist.zip`;
-        chdir '..';
-    }
-}
-
-func GetCifar10()
-{
-    if(not -d "data")
-    {
-        mkdir "data";
-    }
-    if (not -f 'data/cifar10.zip')
-    {
-        `wget http://data.mxnet.io/mxnet/data/cifar10.zip -P data`;
-        chdir 'data';
-        `unzip -u cifar10.zip`;
-        chdir '..';
-    }
-}
-
-func _pdl_compare(PDL $a, PDL|Num $b, Str $criteria)
-{
-    if(not blessed $b)
-    {
-        my $tmp = $b;
-        $b = $a->copy;
-        $b .= $tmp;
-    }
-    my $mask = {
-        'max' => sub { $_[0] < $_[1] },
-        'min' => sub { $_[0] > $_[1] },
-    }->{$criteria}->($a, $b);
-    my $c = $a->copy;
-    $c->where($mask) .= $b->where($mask);
-    $c;
-}
-
-func pdl_maximum(PDL $a, PDL|Num $b)
-{
-    _pdl_compare($a, $b, 'max');
-}
-
-func pdl_minimum(PDL $a, PDL|Num $b)
-{
-    _pdl_compare($a, $b, 'min');
-}
-
-func mlp2()
-{
-    my $data = AI::MXNet::Symbol->Variable('data');
-    my $out  = AI::MXNet::Symbol->FullyConnected(data=>$data, name=>'fc1', num_hidden=>1000);
-    $out     = AI::MXNet::Symbol->Activation(data=>$out, act_type=>'relu');
-    $out     = AI::MXNet::Symbol->FullyConnected(data=>$out, name=>'fc2', num_hidden=>10);
-    return $out;
-}
-
-=head2 check_consistency
-
-    Check symbol gives the same output for different running context
-
-    Parameters
-    ----------
-    sym : Symbol or list of Symbols
-        symbol(s) to run the consistency test
-    ctx_list : list
-        running context. See example for more detail.
-    scale : float, optional
-        standard deviation of the inner normal distribution. Used in initialization
-    grad_req : str or list of str or dict of str to str
-        gradient requirement.
-=cut
-
-my %dtypes = (
-    float32 => 0,
-    float64 => 1,
-    float16 => 2,
-    uint8   => 3,
-    int32   => 4
-);
-
-func check_consistency(
-    SymbolOrArrayOfSymbols              :$sym,
-    ArrayRef                            :$ctx_list,
-    Num                                 :$scale=1,
-    Str|ArrayRef[Str]|HashRef[Str]      :$grad_req='write',
-    Maybe[HashRef[AI::MXNet::NDArray]]  :$arg_params=,
-    Maybe[HashRef[AI::MXNet::NDArray]]  :$aux_params=,
-    Maybe[HashRef[Num]|Num]             :$tol=,
-    Bool                                :$raise_on_err=1,
-    Maybe[AI::MXNer::NDArray]           :$ground_truth=
-)
-{
-    $tol //= {
-        float16 => 1e-1,
-        float32 => 1e-3,
-        float64 => 1e-5,
-        uint8   => 0,
-        int32   => 0
-    };
-    $tol = {
-        float16 => $tol,
-        float32 => $tol,
-        float64 => $tol,
-        uint8   => $tol,
-        int32   => $tol
-    } unless ref $tol;
-
-    Test::More::ok(@$ctx_list > 1);
-    if(blessed $sym)
-    {
-        $sym = [($sym)x@$ctx_list];
-    }
-    else
-    {
-        Test::More::ok(@$sym == @$ctx_list);
-    }
-    my $output_names = $sym->[0]->list_outputs;
-    my $arg_names    = $sym->[0]->list_arguments;
-    my @exe_list;
-    zip(sub {
-        my ($s, $ctx) = @_;
-        Test::More::is_deeply($s->list_arguments, $arg_names);
-        Test::More::is_deeply($s->list_outputs, $output_names);
-        push @exe_list, $s->simple_bind(grad_req=>$grad_req, %$ctx);
-    }, $sym, $ctx_list);
-    $arg_params //= {};
-    $aux_params //= {};
-    my %arg_dict = %{ $exe_list[0]->arg_dict };
-    while(my ($n, $arr) = each %arg_dict)
-    {
-        if(not exists $arg_params->{$n})
-        {
-            $arg_params->{$n} = random(reverse @{ $arr->shape })*$scale;
-        }
-    }
-    my %aux_dict = %{ $exe_list[0]->aux_dict };
-    while(my ($n, $arr) = each %aux_dict)
-    {
-        if(not exists $aux_params->{$n})
-        {
-            $aux_params->{$n} = 0;
-        }
-    }
-    for my $exe(@exe_list)
-    {
-        %arg_dict = %{ $exe->arg_dict };
-        while(my ($name, $arr) = each %arg_dict)
-        {
-            $arr .= $arg_params->{$name};
-        }
-        %aux_dict = %{ $exe->aux_dict };
-        while(my ($name, $arr) = each %aux_dict)
-        {
-            $arr .= $aux_params->{$name};
-        }
-    }
-    my @dtypes = map { $_->outputs->[0]->dtype } @exe_list;
-    my $max_idx = pdl(map { $dtypes{$_} } @dtypes)->maximum_ind;
-    my $gt = $ground_truth;
-    if(not defined $gt)
-    {
-        $gt = { %{ $exe_list[$max_idx]->output_dict } };
-        if($grad_req ne 'null')
-        {
-            %{$gt} = (%{$gt}, %{ $exe_list[$max_idx]->grad_dict });
-        }
-    }
-
-    # test
-    for my $exe (@exe_list)
-    {
-        $exe->forward(0);
-    }
-    enumerate(sub {
-        my ($i, $exe) = @_;
-        if($i == $max_idx)
-        {
-            return;
-        }
-        zip(sub {
-            my ($name, $arr) = @_;
-            my $gtarr = $gt->{$name}->astype($dtypes[$i])->aspdl;
-            $arr = $arr->aspdl;
-            Test::More::ok(
-                almost_equal(
-                    $arr, $gtarr,
-                    $tol->{$dtypes[$i]}
-                )
-            );
-        }, $output_names, $exe->outputs);
-    }, \@exe_list);
-
-    # train
-    if ($grad_req ne 'null')
-    {
-        for my $exe (@exe_list)
-        {
-            $exe->forward(1);
-            $exe->backward($exe->outputs);
-        }
-        enumerate(sub {
-            my ($i, $exe) = @_;
-            return if($i == $max_idx);
-            zip(sub {
-                my ($name, $arr) = @_;
-                if (not defined $gt->{$name})
-                {
-                    Test::More::ok(not defined $arr);
-                    return;
-                }
-                my $gtarr = $gt->{$name}->astype($dtypes[$i])->aspdl;
-                $arr = $arr->aspdl;
-                Test::More::ok(
-                    almost_equal(
-                        $arr, $gtarr,
-                        $tol->{$dtypes[$i]}
-                    )
-                );
-            }, [@$output_names, @$arg_names], [@{ $exe->outputs }, @{ $exe->grad_arrays }]);
-        }, \@exe_list);
-    }
-    return $gt;
-}
-
-=head2 same_array
-
-    Check whether two NDArrays sharing the same memory block
-
-    Parameters
-    ----------
-
-    array1 : NDArray
-        First NDArray to be checked
-    array2 : NDArray
-        Second NDArray to be checked
-
-    Returns
-    -------
-    bool
-        Whether two NDArrays share the same memory
-=cut
-
-func same_array(
-    AI::MXNet::NDArray $array1,
-    AI::MXNet::NDArray $array2
-)
-{
-    $array1 += 1;
-    if(not same($array1->aspdl, $array2->aspdl))
-    {
-        $array1 -= 1;
-        return 0
-    }
-    $array1 -= 1;
-    return same($array1->aspdl, $array2->aspdl);
-}
-
-func dies_like($code, $regexp)
-{
-    eval { $code->() };
-    if($@ =~ $regexp)
-    {
-        return 1;
-    }
-    else
-    {
-        warn $@;
-        return 0;
-    }
-}
-
-func random_arrays(@shapes)
-{
-    my @arrays = map { random(reverse(@$_))->float } @shapes;
-    if(@arrays > 1)
-    {
-        return @arrays;
-    }
-    else
-    {
-        return $arrays[0];
-    }
-}
-
-
-func _validate_csr_generation_inputs(
-    $num_rows, $num_cols, $density,
-    $distribution="uniform"
-)
-{
-    my $total_nnz = int($num_rows * $num_cols * $density);
-    if($density < 0 or $density > 1)
-    {
-        confess("density has to be between 0 and 1");
-    }
-    if($num_rows <= 0 or $num_cols <= 0)
-    {
-        confess("num_rows or num_cols should be greater than 0");
-    }
-    if($distribution eq "powerlaw")
-    {
-        if($total_nnz < 2 * $num_rows)
-        {
-            confess(
-                "not supported for this density: $density"
-                ." for this shape ($num_rows, $num_cols)"
-                ." Please keep :"
-                ." num_rows * num_cols * density >= 2 * num_rows"
-            );
-        }
-    }
-}
-
-# Shuffle CSR column indices per row
-# This allows validation of unordered column indices, which is not a requirement
-# for a valid CSR matrix
-
-func shuffle_csr_column_indices($csr)
-{
-    my $row_count = @{ $csr->indptr } - 1;
-    for my $i (0..$row_count-1)
-    {
-        my $start_index = $csr->indptr->[$i];
-        my $end_index   = $csr->indptr->[$i + 1];
-        my @sublist = @{$csr->indices}[$start_index .. $end_index];
-        @sublist = shuffle(@sublist);
-        @{$csr->indices}[$start_index .. $end_index] = @sublist;
-    }
-}
-
-
-func _get_uniform_dataset_csr(
-    $num_rows, $num_cols, $density=0.1, $dtype='float32',
-    $data_init=, $shuffle_csr_indices=0)
-{
-    # Returns CSRNDArray with uniform distribution
-    # This generates a csr matrix with totalnnz unique randomly chosen numbers
-    # from num_rows*num_cols and arranges them in the 2d array in the
-    # following way:
-    # row_index = (random_number_generated / num_rows)
-    # col_index = random_number_generated - row_index * num_cols
-
-    _validate_csr_generation_inputs(
-        $num_rows, $num_cols, $density,
-        "uniform"
-    );
-    my $csr = rand_sparse($num_rows, $num_cols, $density, $dtype, "csr");
-    if(defined $data_init)
-    {
-        $csr->data->fill($data_init);
-    }
-    if($shuffle_csr_indices)
-    {
-        shuffle_csr_column_indices($csr);
-    }
-    return mx->nd->sparse->csr_matrix(
-        [$csr->data, $csr->indices, $csr->indptr],
-        shape => [$num_rows, $num_cols], dtype => $dtype
-    );
-}
-
-func _get_powerlaw_dataset_csr($num_rows, $num_cols, $density=0.1, $dtype='float32')
-{
-    # Returns CSRNDArray with powerlaw distribution
-    # with exponentially increasing number of non zeros in each row.
-    # Not supported for cases where total_nnz < 2*num_rows. This is because
-    # the algorithm first tries to ensure that there are rows with no zeros by
-    # putting non zeros at beginning of each row.
-
-    _validate_csr_generation_inputs($num_rows, $num_cols, $density,
-                                    "powerlaw");
-
-    my $total_nnz = int($num_rows * $num_cols * $density);
-
-    my $unused_nnz = $total_nnz;
-    my $output_arr = zeros($num_cols, $num_rows);
-    # Start with ones on each row so that no row is empty
-    for my $row (0..$num_rows-1)
-    {
-        $output_arr->slice(0, $row) .= 1 + rand(2);
-        $unused_nnz--;
-        if($unused_nnz <= 0)
-        {
-            return mx->nd->array($output_arr)->tostype("csr");
-        }
-    }
-    # Populate rest of matrix with 2^i items in ith row.
-    # if we have used all total nnz return the sparse matrix
-    # else if we reached max column size then fill up full columns until we use all nnz
-    my $col_max = 2;
-    for my $row (0..$num_rows-1)
-    {
-        my $col_limit = List::Util::min($num_cols, $col_max);
-        # In case col_limit reached assign same value to all elements, which is much faster
-        if($col_limit == $num_cols and $unused_nnz > $col_limit)
-        {
-            $output_arr->slice('X', $row) .= 1 + rand(2);
-            $unused_nnz = $unused_nnz - $col_limit + 1;
-            if($unused_nnz <= 0)
-            {
-                return mx->nd->array($output_arr)->tostype("csr");
-            }
-        }
-        else
-        {
-            for my $col_index (1..$col_limit-1)
-            {
-                $output_arr->slice($col_index, $row) .= 1 + rand(2);
-                $unused_nnz--;
-                if($unused_nnz <= 0)
-                {
-                    return mx->nd->array($output_arr)->tostype("csr");
-                }
-            }
-            $col_max *= 2;
-        }
-    }
-
-    if($unused_nnz > 0)
-    {
-        warn $unused_nnz;
-        confess(
-            "not supported for this density: $density"
-            ." for this shape ($num_rows,$num_cols)"
-        );
-    }
-    else
-    {
-        return mx->nd->array($output_arr)->tostype("csr");
-    }
-}
-
-
-func assign_each($input, $function=)
-{
-    my $res = pdl($input);
-    if(defined $function)
-    {
-        return $function->($res);
-    }
-    return $res;
-}
-
-func assign_each2($input1, $input2, $function=)
-{
-    my $res = pdl($input1);
-    if(defined $function)
-    {
-        return $function->($res, pdl($input2));
-    }
-    return $res;
-}
-
-=head2 rand_sparse_ndarray
-
-    Generate a random sparse ndarray. Returns the ndarray, value(np) and indices(np)
-
-    Parameters
-    ----------
-    shape: list or tuple
-    stype: str, valid values: "csr" or "row_sparse"
-    density, optional: float, should be between 0 and 1
-    distribution, optional: str, valid values: "uniform" or "powerlaw"
-    dtype, optional: numpy.dtype, default value is None
-
-    Returns
-    -------
-    Result of type CSRNDArray or RowSparseNDArray
-
-    Examples
-    --------
-    Below is an example of the powerlaw distribution with csr as the stype.
-    It calculates the nnz using the shape and density.
-    It fills up the ndarray with exponentially increasing number of elements.
-    If there are enough unused_nnzs, n+1th row will have twice more nnzs compared to nth row.
-    else, remaining unused_nnzs will be used in n+1th row
-    If number of cols is too small and we have already reached column size it will fill up
-    all following columns in all followings rows until we reach the required density.
-
-    >>> csr_arr, _ = rand_sparse_ndarray(shape=(5, 16), stype="csr",
-                                         density=0.50, distribution="powerlaw")
-    >>> indptr = csr_arr.indptr.asnumpy()
-    >>> indices = csr_arr.indices.asnumpy()
-    >>> data = csr_arr.data.asnumpy()
-    >>> row2nnz = len(data[indptr[1]:indptr[2]])
-    >>> row3nnz = len(data[indptr[2]:indptr[3]])
-    >>> assert(row3nnz == 2*row2nnz)
-    >>> row4nnz = len(data[indptr[3]:indptr[4]])
-    >>> assert(row4nnz == 2*row3nnz)
-=cut
-
-func rand_sparse_ndarray(
-    $shape, $stype, :$density=rand, :$dtype='float32', :$distribution='uniform',
-    :$data_init=, :$rsp_indices=, :$modifier_func=,
-    :$shuffle_csr_indices=0
-)
-{
-    if($stype eq 'row_sparse')
-    {
-        assert (
-            ($distribution eq "uniform"),
-            "Distribution $distribution not supported for row_sparse"
-        );
-        # sample index
-        my $indices;
-        if(defined $rsp_indices)
-        {
-            $indices = $rsp_indices;
-            assert($indices->nelem <= $shape->[0]);
-        }
-        else
-        {
-            my $idx_sample = random($shape->[0]);
-            $indices = which($idx_sample < $density);
-        }
-        if($indices->shape(-1)->at(0) == 0)
-        {
-            my $result = mx->nd->zeros($shape, stype=>'row_sparse', dtype=>$dtype);
-            return ($result, [pdl([]), pdl([])]);
-        }
-        # generate random values
-        my $val = random(PDL::Type->new(DTYPE_MX_TO_PDL->{ $dtype }), reverse($indices->shape(-1)->at(0), @{ $shape }[1..@{ $shape }-1]));
-
-        # Allow caller to override or adjust random values
-        if(defined $data_init)
-        {
-            $val .= $data_init;
-        }
-        if(defined $modifier_func)
-        {
-            $val = assign_each($val, $modifier_func);
-        }
-        my $arr = mx->nd->sparse->row_sparse_array([$val, $indices], shape=>$shape, dtype=>$dtype);
-        return ($arr, [$val, $indices]);
-    }
-    elsif($stype eq 'csr')
-    {
-        assert(@{ $shape } == 2);
-        my $csr;
-        if($distribution eq "uniform")
-        {
-            $csr = _get_uniform_dataset_csr(
-                @{ $shape }, $density, $dtype,
-                $data_init, $shuffle_csr_indices
-            );
-            return ($csr, [$csr->indptr, $csr->indices, $csr->data]);
-        }
-        elsif($distribution eq "powerlaw")
-        {
-            $csr = _get_powerlaw_dataset_csr(@{ $shape }, $density, $dtype);
-            return ($csr, [$csr->indptr, $csr->indices, $csr->data]);
-        }
-        else
-        {
-            confess("Distribution not supported: $distribution");
-        }
-    }
-    else
-    {
-        confess("unknown storage type");
-    }
-}
-
-func rand_ndarray(
-    $shape, $stype, $density=rand, $dtype='float32',
-    $modifier_func=, $shuffle_csr_indices=0, $distribution='uniform'
-)
-{
-    my $arr;
-    if($stype eq 'default')
-    {
-        $arr = mx->nd->array(random_arrays($shape), dtype=>$dtype);
-    }
-    else
-    {
-        ($arr) = rand_sparse_ndarray(
-            $shape, $stype, density => $density, dtype => $dtype,
-            modifier_func => $modifier_func,
-            shuffle_csr_indices => $shuffle_csr_indices, distribution => $distribution
-        );
-    }
-    return $arr;
-}
-
-
-func create_sparse_array(
-    $shape, $stype, $data_init=, $rsp_indices=,
-    $dtype=, $modifier_func=, $density=0.5,
-    $shuffle_csr_indices=0
-)
-{
-    my $arr_data;
-    if($stype eq 'row_sparse')
-    {
-        my $arr_indices;
-        if(defined $rsp_indices)
-        {
-            $arr_indices = pdl($rsp_indices);
-            $arr_indices->inplace->qsort;
-        }
-        ($arr_data) = rand_sparse_ndarray(
-            $shape, $stype,
-            $density, $dtype,
-            $data_init,
-            $arr_indices,
-            $modifier_func
-        );
-    }
-    elsif($stype eq 'csr')
-    {
-        ($arr_data) = rand_sparse_ndarray(
-            $shape,
-            $stype,
-            $density, $dtype,
-            $data_init,
-            $modifier_func,
-            $shuffle_csr_indices
-        );
-    }
-    else
-    {
-        confess("Unknown storage type: $stype");
-    }
-    return $arr_data;
-}
-
-
-func create_sparse_array_zd(
-    $shape, $stype, $density, $data_init=,
-    $rsp_indices=, $dtype=, $modifier_func=,
-    $shuffle_csr_indices=0
-)
-{
-    if($stype eq 'row_sparse')
-    {
-        $density = 0;
-        if(defined $rsp_indices)
-        {
-            assert($rsp_indices->len <= $shape->[0]);
-        }
-    }
-    return create_sparse_array(
-        $shape, $stype,
-        $data_init,
-        $rsp_indices,
-        $dtype,
-        $modifier_func,
-        $density,
-        $shuffle_csr_indices
-    );
-}
-
-func rand_shape_2d($dim0=10, $dim1=10)
-{
-    [int(rand($dim0)+1), int(rand($dim1)+1)];
-}
-
-
-func rand_shape_3d($dim0=10, $dim1=10, $dim2=10)
-{
-    [int(rand($dim0)+1), int(rand($dim1)+1), int(rand($dim1)+1)];
-}
-
-
-func rand_shape_nd($num_dim, $dim=10)
-{
-    (random($num_dim)*$dim+1)->floor->unpdl;
-}
-
-func randint($low=0, $high=10)
-{
-    my $value = int(rand($high));
-    return $value < $low ? $low : $value;
-}
-
-sub dies_ok
-{
-    my $sub = shift;
-    eval { $sub->() };
-    if($@)
-    {
-        Test::More::ok(1);
-    }
-    else
-    {
-        Test::More::ok(0);
-    }
-}
-
-1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Types.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Types.pm
deleted file mode 100644
index 835d36c81c1b..000000000000
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Types.pm
+++ /dev/null
@@ -1,85 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-package AI::MXNet::Types;
-use strict;
-use warnings;
-use Mouse::Util::TypeConstraints;
-use Exporter;
-use base qw(Exporter);
-@AI::MXNet::Types::EXPORT = qw(find_type_constraint enum);
-
-class_type 'PDL';
-class_type 'PDL::Matrix';
-class_type 'PDL::CCS::Nd';
-class_type 'AI::MXNet::NDArray';
-class_type 'AI::MXNet::Symbol';
-class_type 'AI::MXNet::NDArray::Slice';
-class_type 'AI::MXNet::Executor';
-class_type 'AI::MXNet::DataDesc';
-class_type 'AI::MXNet::Callback';
-class_type 'AI::MXNet::EvalMetric';
-class_type 'AI::MXNet::DataParallelExecutorGroup';
-class_type 'AI::MXNet::Optimizer';
-class_type 'AI::MXNet::Initializer';
-class_type 'AI::MXNet::KVStore';
-class_type 'AI::MXNet::InitDesc';
-class_type 'AI::MXNet::IRHeader';
-class_type 'AI::MXNet::Updater';
-class_type 'AI::MXNet::KVStore';
-class_type 'AI::MXNet::Gluon::Block';
-class_type 'AI::MXNet::Gluon::Data::Set';
-class_type 'AI::MXNet::Gluon::RNN::HybridRecurrentCell';
-class_type 'AI::MXNet::Symbol::NameManager';
-class_type 'AI::MXNet::NDarray::CSR';
-class_type 'AI::MXNet::NDArray::RowSparse';
-class_type 'PDL::CCS::Nd';
-subtype "AcceptableInput" => as "Num|PDL|PDL::Matrix|PDL::CCS::Nd|AI::MXNet::NDArray|AI::MXNet::NDArray::Slice|ArrayRef|AI::MXNet::NDarray::CSR|AI::MXNet::NDArray::RowSparse";
-subtype "Index"           => as "Int";
-subtype "DimSize"         => as "Int" => where { $_ >= 0 };
-subtype "Dropout"         => as "Num" => where { $_ >= 0 and $_ <= 1 };
-subtype "Shape"           => as "ArrayRef[DimSize]";
-subtype "CudaKernelShape" => as "Shape" => where { @$_ == 3 };
-subtype "WholeDim"        => as "Str" => where { $_ eq 'X' };
-subtype "Slice"           => as "ArrayRef[Index]|WholeDim|Index" => where { ref $_ ? @$_ > 0 : 1 };
-subtype "Dtype"           => as enum([qw[float32 float64 float16 uint8 int8 int32 int64]]);
-subtype "GluonClass"      => as enum([qw[AI::MXNet::NDArray AI::MXNet::Symbol]]);
-subtype "GluonInput"      => as "AI::MXNet::NDArray|AI::MXNet::Symbol|ArrayRef[AI::MXNet::NDArray|AI::MXNet::Symbol]";
-subtype "GradReq"         => as enum([qw[add write null]]);
-subtype "KVStoreStr"      => as enum([qw[local device dist dist_sync dist_async]]);
-subtype "PoolType"        => as enum([qw[max avg sum]]);
-subtype "NameShape"       => as "ArrayRef" => where {
-    find_type_constraint("Str")->check($_->[0])
-        and
-    find_type_constraint("Shape")->check($_->[1])
-};
-subtype "Callback"        => as "CodeRef|ArrayRef[Coderef]|AI::MXNet::Callback|ArrayRef[AI::MXNet::Callback]";
-subtype "EvalMetric"      => as "AI::MXNet::EvalMetric|Str|CodeRef";
-subtype "Metric"          => as "Maybe[EvalMetric]";
-subtype "Optimizer"       => as "AI::MXNet::Optimizer|Str";
-subtype "Initializer"     => as "AI::MXNet::Initializer|Str";
-subtype "Updater"         => as "AI::MXNet::Updater|CodeRef";
-subtype "KVStore"         => as "AI::MXNet::KVStore|KVStoreStr";
-subtype "Activation"      => as "AI::MXNet::Symbol|Str|CodeRef";
-subtype "SymbolOrArrayOfSymbols" => as "AI::MXNet::Symbol|ArrayRef[AI::MXNet::Symbol]";
-subtype "NameShapeOrDataDesc" => as "NameShape|AI::MXNet::DataDesc";
-subtype "AdvancedSlice"   => as "ArrayRef[ArrayRef|PDL|PDL::Matrix|AI::MXNet::NDArray]";
-subtype "InternalSlice"   => as enum([qw[begin end step]]);
-subtype "Stype"           => as enum([qw[default csr row_sparse]]);
-subtype "AuxTypes"        => as "ArrayRef[Dtype]";
-
-1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Util/Printable.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Util/Printable.pm
deleted file mode 100644
index 1ae6c2d26c96..000000000000
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Util/Printable.pm
+++ /dev/null
@@ -1,22 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-package AI::MXNet::Util::Printable;
-use strict;
-use warnings;
-use Data::Dumper qw();
-use overload '""' => sub { print Data::Dumper->new([shift])->Purity(1)->Deepcopy(1)->Terse(1)->Dump };
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Visualization.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Visualization.pm
deleted file mode 100644
index 3abfda15eba0..000000000000
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Visualization.pm
+++ /dev/null
@@ -1,456 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-package AI::MXNet::Visualization;
-use strict;
-use warnings;
-use AI::MXNet::NS;
-use AI::MXNet::Base;
-use AI::MXNet::Function::Parameters;
-use JSON::PP;
-
-=encoding UTF-8
-
-=head1 NAME
-
-    AI::MXNet::Vizualization - Vizualization support for Perl interface to MXNet machine learning library
-
-=head1 SYNOPSIS
-
-    use strict;
-    use warnings;
-    use AI::MXNet qw(mx);
-
-    ### model
-    my $data = mx->symbol->Variable('data');
-    my $conv1= mx->symbol->Convolution(data => $data, name => 'conv1', num_filter => 32, kernel => [3,3], stride => [2,2]);
-    my $bn1  = mx->symbol->BatchNorm(data => $conv1, name => "bn1");
-    my $act1 = mx->symbol->Activation(data => $bn1, name => 'relu1', act_type => "relu");
-    my $mp1  = mx->symbol->Pooling(data => $act1, name => 'mp1', kernel => [2,2], stride =>[2,2], pool_type=>'max');
-
-    my $conv2= mx->symbol->Convolution(data => $mp1, name => 'conv2', num_filter => 32, kernel=>[3,3], stride=>[2,2]);
-    my $bn2  = mx->symbol->BatchNorm(data => $conv2, name=>"bn2");
-    my $act2 = mx->symbol->Activation(data => $bn2, name=>'relu2', act_type=>"relu");
-    my $mp2  = mx->symbol->Pooling(data => $act2, name => 'mp2', kernel=>[2,2], stride=>[2,2], pool_type=>'max');
-
-
-    my $fl   = mx->symbol->Flatten(data => $mp2, name=>"flatten");
-    my $fc1  = mx->symbol->FullyConnected(data => $fl,  name=>"fc1", num_hidden=>30);
-    my $act3 = mx->symbol->Activation(data => $fc1, name=>'relu3', act_type=>"relu");
-    my $fc2  = mx->symbol->FullyConnected(data => $act3, name=>'fc2', num_hidden=>10);
-
-    ## creates the image file working directory
-    mx->viz->plot_network($fc2, save_format => 'png')->render("network.png");
-
-=head1 DESCRIPTION
-
-     Vizualization support for Perl interface to MXNet machine learning library
-
-=head1 Class methods
-
-=head2 print_summary
-
-    convert symbol for detail information
-
-    Parameters
-    ----------
-    symbol: AI::MXNet::Symbol
-        symbol to be visualized
-    shape: hashref
-        hashref of shapes, str->shape (arrayref[int]), given input shapes
-    line_length: int
-        total length of printed lines
-    positions: arrayref[float]
-        relative or absolute positions of log elements in each line
-    Returns
-    ------
-        nothing
-=cut
-
-method print_summary(
-    AI::MXNet::Symbol        $symbol,
-    Maybe[HashRef[Shape]]    $shape=,
-    Int                      $line_length=120,
-    ArrayRef[Num]            $positions=[.44, .64, .74, 1]
-)
-{
-    my $show_shape;
-    my %shape_dict;
-    if(defined $shape)
-    {
-        $show_shape = 1;
-        my $interals = $symbol->get_internals;
-        my (undef, $out_shapes, undef) = $interals->infer_shape(%{ $shape });
-        Carp::confess("Input shape is incomplete")
-            unless defined $out_shapes;
-        @shape_dict{ @{ $interals->list_outputs } } = @{ $out_shapes };
-    }
-    my $conf = decode_json($symbol->tojson);
-    my $nodes = $conf->{nodes};
-    my %heads = map { $_ => 1 } @{ $conf->{heads}[0] };
-    if($positions->[-1] <= 1)
-    {
-        $positions = [map { int($line_length * $_) } @{ $positions }];
-    }
-    # header names for the different log elements
-    my $to_display = ['Layer (type)', 'Output Shape', 'Param #', 'Previous Layer'];
-    my $print_row = sub { my ($fields, $positions) = @_;
-        my $line = '';
-        enumerate(sub {
-            my ($i, $field) = @_;
-            $line .= $field//'';
-            $line = substr($line, 0, $positions->[$i]);
-            $line .= ' ' x ($positions->[$i] - length($line));
-
-        }, $fields);
-        print $line,"\n";
-    };
-    print('_' x $line_length,"\n");
-    $print_row->($to_display, $positions);
-    print('=' x $line_length,"\n");
-    my $print_layer_summary = sub { my ($node, $out_shape) = @_;
-        my $op = $node->{op};
-        my $pre_node = [];
-        my $pre_filter = 0;
-        if($op ne 'null')
-        {
-            my $inputs = $node->{inputs};
-            for my $item (@{ $inputs })
-            {
-                my $input_node = $nodes->[$item->[0]];
-                my $input_name = $input_node->{name};
-                if($input_node->{op} ne 'null' or exists $heads{ $item->[0] })
-                {
-                    push @{ $pre_node }, $input_name;
-                    if($show_shape)
-                    {
-                        my $key = $input_name;
-                        $key .= '_output' if $input_node->{op} ne 'null';
-                        if(exists $shape_dict{ $key })
-                        {
-                            $pre_filter = $pre_filter + int($shape_dict{$key}[1]//0);
-                        }
-                    }
-                }
-            }
-        }
-        my $cur_param = 0;
-        if($op eq 'Convolution')
-        {
-            my $num_filter = $node->{attrs}{num_filter};
-            $cur_param = $pre_filter * $num_filter;
-            while($node->{attrs}{kernel} =~ /(\d+)/g)
-            {
-                $cur_param *= $1;
-            }
-            $cur_param += $num_filter;
-        }
-        elsif($op eq 'FullyConnected')
-        {
-            $cur_param = $pre_filter * ($node->{attrs}{num_hidden} + 1);
-        }
-        elsif($op eq 'BatchNorm')
-        {
-            my $key = "$node->{name}_output";
-            if($show_shape)
-            {
-                my $num_filter = $shape_dict{$key}[1];
-                $cur_param = $num_filter * 2;
-            }
-        }
-        elsif($op eq 'Embedding')
-        {
-            $cur_param = $node->{attrs}{input_dim} * $node->{attrs}{output_dim};
-        }
-        my $first_connection;
-        if(not $pre_node)
-        {
-            $first_connection = '';
-        }
-        else
-        {
-            $first_connection = $pre_node->[0];
-        }
-        my $fields = [
-            $node->{name} . '(' . $op . ')',
-            join('x', @{ $out_shape }),
-            $cur_param,
-            $first_connection
-        ];
-        $print_row->($fields, $positions);
-        if(@{ $pre_node } > 1)
-        {
-            for my $i (1..@{ $pre_node }-1)
-            {
-                $fields = ['', '', '', $pre_node->[$i]];
-                $print_row->($fields, $positions);
-            }
-        }
-        return $cur_param;
-    };
-    my $total_params = 0;
-    enumerate(sub {
-        my ($i, $node) = @_;
-        my $out_shape = [];
-        my $op = $node->{op};
-        return if($op eq 'null' and $i > 0);
-        if($op ne 'null' or exists $heads{$i})
-        {
-            if($show_shape)
-            {
-                my $key = $node->{name};
-                $key .= '_output' if $op ne 'null';
-                if(exists $shape_dict{ $key })
-                {
-                    my $end = @{ $shape_dict{ $key } };
-                    @{ $out_shape } = @{ $shape_dict{ $key } }[1..$end-1];
-                }
-            }
-        }
-        $total_params += $print_layer_summary->($nodes->[$i], $out_shape);
-        if($i == @{ $nodes } - 1)
-        {
-            print('=' x $line_length, "\n");
-        }
-        else
-        {
-            print('_' x $line_length, "\n");
-        }
-    }, $nodes);
-    print("Total params: $total_params\n");
-    print('_' x $line_length, "\n");
-}
-
-=head2 plot_network
-
-    convert symbol to dot object for visualization
-
-    Parameters
-    ----------
-    title: str
-        title of the dot graph
-    symbol: AI::MXNet::Symbol
-        symbol to be visualized
-    shape: HashRef[Shape]
-        If supplied, the visualization will include the shape
-        of each tensor on the edges between nodes.
-    node_attrs: HashRef of node's attributes
-        for example:
-            {shape => "oval",fixedsize => "false"}
-            means to plot the network in "oval"
-    hide_weights: Bool
-        if True (default) then inputs with names like `*_weight`
-        or `*_bias` will be hidden
-
-    Returns
-    ------
-    dot: Diagraph
-        dot object of symbol
-=cut
-
-method plot_network(
-    AI::MXNet::Symbol       $symbol,
-    Str                    :$title='plot',
-    Str                    :$save_format='ps',
-    Maybe[HashRef[Shape]]  :$shape=,
-    HashRef[Str]           :$node_attrs={},
-    Bool                   :$hide_weights=1
-)
-{
-    eval { require GraphViz; };
-    Carp::confess("plot_network requires GraphViz module") if $@;
-    my $draw_shape;
-    my %shape_dict;
-    if(defined $shape)
-    {
-        $draw_shape = 1;
-        my $interals = $symbol->get_internals;
-        my (undef, $out_shapes, undef) = $interals->infer_shape(%{ $shape });
-        Carp::confess("Input shape is incomplete")
-            unless defined $out_shapes;
-        @shape_dict{ @{ $interals->list_outputs } } = @{ $out_shapes };
-    }
-    my $conf = decode_json($symbol->tojson);
-    my $nodes = $conf->{nodes};
-    my %node_attr = (
-        qw/ shape box fixedsize true
-            width 1.3 height 0.8034 style filled/,
-        %{ $node_attrs }
-    );
-    my $dot = AI::MXNet::Visualization::PythonGraphviz->new(
-        graph  => GraphViz->new(name => $title),
-        format => $save_format,
-    );
-    # color map
-    my @cm = (
-        "#8dd3c7", "#fb8072", "#ffffb3", "#bebada", "#80b1d3",
-        "#fdb462", "#b3de69", "#fccde5"
-    );
-    # make nodes
-    my %hidden_nodes;
-    for my $node (@{ $nodes })
-    {
-        my $op   = $node->{op};
-        my $name = $node->{name};
-        # input data
-        my %attr = %node_attr;
-        my $label = $name;
-        if($op eq 'null')
-        {
-            if($name =~ /(?:_weight|_bias|_beta|_gamma|_moving_var|_moving_mean|running_var|running_mean)$/)
-            {
-                if($hide_weights)
-                {
-                    $hidden_nodes{$name} = 1;
-                }
-                # else we don't render a node, but
-                # don't add it to the hidden_nodes set
-                # so it gets rendered as an empty oval
-                next;
-            }
-            $attr{shape} = 'ellipse'; # inputs get their own shape
-            $label = $name;
-            $label = 'plus' if $label =~ /plus\d+$/;
-            $attr{fillcolor} = $cm[0];
-        }
-        elsif($op eq 'Convolution')
-        {
-            my @k = $node->{attrs}{kernel} =~ /(\d+)/g;
-            my @stride = ($node->{attrs}{stride}//'') =~ /(\d+)/g;
-            $stride[0] //= 1;
-            $label = "Convolution\n".join('x',@k).'/'.join('x',@stride).", $node->{attrs}{num_filter}";
-            $attr{fillcolor} = $cm[1];
-        }
-        elsif($op eq 'FullyConnected')
-        {
-            $label = "FullyConnected\n$node->{attrs}{num_hidden}";
-            $attr{fillcolor} = $cm[1];
-        }
-        elsif($op eq 'BatchNorm')
-        {
-            $attr{fillcolor} = $cm[3];
-            $label = $op;
-        }
-        elsif($op eq 'Flatten')
-        {
-            $label = $op;
-            $attr{fillcolor} = $cm[5];
-        }
-        elsif($op eq 'elemwise_add' or $op eq 'clip' or $op eq 'Concat')
-        {
-            $label = $op;
-            $attr{fillcolor} = $cm[5];
-        }
-        elsif($op eq 'Dropout')
-        {
-            $label = "$op ($node->{attrs}{p})";
-        }
-        elsif($op eq 'Reshape')
-        {
-            $label = "$op $node->{attrs}{shape}";
-            $attr{fillcolor} = $cm[5];
-        }
-        elsif($op eq 'Activation' or $op eq 'LeakyReLU')
-        {
-            $label = "$op\n$node->{attrs}{act_type}";
-            $attr{fillcolor} = $cm[2];
-        }
-        elsif($op eq 'Pooling')
-        {
-            my @k = $node->{attrs}{kernel} =~ /(\d+)/g;
-            my @stride = ($node->{attrs}{stride}//'') =~ /(\d+)/g;
-            $stride[0] //= 1;
-            $label = "Pooling\n$node->{attrs}{pool_type}, ".join('x',@k).'/'.join('x',@stride);
-            $attr{fillcolor} = $cm[4];
-        }
-        elsif($op eq 'Softmax')
-        {
-            $attr{fillcolor} = $cm[6];
-        }
-        else
-        {
-            $attr{fillcolor} = $cm[7];
-            if($op eq 'Custom')
-            {
-                $label = $node->{attrs}{op_type};
-            }
-        }
-        $dot->graph->add_node($name, label => $label, %attr);
-    };
-
-    # add edges
-    for my $node (@{ $nodes })
-    {
-        my $op   = $node->{op};
-        my $name = $node->{name};
-        if($op eq 'null')
-        {
-            next;
-        }
-        else
-        {
-            my $inputs = $node->{inputs};
-            for my $item (@{ $inputs })
-            {
-                my $input_node = $nodes->[$item->[0]];
-                my $input_name = $input_node->{name};
-                if(not exists $hidden_nodes{ $input_name })
-                {
-                    my %attr = qw/dir back arrowtail normal/;
-                    # add shapes
-                    if($draw_shape)
-                    {
-                        my $key = $input_name;
-                        $key   .= '_output' if $input_node->{op} ne 'null';
-                        if($input_node->{op} ne 'null' and exists $input_node->{attrs})
-                        {
-                            if(ref $input_node->{attrs} eq 'HASH' and exists $input_node->{attrs}{num_outputs})
-                            {
-                                $key .= ($input_node->{attrs}{num_outputs} - 1);
-                            }
-                        }
-                        my $end = @{ $shape_dict{$key} };
-                        $attr{label} = join('x', @{ $shape_dict{$key} }[1..$end-1]);
-                    }
-                    $dot->graph->add_edge($name => $input_name, %attr);
-                }
-            }
-        }
-    }
-    return $dot;
-}
-
-package AI::MXNet::Visualization::PythonGraphviz;
-use Mouse;
-use AI::MXNet::Types;
-has 'format' => (
-    is => 'ro',
-    isa => enum([qw/debug canon text ps hpgl pcl mif
-                    pic gd gd2 gif jpeg png wbmp cmapx
-                    imap vdx vrml vtx mp fig svg svgz
-                    plain/]
-    )
-);
-has 'graph' => (is => 'ro', isa => 'GraphViz');
-
-method render($output=)
-{
-    my $method = 'as_' . $self->format;
-    return $self->graph->$method($output);
-}
-
-1;
diff --git a/perl-package/AI-MXNet/t/AI-MXNet.t b/perl-package/AI-MXNet/t/AI-MXNet.t
deleted file mode 100644
index ad5a0266e06f..000000000000
--- a/perl-package/AI-MXNet/t/AI-MXNet.t
+++ /dev/null
@@ -1,24 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-use strict;
-use warnings;
-use Test::More tests => 3;
-BEGIN { use_ok('AI::MXNet') };
-
-isa_ok(AI::MXNet->Context(), 'AI::MXNet::Context');
-isa_ok(AI::MXNet::Context->new(), 'AI::MXNet::Context');
diff --git a/perl-package/AI-MXNet/t/test_attr.t b/perl-package/AI-MXNet/t/test_attr.t
deleted file mode 100644
index 1a91ae4d0a9a..000000000000
--- a/perl-package/AI-MXNet/t/test_attr.t
+++ /dev/null
@@ -1,125 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-use strict;
-use warnings;
-use Test::More tests => 14;
-use AI::MXNet qw(mx);
-use Storable;
-
-sub contains
-{
-    my ($x, $y) = @_;
-    while(my ($k, $v) = each %$x)
-    {
-        return 0 unless exists $y->{$k};
-        if(ref $y->{$k} and ref $y->{$k} eq 'HASH')
-        {
-            return 0 unless (ref $v and ref $v eq 'HASH');
-            return 0 unless contains($v, $y->{$k});
-        }
-        elsif($y->{$k} ne $v)
-        {
-            return 0;
-        }
-    }
-    return 1;
-}
-
-sub test_attr_basic
-{
-    my ($data, $gdata);
-    {
-        local($mx::AttrScope) = mx->AttrScope(group=>'4', data=>'great');
-        $data = mx->symbol->Variable(
-            'data',
-            attr => {
-                qw/ dtype data
-                    group 1
-                    force_mirroring 1/
-            },
-            lr_mult => 1);
-        $gdata = mx->symbol->Variable('data2');
-    }
-    ok($gdata->attr('group') == 4);
-    ok($data->attr('group') == 1);
-    ok($data->attr('lr_mult') == 1);
-    ok($data->attr('__lr_mult__') == 1);
-    ok($data->attr('force_mirroring') == 1);
-    ok($data->attr('__force_mirroring__') == 1);
-    my $data2 = Storable::thaw(Storable::freeze($data));
-    ok($data->attr('dtype') eq $data2->attr('dtype'));
-}
-
-sub test_operator
-{
-    my $data = mx->symbol->Variable('data');
-    my ($fc1, $fc2);
-    {
-        local($mx::AttrScope) = mx->AttrScope(__group__=>'4', __data__=>'great');
-        $fc1 = mx->symbol->Activation($data, act_type=>'relu');
-        {
-            local($mx::AttrScope) = mx->AttrScope(__init_bias__ => 0, 
-                __group__=>'4', __data__=>'great');
-            $fc2 = mx->symbol->FullyConnected($fc1, num_hidden=>10, name=>'fc2');
-        }
-    }
-    ok($fc1->attr('__data__') eq 'great');
-    ok($fc2->attr('__data__') eq 'great');
-    ok($fc2->attr('__init_bias__') == 0);
-    my $fc2copy = Storable::thaw(Storable::freeze($fc2));
-    ok($fc2copy->tojson() eq $fc2->tojson());
-    ok($fc2->get_internals()->slice('fc2_weight'));
-}
-
-sub test_list_attr
-{
-    my $data = mx->sym->Variable('data', attr=>{'mood', 'angry'});
-    my $op = mx->sym->Convolution(
-        data=>$data, name=>'conv', kernel=>[1, 1],
-        num_filter=>1, attr => {'__mood__'=> 'so so', 'wd_mult'=> 'x'}
-    );
-    ok(contains({'__mood__'=> 'so so', 'wd_mult'=> 'x', '__wd_mult__'=> 'x'}, $op->list_attr()));
-}
-
-sub test_attr_dict
-{
-    my $data = mx->sym->Variable('data', attr=>{'mood'=> 'angry'});
-    my $op = mx->sym->Convolution(
-        data=>$data, name=>'conv', kernel=>[1, 1],
-        num_filter=>1, attr=>{'__mood__'=> 'so so'}, lr_mult=>1
-    );
-    ok(
-        contains(
-            {
-                'data'=> {'mood'=> 'angry'},
-                'conv_weight'=> {'__mood__'=> 'so so'},
-                'conv'=> {
-                    'kernel'=> '(1, 1)', '__mood__'=> 'so so', 
-                    'num_filter'=> '1', 'lr_mult'=> '1', '__lr_mult__'=> '1'
-                },
-                'conv_bias'=> {'__mood__'=> 'so so'}
-            },
-            $op->attr_dict()
-        )
-    );
-}
-
-test_attr_basic();
-test_operator();
-test_list_attr();
-test_attr_dict();
diff --git a/perl-package/AI-MXNet/t/test_autograd.t b/perl-package/AI-MXNet/t/test_autograd.t
deleted file mode 100644
index 2ddad60df989..000000000000
--- a/perl-package/AI-MXNet/t/test_autograd.t
+++ /dev/null
@@ -1,426 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-use strict;
-use warnings;
-use AI::MXNet qw(mx);
-use AI::MXNet::AutoGrad qw(autograd);
-use AI::MXNet::TestUtils qw(same almost_equal rand_ndarray);
-use AI::MXNet::Base qw(:DEFAULT pones);
-use Test::More tests => 246;
-$ENV{MXNET_STORAGE_FALLBACK_LOG_VERBOSE} = 0;
-$ENV{MXNET_SUBGRAPH_VERBOSE} = 0;
-
-sub autograd_assert
-{
-    my $kwargs = {};
-    if(ref $_[-1] eq 'HASH') { $kwargs = pop(@_) };
-    my @args = @_;
-    my $func   = $kwargs->{func};
-    my $grad_f = $kwargs->{grad_func};
-    my $argnum = $kwargs->{argnum};
-    my $grad_func = autograd->grad_and_loss($func, $argnum);
-    my ($grad_vals, $output) = $grad_func->(@args);
-    my $res = $func->(@args);
-    ok(same($output->aspdl, $res->aspdl));
-    my $grad_res = $grad_f->(@args);
-    ok(@$grad_vals == @$grad_res);
-    for(zip($grad_vals, $grad_res)) {
-        my ($a, $b) = @$_;
-        ok(same($a->aspdl, $b->aspdl));
-    }
-}
-
-sub test_unary_func
-{
-    my $check_unary_func = sub {
-        my ($x) = @_;
-        my $f_exp       = sub { $_[0]->exp };
-        my $f_exp_grad  = sub { [$_[0]->exp] };
-        autograd_assert($x, { func => $f_exp, grad_func => $f_exp_grad });
-        my $f_half      = sub { $_[0]/2 };
-        my $f_half_grad = sub { [mx->nd->ones($_[0]->shape) * 0.5] };
-        autograd_assert($x, { func => $f_half, grad_func => $f_half_grad });
-        my $f_square    = sub { $_[0]**2 };
-        my $f_square_grad = sub { [2*$_[0]] };
-        autograd_assert($x, { func => $f_square, grad_func => $f_square_grad });
-    };
-    my $uniform = mx->nd->uniform(shape=>[4, 5]);
-    $check_unary_func->($uniform);
-    my $stypes = ['row_sparse', 'csr', 'default'];
-    for my $stype (@$stypes)
-    {
-        $check_unary_func->($uniform->tostype($stype));
-    }
-}
-
-test_unary_func();
-
-sub test_binary_func
-{
-    my $check_binary_func = sub {
-        my ($x, $y) = @_;
-        my $f_add      = sub { $_[0]+$_[1] };
-        my $f_add_grad = sub { [map { mx->nd->ones($_->shape) } @_] };
-        autograd_assert($x, $y, { func => $f_add, grad_func => $f_add_grad });
-        my $f_mul      = sub { $_[0]*$_[1] };
-        my $f_mul_grad = sub { [reverse(@_)] };
-        autograd_assert($x, $y, { func => $f_mul, grad_func => $f_mul_grad });
-        my $f_compose  = sub { $_[0]+$_[0]*$_[1] };
-        my $f_compose_grad = sub { [mx->nd->ones($_[0]->shape) + $y, $x] };
-        autograd_assert($x, $y, { func => $f_compose, grad_func => $f_compose_grad });
-    };
-    my $uniform_x = mx->nd->uniform(shape=>[4, 5]);
-    my $uniform_y = mx->nd->uniform(shape=>[4, 5]);
-    $check_binary_func->($uniform_x, $uniform_y);
-    my $stypes = ['row_sparse', 'csr', 'default'];
-    for my $stype_x (@$stypes)
-    {
-        for my $stype_y (@$stypes)
-        {
-            my $x = $uniform_x->tostype($stype_x);
-            my $y = $uniform_y->tostype($stype_y);
-            $check_binary_func->($x, $y);
-        }
-    }
-}
-
-test_binary_func();
-
-sub test_operator_with_state
-{
-    my $f_fc = sub {
-        my ($a, $b, $weight, $bias) = @_;
-        my $x = $a*$b;
-        my $fc = mx->nd->FullyConnected(
-            $x, $weight, $bias, num_hidden=>32);
-        return $fc;
-    };
-
-    my $a = mx->nd->uniform(shape=>[64, 50]);
-    my $b = mx->nd->uniform(shape=>[64, 50]);
-    my $weight = mx->nd->uniform(shape=>[32, 50]);
-    my $bias = mx->nd->uniform(shape=>[32]);
-
-    my $grad_func = autograd->grad_and_loss($f_fc);
-    my ($grad_vals, $outputs) = $grad_func->($a, $b, $weight, $bias);
-}
-
-test_operator_with_state();
-
-sub test_argnum
-{
-    my $f_with_mode = sub {
-        my ($a, $b, $mode) = @_;
-        if($mode)
-        {
-            return $a+$b;
-        }
-        else
-        {
-            return $a*$b;
-        }
-    };
-    my $a = mx->nd->uniform(shape=>[3, 2]);
-    my $b = mx->nd->uniform(shape=>[3, 2]);
-    my $f_add_grad = sub { [map { mx->nd->ones($_->shape) } @_[0,1]] };
-    my $f_mul_grad = sub { [reverse(@_[0,1])] };
-    autograd_assert($a, $b, 1,
-        { argnum=>[0, 1], func=>$f_with_mode, grad_func=>$f_add_grad });
-    autograd_assert($a, $b, 0,
-        { argnum=>[0, 1], func=>$f_with_mode, grad_func=>$f_mul_grad });
-}
-
-test_argnum();
-
-sub test_training
-{
-    my $x = mx->nd->ones([10, 10]);
-    autograd->record(sub {
-        my $y = mx->nd->Dropout($x, p=>0.5);
-        ok(not ($y->aspdl == $x->aspdl)->all);
-        autograd->pause(sub {
-            my $y = mx->nd->Dropout($x, p=>0.5);
-            ok(($y->aspdl == $x->aspdl)->all);
-        });
-    });
-}
-
-test_training();
-
-sub test_out_grads
-{
-    my $x = mx->nd->ones([3, 5]);
-    my $dx = mx->nd->zeros_like($x);
-    autograd->mark_variables([$x], [$dx]);
-    my $da;
-    my $db = mx->nd->array([1,2,3,4,5]);
-    my $dc = mx->nd->array([5,4,3,2,1]);
-
-    autograd->record(sub {
-        my ($a, $b, $c) = @{ $x };
-        autograd->backward([$a, $b, $c], head_grads => [$da, $db, $dc]);
-    });
-    ok(($dx->aspdl == pdl(
-        [[1,1,1,1,1],
-         [1,2,3,4,5],
-         [5,4,3,2,1]]))->all);
-}
-
-test_out_grads();
-
-sub test_detach_updated_grad
-{
-    my $x = mx->nd->ones([2, 2]);
-    my $dx = mx->nd->zeros_like($x);
-    my $y = mx->nd->ones_like($x);
-    my $dy = mx->nd->zeros_like($x);
-    autograd->mark_variables([$x, $y], [$dx, $dy]);
-    ok($x->_fresh_grad == 0);
-    ok($y->_fresh_grad == 0);
-
-    autograd->record(sub {
-        my $x2 = $x + 2;
-        my $y2  = $x2 + $y;
-        $y2->backward();
-    });
-    ok(($dx->aspdl == 1)->all);
-    ok($x->_fresh_grad == 1);
-    ok($y->_fresh_grad == 1);
-
-    $dx .= 0;
-    $x->_fresh_grad(0);
-    $y->_fresh_grad(0);
-    ok($x->_fresh_grad == 0);
-    ok($y->_fresh_grad == 0);
-
-    autograd->record(sub {
-        my $x2 = $x + 2;
-        $x2 = $x2->detach;
-        my $y2  = $x2 + $y;
-        $y2->backward();
-    });
-    ok(($dx->aspdl == 0)->all);
-    ok($x->_fresh_grad == 0);
-    ok($y->_fresh_grad == 1);
-}
-
-test_detach_updated_grad();
-
-sub test_retain_grad
-{
-    my $x = mx->nd->ones([2, 2]);
-    my $dx = mx->nd->zeros([2, 2]);
-    autograd->mark_variables([$x], [$dx], grad_reqs=>'add');
-    autograd->record(sub {
-        my $y = $x + 1;
-        $y->backward(retain_graph=>0);
-    });
-    ok(($dx->aspdl == 1)->all);
-
-    $dx .= 0;
-    autograd->record(sub {
-        my $y = $x + 1;
-        $y->backward(retain_graph=>1);
-        $y->backward(retain_graph=>0);
-    });
-    ok(($dx->aspdl == 2)->all);
-    no warnings;
-    open(CPERR, ">&STDERR");
-    open(STDERR, ">/dev/null");
-    eval {
-        autograd->record(sub {
-            my $y = $x + 1;
-            $y->backward();
-            $y->backward();
-        });
-    };
-    open(STDERR, ">&CPERR");
-    ok($@);
-}
-
-test_retain_grad();
-
-sub test_attach_grad
-{
-    my $check_attach_grad = sub {
-        my ($x) = @_;
-        ok(not defined $x->grad);
-        $x->attach_grad();
-        autograd->record(sub {
-            my $y = $x * 2;
-            ok(not defined $y->grad);
-            $y->backward;
-        });
-        ok(($x->grad->aspdl == 2)->all);
-    };
-    my $zeros = mx->nd->zeros([10, 10]);
-    $check_attach_grad->($zeros);
-    my @stypes = ('default', 'row_sparse', 'csr');
-    for my $stype (@stypes)
-    {
-        my $x = $zeros->tostype($stype);
-        $check_attach_grad->($x);
-    }
-}
-
-test_attach_grad();
-
-sub test_is_train
-{
-    my $x = mx->nd->ones([10, 10]);
-    $x->attach_grad();
-    autograd->record(sub {
-        ok(autograd->is_recording());
-        ok(autograd->is_training());
-        my $y = mx->nd->Dropout($x, p=>0.5);
-        ok($y->aspdl->max == 2 and $y->aspdl->min == 0);
-        $y->backward();
-        ok(($x->grad->aspdl == $y->aspdl)->all);
-        autograd->predict_mode(sub {
-            ok(autograd->is_recording());
-            ok(not autograd->is_training());
-            my $y = mx->nd->Dropout($x, p=>0.5);
-            ok(($y->aspdl == $x->aspdl)->all);
-            $y->backward(train_mode=>0);
-            ok(($x->grad->aspdl == $x->aspdl)->all);
-        });
-    }, train_mode => 1);
-
-    autograd->record(sub {
-        ok(autograd->is_recording());
-        ok(not autograd->is_training());
-        my $y = mx->nd->Dropout($x, p=>0.5);
-        ok(($y->aspdl == $x->aspdl)->all);
-        $y->backward(train_mode=>0);
-        ok(($x->grad->aspdl == $x->aspdl)->all);
-
-        autograd->train_mode(sub {
-            ok(autograd->is_recording);
-            ok(autograd->is_training);
-            my $y = mx->nd->Dropout($x, p=>0.5);
-            ok($y->aspdl->max == 2 and $y->aspdl->min == 0);
-            $y->backward;
-            ok(($x->grad->aspdl == $y->aspdl)->all);
-        });
-    }, train_mode => 0);
-
-    ok(not autograd->is_recording);
-    ok(not autograd->is_training);
-    my $y = mx->nd->Dropout($x, p=>0.5);
-    ok(($y->aspdl == $x->aspdl)->all);
-
-    autograd->train_mode(sub {
-        ok(not autograd->is_recording);
-        ok(autograd->is_training);
-        my $y = mx->nd->Dropout($x, p=>0.5);
-        ok($y->aspdl->max == 2 and $y->aspdl->min == 0);
-    });
-}
-
-test_is_train();
-
-sub test_get_symbol
-{
-    my $x = mx->nd->ones([1]);
-    $x->attach_grad;
-    my $y;
-    autograd->record(sub {
-        $y = $x*$x + 2*$x - 1;
-    });
-    ok(@{ autograd->get_symbol($y)->list_arguments } == 1);
-
-    my $z = mx->nd->ones([1]);
-    $z->attach_grad;
-    autograd->record(sub {
-        $y = $x*$x + 2*$z - 1;
-    });
-    ok(@{ autograd->get_symbol($y)->list_arguments } == 2);
-}
-
-test_get_symbol();
-
-sub test_gradient
-{
-    my $x = mx->nd->ones([1]);
-    $x->attach_grad;
-    my $z;
-    mx->autograd->record(sub {
-        $z = mx->nd->elemwise_add($x->exp, $x);
-    });
-    my $dx = mx->autograd->grad($z, $x, create_graph=>1);
-    ok(abs($dx->asscalar - 3.71828175) < 1e-7);
-    $dx->backward;
-    ok(abs($x->grad->asscalar - 2.71828175) < 1e-7);
-}
-
-test_gradient();
-
-sub test_grad_with_stype
-{
-    my $check_grad_with_stype = sub { my ($array_stype, $grad_stype, $expected_stype) = @_;
-        my $x = mx->nd->zeros([1, 1], stype=>$array_stype);
-        $x->attach_grad(stype=>$grad_stype);
-        # check grad attached
-        ok($x->grad->stype eq $expected_stype);
-        my $y = $x->detach();
-        # check array detached
-        ok($y->stype eq $array_stype);
-    };
-    my @stypes = ('default', 'csr', 'row_sparse');
-    for my $stype (@stypes)
-    {
-        # check the default stype of the gradient (same as the array stype)
-        $check_grad_with_stype->($stype, undef, $stype);
-        for my $grad_stype (@stypes)
-        {
-            # check the stype of the gradient when provided
-            $check_grad_with_stype->($stype, $grad_stype, $grad_stype);
-        }
-    }
-}
-
-test_grad_with_stype();
-
-sub test_sparse_dot_grad
-{
-    my $check_sparse_dot_grad = sub { my ($rhs) = @_;
-        my $lhs = rand_ndarray([2, 8], 'csr');
-        my $y;
-        mx->autograd->record(sub {
-            $y = mx->nd->dot($lhs, $rhs);
-        });
-        $y->backward();
-        my $grad = $rhs->grad;
-        my $grad_pdl = $lhs->aspdl->transpose x pones($rhs->shape->[1], $lhs->shape->[0]);
-        ok($grad->stype eq 'row_sparse');
-        ok(almost_equal($grad->aspdl, $grad_pdl));
-    };
-
-    # check grad with row_sparse weight
-    my $shape = [8, 3];
-    my $rsp = mx->nd->ones($shape)->tostype('row_sparse');
-    $rsp->attach_grad();
-    $check_sparse_dot_grad->($rsp);
-
-    # check grad with dense weight
-    my $dns = mx->nd->ones($shape);
-    $dns->attach_grad(stype=>'row_sparse');
-    $check_sparse_dot_grad->($dns);
-}
-
-test_sparse_dot_grad();
diff --git a/perl-package/AI-MXNet/t/test_base.t b/perl-package/AI-MXNet/t/test_base.t
deleted file mode 100644
index e21b99a38c66..000000000000
--- a/perl-package/AI-MXNet/t/test_base.t
+++ /dev/null
@@ -1,124 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-use strict;
-use warnings;
-use Test::More;
-use AI::MXNet qw(mx);
-
-sub test_builtin_zip()
-{
-    is_deeply(
-        [ AI::MXNet::zip([ 0 .. 9 ], [ 10 .. 19 ]) ],
-        [ map { [ $_, 10 + $_ ] } 0 .. 9 ]);
-    is_deeply(
-        [ AI::MXNet::zip([ 0 .. 9 ], [ 10 .. 19 ], [ 20 .. 29 ]) ],
-        [ map { [ $_, 10 + $_, 20 + $_ ] } 0 .. 9 ]);
-    my $over = ListOverload->new(10 .. 19);
-    is_deeply(
-        [ AI::MXNet::zip([ 0 .. 9 ], \@$over) ],
-        [ map { [ $_, 10 + $_ ] } 0 .. 9 ]);
-    my $tied = ListTied->new(10 .. 19);
-    is_deeply(
-        [ AI::MXNet::zip([ 0 .. 9 ], \@$tied) ],
-        [ map { [ $_, 10 + $_ ] } 0 .. 9 ]);
-}
-
-
-test_builtin_zip();
-done_testing();
-
-package ListTied {
-    sub new {
-        my($class, @list) = @_;
-        my @tied;
-        tie @tied, $class, @list;
-        return \@tied;
-    }
-    sub TIEARRAY {
-        my($class, @list) = @_;
-        return bless { list => \@list }, $class;
-    }
-    sub FETCH {
-        my($self, $index) = @_;
-        return $self->{list}[$index];
-    }
-    sub STORE {
-        my($self, $index, $value) = @_;
-        return $self->{list}[$index] = $value;
-    }
-    sub FETCHSIZE {
-        my($self) = @_;
-        return scalar @{$self->{list}};
-    }
-    sub STORESIZE {
-        my($self, $count) = @_;
-        return $self->{list}[$count - 1] //= undef;
-    }
-    sub EXTEND {
-        my($self, $count) = @_;
-        return $self->STORESIZE($count);
-    }
-    sub EXISTS {
-        my($self, $key) = @_;
-        return exists $self->{list}[$key];
-    }
-    sub DELETE {
-        my($self, $key) = @_;
-        return delete $self->{list}[$key];
-    }
-    sub CLEAR {
-        my($self) = @_;
-        return @{$self->{list}} = ();
-    }
-    sub PUSH {
-        my($self, @list) = @_;
-        return push @{$self->{list}}, @list;
-    }
-    sub POP {
-        my($self) = @_;
-        return pop @{$self->{list}};
-    }
-    sub SHIFT {
-        my($self) = @_;
-        return shift @{$self->{list}};
-    }
-    sub UNSHIFT {
-        my($self, @list) = @_;
-        return unshift @{$self->{list}}, @list;
-    }
-    sub SPLICE {
-        my($self, $offset, $length, @list) = @_;
-        return splice @{$self->{list}}, $offset, $length, @list;
-    }
-    sub UNTIE {
-        my($self) = @_;
-    }
-    sub DESTROY {
-        my($self) = @_;
-    }
-}
-
-package ListOverload {
-    use overload '@{}' => \&as_list;
-    sub new {
-        my($class, @list) = @_;
-        return bless { list => \@list }, $class;
-    }
-    sub as_list { return $_[0]{list} }
-}
-
diff --git a/perl-package/AI-MXNet/t/test_cuda_module.t b/perl-package/AI-MXNet/t/test_cuda_module.t
deleted file mode 100644
index 4576e766a3eb..000000000000
--- a/perl-package/AI-MXNet/t/test_cuda_module.t
+++ /dev/null
@@ -1,57 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-use strict;
-use warnings;
-use AI::MXNet qw(mx);
-use Test::More tests => 3;
-my $gpu_present = mx->context->num_gpus;
-
-sub test_cuda_rtc
-{
-    my $source = '
-    extern "C" __global__ void axpy(const float *x, float *y, float alpha) {
-        int i = threadIdx.x + blockIdx.x * blockDim.x;
-        y[i] += alpha * x[i];
-    }
-
-    extern "C" __global__ void saxpy(const float *x, float *y, float alpha) {
-        extern __shared__ float smem[];
-        int i = threadIdx.x + blockIdx.x * blockDim.x;
-        smem[threadIdx.x] = x[i];
-        y[i] += alpha * smem[threadIdx.x];
-    }
-    ';
-    my $module = mx->rtc->CudaModule($source);
-    my $axpy = $module->get_kernel("axpy", "const float *x, float *y, float alpha");
-    my $x = mx->nd->ones([10], ctx=>mx->gpu(0));
-    my $y = mx->nd->zeros([10], ctx=>mx->gpu(0));
-    $axpy->launch([$x, $y, 3], mx->gpu(0), [1, 1, 1], [10, 1, 1]);
-    ok(($y->aspdl == 3)->all);
-
-    my $saxpy = $module->get_kernel("saxpy", "const float *x, float *y, float alpha");
-    $saxpy->launch([$x, $y, 4], mx->gpu(0), [1, 1, 1], [10, 1, 1], 10);
-    ok(($y->aspdl == 7)->all);
-
-    $saxpy->launch([$x, $y, 5], mx->gpu(0), [2, 1, 1], [5, 1, 1], 5);
-    ok(($y->aspdl == 12)->all);
-}
-
-SKIP: {
-    skip("GPU is not avalilable", 3) unless $gpu_present;
-    test_cuda_rtc();
-}
\ No newline at end of file
diff --git a/perl-package/AI-MXNet/t/test_engine.t b/perl-package/AI-MXNet/t/test_engine.t
deleted file mode 100644
index 4cf5744f0051..000000000000
--- a/perl-package/AI-MXNet/t/test_engine.t
+++ /dev/null
@@ -1,41 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-use strict;
-use warnings;
-use AI::MXNet qw(mx);
-use Test::More tests => 2;
-
-sub test_bulk
-{
-    my $x;
-    mx->engine->bulk(10, sub {
-        $x = mx->nd->ones([10]);
-        $x *= 2;
-        $x += 1;
-        $x->wait_to_read();
-        $x += 1;
-        ok(($x->aspdl == 4)->all);
-        for my $i (1..100)
-        {
-            $x += 1;
-        }
-    });
-    ok(($x->aspdl == 104)->all);
-}
-
-test_bulk();
\ No newline at end of file
diff --git a/perl-package/AI-MXNet/t/test_executor.t b/perl-package/AI-MXNet/t/test_executor.t
deleted file mode 100644
index 0c01bd50dad2..000000000000
--- a/perl-package/AI-MXNet/t/test_executor.t
+++ /dev/null
@@ -1,195 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-use strict;
-use warnings;
-use Test::More tests => 2285;
-use AI::MXNet qw(mx);
-use AI::MXNet::TestUtils qw(reldiff pdl_maximum pdl_minimum);
-use PDL;
-
-sub check_bind_with_uniform
-{
-    my ($uf, $gf, $dim, $sf, $lshape, $rshape) = @_;
-    my $shape = (random($dim)*int(1000**(1.0/$dim))+1)->floor->unpdl;
-    my $lhs = mx->symbol->Variable('lhs');
-    my $rhs = mx->symbol->Variable('rhs');
-    my $ret;
-    if(defined $sf)
-    {
-        $ret = &{$sf}($lhs, $rhs);
-    }
-    else
-    {
-        $ret = &{$uf}($lhs, $rhs);
-    }
-
-    is_deeply($ret->list_arguments(), ['lhs', 'rhs']);
-    $lshape //= $shape;
-    $rshape //= $shape;
-
-    my $lhs_arr = mx->nd->array(random(reverse (@$lshape)));
-    my $rhs_arr = mx->nd->array(random(reverse (@$rshape)));
-    my $lhs_grad = mx->nd->empty($lshape);
-    my $rhs_grad = mx->nd->empty($rshape);
-    my $executor = $ret->bind(
-        ctx       => mx->Context('cpu'),
-        args      => [$lhs_arr, $rhs_arr],
-        args_grad => [$lhs_grad, $rhs_grad]
-    );
-
-    my $exec3 = $ret->bind(
-        ctx  => mx->Context('cpu'),
-        args => [$lhs_arr, $rhs_arr]
-    );
-
-    my $exec4 = $ret->bind(
-        ctx  => mx->Context('cpu'),
-        args => {'rhs' => $rhs_arr, 'lhs' => $lhs_arr},
-        args_grad=>{'lhs' => $lhs_grad, 'rhs' => $rhs_grad}
-    );
-
-    $executor->forward(1);
-    $exec3->forward(1);
-    $exec4->forward(1);
-    my $out2 = $executor->outputs->[0]->aspdl;
-    my $out1 = &{$uf}($lhs_arr->aspdl, $rhs_arr->aspdl);
-    my $out3 = $exec3->outputs->[0]->aspdl;
-    my $out4 = $exec4->outputs->[0]->aspdl;
-    ok(reldiff($out1, $out2) < 1e-6);
-    ok(reldiff($out1, $out3) < 1e-6);
-    ok(reldiff($out1, $out4) < 1e-6);
-    # test gradient
-
-    my $out_grad = mx->nd->ones([reverse @{$out2->shape->unpdl}]);
-    my ($lhs_grad2, $rhs_grad2) = &{$gf}(
-        $out_grad->aspdl,
-        $lhs_arr->aspdl,
-        $rhs_arr->aspdl
-    );
-    $executor->backward([$out_grad]);
-
-    ok(reldiff($lhs_grad->aspdl, $lhs_grad2) < 1e-6);
-    ok(reldiff($rhs_grad->aspdl, $rhs_grad2) < 1e-6);
-}
-
-sub test_bind
-{
-    my ($disable_bulk_exec) = @_;
-    my ($prev_fwd_var, $prev_bwd_var);
-    if($disable_bulk_exec)
-    {
-        $prev_fwd_var = $ENV{MXNET_EXEC_BULK_FWD_THRESHOLD_TRAIN}//1;
-        $prev_bwd_var = $ENV{MXNET_EXEC_BULK_BWD_TRAIN}//1;
-        $ENV{MXNET_EXEC_BULK_FWD_THRESHOLD_TRAIN} = 0;
-        $ENV{MXNET_EXEC_BULK_BWD_TRAIN} = 0;
-    }
-    srand(0);
-    my $nrepeat = 9;
-    my $maxdim = 3;
-    for my $repeat (0..$nrepeat)
-    {
-        for my $dim (1..$maxdim)
-        {
-            check_bind_with_uniform(sub { my ($x, $y) = @_; $x + $y },
-                                    sub { my ($g) = @_; ($g, $g) },
-                                    $dim);
-            check_bind_with_uniform(sub { my ($x, $y) = @_; $x - $y },
-                                    sub { my ($g) = @_; ($g, -$g) },
-                                    $dim);
-            check_bind_with_uniform(sub { my ($x, $y) = @_; $x * $y },
-                                    sub { my ($g, $x, $y) = @_; ($g*$y, $g*$x) },
-                                    $dim);
-            check_bind_with_uniform(sub { my ($x, $y) = @_; $x / $y },
-                                    sub { my ($g, $x, $y) = @_; ($g / $y, -$x * $g/ ($y**2)) },
-                                    $dim);
-            check_bind_with_uniform(sub { my ($x, $y) = @_; pdl_maximum($x, $y) },
-                                    sub { my ($g, $x, $y) = @_; ($g * ($x>$y), $g * ($y>$x)) },
-                                    $dim,
-                                    sub { $_[0]->maximum($_[1]) });
-            check_bind_with_uniform(sub { my ($x, $y) = @_; pdl_minimum($x, $y) },
-                                    sub { my ($g, $x, $y) = @_; ($g * ($x<$y), $g * ($y<$x)) },
-                                    $dim,
-                                    sub { $_[0]->minimum($_[1]) });
-        }
-    }
-    if($disable_bulk_exec)
-    {
-        $ENV{MXNET_EXEC_BULK_FWD_THRESHOLD_TRAIN} = $prev_fwd_var;
-        $ENV{MXNET_EXEC_BULK_BWD_TRAIN}           = $prev_bwd_var;
-    }
-}
-
-
-sub test_dot
-{
-    srand(0);
-    my $nrepeat = 9;
-    my $maxdim = 4;
-    for my $repeat (0..$nrepeat)
-    {
-        my $shape = (random(3)*500+1)->floor->unpdl;
-        check_bind_with_uniform(sub { my ($x, $y) = @_; $x x $y },
-                                sub { my ($g, $x, $y) = @_; ($g x $y->transpose, $x->transpose x $g) },
-                                2,
-                                sub { mx->symbol->dot(@_) },
-                                [@{$shape}[0, 1]],
-                                [@{$shape}[1, 2]],
-        );
-    }
-    for my $repeat (0..$nrepeat)
-    {
-        my $shape = (random(1)*500+1)->floor->unpdl;
-        check_bind_with_uniform(sub { my ($x, $y) = @_; $x x $y->transpose },
-                                sub { my ($g, $x, $y) = @_; ($g * $y, $g * $x) },
-                                2,
-                                sub { mx->symbol->dot(@_) },
-                                [@{$shape}[0]],
-                                [@{$shape}[0]],
-        );
-    }
-}
-
-sub test_reshape
-{
-    my $x = mx->sym->Variable('x');
-    my $y = mx->sym->FullyConnected($x, num_hidden=>4);
-    my $exe = $y->simple_bind(ctx => mx->cpu(), shapes => { x=>[5,4] }, grad_req=>'null');
-    $exe->arg_arrays->[0] .= 1;
-    $exe->arg_arrays->[1] .= mx->nd->ones([4,4]);
-    $exe->arg_arrays->[2] .= 0;
-    my $new_exe = $exe->reshape({ x=>[3,4] });
-    $new_exe->forward(0);
-    # test sub exec forward
-    ok(($new_exe->outputs->[0]->aspdl == 4)->all);
-    # test shared memory
-    ok(($exe->outputs->[0]->aspdl->slice('X', [0,2]) == 4)->all);
-    # test base exec forward
-    $exe->forward(0);
-    ok(($new_exe->outputs->[0]->aspdl == 4)->all);
-    $new_exe = $exe->reshape({ x=>[6,4] }, allow_up_sizing=>1);
-    # data ndarray is not shared between exe and new_exe
-    $new_exe->arg_arrays->[0] .= 0;
-    ok(($exe->arg_arrays->[0]->aspdl == 1)->all);
-    # weight ndarray is shared between exe and new_exe
-    ok(($new_exe->arg_arrays->[1]->aspdl == 1)->all);
-}
-
-test_bind(0);
-test_bind(1);
-test_dot();
-test_reshape();
diff --git a/perl-package/AI-MXNet/t/test_gluon.t b/perl-package/AI-MXNet/t/test_gluon.t
deleted file mode 100644
index 545cb7b3f882..000000000000
--- a/perl-package/AI-MXNet/t/test_gluon.t
+++ /dev/null
@@ -1,1320 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-use strict;
-use warnings;
-use Test::More tests => 232;
-use AI::MXNet qw(mx);
-use AI::MXNet::Gluon qw(gluon);
-use AI::MXNet::Gluon::NN qw(nn);
-use AI::MXNet::TestUtils qw(almost_equal dies_ok);
-use Scalar::Util qw(refaddr);
-use AI::MXNet::Base;
-
-sub test_parameter
-{
-    my $p = gluon->Parameter('weight', shape=>[10, 10]);
-    $p->initialize(init=>'xavier', ctx=>[mx->cpu(0), mx->cpu(1)]);
-    ok(@{$p->list_data} == 2);
-    ok(@{$p->list_grad} == 2);
-    ok($p->data(mx->cpu(1))->context eq mx->cpu(1));
-    is_deeply($p->data(mx->cpu(0))->shape, [10, 10]);
-    ok($p->var->name eq  'weight');
-    ok($p->grad(mx->cpu(0))->stype eq 'default');
-    ok($p->data(mx->cpu(0))->stype eq 'default');
-
-    $p->reset_ctx(ctx=>[mx->cpu(1), mx->cpu(2)]);
-    is_deeply($p->list_ctx, [mx->cpu(1), mx->cpu(2)]);
-}
-
-test_parameter();
-
-sub test_invalid_parameter_stype
-{
-    dies_ok(sub { gluon->Parameter('weight', shape=>[10, 10], stype=>'invalid') });
-}
-
-test_invalid_parameter_stype();
-
-sub test_invalid_parameter_grad_stype
-{
-    dies_ok(sub { gluon->Parameter('weight', shape=>[10, 10], grad_stype=>'invalid') });
-}
-
-test_invalid_parameter_grad_stype();
-
-sub test_sparse_parameter
-{
-    my $p = gluon->Parameter('weight', shape=>[10, 10], stype=>'row_sparse', grad_stype=>'row_sparse');
-    $p->initialize(init=>'xavier', ctx=>[mx->cpu(0), mx->cpu(1)]);
-    my $row_id = mx->nd->arange(start => 0, stop => 10, ctx=>mx->cpu(1));
-    ok(@{ $p->list_grad } == 2);
-    # getting row_sparse data without trainer throws an exception
-    dies_ok(sub { $p->list_row_sparse_data($row_id) });
-    my $trainer = gluon->Trainer([$p], 'sgd');
-    ok(@{ $p->list_row_sparse_data($row_id) } == 2);
-    my $weight = $p->row_sparse_data($row_id);
-    ok($weight->context eq mx->cpu(1));
-    is_deeply($weight->shape, [10, 10]);
-    ok($weight->stype eq 'row_sparse');
-    ok($p->var->name eq 'weight');
-    ok($p->var->attr('__storage_type__') eq STORAGE_TYPE_STR_TO_ID->{row_sparse});
-    ok($p->grad(mx->cpu(0))->stype eq 'row_sparse');
-
-    $p->reset_ctx(ctx=>[mx->cpu(1), mx->cpu(2)]);
-    is_deeply($p->list_ctx, [mx->cpu(1), mx->cpu(2)]);
-}
-
-test_sparse_parameter();
-
-sub test_parameter_invalid_access
-{
-    # cannot call data on row_sparse parameters
-    my $p0 = gluon->Parameter('weight', shape=>[10, 10], stype=>'row_sparse', grad_stype=>'row_sparse');
-    $p0->initialize(init=>'xavier', ctx=>[mx->cpu(0), mx->cpu(1)]);
-    dies_ok(sub { $p0->data });
-    dies_ok(sub { $p0->list_data });
-    my $row_id = mx->nd->arange(start => 0, stop => 10);
-    # cannot call row_sparse_data on dense parameters
-    my $p1 = gluon->Parameter('weight', shape=>[10, 10]);
-    $p1->initialize(init=>'xavier', ctx=>[mx->cpu(0), mx->cpu(1)]);
-    dies_ok(sub { $p1->row_sparse_data($row_id->copyto(mx->cpu(0))) });
-    dies_ok(sub { $p1->list_row_sparse_data($row_id) });
-}
-
-test_parameter_invalid_access();
-
-sub test_paramdict
-{
-    my $ctx = mx->cpu(1);
-    my $params0 = gluon->ParameterDict('net_');
-    $params0->get('w0', shape=>[10, 10]);
-    $params0->get('w1', shape=>[10, 10], stype=>'row_sparse');
-    my $all_row_ids = mx->nd->arange(start => 0, stop => 10, ctx=>$ctx);
-    # check param names
-    is_deeply([$params0->keys()], ['net_w0', 'net_w1']);
-    $params0->initialize(ctx=>$ctx);
-    my $trainer0 = gluon->Trainer($params0, 'sgd');
-    my $prev_w0 = $params0->get('w0')->data($ctx);
-    my $prev_w1 = $params0->get('w1')->row_sparse_data($all_row_ids);
-    # save params
-    $params0->save('test_paramdict.params');
-
-    # load params
-    my $params1 = gluon->ParameterDict('net_');
-    $params1->get('w0', shape=>[10, 10]);
-    $params1->get('w1', shape=>[10, 10], stype=>'row_sparse');
-    $params1->load('test_paramdict.params', ctx=>$ctx);
-    my $trainer1 = gluon->Trainer($params1, 'sgd');
-
-    # compare the values before and after save/load
-    my $cur_w0 = $params1->get('w0')->data($ctx);
-    my $cur_w1 = $params1->get('w1')->row_sparse_data($all_row_ids);
-    ok(almost_equal($prev_w0->aspdl, $cur_w0->aspdl));
-    ok(almost_equal($prev_w1->aspdl, $cur_w1->aspdl));
-
-    # create a new param dict with dense params, and load from the checkpoint
-    # of sparse & dense params
-    my $params2 = gluon->ParameterDict('net_');
-    $params2->get('w0', shape=>[10, 10]);
-    $params2->get('w1', shape=>[10, 10]);
-    $params2->load('test_paramdict.params', ctx=>$ctx);
-
-    # compare the values before and after save/load
-    $cur_w0 = $params2->get('w0')->data($ctx);
-    $cur_w1 = $params2->get('w1')->data($ctx);
-    ok(almost_equal($prev_w0->aspdl, $cur_w0->aspdl));
-    ok(almost_equal($prev_w1->aspdl, $cur_w1->aspdl));
-}
-
-test_paramdict();
-
-sub test_parameter_row_sparse_data
-{
-    my $ctx0 = mx->cpu(1);
-    my $ctx1 = mx->cpu(2);
-    my $dim0 = 4;
-    my $x = gluon->Parameter('x', shape=>[$dim0, 2], stype=>'row_sparse');
-    $x->initialize(init=>'xavier', ctx=>[$ctx0, $ctx1]);
-    my $trainer = gluon->Trainer([$x], 'sgd');
-    my $x_param = $x->_data->[0]->copy();
-    is($x_param->stype, 'row_sparse');
-    my $row_id_0 = mx->nd->array([0,1], ctx=>$ctx0);
-    my $retained_0 = $x->row_sparse_data($row_id_0);
-    my $retained_target_0 = mx->nd->sparse->retain($x_param, $row_id_0->as_in_context($ctx0));
-    ok(almost_equal($retained_0->aspdl, $retained_target_0->aspdl));
-    is($retained_0->context, $ctx0);
-    my $row_id_1 = mx->nd->arange(start => 0, stop => $dim0, ctx=>$ctx1);
-    my $retained_1 = $x->row_sparse_data($row_id_1);
-    my $retained_target_1 = $x_param;
-    ok(almost_equal($retained_1->aspdl, $retained_target_1->aspdl));
-    is($retained_1->context, $ctx1);
-    my $row_id_2 = mx->nd->array([0,1,2]);
-    my $retained_2 = $x->list_row_sparse_data($row_id_2);
-    my $retained_target_2 = mx->nd->sparse->retain($x_param, $row_id_2->as_in_context($ctx0));
-    ok(almost_equal($retained_2->[0]->aspdl, $retained_target_2->aspdl));
-}
-
-test_parameter_row_sparse_data();
-
-sub test_constant
-{
-    package Test {
-        use AI::MXNet::Gluon::Mouse;
-        extends 'AI::MXNet::Gluon::HybridBlock';
-        sub BUILD
-        {
-            my $self = shift;
-            $self->value(mx->nd->array([[1,2], [3,4]])->aspdl);
-            $self->const($self->params->get_constant('const', $self->value));
-        }
-        sub hybrid_forward
-        {
-            my ($self, $F, $x, $name, $const) = @_;
-            return $x + $const;
-        }
-    };
-
-    my $test = Test->new();
-    $test->initialize();
-    my $trainer = gluon->Trainer(
-        $test->collect_params(), 'sgd',
-        {learning_rate => 1.0, momentum => 0.5}
-    );
-
-    my ($x, $y);
-    mx->autograd->record(sub {
-        $x = mx->nd->ones([2,2]);
-        $x->attach_grad();
-        $y = $test->($x);
-        $y->backward();
-    });
-
-    $trainer->step(1);
-
-    ok(($test->const->data->aspdl == $test->value)->all);
-    ok(($x->grad->aspdl == 1)->all);
-}
-
-test_constant();
-
-package Net;
-use AI::MXNet::Gluon::Mouse;
-use AI::MXNet::Function::Parameters;
-extends 'AI::MXNet::Gluon::Block';
-has 'in_units' => (is => 'rw', default => 0);
-
-sub BUILD
-{
-    my $self = shift;
-    $self->name_scope(sub {
-        $self->dense0(nn->Dense(5, in_units=>$self->in_units));
-        $self->dense1(nn->Dense(5, in_units=>$self->in_units));
-    });
-}
-
-method forward($x)
-{
-    return $self->dense1->($self->dense0->($x));
-}
-
-package main;
-
-sub test_parameter_sharing
-{
-    my $net1 = Net->new(prefix=>'net1_', in_units => 5);
-    my $net2 = Net->new(prefix=>'net2_', params=>$net1->collect_params());
-    $net1->collect_params()->initialize();
-    $net2->(mx->nd->zeros([3, 5]));
-    $net1->save_parameters('net1.params');
-    my $net3 = Net->new(prefix=>'net3_');
-    $net3->load_parameters('net1.params', ctx => mx->cpu());
-    my $net4 = Net->new(prefix=>'net4_');
-    my $net5 = Net->new(prefix=>'net5_', in_units=>5, params=>$net4->collect_params());
-    $net4->collect_params()->initialize();
-    $net5->(mx->nd->zeros([3, 5]));
-    $net4->save_parameters('net4.params');
-    my $net6 = Net->new(prefix=>'net6_');
-    $net6->load_parameters('net4.params', ctx => mx->cpu());
-}
-
-test_parameter_sharing();
-
-sub test_parameter_str
-{
-    package Net1 {
-        use AI::MXNet::Gluon::Mouse;
-        extends 'AI::MXNet::Gluon::Block';
-        sub BUILD
-        {
-            my $self = shift;
-            $self->name_scope(sub {
-                $self->dense0(nn->Dense(10, in_units=>5, use_bias=>0));
-            });
-        }
-    };
-    my $net = Net1->new(prefix=>'net1_');
-    my @lines = split(/\n/, $net->collect_params());
-    ok($lines[0] eq 'net1_ (');
-    ok($lines[1] =~ /net1_dense0_weight/);
-    ok($lines[1] =~ /\(10, 5\)/);
-    ok($lines[1] =~ /float32/);
-    ok($lines[2] eq ')');
-}
-
-test_parameter_str();
-
-sub test_collect_parameters
-{
-    my $net = nn->HybridSequential(prefix=>"test_");
-    $net->name_scope(sub {
-        $net->add(nn->Conv2D(10, 3));
-        $net->add(nn->Dense(10, activation=>'relu'));
-    });
-    is_deeply(
-        [$net->collect_params->keys],
-        ['test_conv0_weight', 'test_conv0_bias','test_dense0_weight','test_dense0_bias']
-    );
-    is_deeply(
-        [$net->collect_params('.*weight')->keys],
-        ['test_conv0_weight', 'test_dense0_weight']
-    );
-    is_deeply(
-        [$net->collect_params('test_conv0_bias|test_dense0_bias')->keys],
-        ['test_conv0_bias', 'test_dense0_bias']
-    )
-};
-
-test_collect_parameters();
-
-sub test_basic
-{
-    my $model = nn->Sequential();
-    $model->add(nn->Dense(128, activation=>'tanh', in_units=>10, flatten=>0));
-    $model->add(nn->Dropout(0.5));
-    $model->add(nn->Dense(64, activation=>'tanh', in_units=>256));
-    $model->add(nn->Dense(32, in_units=>64));
-    $model->add(nn->Activation('relu'));
-
-    # symbol
-    my $x = mx->sym->var('data');
-    my $y = $model->($x);
-    ok(@{ $y->list_arguments } == 7);
-
-    # ndarray
-    $model->collect_params()->initialize(init => mx->init->Xavier(magnitude=>2.24));
-    $x = $model->(mx->nd->zeros([32, 2, 10]));
-    is_deeply($x->shape, [32, 32]);
-    $x->wait_to_read;
-
-    $model->collect_params()->setattr(grad_req => 'null');
-    ok(not defined( ($model->collect_params()->values())[0]->_grad));
-    $model->collect_params()->setattr(grad_req => 'write');
-    ok(defined (($model->collect_params()->values())[0]->_grad));
-}
-
-test_basic();
-
-sub test_dense
-{
-    my $model = nn->Dense(128, activation=>'tanh', in_units=>10, flatten=>0, prefix=>'test_');
-    my $inputs = mx->sym->Variable('data');
-    my $outputs = $model->($inputs);
-    is_deeply({map { $_ => 1 } $model->collect_params()->keys()}, {'test_weight', 1, 'test_bias', 1});
-    is_deeply($outputs->list_outputs(), ['test_tanh_fwd_output']);
-    my ($args, $outs, $auxs) = $outputs->infer_shape(data=>[2, 3, 10]);
-    is_deeply($outs, [[2, 3, 128]]);
-
-    $model = nn->Dense(128, activation=>'relu', in_units=>30, flatten=>1, prefix=>'test2_');
-    $inputs = mx->sym->Variable('data');
-    $outputs = $model->($inputs);
-    is_deeply({map { $_ => 1 } $model->collect_params()->keys()}, {'test2_weight', 1, 'test2_bias', 1});
-    is_deeply($outputs->list_outputs(), ['test2_relu_fwd_output']);
-    ($args, $outs, $auxs) = $outputs->infer_shape(data=>[17, 2, 5, 3]);
-    is_deeply($outs, [[17, 128]]);
-}
-
-test_dense();
-
-package Net2;
-use AI::MXNet::Gluon::Mouse;
-use AI::MXNet::Function::Parameters;
-extends 'AI::MXNet::Gluon::HybridBlock';
-has 'model' => (is => 'rw');
-
-method hybrid_forward($F, $x)
-{
-    my $out = $self->model->($x);
-    return $F->add_n(map { $_->sum } @{ $out });
-}
-
-package main;
-
-sub test_symbol_block
-{
-    my $model = nn->HybridSequential();
-    $model->add(nn->Dense(128, activation=>'tanh'));
-    $model->add(nn->Dropout(0.5));
-    $model->add(nn->Dense(64, activation=>'tanh'));
-    $model->add(nn->Dense(32, in_units=>64));
-    $model->add(nn->Activation('relu'));
-
-    $model->initialize();
-
-    my $inputs = mx->sym->var('data');
-    my $outputs = $model->($inputs)->get_internals();
-    my $smodel = gluon->SymbolBlock($outputs, $inputs, params=>$model->collect_params);
-
-    ok($smodel->(mx->nd->zeros([16, 10])) == 14);
-    my $out = $smodel->(mx->sym->var('in'));
-    ok(@{ $out } == @{ $outputs->list_outputs() });
-
-    my $net = Net2->new(model => $smodel);
-    $net->hybridize();
-    ok(ref $net->(mx->nd->zeros([16, 10])) eq 'AI::MXNet::NDArray');
-
-    $inputs = mx->sym->var('data');
-    $outputs = $model->($inputs);
-    $smodel = gluon->SymbolBlock($outputs, $inputs, params=>$model->collect_params);
-    $net = Net2->new(model => $smodel);
-    $net->hybridize();
-    ok(ref $net->(mx->nd->zeros([16, 10])) eq 'AI::MXNet::NDArray');
-}
-
-test_symbol_block();
-
-sub test_sparse_symbol_block
-{
-    my $data = mx->sym->var('data');
-    my $weight = mx->sym->var('weight', stype=>'row_sparse');
-    my $bias = mx->sym->var('bias');
-    my $out = mx->sym->broadcast_add(mx->sym->dot($data, $weight), $bias);
-    # an exception is expected when creating a SparseBlock w/ sparse param
-    dies_ok(sub { gluon->SymbolBlock($out, $data) });
-}
-
-test_sparse_symbol_block();
-
-sub test_sparse_hybrid_block0
-{
-    my $params = gluon->ParameterDict('net_');
-    $params->get('weight', shape=>[5,5], stype=>'row_sparse', dtype=>'float32', allow_deferred_init => 1);
-    $params->get('bias', shape=>[5], dtype=>'float32', allow_deferred_init => 1);
-    my $net = nn->Dense(5, params=>$params);
-    $net->initialize();
-    my $x = mx->nd->ones([2,5]);
-    # an exception is expected when forwarding a HybridBlock w/ sparse param
-    dies_ok(sub { $net->($x) });
-}
-
-test_sparse_hybrid_block0();
-
-sub check_layer_forward
-{
-    my ($layer, $dshape) = @_;
-    $layer->collect_params()->initialize();
-    my $x = mx->nd->ones($dshape);
-    $x->attach_grad();
-    my $out;
-    mx->autograd->record(sub {
-        $out = $layer->($x);
-    });
-    $out->backward();
-    my $pdl_out = $out->aspdl;
-    my $pdl_dx  = $x->grad->aspdl;
-
-    $layer->hybridize();
-
-    $x = mx->nd->ones($dshape);
-    $x->attach_grad();
-    mx->autograd->record(sub {
-        $out = $layer->($x);
-    });
-    $out->backward();
-
-    ok(almost_equal($pdl_out, $out->aspdl, 1e-5));
-    ok(almost_equal($pdl_dx, $x->grad->aspdl, 1e-5));
-}
-
-sub test_conv
-{
-    my @layers1d = (
-        nn->Conv1D(16, 3, in_channels=>4),
-        nn->Conv1D(16, 3, groups=>2, in_channels=>4),
-        nn->Conv1D(16, 3, strides=>3, groups=>2, in_channels=>4),
-    );
-    for my $layer (@layers1d)
-    {
-        check_layer_forward($layer, [1, 4, 10]);
-    }
-
-    my @layers2d = (
-        nn->Conv2D(16, [3, 4], in_channels=>4),
-        nn->Conv2D(16, [5, 4], in_channels=>4),
-        nn->Conv2D(16, [3, 4], groups=>2, in_channels=>4),
-        nn->Conv2D(16, [3, 4], strides=>4, in_channels=>4),
-        nn->Conv2D(16, [3, 4], dilation=>4, in_channels=>4),
-        nn->Conv2D(16, [3, 4], padding=>4, in_channels=>4),
-    );
-    for my $layer (@layers2d)
-    {
-        check_layer_forward($layer, [1, 4, 20, 20]);
-    }
-
-    my @layers3d = (
-        nn->Conv3D(16, [1, 8, 4], in_channels=>4, activation=>'relu'),
-        nn->Conv3D(16, [5, 4, 3], in_channels=>4),
-        nn->Conv3D(16, [3, 3, 3], groups=>2, in_channels=>4),
-        nn->Conv3D(16, 4, strides=>4, in_channels=>4),
-        nn->Conv3D(16, [3, 3, 3], padding=>4, in_channels=>4),
-    );
-    for my $layer (@layers3d)
-    {
-        check_layer_forward($layer, [1, 4, 10, 10, 10]);
-    }
-
-    # These layouts only supported on GPU for now
-    my $layer = nn->Conv2D(16, [3, 3], layout=>'NHWC', in_channels=>4);
-    #check_layer_forward($layer, [1, 10, 10, 4]);
-
-    $layer = nn->Conv3D(16, [3, 3, 3], layout=>'NDHWC', in_channels=>4);
-    # check_layer_forward(layer, (1, 10, 10, 10, 4))
-}
-
-test_conv();
-
-
-sub test_deconv
-{
-    # commented out code is only supported on GPU for now
-    # my @layers1d = (
-    #     nn->Conv1DTranspose(16, 3, in_channels=>4),
-    #     nn->Conv1DTranspose(16, 3, groups=>2, in_channels=>4),
-    #     nn->Conv1DTranspose(16, 3, strides=>3, groups=>2, in_channels=>4),
-    # );
-    # for my $layer (@layers1d)
-    # {
-    #     check_layer_forward($layer, [1, 4, 10]);
-    # }
-
-
-    my @layers2d = (
-        nn->Conv2DTranspose(16, [3, 4], in_channels=>4),
-        nn->Conv2DTranspose(16, [5, 4], in_channels=>4),
-        nn->Conv2DTranspose(16, [3, 4], groups=>2, in_channels=>4),
-        nn->Conv2DTranspose(16, [3, 4], strides=>4, in_channels=>4),
-        nn->Conv2DTranspose(16, [3, 4], dilation=>4, in_channels=>4),
-        nn->Conv2DTranspose(16, [3, 4], padding=>4, in_channels=>4),
-        nn->Conv2DTranspose(16, [3, 4], strides=>4, output_padding=>3, in_channels=>4),
-    );
-    for my $layer (@layers2d)
-    {
-        check_layer_forward($layer, [1, 4, 20, 20]);
-    }
-
-    # @layers3d = (
-    #     nn->Conv3DTranspose(16, [1, 8, 4], in_channels=>4),
-    #     nn->Conv3DTranspose(16, [5, 4, 3], in_channels=>4),
-    #     nn->Conv3DTranspose(16, [3, 3, 3], groups=>2, in_channels=>4),
-    #     nn->Conv3DTranspose(16, 4, strides=>4, in_channels=>4),
-    #     nn->Conv3DTranspose(16, [3, 3, 3], padding=>4, in_channels=>4),
-    # );
-    # for my $layer (@layers3d)
-    # {
-    #     check_layer_forward($layer, [1, 4, 10, 10, 10]);
-    # }
-    #
-    my $layer = nn->Conv2DTranspose(16, [3, 3], layout=>'NHWC', in_channels=>4);
-    # check_layer_forward($layer, [1, 10, 10, 4]);
-    #
-    # $layer = nn->Conv3DTranspose(16, [3, 3, 3], layout=>'NDHWC', in_channels=>4);
-    # check_layer_forward(layer, [1, 10, 10, 10, 4]);
-}
-
-test_deconv();
-
-sub test_pool
-{
-    my @layers1d = (
-        nn->MaxPool1D(),
-        nn->MaxPool1D(3),
-        nn->MaxPool1D(3, 2),
-        nn->AvgPool1D(),
-        nn->AvgPool1D(count_include_pad=>0),
-        nn->GlobalAvgPool1D(),
-    );
-    for my $layer (@layers1d)
-    {
-        check_layer_forward($layer, [1, 2, 10]);
-    }
-
-    my @layers2d = (
-        nn->MaxPool2D(),
-        nn->MaxPool2D([3, 3]),
-        nn->MaxPool2D(3, 2),
-        nn->AvgPool2D(),
-        nn->AvgPool2D(count_include_pad=>0),
-        nn->GlobalAvgPool2D(),
-    );
-    for my $layer (@layers2d)
-    {
-        check_layer_forward($layer, [1, 2, 10, 10]);
-    }
-
-    my @layers3d = (
-        nn->MaxPool3D(),
-        nn->MaxPool3D([3, 3, 3]),
-        nn->MaxPool3D(3, 2),
-        nn->AvgPool3D(),
-        nn->AvgPool3D(count_include_pad=>0),
-        nn->GlobalAvgPool3D(),
-    );
-    for my $layer (@layers3d)
-    {
-        check_layer_forward($layer, [1, 2, 10, 10, 10]);
-    }
-
-    # test ceil_mode
-    my $x = mx->nd->zeros([2, 2, 10, 10]);
-
-    my $layer = nn->MaxPool2D(3, ceil_mode=>0);
-    $layer->collect_params()->initialize();
-    is_deeply($layer->($x)->shape, [2, 2, 3, 3]);
-
-    $layer = nn->MaxPool2D(3, ceil_mode=>1);
-    $layer->collect_params()->initialize();
-    is_deeply($layer->($x)->shape, [2, 2, 4, 4]);
-}
-
-test_pool();
-
-sub test_batchnorm
-{
-    my $layer = nn->BatchNorm(in_channels=>10);
-    check_layer_forward($layer, [2, 10, 10, 10]);
-}
-
-test_batchnorm();
-
-sub test_instancenorm
-{
-    my $layer = nn->InstanceNorm(in_channels=>10);
-    check_layer_forward($layer, [2, 10, 10, 10]);
-}
-
-test_instancenorm();
-
-sub test_layernorm
-{
-    my $layer = nn->LayerNorm(in_channels=>10);
-    check_layer_forward($layer, [2, 10, 10, 10]);
-}
-
-test_layernorm();
-
-sub test_reflectionpad
-{
-    my $layer = nn->ReflectionPad2D(3);
-    check_layer_forward($layer, [2, 3, 24, 24]);
-}
-
-test_reflectionpad();
-
-sub test_reshape
-{
-    my $x = mx->nd->ones([2, 4, 10, 10]);
-    my $layer = nn->Conv2D(10, 2, in_channels=>4);
-    $layer->collect_params()->initialize();
-    mx->autograd->record(sub {
-        $x = $layer->($x);
-        $x = $x->reshape([-1]);
-        $x = $x + 10;
-    });
-    $x->backward();
-}
-
-test_reshape();
-
-sub test_slice
-{
-    my $x = mx->nd->ones([5, 4, 10, 10]);
-    my $layer = nn->Conv2D(10, 2, in_channels=>4);
-    $layer->collect_params()->initialize();
-    mx->autograd->record(sub {
-        $x = $layer->($x);
-        $x = $x->slice([1,3]);
-        $x = $x + 10;
-    });
-    $x->backward();
-}
-
-test_slice();
-
-sub test_at
-{
-    my $x = mx->nd->ones([5, 4, 10, 10]);
-    my $layer = nn->Conv2D(10, 2, in_channels=>4);
-    $layer->collect_params()->initialize();
-    mx->autograd->record(sub {
-        $x = $layer->($x);
-        $x = $x->at(1);
-        $x = $x + 10;
-    });
-    $x->backward();
-}
-
-test_at();
-
-sub test_deferred_init
-{
-    my $x = mx->nd->ones([5, 4, 10, 10]);
-    my $layer = nn->Conv2D(10, 2);
-    $layer->collect_params()->initialize();
-    $layer->($x);
-}
-
-test_deferred_init();
-
-
-sub check_split_data
-{
-    my ($x, $num_slice, $batch_axis, %kwargs) = @_;
-    my $res = gluon->utils->split_data($x, $num_slice, $batch_axis, %kwargs);
-    ok(@{ $res } == $num_slice);
-    ok(almost_equal(mx->nd->concat(@$res, dim=>$batch_axis)->aspdl(), $x->aspdl()));
-}
-
-sub test_split_data
-{
-    my $x = mx->nd->random->uniform(shape=>[128, 33, 64]);
-
-    check_split_data($x, 8, 0);
-    check_split_data($x, 3, 1);
-    check_split_data($x, 4, 1, even_split=>0);
-    check_split_data($x, 15, 1, even_split=>0);
-    eval {
-        check_split_data($x, 4, 1);
-    };
-    ok($@);
-}
-
-test_split_data();
-
-sub test_flatten
-{
-    my $flatten = nn->Flatten();
-    my $x = mx->nd->zeros([3,4,5,6]);
-    is_deeply($flatten->($x)->shape, [3, 4*5*6]);
-    $x = mx->nd->zeros([3,6]);
-    is_deeply($flatten->($x)->shape, [3, 6]);
-    $x = mx->nd->zeros([3]);
-    is_deeply($flatten->($x)->shape, [3, 1]);
-}
-
-test_flatten();
-
-sub test_block_attr_hidden
-{
-    my $b = gluon->Block();
-    # regular attributes can change types
-    $b->a(undef);
-    $b->a(1);
-}
-
-test_block_attr_hidden();
-
-sub test_block_attr_block
-{
-    my $b = gluon->Block();
-    # regular variables can't change types
-    $b->b(gluon->Block());
-    eval { $b->b([2]); };
-    ok($@ =~ /not allowed/i);
-}
-
-test_block_attr_block();
-
-sub test_block_attr_param
-{
-    my $b = gluon->Block();
-    # regular variables can't change types
-    $b->b(gluon->Parameter(name => 'test'));
-    eval { $b->b([2]); };
-    ok($@ =~ /not allowed/i);
-}
-
-test_block_attr_param();
-
-sub test_block_attr_regular
-{
-    my $b = gluon->Block();
-
-    # set block attribute also sets _children
-    $b->c(gluon->Block());
-    my $c2 = gluon->Block();
-    $b->c($c2);
-    ok(refaddr($b->c) == refaddr($c2) and refaddr(($b->_children->values)[0]) == refaddr($c2));
-}
-
-test_block_attr_regular();
-
-sub test_block_attr_list_of_block
-{
-    package Model1 {
-        use AI::MXNet::Gluon::Mouse;
-        extends 'AI::MXNet::Gluon::Block';
-        sub BUILD
-        {
-            my $self = shift;
-            $self->name_scope(sub {
-                $self->layers([map { nn->Dense($_ * 10) } 0..5]);
-            });
-        }
-    };
-    package Model2 {
-        use AI::MXNet::Gluon::Mouse;
-        extends 'AI::MXNet::Gluon::Block';
-        sub BUILD
-        {
-            my $self = shift;
-            $self->name_scope(sub {
-                $self->layers({});
-                $self->layers->{a} = [map { nn->Dense($_ * 10) } 0..5];
-            });
-        }
-    };
-    package Model3 {
-        use AI::MXNet::Gluon::Mouse;
-        extends 'AI::MXNet::Gluon::Block';
-        sub BUILD
-        {
-            my $self = shift;
-            $self->name_scope(sub {
-                $self->layers(nn->Sequential());
-                $self->layers->add(map { nn->Dense($_ * 10) } 0..5);
-            });
-        }
-    };
-    package Model4 {
-        use AI::MXNet::Gluon::Mouse;
-        extends 'AI::MXNet::Gluon::Block';
-        sub BUILD
-        {
-            my $self = shift;
-            $self->name_scope(sub {
-                $self->data({a => '4', b => 123});
-            });
-        }
-    };
-    my $w = 0;
-    local($SIG{__WARN__}) = sub {
-        $w++;
-    };
-    Model1->new->collect_params;
-    ok($w > 0); $w = 0;
-    Model2->new->collect_params;
-    ok($w > 0); $w = 0;
-    Model3->new->collect_params;
-    ok($w == 0); $w = 0;
-    Model4->new->collect_params;
-    ok($w == 0);
-}
-
-test_block_attr_list_of_block();
-
-sub check_sequential
-{
-    my ($net) = @_;
-    my $dense1 = nn->Dense(10);
-    $net->add($dense1);
-    my $dense2 = nn->Dense(10);
-    $net->add($dense2);
-    my $dense3 = nn->Dense(10);
-    $net->add($dense3);
-
-    ok(refaddr($net->[1]) == refaddr($dense2));
-    ok(refaddr($net->[-1]) == refaddr($dense3));
-    my $slc = $net->slice([1,2]);
-    ok(@$slc == 2 and refaddr($slc->[0]) == refaddr($dense2) and refaddr($slc->[1]) == refaddr($dense3));
-    ok(ref $slc eq ref $net);
-}
-
-sub test_sequential
-{
-    check_sequential(nn->Sequential());
-    check_sequential(nn->HybridSequential());
-}
-
-test_sequential();
-
-sub test_global_norm_clip
-{
-    my @stypes = ('default', 'row_sparse');
-    my $check_global_norm_clip = sub { my ($stype) = @_;
-        my $x1 = mx->nd->ones([3,3])->tostype($stype);
-        my $x2 = mx->nd->ones([4,4])->tostype($stype);
-        my $norm = gluon->utils->clip_global_norm([$x1, $x2], 1.0);
-        ok($norm == 5);
-        ok(almost_equal($x1->aspdl, mx->nd->ones([3,3])->aspdl/5));
-        ok(almost_equal($x2->aspdl, mx->nd->ones([4,4])->aspdl/5));
-
-        my $x3 = mx->nd->array([1.0, 2.0, 'nan'])->tostype($stype);
-        my $w = 0;
-        local($SIG{__WARN__}) = sub {
-            $w++;
-        };
-        gluon->utils->clip_global_norm([$x1, $x3], 2.0);
-        ok($w == 1);
-    };
-    for my $stype (@stypes)
-    {
-        $check_global_norm_clip->($stype);
-    }
-}
-
-test_global_norm_clip();
-
-sub test_embedding
-{
-    local($ENV{MXNET_STORAGE_FALLBACK_LOG_VERBOSE}) = 0;
-    my $check_embedding = sub { my ($sparse_grad) = @_;
-        my $layer = nn->Embedding(10, 100, sparse_grad=>$sparse_grad);
-        $layer->initialize();
-        my $x = mx->nd->array([3,4,2,0,1]); my $y;
-        mx->autograd->record(sub {
-            $y = $layer->($x);
-            $y->backward();
-        });
-        ok(($layer->weight->grad->aspdl->slice('X', [0, 4]) == 1)->all);
-        ok(($layer->weight->grad->aspdl->slice('X', [5, -1]) == 0)->all);
-    };
-    my $check_embedding_large_input = sub { my ($sparse_grad) = @_;
-        my $embedding = nn->Embedding(10, 1, sparse_grad=>$sparse_grad);
-        $embedding->initialize();
-        $embedding->hybridize();
-        my $shape = [20481];
-        my ($emb_in, $loss);
-        mx->autograd->record(sub {
-            $emb_in = $embedding->(mx->nd->ones($shape));
-            $loss = $emb_in->sum;
-        });
-        $loss->backward;
-        ok($embedding->weight->grad->sum->asscalar == 20481);
-    };
-    $check_embedding->(1);
-    $check_embedding->(0);
-    $check_embedding_large_input->(1);
-    $check_embedding_large_input->(0);
-}
-
-test_embedding();
-
-sub test_hybrid_stale_cache
-{
-    my $net = nn->HybridSequential();
-    $net->name_scope(sub {
-        $net->add(nn->Dense(10, weight_initializer=>'zeros', bias_initializer=>'ones', flatten=>0));
-    });
-
-    $net->hybridize();
-    $net->initialize();
-    $net->(mx->nd->ones([2,3,5]));
-
-    $net->add(nn->Flatten());
-    is_deeply($net->(mx->nd->ones([2,3,5]))->shape, [2, 30]);
-
-    $net = nn->HybridSequential();
-    $net->name_scope(sub {
-        $net->fc1(nn->Dense(10, weight_initializer=>'zeros',
-                                    bias_initializer=>'ones', flatten=>0));
-        $net->fc2(nn->Dense(10, weight_initializer=>'zeros',
-                                    bias_initializer=>'ones', flatten=>0));
-    });
-    $net->hybridize();
-    $net->initialize();
-    $net->(mx->nd->ones([2,3,5]));
-
-    $net->fc2(nn->Dense(10, weight_initializer=>'zeros',
-                                bias_initializer=>'ones', flatten=>1));
-    $net->initialize();
-    is_deeply($net->(mx->nd->ones([2,3,5]))->shape, [2, 10]);
-}
-
-test_hybrid_stale_cache();
-
-sub test_lambda
-{
-    my $net1 = nn->HybridSequential();
-    $net1->add(nn->Activation('tanh'),
-             nn->LeakyReLU(0.1));
-
-    my $net2 = nn->HybridSequential();
-    my $op3 = sub { my ($F, $x, @args) = @_; $F->LeakyReLU($x, @args, slope=>0.1); };
-    $net2->add(nn->HybridLambda('tanh'),
-             nn->HybridLambda($op3));
-
-    my $op4 = sub { mx->nd->LeakyReLU($_[0], slope=>0.1); };
-    my $net3 = nn->Sequential();
-    $net3->add(nn->Lambda('tanh'),
-             nn->Lambda($op4));
-
-    my $input_data = mx->nd->random->uniform(shape=>[2, 3, 5, 7]);
-    my ($out1, $out2, $out3) = ($net1->($input_data), $net2->($input_data), $net3->($input_data));
-    ok(almost_equal($out1->aspdl, $out2->aspdl, 1e-3));
-    ok(almost_equal($out1->aspdl, $out3->aspdl, 1e-3));
-}
-
-test_lambda();
-
-sub test_fill_shape_deferred
-{
-    my $net = nn->HybridSequential();
-    $net->name_scope(sub {
-        $net->add(nn->Conv2D(64, kernel_size=>2, padding=>1),
-                nn->BatchNorm(),
-                nn->Dense(10));
-    });
-    $net->hybridize();
-    $net->initialize();
-    $net->(mx->nd->ones([2,3,5,7]));
-    ok($net->[0]->weight->shape->[1] == 3);
-    ok($net->[1]->gamma->shape->[0] == 64);
-    ok($net->[2]->weight->shape->[1] == 3072);
-}
-
-test_fill_shape_deferred();
-
-sub test_fill_shape_load
-{
-    my $ctx = mx->context->current_context();
-    my $net1 = nn->HybridSequential();
-    $net1->name_scope(sub {
-        $net1->add(nn->Conv2D(64, kernel_size=>2, padding=>1),
-                 nn->BatchNorm(),
-                 nn->Dense(10))
-    });
-    $net1->hybridize();
-    $net1->initialize(mx->init->Uniform, ctx => $ctx);
-    $net1->(mx->nd->ones([2,3,5,7], ctx => $ctx));
-    $net1->save_parameters('net_fill.params');
-
-    my $net2 = nn->HybridSequential();
-    $net2->name_scope(sub {
-        $net2->add(nn->Conv2D(64, kernel_size=>2, padding=>1),
-                 nn->BatchNorm(),
-                 nn->Dense(10))
-    });
-    $net2->hybridize();
-    $net2->initialize();
-    $net2->load_parameters('net_fill.params', ctx=>$ctx);
-    ok($net2->[0]->weight->shape->[1] == 3);
-    ok($net2->[1]->gamma->shape->[0] == 64);
-    ok($net2->[2]->weight->shape->[1] == 3072);
-}
-
-test_fill_shape_load();
-
-use JSON::PP qw(decode_json);
-
-sub test_inline
-{
-    my $y;
-
-    my $net = nn->HybridSequential();
-    $net->name_scope(sub {
-        $net->add(nn->Dense(10));
-        $net->add(nn->Dense(10));
-        $net->add(nn->Dense(10));
-    });
-    $net->initialize();
-
-    $net->hybridize(inline_limit=>3);
-    mx->autograd->record(sub {
-        $y = $net->(mx->nd->zeros([1,10]));
-    });
-    my $len_1 = @{ decode_json(mx->autograd->get_symbol($y)->tojson())->{nodes} };
-    $y->backward();
-
-    $net->hybridize(inline_limit=>0);
-    mx->autograd->record(sub {
-        $y = $net->(mx->nd->zeros([1,10]));
-    });
-    my $len_2 = @{ decode_json(mx->autograd->get_symbol($y)->tojson())->{nodes} };
-    $y->backward();
-
-    is($len_1, $len_2 + 2);
-}
-
-test_inline();
-
-sub test_activations
-{
-    my $point_to_validate = mx->nd->array([(-0.1, 0.1) x 3]);
-
-    my $swish = nn->Swish();
-    my $swish_test = sub { my ($x) = @_;
-        return $x * mx->nd->sigmoid($x)
-    };
-
-    for(zip($swish_test->($point_to_validate), $swish->($point_to_validate)))
-    {
-        my ($test_point, $ref_point) = @$_;
-        ok($test_point == $ref_point);
-    }
-
-    my $elu = nn->ELU();
-    my $elu_test = sub { my ($x) = @_;
-        my $elu = sub { my ($x) = @_;
-            return $x < 0 ? 1.0 * (mx->nd->exp($x) - 1) : $x;
-        };
-        return [map { $elu->($_) } @{ $x }];
-    };
-
-    for(zip($elu_test->($point_to_validate), $elu->($point_to_validate)))
-    {
-        my ($test_point, $ref_point) = @$_;
-        ok($test_point == $ref_point);
-    }
-
-    my $selu = nn->SELU();
-    my $selu_test = sub { my ($x) = @_;
-        my $selu = sub { my ($x) = @_;
-            my ($scale, $alpha) = (1.0507009873554804934193349852946, 1.6732632423543772848170429916717);
-            return $x => 0 ? $scale * $x : $alpha * mx->nd->exp($x) - $alpha;
-        };
-        return [map { $selu->($_) } @{ $x }];
-    };
-
-    for(zip($selu_test->($point_to_validate), $selu->($point_to_validate)))
-    {
-        my ($test_point, $ref_point) = @$_;
-        ok($test_point == $ref_point);
-    }
-
-    my $prelu = nn->PReLU();
-    $prelu->initialize();
-    my $x = $point_to_validate->reshape([1, 3, 2]);
-    ok(almost_equal($prelu->($x)->aspdl, mx->nd->where($x >= 0, $x, 0.25 * $x)->aspdl));
-}
-
-test_activations();
-
-sub test_req
-{
-    my $data = mx->nd->random->uniform(shape=>[1,3,224,224]);
-    my $label = mx->nd->array([1]);
-    my $loss = gluon->loss->SoftmaxCrossEntropyLoss();
-
-    my $net = nn->HybridSequential();
-    my $net1 = nn->HybridSequential();
-    $net1->add(nn->Dense(4));
-    my $net2 = nn->HybridSequential();
-    $net2->add(nn->Dense(3));
-    $net2->add(nn->Dense(2));
-    $net->add($net1);
-    $net->add($net2);
-    $net->initialize();
-
-    $net->hybridize();
-
-    for my $v ($net->collect_params->values)
-    {
-        $v->grad_req('add');
-    }
-
-    $net->collect_params->zero_grad();
-    my $grad;
-    mx->autograd->record(sub {
-        my $pred = $net->($data);
-        my $l = $loss->($pred, $label);
-        $l->backward();
-        $grad = $net->[0][0]->weight->grad->mean->aspdl;
-        # run twice to check req = add
-        $pred = $net->($data);
-        $l = $loss->($pred, $label);
-        $l->backward;
-    });
-
-    my $grad_double = $net->[0][0]->weight->grad->mean->aspdl;
-    ok(almost_equal($grad * 2, $grad_double));
-}
-
-test_req();
-
-sub test_zero_grad
-{
-    my $data = mx->nd->random->uniform(shape=>[3,3]);
-    my $net = nn->Embedding(3, 4, sparse_grad=>1, prefix=>'test_zero_grad_');
-    $net->initialize();
-    mx->autograd->record(sub {
-        $net->($data)->backward;
-    });
-    $net->collect_params->zero_grad;
-    my $grad = $net->collect_params->params->get('test_zero_grad_weight')->grad;
-    ok(almost_equal($grad->aspdl, $grad->aspdl * 0));
-}
-
-test_zero_grad();
-
-sub test_hook
-{
-    my $hook_call_count = 0;
-    my $pre_hook_call_count = 0;
-
-    my $call_hook = sub { my ($block, $x, $y) = @_;
-        $hook_call_count += 1;
-    };
-
-    my $call_pre_hook = sub { my ($block, $x) = @_;
-        $pre_hook_call_count += 1;
-    };
-
-    my $block = nn->Dense(10);
-    $block->initialize();
-    my $handle = $block->register_forward_hook($call_hook);
-    my $pre_handle = $block->register_forward_pre_hook($call_pre_hook);
-    $block->(mx->nd->ones([3, 5]));
-
-    ok($hook_call_count == 1);
-    ok($pre_hook_call_count == 1);
-
-    $handle->detach();
-    $block->(mx->nd->ones([3, 5]));
-
-    ok($hook_call_count == 1);
-    ok($pre_hook_call_count == 2);
-
-    $pre_handle->detach();
-    $block->(mx->nd->ones([3, 5]));
-
-    ok($hook_call_count == 1);
-    ok($pre_hook_call_count == 2);
-}
-
-test_hook();
-
-sub test_apply
-{
-    my @called_blocks;
-
-    my $record_name = sub { my ($block) = @_;
-        push @called_blocks, $block->name;
-    };
-    my $block = nn->HybridSequential(prefix=>'test_');
-    $block->name_scope(sub {
-        $block->add(nn->Dense(10));
-        $block->add(nn->Dropout(0.5));
-    });
-    $block->apply($record_name);
-
-    is_deeply(\@called_blocks, ['test_dense0', 'test_dropout0', 'test']);
-}
-
-test_apply();
-
-sub test_sparse_hybrid_block_grad
-{
-    package Embedding {
-        use AI::MXNet::Gluon::Mouse;
-        use AI::MXNet::Function::Parameters;
-        extends 'AI::MXNet::Gluon::HybridBlock';
-        has ['num_tokens', 'embedding_size'] => (is => 'rw');
-        method python_constructor_arguments() { ['num_tokens', 'embedding_size'] }
-        sub BUILD {
-            my $self = shift;
-            $self->name_scope(sub {
-                $self->embedding(nn->Embedding(
-                    $self->num_tokens, $self->embedding_size, sparse_grad=>1
-                ));
-            });
-        }
-
-        method hybrid_forward($F, $words)
-        {
-            my $emb = $self->embedding->($words);
-            return $emb + $F->ones_like($emb);
-        }
-    };
-    my $embedding = Embedding->new(20, 3);
-    $embedding->initialize();
-    $embedding->hybridize();
-
-    my $loss;
-    mx->autograd->record(sub {
-        my $emb0 = $embedding->(mx->nd->arange(stop => 10))->sum;
-        my $emb1 = $embedding->(mx->nd->arange(stop => 10))->sum;
-        $loss = $emb0 + $emb1;
-    });
-    $loss->backward();
-    my $grad = $embedding->embedding->weight->grad->aspdl;
-    ok(($grad->slice('X', ':9') == 2)->all);
-    ok(($grad->slice('X', '10:') == 0)->all);
-}
-
-test_sparse_hybrid_block_grad();
-
-sub test_sparse_hybrid_block
-{
-    package Linear {
-        use AI::MXNet::Gluon::Mouse;
-        use AI::MXNet::Function::Parameters;
-        extends 'AI::MXNet::Gluon::HybridBlock';
-        has ['units'] => (is => 'rw');
-        method python_constructor_arguments() { ['units'] }
-        sub BUILD {
-            my $self = shift;
-            $self->name_scope(sub {
-                $self->w($self->params->get(
-                    'w', shape => [$self->units, $self->units]
-                ));
-            });
-        }
-        method hybrid_forward($F, $x, :$w)
-        {
-            return $F->dot($x, $w);
-        }
-    };
-    package SparseBlock {
-        use AI::MXNet::Gluon::Mouse;
-        use AI::MXNet::Function::Parameters;
-        extends 'AI::MXNet::Gluon::HybridBlock';
-        has ['units'] => (is => 'rw');
-        method python_constructor_arguments() { ['units'] }
-        sub BUILD {
-            my $self = shift;
-            $self->name_scope(sub {
-                $self->net(Linear->new($self->units));
-            });
-        }
-        method hybrid_forward($F, $x)
-        {
-            return $self->net->($x) * $x;
-        }
-    };
-    my $block = SparseBlock->new(2);
-    $block->initialize();
-    $block->hybridize();
-    my $x = mx->nd->ones([2,2])->tostype('csr');
-    my $z;
-    mx->autograd->record(sub {
-        $z = $block->($x) + $block->($x);
-    });
-    $z->backward;
-    ok(($block->net->w->grad->aspdl == 4)->all);
-}
-
-test_sparse_hybrid_block();
diff --git a/perl-package/AI-MXNet/t/test_gluon_data.t b/perl-package/AI-MXNet/t/test_gluon_data.t
deleted file mode 100644
index 92e83b968d23..000000000000
--- a/perl-package/AI-MXNet/t/test_gluon_data.t
+++ /dev/null
@@ -1,128 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-use strict;
-use warnings;
-use AI::MXNet qw(mx);
-use AI::MXNet::Gluon qw(gluon);
-use AI::MXNet::Gluon::Utils qw(download);
-use Archive::Tar;
-use AI::MXNet::TestUtils qw(almost_equal);
-use AI::MXNet::Base;
-use File::Path qw(make_path);
-use IO::File;
-use Test::More tests => 52;
-
-sub test_array_dataset
-{
-    my $X = mx->nd->random->uniform(shape=>[10, 20]);
-    my $Y = mx->nd->random->uniform(shape=>[10]);
-    my $dataset = gluon->data->ArrayDataset($X, $Y);
-    my $loader = gluon->data->DataLoader($dataset, 2);
-    enumerate(sub {
-        my ($i, $d) = @_;
-        my ($x, $y) = @$d;
-        ok(almost_equal($x->aspdl, $X->slice([$i*2,($i+1)*2-1])->aspdl));
-        ok(almost_equal($y->aspdl, $Y->slice([$i*2,($i+1)*2-1])->aspdl));
-    }, \@{ $loader });
-}
-
-test_array_dataset();
-
-sub prepare_record
-{
-    my ($copy) = @_;
-    if(not -d "data/test_images")
-    {
-        make_path('data/test_images');
-    }
-    if(not -d "data/test_images/test_images")
-    {
-        download("http://data.mxnet.io/data/test_images.tar.gz", path => "data/test_images.tar.gz");
-        my $f = Archive::Tar->new('data/test_images.tar.gz');
-        chdir('data');
-        $f->extract;
-        chdir('..');
-    }
-    if(not -f 'data/test.rec')
-    {
-        my @imgs = glob('data/test_images/*');
-        my $record = mx->recordio->MXIndexedRecordIO('data/test.idx', 'data/test.rec', 'w');
-        enumerate(sub {
-            my ($i, $img) = @_;
-            my $str_img = join('',IO::File->new("./$img")->getlines);
-            my $s = mx->recordio->pack([0, $i, $i, 0], $str_img);
-            $record->write_idx($i, $s);
-        }, \@imgs);
-    }
-    if($copy)
-    {
-        make_path('data/images/test_images');
-        `cp  data/test_images/* data/images/test_images`;
-    }
-    return 'data/test.rec';
-}
-
-sub test_recordimage_dataset
-{
-    my $recfile = prepare_record();
-    my $dataset = gluon->data->vision->ImageRecordDataset($recfile);
-    my $loader = gluon->data->DataLoader($dataset, 1);
-    enumerate(sub {
-        my ($i, $d) = @_;
-        my ($x, $y) = @$d;
-        ok($x->shape->[0] == 1 and $x->shape->[3] == 3);
-        ok($y->asscalar == $i);
-    }, \@{ $loader });
-}
-
-test_recordimage_dataset();
-
-sub test_sampler
-{
-    my $seq_sampler = gluon->data->SequentialSampler(10);
-    is_deeply(\@{ $seq_sampler }, [0..9]);
-    my $rand_sampler = gluon->data->RandomSampler(10);
-    is_deeply([sort { $a <=> $b } @{ $rand_sampler }], [0..9]);
-    my $seq_batch_keep = gluon->data->BatchSampler($seq_sampler, 3, 'keep');
-    is_deeply([map { @$_ } @{ $seq_batch_keep }], [0..9]);
-    my $seq_batch_discard = gluon->data->BatchSampler($seq_sampler, 3, 'discard');
-    is_deeply([map { @$_ } @{ $seq_batch_discard }], [0..8]);
-    my $rand_batch_keep = gluon->data->BatchSampler($rand_sampler, 3, 'keep');
-    is_deeply([sort { $a <=> $b } map { @$_ } @{ $rand_batch_keep }], [0..9]);
-}
-
-test_sampler();
-
-sub test_datasets
-{
-    ok(gluon->data->vision->MNIST(root=>'data/mnist')->len == 60000);
-    ok(gluon->data->vision->FashionMNIST(root=>'data/fashion-mnist')->len == 60000);
-    ok(gluon->data->vision->CIFAR10(root=>'data/cifar10', train=>0)->len == 10000);
-}
-
-test_datasets();
-
-sub test_image_folder_dataset
-{
-    prepare_record(1);
-    my $dataset = gluon->data->vision->ImageFolderDataset('data/images');
-    is_deeply($dataset->synsets, ['test_images']);
-    ok(@{ $dataset->items } == 16);
-}
-
-test_image_folder_dataset();
diff --git a/perl-package/AI-MXNet/t/test_gluon_rnn.t b/perl-package/AI-MXNet/t/test_gluon_rnn.t
deleted file mode 100644
index 51e6ad53e171..000000000000
--- a/perl-package/AI-MXNet/t/test_gluon_rnn.t
+++ /dev/null
@@ -1,325 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-use strict;
-use warnings;
-use Test::More tests => 77;
-use AI::MXNet 'mx';
-use AI::MXNet::Gluon 'gluon';
-use AI::MXNet::TestUtils qw/allclose almost_equal/;
-use AI::MXNet::Base;
-use Scalar::Util 'blessed';
-
-sub test_rnn
-{
-    my $cell = gluon->rnn->RNNCell(100, prefix=>'rnn_');
-    my $inputs = [map { mx->sym->Variable("rnn_t${_}_data") } 0..2];
-    my ($outputs) = $cell->unroll(3, $inputs);
-    $outputs = mx->sym->Group($outputs);
-    is_deeply([sort $cell->collect_params()->keys()], ['rnn_h2h_bias', 'rnn_h2h_weight', 'rnn_i2h_bias', 'rnn_i2h_weight']);
-    is_deeply($outputs->list_outputs(), ['rnn_t0_out_output', 'rnn_t1_out_output', 'rnn_t2_out_output']);
-
-    my (undef, $outs) = $outputs->infer_shape(rnn_t0_data=>[10,50], rnn_t1_data=>[10,50], rnn_t2_data=>[10,50]);
-    is_deeply($outs, [[10, 100], [10, 100], [10, 100]]);
-}
-
-test_rnn();
-
-sub test_lstm
-{
-    my $cell = gluon->rnn->LSTMCell(100, prefix=>'rnn_');
-    my $inputs = [map { mx->sym->Variable("rnn_t${_}_data") } 0..2];
-    my ($outputs) = $cell->unroll(3, $inputs);
-    $outputs = mx->sym->Group($outputs);
-    is_deeply([sort $cell->collect_params()->keys()], ['rnn_h2h_bias', 'rnn_h2h_weight', 'rnn_i2h_bias', 'rnn_i2h_weight']);
-    is_deeply($outputs->list_outputs(), ['rnn_t0_out_output', 'rnn_t1_out_output', 'rnn_t2_out_output']);
-
-    my (undef, $outs) = $outputs->infer_shape(rnn_t0_data=>[10,50], rnn_t1_data=>[10,50], rnn_t2_data=>[10,50]);
-    is_deeply($outs, [[10, 100], [10, 100], [10, 100]]);
-}
-
-test_lstm();
-
-sub test_lstm_forget_bias
-{
-    my $forget_bias = 2;
-    my $stack = gluon->rnn->SequentialRNNCell();
-    $stack->add(gluon->rnn->LSTMCell(100, i2h_bias_initializer=>mx->init->LSTMBias($forget_bias), prefix=>'l0_'));
-    $stack->add(gluon->rnn->LSTMCell(100, i2h_bias_initializer=>mx->init->LSTMBias($forget_bias), prefix=>'l1_'));
-
-    my $dshape = [32, 1, 200];
-    my $data = mx->sym->Variable('data');
-
-    my ($sym) = $stack->unroll(1, $data, merge_outputs=>1);
-    my $mod = mx->mod->Module($sym, context=>mx->cpu(0));
-    $mod->bind(data_shapes=>[['data', $dshape]]);
-
-    $mod->init_params();
-
-    my ($bias_argument) = grep { /i2h_bias$/ } @{ $sym->list_arguments() };
-    my $expected_bias = pdl((0)x100, ($forget_bias)x100, (0)x200);
-    ok(allclose(($mod->get_params())[0]->{$bias_argument}->aspdl, $expected_bias));
-}
-
-test_lstm_forget_bias();
-
-sub test_gru
-{
-    my $cell = gluon->rnn->GRUCell(100, prefix=>'rnn_');
-    my $inputs = [map { mx->sym->Variable("rnn_t${_}_data") } 0..2];
-    my ($outputs) = $cell->unroll(3, $inputs);
-    $outputs = mx->sym->Group($outputs);
-    is_deeply([sort $cell->collect_params()->keys()], ['rnn_h2h_bias', 'rnn_h2h_weight', 'rnn_i2h_bias', 'rnn_i2h_weight']);
-    is_deeply($outputs->list_outputs(), ['rnn_t0_out_output', 'rnn_t1_out_output', 'rnn_t2_out_output']);
-
-    my (undef, $outs) = $outputs->infer_shape(rnn_t0_data=>[10,50], rnn_t1_data=>[10,50], rnn_t2_data=>[10,50]);
-    is_deeply($outs, [[10, 100], [10, 100], [10, 100]]);
-}
-
-test_gru();
-
-sub test_residual
-{
-    my $cell = gluon->rnn->ResidualCell(gluon->rnn->GRUCell(50, prefix=>'rnn_'));
-    my $inputs = [map { mx->sym->Variable("rnn_t${_}_data") } 0..1];
-    my ($outputs) = $cell->unroll(2, $inputs);
-    $outputs = mx->sym->Group($outputs);
-    is_deeply([sort $cell->collect_params()->keys()], ['rnn_h2h_bias', 'rnn_h2h_weight', 'rnn_i2h_bias', 'rnn_i2h_weight']);
-    my (undef, $outs) = $outputs->infer_shape(rnn_t0_data=>[10,50], rnn_t1_data=>[10,50]);
-    is_deeply($outs, [[10, 50], [10, 50]]);
-    $outputs = $outputs->eval(args => { rnn_t0_data=>mx->nd->ones([10, 50]),
-                           rnn_t1_data=>mx->nd->ones([10, 50]),
-                           rnn_i2h_weight=>mx->nd->zeros([150, 50]),
-                           rnn_i2h_bias=>mx->nd->zeros([150]),
-                           rnn_h2h_weight=>mx->nd->zeros([150, 50]),
-                           rnn_h2h_bias=>mx->nd->zeros([150]) });
-    my $expected_outputs = mx->nd->ones([10, 50]);
-    ok(($outputs->[0] == $expected_outputs)->aspdl->all);
-    ok(($outputs->[1] == $expected_outputs)->aspdl->all);
-}
-
-test_residual();
-
-sub test_residual_bidirectional
-{
-    my $cell = gluon->rnn->ResidualCell(
-        gluon->rnn->BidirectionalCell(
-            gluon->rnn->GRUCell(25, prefix=>'rnn_l_'),
-            gluon->rnn->GRUCell(25, prefix=>'rnn_r_')
-        )
-    );
-    my $inputs = [map { mx->sym->Variable("rnn_t${_}_data") } 0..1];
-    my ($outputs) = $cell->unroll(2, $inputs, merge_outputs => 0);
-    $outputs = mx->sym->Group($outputs);
-    is_deeply([sort $cell->collect_params()->keys()],
-                ['rnn_l_h2h_bias', 'rnn_l_h2h_weight', 'rnn_l_i2h_bias', 'rnn_l_i2h_weight',
-                'rnn_r_h2h_bias', 'rnn_r_h2h_weight', 'rnn_r_i2h_bias', 'rnn_r_i2h_weight']);
-    my (undef, $outs) = $outputs->infer_shape(rnn_t0_data=>[10,50], rnn_t1_data=>[10,50]);
-    is_deeply($outs, [[10, 50], [10, 50]]);
-    $outputs = $outputs->eval(args => { rnn_t0_data=>mx->nd->ones([10, 50])+5,
-                           rnn_t1_data=>mx->nd->ones([10, 50])+5,
-                           rnn_l_i2h_weight=>mx->nd->zeros([75, 50]),
-                           rnn_l_i2h_bias=>mx->nd->zeros([75]),
-                           rnn_l_h2h_weight=>mx->nd->zeros([75, 25]),
-                           rnn_l_h2h_bias=>mx->nd->zeros([75]),
-                           rnn_r_i2h_weight=>mx->nd->zeros([75, 50]),
-                           rnn_r_i2h_bias=>mx->nd->zeros([75]),
-                           rnn_r_h2h_weight=>mx->nd->zeros([75, 25]),
-                           rnn_r_h2h_bias=>mx->nd->zeros([75]),
-    });
-    my $expected_outputs = mx->nd->ones([10, 50])+5;
-    ok(($outputs->[0] == $expected_outputs)->aspdl->all);
-    ok(($outputs->[1] == $expected_outputs)->aspdl->all);
-}
-
-test_residual_bidirectional();
-
-sub test_stack
-{
-    my $cell = gluon->rnn->SequentialRNNCell();
-    for my $i (0..4)
-    {
-        if($i == 1)
-        {
-            $cell->add(gluon->rnn->ResidualCell(gluon->rnn->LSTMCell(100, prefix=>"rnn_stack${i}_")));
-        }
-        else
-        {
-            $cell->add(gluon->rnn->LSTMCell(100, prefix=>"rnn_stack${i}_"));
-        }
-    }
-    my $inputs = [map { mx->sym->Variable("rnn_t${_}_data") } 0..2];
-    my ($outputs) = $cell->unroll(3, $inputs);
-    $outputs = mx->sym->Group($outputs);
-    my %keys = map { $_ => 1 } $cell->collect_params()->keys();
-    for my $i (0..4)
-    {
-        ok($keys{"rnn_stack${i}_h2h_weight"});
-        ok($keys{"rnn_stack${i}_h2h_bias"});
-        ok($keys{"rnn_stack${i}_i2h_weight"});
-        ok($keys{"rnn_stack${i}_i2h_bias"});
-    }
-    is_deeply($outputs->list_outputs(), ['rnn_stack4_t0_out_output', 'rnn_stack4_t1_out_output', 'rnn_stack4_t2_out_output']);
-    my (undef, $outs) = $outputs->infer_shape(rnn_t0_data=>[10,50], rnn_t1_data=>[10,50], rnn_t2_data=>[10,50]);
-    is_deeply($outs, [[10, 100], [10, 100], [10, 100]]);
-}
-
-test_stack();
-
-sub test_bidirectional
-{
-    my $cell = gluon->rnn->BidirectionalCell(
-            gluon->rnn->LSTMCell(100, prefix=>'rnn_l0_'),
-            gluon->rnn->LSTMCell(100, prefix=>'rnn_r0_'),
-            output_prefix=>'rnn_bi_');
-    my $inputs = [map { mx->sym->Variable("rnn_t${_}_data") } 0..2];
-    my ($outputs) = $cell->unroll(3, $inputs);
-    $outputs = mx->sym->Group($outputs);
-    is_deeply($outputs->list_outputs(), ['rnn_bi_t0_output', 'rnn_bi_t1_output', 'rnn_bi_t2_output']);
-    my (undef, $outs) = $outputs->infer_shape(rnn_t0_data=>[10,50], rnn_t1_data=>[10,50], rnn_t2_data=>[10,50]);
-    is_deeply($outs, [[10, 200], [10, 200], [10, 200]]);
-}
-
-test_bidirectional();
-
-sub test_zoneout
-{
-    my $cell = gluon->rnn->ZoneoutCell(gluon->rnn->RNNCell(100, prefix=>'rnn_'), zoneout_outputs=>0.5,
-                              zoneout_states=>0.5);
-    my $inputs = [map { mx->sym->Variable("rnn_t${_}_data") } 0..2];
-    my ($outputs) = $cell->unroll(3, $inputs);
-    $outputs = mx->sym->Group($outputs);
-    my (undef, $outs) = $outputs->infer_shape(rnn_t0_data=>[10,50], rnn_t1_data=>[10,50], rnn_t2_data=>[10,50]);
-    is_deeply($outs, [[10, 100], [10, 100], [10, 100]]);
-}
-
-test_zoneout();
-
-sub check_rnn_forward
-{
-    my ($layer, $inputs, $deterministic) = @_;
-    $deterministic //= 1;
-    $inputs->attach_grad();
-    $layer->collect_params()->initialize();
-    my $out;
-    mx->autograd->record(sub {
-        $out = ($layer->unroll(3, $inputs, merge_outputs=>0))[0];
-        mx->autograd->backward($out);
-        $out = ($layer->unroll(3, $inputs, merge_outputs=>1))[0];
-        $out->backward;
-    });
-
-    my $pdl_out = $out->aspdl;
-    my $pdl_dx = $inputs->grad->aspdl;
-
-    $layer->hybridize;
-
-    mx->autograd->record(sub {
-        $out = ($layer->unroll(3, $inputs, merge_outputs=>0))[0];
-        mx->autograd->backward($out);
-        $out = ($layer->unroll(3, $inputs, merge_outputs=>1))[0];
-        $out->backward;
-    });
-
-    if($deterministic)
-    {
-        ok(almost_equal($pdl_out, $out->aspdl, 1e-3));
-        ok(almost_equal($pdl_dx, $inputs->grad->aspdl, 1e-3));
-    }
-}
-
-sub test_rnn_cells
-{
-    check_rnn_forward(gluon->rnn->LSTMCell(100, input_size=>200), mx->nd->ones([8, 3, 200]));
-    check_rnn_forward(gluon->rnn->RNNCell(100, input_size=>200), mx->nd->ones([8, 3, 200]));
-    check_rnn_forward(gluon->rnn->GRUCell(100, input_size=>200), mx->nd->ones([8, 3, 200]));
-    my $bilayer = gluon->rnn->BidirectionalCell(
-        gluon->rnn->LSTMCell(100, input_size=>200),
-        gluon->rnn->LSTMCell(100, input_size=>200)
-    );
-    check_rnn_forward($bilayer, mx->nd->ones([8, 3, 200]));
-    check_rnn_forward(gluon->rnn->DropoutCell(0.5), mx->nd->ones([8, 3, 200]), 0);
-    check_rnn_forward(
-        gluon->rnn->ZoneoutCell(
-            gluon->rnn->LSTMCell(100, input_size=>200),
-            0.5, 0.2
-        ),
-        mx->nd->ones([8, 3, 200]),
-        0
-    );
-    my $net = gluon->rnn->SequentialRNNCell();
-    $net->add(gluon->rnn->LSTMCell(100, input_size=>200));
-    $net->add(gluon->rnn->RNNCell(100, input_size=>100));
-    $net->add(gluon->rnn->GRUCell(100, input_size=>100));
-    check_rnn_forward($net, mx->nd->ones([8, 3, 200]));
-}
-
-test_rnn_cells();
-
-sub check_rnn_layer_forward
-{
-    my ($layer, $inputs, $states) = @_;
-    $layer->collect_params()->initialize();
-    $inputs->attach_grad;
-    my $out;
-    mx->autograd->record(sub {
-        if(defined $states)
-        {
-            $out = $layer->($inputs, $states);
-            ok(@$out == 2);
-            $out = $out->[0];
-        }
-        else
-        {
-            $out = $layer->($inputs);
-            ok(blessed $out and $out->isa('AI::MXNet::NDArray'));
-        }
-        $out->backward();
-    });
-
-    my $pdl_out = $out->aspdl;
-    my $pdl_dx = $inputs->grad->aspdl;
-    $layer->hybridize;
-    mx->autograd->record(sub {
-        if(defined $states)
-        {
-            ($out, $states) = $layer->($inputs, $states);
-            ok(blessed $out and $out->isa('AI::MXNet::NDArray'));
-        }
-        else
-        {
-            $out = $layer->($inputs, $states);
-            ok(blessed $out and $out->isa('AI::MXNet::NDArray'));
-        }
-        $out->backward();
-    });
-    ok(almost_equal($pdl_out, $out->aspdl, 1e-3));
-    ok(almost_equal($pdl_dx, $inputs->grad->aspdl, 1e-3));
-}
-
-sub test_rnn_layers
-{
-    check_rnn_layer_forward(gluon->rnn->RNN(10, 2), mx->nd->ones([8, 3, 20]));
-    check_rnn_layer_forward(gluon->rnn->RNN(10, 2, bidirectional=>1), mx->nd->ones([8, 3, 20]), mx->nd->ones([4, 3, 10]));
-    check_rnn_layer_forward(gluon->rnn->LSTM(10, 2), mx->nd->ones([8, 3, 20]));
-    check_rnn_layer_forward(gluon->rnn->LSTM(10, 2, bidirectional=>1), mx->nd->ones([8, 3, 20]), [mx->nd->ones([4, 3, 10]), mx->nd->ones([4, 3, 10])]);
-    check_rnn_layer_forward(gluon->rnn->GRU(10, 2), mx->nd->ones([8, 3, 20]));
-    check_rnn_layer_forward(gluon->rnn->GRU(10, 2, bidirectional=>1), mx->nd->ones([8, 3, 20]), mx->nd->ones([4, 3, 10]));
-}
-
-test_rnn_layers();
-
diff --git a/perl-package/AI-MXNet/t/test_gluon_trainer.t b/perl-package/AI-MXNet/t/test_gluon_trainer.t
deleted file mode 100644
index 3b1130af4ecf..000000000000
--- a/perl-package/AI-MXNet/t/test_gluon_trainer.t
+++ /dev/null
@@ -1,255 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-use strict;
-use warnings;
-use Test::More tests => 30;
-use AI::MXNet qw(mx);
-use AI::MXNet::Gluon qw(gluon);
-use AI::MXNet::Gluon::NN qw(nn);
-use AI::MXNet::TestUtils qw(almost_equal dies_ok);
-use Scalar::Util qw(refaddr);
-use AI::MXNet::Base;
-$ENV{MXNET_STORAGE_FALLBACK_LOG_VERBOSE} = 0;
-$ENV{MXNET_SUBGRAPH_VERBOSE} = 0;
-
-sub test_multi_trainer
-{
-    my $x = gluon->Parameter('x', shape=>[10], stype=>'row_sparse');
-    $x->initialize();
-    # test set trainer
-    my $trainer0 = gluon->Trainer([$x], 'sgd');
-    ok(refaddr($x->_trainer) == refaddr($trainer0));
-    # test unset trainer
-    $x->_set_trainer(undef);
-    ok(not defined $x->_trainer);
-    $x->_set_trainer($trainer0);
-    # multiple trainers for a sparse Parameter are not allowed
-    dies_ok(sub { gluon->Trainer([$x], 'sgd') });
-}
-
-sub test_trainer
-{
-    my $dict_equ = sub { my ($a, $b) = @_;
-        is_deeply({ map { $_ => 1 } keys %$a }, { map { $_ => 1 } keys %$b });
-        for my $k (keys %$a)
-        {
-            ok(($a->{$k}->aspdl == $b->{$k}->aspdl)->all);
-        }
-    };
-    my $x = gluon->Parameter('x', shape=>[10]);
-    $x->initialize(ctx=>[mx->cpu(0), mx->cpu(1)], init=>'zeros');
-    my $trainer = gluon->Trainer([$x], 'sgd', {'learning_rate'=> 1.0, 'momentum'=> 0.5});
-    my $y;
-    mx->autograd->record(sub {
-        for my $w (@{ $x->list_data() })
-        {
-            $y = $w + 1;
-            $y->backward();
-        }
-    });
-    $trainer->step(1);
-
-    ok(($x->data(mx->cpu(1))->aspdl == -2)->all);
-
-    $x->lr_mult(0.5);
-
-    mx->autograd->record(sub {
-        for my $w (@{ $x->list_data() })
-        {
-            $y = $w + 1;
-            $y->backward();
-        }
-    });
-    $trainer->step(1);
-
-    ok(($x->data(mx->cpu(1))->aspdl == -4)->all);
-
-    $trainer->save_states('test_trainer.states');
-    my $states;
-    if($trainer->update_on_kvstore)
-    {
-        $states = { %{ $trainer->kvstore->_updater->states } };
-    }
-    else
-    {
-        $states = { %{ $trainer->_updaters->[0]->states } };
-    }
-    $trainer->load_states('test_trainer.states');
-    if($trainer->update_on_kvstore)
-    {
-        $dict_equ->($trainer->kvstore->_updater->states, $states);
-        ok($trainer->_optimizer eq $trainer->kvstore->_updater->optimizer);
-    }
-    else
-    {
-        for my $updater (@{ $trainer->_updaters })
-        {
-            $dict_equ->($updater->states, $states);
-        }
-        ok($trainer->_optimizer eq $trainer->_updaters->[0]->optimizer);
-    }
-
-    dies_ok(sub { $trainer->update(1 ) });
-    dies_ok(sub { $trainer->allreduce_grads() });
-
-    $x = gluon->Parameter('x', shape=>[10]);
-    $x->initialize(ctx=>[mx->cpu(0), mx->cpu(1)], init=>'zeros');
-    my $trainer2 = gluon->Trainer([$x], 'sgd', {learning_rate => 1.0, momentum => 0.5},
-                             update_on_kvstore=>0);
-    mx->autograd->record(sub {
-        for(enumerate($x->list_data))
-        {
-            my ($i, $w) = @$_;
-            my $y = $i*$w;
-            $y->backward;
-        }
-    });
-    ok(($x->grad(mx->cpu(0))->aspdl != $x->grad(mx->cpu(1))->aspdl)->all);
-    $trainer2->allreduce_grads;
-    ok(($x->grad(mx->cpu(0))->aspdl == $x->grad(mx->cpu(1))->aspdl)->all);
-    $trainer2->update(1);
-    ok(($x->data(mx->cpu(1))->aspdl == -1)->all);
-
-}
-
-test_trainer();
-
-sub test_trainer_sparse_save_load
-{
-    my $x = gluon->Parameter('x', shape=>[10, 1], lr_mult=>1.0, stype=>'row_sparse');
-    $x->initialize(ctx=>[mx->cpu(0)], init=>'zeros');
-    my $trainer = gluon->Trainer([$x], 'sgd', {learning_rate => 0.1});
-    my $all_rows = mx->nd->arange(start => 0, stop => 10, ctx => mx->cpu(0));
-    mx->autograd->record(sub {
-        for my $w (@{ $x->list_row_sparse_data($all_rows) })
-        {
-            my $y = $w * 1;
-            $y->backward();
-        }
-    });
-    $trainer->step(1);
-    ok($trainer->kvstore->_updater->optimizer->_get_lr(0) == 0.1);
-    $trainer->save_states('test_trainer_save_load.states');
-    $trainer->load_states('test_trainer_save_load.states');
-    $x->lr_mult(2.0);
-    # check if parameter dict is correctly associated with optimizer after load_state
-    ok($trainer->kvstore->_updater->optimizer->_get_lr(0) == 0.2);
-}
-
-test_trainer_sparse_save_load();
-
-sub test_trainer_multi_layer_init
-{
-    local($ENV{MXNET_STORAGE_FALLBACK_LOG_VERBOSE}) = 0;
-    package Net {
-        use AI::MXNet::Gluon::Mouse;
-        extends 'AI::MXNet::Gluon::Block';
-        use AI::MXNet::Function::Parameters;
-        sub BUILD {
-            my $self = shift;
-            $self->name_scope(sub {
-                # sparse param
-                $self->embed_weight($self->params->get('embed_weight', stype=>'row_sparse',
-                                                    shape=>[4,3], grad_stype=>'row_sparse'));
-                # dense param from a hybrid block
-                $self->dense0(nn->Dense(2));
-            });
-        }
-        method forward($x)
-        {
-            my $embed_weight = $self->embed_weight->row_sparse_data($x);
-            my $embed = mx->nd->Embedding(data=>$x, weight=>$embed_weight,
-                                    input_dim=>4, output_dim=>3, sparse_grad=>1);
-            return $self->dense0->($embed);
-        }
-    };
-    my $check_init = sub { my ($ctxes) = @_;
-        my $net = Net->new(prefix=>'net_');
-        $net->initialize(mx->init->One(), ctx=>$ctxes);
-        my $trainer = gluon->Trainer($net->collect_params(), 'sgd', {learning_rate => 1});
-        my $data = mx->nd->array([[0,2], [1,2]]);
-        my $xs = gluon->utils->split_and_load($data, ctx_list => $ctxes);
-        my @ys;
-        mx->autograd->record(sub {
-            for my $x (@{ $xs })
-            {
-                my $y = $net->($x);
-                push @ys, $y;
-            }
-        });
-        for my $y (@ys)
-        {
-            $y->backward;
-        }
-        $trainer->step(1);
-        # all parameters should be initialized
-        ok(not @{ $trainer->_params_to_init });
-        my $all_rows = mx->nd->arange(start => 0, stop => 4, ctx=>mx->cpu(1));
-        # check the updated weights
-        my $weight = $net->embed_weight->row_sparse_data($all_rows)->aspdl;
-        ok(($weight->at(0) == -1)->all);
-        ok(($weight->at(1) == -1)->all);
-        ok(($weight->at(2) == -3)->all);
-        ok(($weight->at(3) ==  1)->all);
-    };
-    $check_init->([mx->cpu(1), mx->cpu(2)]);
-    $check_init->([mx->cpu(1)]);
-}
-
-test_trainer_multi_layer_init();
-
-sub test_trainer_reset_kv
-{
-    my $check_trainer_reset_kv = sub { my ($kv) = @_;
-        my $params = gluon->ParameterDict();
-        my $x = $params->get('x', shape=>[10], lr_mult=>1.0);
-        $params->initialize(ctx=>[mx->cpu(0), mx->cpu(1)], init=>'zeros');
-        my $trainer = gluon->Trainer($params, 'sgd', {learning_rate => 0.1}, kvstore=>$kv);
-        $params->save('test_trainer_reset_kv.params');
-        mx->autograd->record(sub {
-            for my $w (@{ $x->list_data })
-            {
-                my $y = $w + 1;
-                $y->backward;
-            }
-        });
-        $trainer->step(1);
-        is($trainer->kvstore->type, $kv);
-        # load would reset kvstore
-        $params->load('test_trainer_reset_kv.params', ctx => [mx->cpu(0), mx->cpu(1)]);
-        ok(not defined $trainer->kvstore);
-        ok (defined $trainer->_kv_initialized and not $trainer->_kv_initialized);
-        mx->autograd->record(sub {
-            for my $w (@{ $x->list_data })
-            {
-                my $y = $w + 1;
-                $y->backward;
-            }
-        });
-        $trainer->step(1);
-        # the updated parameter should be based on the loaded checkpoint
-        ok(($x->data(mx->cpu()) == -0.2)->aspdl->all);
-    };
-    my @kvs = ('local', 'device');
-    for my $kv (@kvs)
-    {
-        $check_trainer_reset_kv->($kv);
-    }
-}
-
-test_trainer_reset_kv();
diff --git a/perl-package/AI-MXNet/t/test_infer_shape.t b/perl-package/AI-MXNet/t/test_infer_shape.t
deleted file mode 100644
index 157d1133d6f5..000000000000
--- a/perl-package/AI-MXNet/t/test_infer_shape.t
+++ /dev/null
@@ -1,142 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-use strict;
-use warnings;
-use Test::More tests => 18;
-use AI::MXNet qw(mx);
-use AI::MXNet::TestUtils qw(mlp2);
-
-sub _test_shapes
-{
-    my ($sym, $arg_shapes, %expected_shapes) = @_;
-    my %arg_shape_dict;
-    @arg_shape_dict{ @{ $sym->list_arguments() } } = @{ $arg_shapes };
-    while(my ($k, $v) = each %expected_shapes)
-    {
-        is_deeply($arg_shape_dict{$k}, $v);
-    }
-}
-
-sub test_mlp2_infer_shape
-{
-    # Build MLP
-    my $out = mlp2();
-    # infer shape
-    my $data_shape = [100, 100];
-    my($arg_shapes, $out_shapes, $aux_shapes) = $out->infer_shape(data=>$data_shape);
-    ok(@$out_shapes == 1);
-    is_deeply($out_shapes->[0], [100, 10]);
-    my %true_shapes = (
-        fc2_bias   => [10],
-        fc2_weight => [10, 1000],
-        fc1_bias   => [1000],
-        fc1_weight => [1000,100]
-    );
-    _test_shapes($out, $arg_shapes, %true_shapes);
-}
-
-sub test_mlp2_infer_error
-{
-    # Test shape inconsistent case
-    my $out = mlp2();
-    my $weight_shape = [1, 100];
-    my $data_shape   = [100, 100];
-    eval { $out->infer_shape(data=>$data_shape, fc1_weight=>$weight_shape) };
-    like($@, qr/Shape inconsistent/);
-}
-
-sub test_backward_infer
-{
-    my $w = mx->sym->Variable("weight");
-    my $wshift = mx->sym->Variable("wshift", shape=>[1]);
-    my $data = mx->sym->Variable("data");
-    # broadcast add here, not being able to deduce shape correctly
-    my $wt = mx->sym->broadcast_add($w, $wshift);
-    # shape constraint, this is what enables backward shape inference
-    $wt = mx->sym->_identity_with_attr_like_rhs($wt, $w);
-    my $net = mx->sym->FullyConnected(data=>$data, weight=>$wt, num_hidden=>11, no_bias=>1);
-    my $data_shape = [7, 100];
-    my ($arg_shapes, $out_shapes, $aux_shapes) = $net->infer_shape(data=>$data_shape);
-    _test_shapes($net, $arg_shapes, weight=>[11,100]);
-}
-
-sub test_incomplete_infer_elewise
-{
-    my $a = mx->sym->Variable('a', shape=>[0, 10]);
-    my $b = mx->sym->Variable('b', shape=>[12, 0]);
-    my $c = $a + $b;
-    my ($arg_shapes) = $c->infer_shape();
-    _test_shapes($c, $arg_shapes, a=>[12,10], b=>[12,10]);
-}
-
-sub test_incomplete_infer_mlp
-{
-    my $a = mx->sym->Variable('a', shape=>[0, 10]);
-    my $b = mx->sym->FullyConnected(data=>$a, num_hidden=>21);
-    my $c = mx->sym->Variable('c', shape=>[5, 0]);
-    my $d = $b + $c;
-    my ($arg_shapes) = $d->infer_shape();
-    _test_shapes($d, $arg_shapes, a=>[5,10], c=>[5,21]);
-}
-
-sub test_incomplete_infer_slicechannel
-{
-    my $a = mx->sym->Variable('a', shape=>[0, 10]);
-    my $b = mx->sym->SliceChannel(data=>$a, num_outputs=>10, axis=>1, squeeze_axis=>1);
-    my $c = mx->sym->Variable('c', shape=>[5]);
-    my $d = @{$b}[1] + $c;
-    my ($arg_shapes) = $d->infer_shape();
-    _test_shapes($d, $arg_shapes, a=>[5,10]);
-
-    $a = mx->sym->Variable('a', shape=>[0, 15, 0]);
-    $b = mx->sym->SliceChannel(data=>$a, num_outputs=>3, squeeze_axis=>0);
-    $c = mx->sym->Variable('c', shape=>[3, 5, 2]);
-    $d = @{$b}[1] + $c;
-    ($arg_shapes) = $d->infer_shape();
-    _test_shapes($d, $arg_shapes, a=>[3,15,2]);
-}
-
-sub test_incomplete_infer_convolution
-{
-    my $a = mx->sym->Variable('a', shape=>[0, 10, 0, 0]);
-    my $b = mx->sym->Convolution(data=>$a, num_filter=>21, kernel=>[3, 3], dilate=>[1, 1], pad=>[1, 1]);
-    my $c = mx->sym->Variable('c', shape=>[5, 21, 32, 32]);
-    my $d = $b + $c;
-    my ($arg_shapes) = $d->infer_shape();
-    _test_shapes($d, $arg_shapes, a=>[5, 10, 32, 32]);
-}
-
-sub test_incomplete_infer_concat
-{
-    my $a = mx->sym->Variable('a', shape=>[0, 10]);
-    my $b = mx->sym->Variable('b', shape=>[0, 5]);
-    my $c = mx->sym->Concat($a, $b, num_args=>2, dim=>1);
-    my $d = mx->sym->Variable('d', shape=>[2, 0]);
-    $d = $d + $c;
-    my ($arg_shapes) = $d->infer_shape();
-    _test_shapes($d, $arg_shapes, a=>[2,10], b=>[2,5], d=>[2,15]);
-}
-
-test_mlp2_infer_shape();
-test_mlp2_infer_error();
-test_backward_infer();
-test_incomplete_infer_elewise();
-test_incomplete_infer_mlp();
-test_incomplete_infer_slicechannel();
-test_incomplete_infer_convolution();
-test_incomplete_infer_concat();
diff --git a/perl-package/AI-MXNet/t/test_init.t b/perl-package/AI-MXNet/t/test_init.t
deleted file mode 100644
index c697e99bce0f..000000000000
--- a/perl-package/AI-MXNet/t/test_init.t
+++ /dev/null
@@ -1,78 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-use strict;
-use warnings;
-# use Test::More tests => 7;  https://github.com/apache/incubator-mxnet/issues/17988
-use Test::More tests => 4;
-use AI::MXNet qw(mx);
-
-sub test_default_init
-{
-    my $data = mx->sym->Variable('data');
-    my $sym  = mx->sym->LeakyReLU(data => $data, act_type => 'prelu');
-    my $mod  = mx->mod->Module($sym);
-    $mod->bind(data_shapes=>[['data', [10,10]]]);
-    $mod->init_params;
-    ok((((values %{ ($mod->get_params)[0] }))[0]->aspdl == 0.25)->all);
-}
-
-sub test_variable_init
-{
-    my $data  = mx->sym->Variable('data');
-    my $gamma = mx->sym->Variable('gamma', init => mx->init->One());
-    my $sym   = mx->sym->LeakyReLU(data => $data, gamma => $gamma, act_type => 'prelu');
-    my $mod   = mx->mod->Module($sym);
-    $mod->bind(data_shapes=>[['data', [10,10]]]);
-    $mod->init_params();
-    ok(((values %{ ($mod->get_params)[0] })[0]->aspdl == 1)->all);
-}
-
-sub test_aux_init
-{
-    my $data = mx->sym->Variable('data');
-    my $sym  = mx->sym->BatchNorm(data => $data, name => 'bn');
-    my $mod  = mx->mod->Module($sym);
-    $mod->bind(data_shapes=>[['data', [10, 10, 3, 3]]]);
-    $mod->init_params();
-    ok((($mod->get_params)[1]->{bn_moving_var}->aspdl == 1)->all);
-    ok((($mod->get_params)[1]->{bn_moving_mean}->aspdl == 0)->all);
-}
-
-$ENV{MXNET_STORAGE_FALLBACK_LOG_VERBOSE} = 0;
-sub test_rsp_const_init
-{
-    my $check_rsp_const_init = sub { my ($init, $val) = @_;
-        my $shape = [10, 10];
-        my $x = mx->symbol->Variable("data", stype=>'csr');
-        my $weight = mx->symbol->Variable("weight", shape=>[$shape->[1], 2],
-                                    init=>$init, stype=>'row_sparse');
-        my $dot = mx->symbol->sparse->dot($x, $weight);
-        my $mod = mx->mod->Module($dot);
-        $mod->bind(data_shapes=>[['data', $shape]]);
-        $mod->init_params;
-        ok(((values %{ ($mod->get_params)[0] })[0]->aspdl == $val)->all);
-    };
-    $check_rsp_const_init->(mx->initializer->Constant(value=>2), 2);
-    $check_rsp_const_init->(mx->initializer->Zero(), 0);
-    $check_rsp_const_init->(mx->initializer->One(), 1);
-}
-
-# test_rsp_const_init();  https://github.com/apache/incubator-mxnet/issues/17988
-test_default_init();
-test_variable_init();
-test_aux_init();
diff --git a/perl-package/AI-MXNet/t/test_io.t b/perl-package/AI-MXNet/t/test_io.t
deleted file mode 100644
index 557925b88610..000000000000
--- a/perl-package/AI-MXNet/t/test_io.t
+++ /dev/null
@@ -1,194 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-use AI::MXNet qw(mx);
-use Test::More tests => 36;
-use AI::MXNet::TestUtils qw(same reldiff almost_equal GetMNIST_ubyte GetCifar10 randint rand_sparse_ndarray dies_ok);
-use PDL;
-use PDL::Types;
-use PDL::NiceSlice;
-$|++;
-
-
-sub test_Cifar10Rec()
-{
-    GetCifar10();
-    my $dataiter = mx->io->ImageRecordIter({
-            path_imgrec => "data/cifar/train.rec",
-            mean_img => "data/cifar/cifar10_mean.bin",
-            rand_crop => 0,
-            and_mirror => 0,
-            shuffle => 0,
-            data_shape => [3,28,28],
-            batch_size => 100,
-            preprocess_threads => 4,
-            prefetch_buffer => 1
-    });
-    my @labelcount;
-    my $batchcount = 0;
-    while(my $batch = <$dataiter>)
-    {
-        my $nplabel = $batch->label->[0];
-        for my $i (0..$nplabel->shape->[0]-1)
-        {
-            $labelcount[int($nplabel->at($i)->asscalar)] += 1;
-        }
-    }
-    for my $i (0..9)
-    {
-        ok($labelcount[$i] == 5000);
-    }
-}
-
-sub test_NDArrayIter()
-{
-    my $datas  = ones(PDL::Type->new(6), 2, 2, 1000);
-    my $labels = ones(PDL::Type->new(6), 1, 1000);
-    for my $i (0..999)
-    {
-        $datas(:,:,$i) .= $i / 100;
-        $labels(:,$i) .= $i / 100;
-    }
-    my $dataiter = mx->io->NDArrayIter(
-        data => $datas,
-        label => $labels,
-        batch_size => 128,
-        shuffle => 1,
-        last_batch_handle => 'pad'
-    );
-    my $batchidx = 0;
-    while(<$dataiter>)
-    {
-        $batchidx += 1;
-    }
-    is($batchidx, 8);
-    $dataiter = mx->io->NDArrayIter(
-        data => $datas,
-        label => $labels,
-        batch_size => 128,
-        shuffle => 0,
-        last_batch_handle => 'pad'
-    );
-    $batchidx = 0;
-    my @labelcount;
-    my $i = 0;
-    for my $batch (@{ $dataiter })
-    {
-        my $label = $batch->label->[0];
-        my $flabel = $label->aspdl->flat;
-        ok($batch->data->[0]->aspdl->slice(0,0,'X')->flat->at(0) == $flabel->at(0));
-        for my $i (0..$label->shape->[0]-1)
-        {
-            $labelcount[$flabel->at($i)] += 1;
-        }
-    }
-    for my $i (0..9)
-    {
-        if($i == 0)
-        {
-            ok($labelcount[$i] == 124);
-        }
-        else
-        {
-            ok($labelcount[$i] == 100);
-        }
-    }
-}
-
-sub test_NDArrayIter_csr
-{
-    # creating toy data
-    my $num_rows = 20;
-    my $num_cols = 20;
-    my $batch_size = 6;
-    my $shape = [$num_rows, $num_cols];
-    my ($csr) = rand_sparse_ndarray($shape, 'csr', density => 0.5);
-    my $dns = $csr->aspdl;
-    dies_ok(sub { mx->io->NDArrayIter->new(data => $csr, label => $dns, batch_size => $batch_size) });
-
-    # AI::MXNet::NDArray::CSR with shuffle
-    my $csr_iter = mx->io->NDArrayIter(
-            data => Hash::Ordered->new(csr_data => $csr, dns_data => $dns),
-            label => $dns,
-            batch_size => $batch_size,
-            shuffle=>1, last_batch_handle=>'discard'
-    );
-    my $num_batch = 0;
-    for my $batch (@{ $csr_iter })
-    {
-        $num_batch += 1;
-    }
-
-    ok($num_batch == int($num_rows / $batch_size));
-
-    # make iterators
-    $csr_iter = mx->io->NDArrayIter(data => $dns, label => $dns, batch_size => $batch_size, last_batch_handle=>'discard');
-    my $begin = 0;
-    for my $batch (@{ $csr_iter })
-    {
-        my $expected = mx->nd->zeros([$batch_size, $num_cols])->aspdl;
-        my $end = $begin + $batch_size;
-        $expected->slice('X', [0, $batch_size-1]) .= $dns->slice('X', [$begin, $end-1]);
-        if($end > $num_rows)
-        {
-            $expected->slice('X', [0, $end - $num_rows - 1]) .= $dns->slice('X', [0, $end - $num_rows - 1]);
-        }
-        ok(almost_equal($batch->data->[0]->aspdl, $expected));
-        $begin += $batch_size;
-    }
-}
-
-sub test_MNISTIter()
-{
-    GetMNIST_ubyte();
-
-    my $batch_size = 100;
-    my $train_dataiter = mx->io->MNISTIter({
-            image => "data/train-images-idx3-ubyte",
-            label => "data/train-labels-idx1-ubyte",
-            data_shape => [784],
-            batch_size => $batch_size,
-            shuffle => 1,
-            flat => 1,
-            silent => 0,
-            seed => 10
-    });
-    # test_loop
-    my $nbatch = 60000 / $batch_size;
-    my $batch_count = 0;
-    for my $batch (@{ $train_dataiter})
-    {
-        $batch_count += 1;
-    }
-    ok($nbatch == $batch_count);
-    # test_reset
-    $train_dataiter->reset();
-    $train_dataiter->iter_next();
-    my $label_0 = $train_dataiter->getlabel->aspdl->flat;
-    $train_dataiter->iter_next;
-    $train_dataiter->iter_next;
-    $train_dataiter->iter_next;
-    $train_dataiter->reset;
-    $train_dataiter->iter_next;
-    my $label_1 = $train_dataiter->getlabel->aspdl->flat;
-    ok(sum($label_0 - $label_1) == 0);
-}
-
-test_NDArrayIter();
-test_NDArrayIter_csr();
-test_MNISTIter();
-test_Cifar10Rec();
diff --git a/perl-package/AI-MXNet/t/test_io_image.t b/perl-package/AI-MXNet/t/test_io_image.t
deleted file mode 100644
index dc83d963e5a1..000000000000
--- a/perl-package/AI-MXNet/t/test_io_image.t
+++ /dev/null
@@ -1,47 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-use strict;
-use warnings;
-use Test::More tests => 1;
-use AI::MXNet qw(mx);
-use Time::HiRes qw(time);
-
-sub run_imageiter
-{
-    my ($path_rec, $n, $batch_size) = @_;
-    $batch_size //= 32;
-    my $data = mx->img->ImageIter(
-        batch_size=>$batch_size,
-        data_shape=>[3, 224, 224],
-        path_imgrec=>$path_rec,
-        kwargs => { rand_crop=>1,
-        rand_resize=>1,
-        rand_mirror=>1 }
-    );
-    $data->reset();
-    my $tic = time;
-    for my $i (1..$n)
-    {
-        $data->next;
-        mx->nd->waitall;
-        warn("average speed after iteration $i is " . $batch_size*$i/(time - $tic) . " samples/sec");
-    }
-}
-
-run_imageiter('data/cifar/test.rec', 20);
-ok(1);
\ No newline at end of file
diff --git a/perl-package/AI-MXNet/t/test_kvstore.t b/perl-package/AI-MXNet/t/test_kvstore.t
deleted file mode 100644
index 5e3f49f78164..000000000000
--- a/perl-package/AI-MXNet/t/test_kvstore.t
+++ /dev/null
@@ -1,262 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-use strict;
-use warnings;
-use Test::More tests => 62;
-use AI::MXNet qw(mx);
-use AI::MXNet::TestUtils qw(almost_equal randint zip rand_ndarray);
-use AI::MXNet::Base qw(pzeros);
-
-my $shape = [4, 4];
-my $keys  = [5,7,9];
-
-sub init_kv
-{
-    # init kv
-    my $kv = mx->kv->create();
-    # single
-    $kv->init(3, mx->nd->zeros($shape));
-    # list
-    $kv->init($keys, [map { mx->nd->zeros($shape) } 0..@$keys-1]);
-    return $kv;
-}
-
-sub check_diff_to_scalar
-{
-    # assert A == x
-    my ($A, $x) = @_;
-    ok(($A - $x)->aspdl->abs->sum == 0);
-}
-
-sub test_single_kv_pair
-{
-    # single key-value pair push & pull
-    my $kv = init_kv();
-    $kv->push(3, mx->nd->ones($shape));
-    my $val = mx->nd->empty($shape);
-    $kv->pull(3, out => $val);
-    check_diff_to_scalar($val, 1);
-}
-
-sub test_init
-{
-    my $kv = mx->kv->create();
-    $kv->init(3, mx->nd->ones($shape)*4);
-    my $a = mx->nd->zeros($shape);
-    $kv->pull(3, out=>$a);
-    check_diff_to_scalar($a, 4);
-}
-
-sub test_list_kv_pair
-{
-    # list key-value pair push & pull
-    my $kv = init_kv();
-    $kv->push($keys, [map {mx->nd->ones($shape)*4} 0..@$keys-1]);
-    my $val = [map { mx->nd->empty($shape) } 0..@$keys-1];
-    $kv->pull($keys, out => $val);
-    for my $v (@$val)
-    {
-        check_diff_to_scalar($v, 4);
-    }
-}
-
-sub test_row_sparse_pull
-{
-    my $kv = mx->kv->create();
-    $kv->init('e', mx->nd->ones($shape)->tostype('row_sparse'));
-
-    my $check_row_sparse_pull = sub { my ($kv, $count) = @_;
-        my $num_rows = $shape->[0];
-        my $vals = [];
-        my $row_ids = [];
-        my $all_row_ids = mx->nd->array([0..$num_rows-1])->aspdl;
-        for my $i (0..$count-1)
-        {
-            push @$vals, mx->nd->zeros($shape)->tostype('row_sparse');
-            my $row_id = [map { randint(0, $num_rows) } 1..$num_rows];
-            push @$row_ids, mx->nd->array($row_id)->reshape([2, int($num_rows/2)]);
-        }
-        my $row_ids_to_pull = @$row_ids == 1 ? $row_ids->[0] : $row_ids;
-        my $vals_to_pull = @$vals == 1 ? $vals->[0] : $vals;
-
-        $kv->row_sparse_pull('e', out=>$vals_to_pull, row_ids=>$row_ids_to_pull);
-        zip(sub {
-            my ($val, $row_id) = @_;
-            my $retained_val = $val->aspdl;
-            my %excluded_row_ids = map { $_ => 1 } @{ PDL::setops($all_row_ids, 'XOR', $row_id->aspdl)->unpdl };
-            for my $row (0..$num_rows-1)
-            {
-                my $expected_val = pzeros(@{ $retained_val->at($row)->shape->unpdl });
-                $expected_val += exists $excluded_row_ids{ $row } ? 0 : 1;
-                ok(almost_equal($retained_val->at($row), $expected_val));
-            }
-        }, $vals, $row_ids);
-    };
-    $check_row_sparse_pull->($kv, 1);
-    $check_row_sparse_pull->($kv, 4);
-}
-
-sub test_sparse_aggregator
-{
-    my $stype = 'row_sparse';
-    my $kv = mx->kv->create($stype);
-    $kv->init('a', mx->nd->zeros($shape, stype=>$stype));
-    $kv->init($keys, [map { mx->nd->zeros($shape, stype=>$stype) } 0..@$keys-1]);
-
-    # devices
-    my $num_devs = 4;
-    my $devs = [map { mx->cpu($_) } 0..$num_devs];
-
-    # single
-    my $vals = [map { rand_ndarray($shape, $stype)->copyto($devs->[$_]) } 0..$num_devs-1];
-    my $expected_sum = mx->nd->zeros($shape)->aspdl;
-    for my $v (@$vals)
-    {
-        $expected_sum += $v->aspdl;
-    }
-
-    # prepare row_ids
-    my $all_rows = mx->nd->array([0..$shape->[0]-1]);
-    $kv->push('a', $vals);
-    $kv->row_sparse_pull('a', out=>$vals, row_ids=>[($all_rows)x@$vals]);
-    my $result_sum = mx->nd->zeros($shape)->aspdl;
-    for my $v (@$vals)
-    {
-        $result_sum += $v->aspdl;
-    }
-    ok(almost_equal($result_sum, $expected_sum * $num_devs));
-
-    # list
-    $vals = [([map { rand_ndarray($shape, $stype)->copyto($devs->[$_]) } 0..$num_devs-1])x@$keys];
-    $expected_sum = mx->nd->zeros($shape)->aspdl;
-    for my $v (@{ $vals->[0] })
-    {
-        $expected_sum += $v->aspdl;
-    }
-
-    $kv->push($keys, $vals);
-    $kv->row_sparse_pull($keys, out=>$vals, row_ids=>[([($all_rows)x$num_devs])x@$vals]);
-    for my $vv (@$vals)
-    {
-        $result_sum = mx->nd->zeros($shape)->aspdl;
-        for my $v (@$vv)
-        {
-            $result_sum += $v->aspdl;
-        }
-        ok(almost_equal($result_sum, $expected_sum * $num_devs))
-    }
-}
-
-sub test_aggregator
-{
-    # aggregate value on muliple devices
-
-    my $kv = init_kv();
-
-    # devices
-    my $num_devs = 4;
-    my $devs = [map { mx->cpu($_) } 0..$num_devs-1];
-
-    # single
-    my $vals = [map { mx->nd->ones($shape, ctx => $_) } @$devs];
-
-    $kv->push(3, $vals);
-    $kv->pull(3, out => $vals);
-
-    for my $v (@$vals)
-    {
-        check_diff_to_scalar($v, $num_devs);
-    }
-    # list
-
-    $vals = [map { [map { mx->nd->ones($shape, ctx => $_)*2 } @$devs] } 0..@$keys-1];
-    $kv->push($keys, $vals);
-    $kv->pull($keys, out => $vals);
-
-    for my $vv (@{ $vals })
-    {
-        for my $v (@{ $vv })
-        {
-            check_diff_to_scalar($v, $num_devs * 2);
-        }
-    }
-}
-
-sub updater
-{
-    my ($key, $recv, $local) = @_;
-    $local += $recv;
-}
-
-sub test_updater
-{
-    my ($dev) = @_;
-    $dev //= 'cpu';
-    my $kv = init_kv();
-    $kv->_set_updater(\&updater);
-
-    # devices
-    my $num_devs = 4;
-    my $devs = [map { mx->$dev($_) } 0..$num_devs-1];
-
-    # single
-    my $vals = [map { mx->nd->ones($shape, ctx => $_) } @$devs];
-
-    $kv->push(3, $vals);
-    $kv->pull(3, out => $vals);
-
-    for my $v (@$vals)
-    {
-        check_diff_to_scalar($v, $num_devs);
-    }
-
-    # list
-    $vals = [map { [map { mx->nd->ones($shape, ctx => $_) } @$devs] } 0..@$keys-1];
-
-    my $num_push = 10;
-    for my $i (0..$num_push-1)
-    {
-        $kv->push($keys, $vals);
-    }
-
-    $kv->pull($keys, out => $vals);
-
-    for my $vv (@{ $vals })
-    {
-        for my $v (@{ $vv })
-        {
-            check_diff_to_scalar($v, $num_devs * $num_push);
-        }
-    }
-}
-
-sub test_get_type
-{
-    my $kvtype = 'local_allreduce_cpu';
-    my $kv = mx->kv->create($kvtype);
-    is($kv->type, $kvtype);
-}
-
-test_init();
-test_get_type();
-test_single_kv_pair();
-test_list_kv_pair();
-test_aggregator();
-test_updater();
-test_row_sparse_pull();
-test_sparse_aggregator();
diff --git a/perl-package/AI-MXNet/t/test_loss.t b/perl-package/AI-MXNet/t/test_loss.t
deleted file mode 100644
index 5a9e413bbfaf..000000000000
--- a/perl-package/AI-MXNet/t/test_loss.t
+++ /dev/null
@@ -1,481 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-use strict;
-use warnings;
-use Test::More tests => 32;
-use AI::MXNet 'mx';
-use AI::MXNet::Gluon 'gluon';
-use AI::MXNet::TestUtils 'almost_equal';
-use Hash::Ordered;
-
-sub test_loss_ndarray
-{
-    mx->random->seed(1234);
-    srand(1234);
-    my $output     = mx->nd->array([1, 2, 3, 4]);
-    my $label      = mx->nd->array([1, 3, 5, 7]);
-    my $weighting  = mx->nd->array([0.5, 1, 0.5, 1]);
-
-    my $loss = gluon->loss->L1Loss();
-    ok(mx->nd->sum($loss->($output, $label))->asscalar() == 6);
-    $loss = gluon->loss->L1Loss(weight=>0.5);
-    ok(mx->nd->sum($loss->($output, $label))->asscalar() == 3);
-    $loss = gluon->loss->L1Loss();
-    ok(mx->nd->sum($loss->($output, $label, $weighting))->asscalar() == 5);
-
-    $loss = gluon->loss->L2Loss();
-    ok(mx->nd->sum($loss->($output, $label))->asscalar() == 7);
-    $loss = gluon->loss->L2Loss(weight=>0.25);
-    ok(mx->nd->sum($loss->($output, $label))->asscalar() == 1.75);
-    $loss = gluon->loss->L2Loss();
-    ok(mx->nd->sum($loss->($output, $label, $weighting))->asscalar() == 6);
-
-    $output    = mx->nd->array([[0, 2], [1, 4]]);
-    $label     = mx->nd->array([0, 1]);
-    $weighting = mx->nd->array([[0.5], [1.0]]);
-
-    $loss = gluon->loss->SoftmaxCrossEntropyLoss();
-    my $L = $loss->($output, $label)->aspdl();
-    ok(almost_equal($L, mx->nd->array([ 2.12692809,  0.04858733])->aspdl));
-
-    $L = $loss->($output, $label, $weighting)->aspdl();
-    ok(almost_equal($L, mx->nd->array([ 1.06346405,  0.04858733])->aspdl));
-}
-
-test_loss_ndarray();
-
-sub get_net
-{
-    my ($num_hidden, $flatten) = @_;
-    $flatten //= 1;
-    my $data = mx->symbol->Variable('data');
-    my $fc1 = mx->symbol->FullyConnected($data, name=>'fc1', num_hidden=>128, flatten=>$flatten);
-    my $act1 = mx->symbol->Activation($fc1, name=>'relu1', act_type=>"relu");
-    my $fc2 = mx->symbol->FullyConnected($act1, name => 'fc2', num_hidden => 64, flatten=>$flatten);
-    my $act2 = mx->symbol->Activation($fc2, name=>'relu2', act_type=>"relu");
-    my $fc3 = mx->symbol->FullyConnected($act2, name=>'fc3', num_hidden=>$num_hidden, flatten=>$flatten);
-    return $fc3;
-}
-
-sub test_ce_loss
-{
-    mx->random->seed(1234);
-    srand(1234);
-    my $nclass = 10;
-    my $N = 20;
-    my $data = mx->random->uniform(-1, 1, shape=>[$N, $nclass]);
-    my $label = mx->nd->array([qw/3 6 5 4 8 9 1 7 9 6 8 0 5 0 9 6 2 0 5 2/], dtype=>'int32');
-    my $data_iter = mx->io->NDArrayIter($data, $label, batch_size=>10, label_name=>'label');
-    my $output = get_net($nclass);
-    my $l = mx->symbol->Variable('label');
-    my $Loss = gluon->loss->SoftmaxCrossEntropyLoss();
-    my $loss = $Loss->($output, $l);
-    $loss = mx->sym->make_loss($loss);
-    my $mod = mx->mod->Module($loss, data_names=>['data'], label_names=>['label']);
-    local($AI::MXNet::Logging::silent) = 1;
-    $mod->fit($data_iter, num_epoch=>200, optimizer_params=>{learning_rate => 0.01},
-            eval_metric=>mx->metric->Loss(), optimizer=>'adam');
-    ok($mod->score($data_iter, mx->metric->Loss())->{loss} < 0.1);
-}
-
-test_ce_loss();
-
-sub test_bce_loss
-{
-    mx->random->seed(1234);
-    srand(1234);
-    my $N = 20;
-    my $data = mx->random->uniform(-1, 1, shape=>[$N, 20]);
-    my $label = mx->nd->array([qw/1 1 0 1 0 0 0 1 1 1 1 1 0 0 1 0 0 0 0 0/], dtype=>'float32');
-    my $data_iter = mx->io->NDArrayIter($data, $label, batch_size=>10, label_name=>'label');
-    my $output = get_net(1);
-    my $l = mx->symbol->Variable('label');
-    my $Loss = gluon->loss->SigmoidBinaryCrossEntropyLoss();
-    my $loss = $Loss->($output, $l);
-    $loss = mx->sym->make_loss($loss);
-    my $mod = mx->mod->Module($loss, data_names=>['data'], label_names=>['label']);
-    local($AI::MXNet::Logging::silent) = 1;
-    $mod->fit($data_iter, num_epoch=>200, optimizer_params=>{learning_rate => 0.01},
-            eval_metric=>mx->metric->Loss(), optimizer=>'adam',
-            initializer=>mx->init->Xavier(magnitude=>2));
-    ok($mod->score($data_iter, mx->metric->Loss())->{loss} < 0.01);
-}
-
-test_bce_loss();
-
-sub test_bce_equal_ce2
-{
-    mx->random->seed(1234);
-    srand(1234);
-    my $N = 100;
-    my $loss1 = gluon->loss->SigmoidBCELoss(from_sigmoid=>1);
-    my $loss2 = gluon->loss->SoftmaxCELoss(from_logits=>1);
-    my $out1 = mx->random->uniform(0, 1, shape=>[$N, 1]);
-    my $out2 = mx->nd->log(mx->nd->concat(1-$out1, $out1, dim=>1) + 1e-8);
-    my $label = mx->nd->round(mx->random->uniform(0, 1, shape=>[$N, 1]));
-    ok(almost_equal($loss1->($out1, $label)->aspdl, $loss2->($out2, $label)->aspdl));
-}
-
-test_bce_equal_ce2();
-
-sub test_kl_loss
-{
-    mx->random->seed(1234);
-    srand(1234);
-    my $N = 20;
-    my $data = mx->random->uniform(-1, 1, shape=>[$N, 10]);
-    my $label = mx->nd->softmax(mx->random->uniform(0, 1, shape=>[$N, 2]));
-    my $data_iter = mx->io->NDArrayIter($data, $label, batch_size=>10, label_name=>'label');
-    my $output = mx->sym->log_softmax(get_net(2));
-    my $l = mx->symbol->Variable('label');
-    my $Loss = gluon->loss->KLDivLoss();
-    my $loss = $Loss->($output, $l);
-    $loss = mx->sym->make_loss($loss);
-    local($AI::MXNet::Logging::silent) = 1;
-    my $mod = mx->mod->Module($loss, data_names=>['data'], label_names=>['label']);
-    $mod->fit($data_iter, num_epoch=>200, optimizer_params=>{learning_rate => 0.01},
-            eval_metric=>mx->metric->Loss(), optimizer=>'adam');
-    ok($mod->score($data_iter, mx->metric->Loss())->{loss} < 0.05);
-}
-
-test_kl_loss();
-
-sub test_l2_loss
-{
-    mx->random->seed(1234);
-    srand(1234);
-    my $N = 20;
-    my $data = mx->random->uniform(-1, 1, shape=>[$N, 10]);
-    my $label = mx->nd->softmax(mx->random->uniform(-1, 1, shape=>[$N, 1]));
-    my $data_iter = mx->io->NDArrayIter($data, $label, batch_size=>10, label_name=>'label', shuffle=>1);
-    my $output = get_net(1);
-    my $l = mx->symbol->Variable('label');
-    my $Loss = gluon->loss->L2Loss();
-    my $loss = $Loss->($output, $l);
-    $loss = mx->sym->make_loss($loss);
-    local($AI::MXNet::Logging::silent) = 1;
-    my $mod = mx->mod->Module($loss, data_names=>['data'], label_names=>['label']);
-    $mod->fit($data_iter, num_epoch=>200, optimizer_params=>{learning_rate => 0.01},
-            eval_metric=>mx->metric->Loss(), optimizer=>'adam');
-    ok($mod->score($data_iter, mx->metric->Loss())->{loss} < 0.1);
-}
-
-test_l2_loss();
-
-sub test_l1_loss
-{
-    mx->random->seed(1234);
-    srand(1234);
-    my $N = 20;
-    my $data = mx->random->uniform(-1, 1, shape=>[$N, 10]);
-    my $label = mx->nd->softmax(mx->random->uniform(-1, 1, shape=>[$N, 1]));
-    my $data_iter = mx->io->NDArrayIter($data, $label, batch_size=>10, label_name=>'label', shuffle=>1);
-    my $output = get_net(1);
-    my $l = mx->symbol->Variable('label');
-    my $Loss = gluon->loss->L1Loss();
-    my $loss = $Loss->($output, $l);
-    $loss = mx->sym->make_loss($loss);
-    local($AI::MXNet::Logging::silent) = 1;
-    my $mod = mx->mod->Module($loss, data_names=>['data'], label_names=>['label']);
-    $mod->fit($data_iter, num_epoch=>200, optimizer_params=>{learning_rate => 0.01},
-            eval_metric=>mx->metric->Loss(), optimizer=>'adam');
-    ok($mod->score($data_iter, mx->metric->Loss())->{loss} < 0.1);
-}
-
-test_l1_loss();
-
-sub test_ctc_loss
-{
-    mx->random->seed(1234);
-    srand(1234);
-    my $loss = gluon->loss->CTCLoss();
-    my $l = $loss->(mx->nd->ones([2,20,4]), mx->nd->array([[1,0,-1,-1],[2,1,1,-1]]));
-    ok(almost_equal($l->aspdl, mx->nd->array([18.82820702, 16.50581741])->aspdl));
-
-    $loss = gluon->loss->CTCLoss(layout=>'TNC');
-    $l = $loss->(mx->nd->ones([20,2,4]), mx->nd->array([[1,0,-1,-1],[2,1,1,-1]]));
-    ok(almost_equal($l->aspdl, mx->nd->array([18.82820702, 16.50581741])->aspdl));
-
-    $loss = gluon->loss->CTCLoss(layout=>'TNC', label_layout=>'TN');
-    $l = $loss->(mx->nd->ones([20,2,4]), mx->nd->array([[1,0,-1,-1],[2,1,1,-1]])->T);
-    ok(almost_equal($l->aspdl, mx->nd->array([18.82820702, 16.50581741])->aspdl));
-
-    $loss = gluon->loss->CTCLoss();
-    $l = $loss->(mx->nd->ones([2,20,4]), mx->nd->array([[2,1,2,2],[3,2,2,2]]), undef, mx->nd->array([2,3]));
-    ok(almost_equal($l->aspdl, mx->nd->array([18.82820702, 16.50581741])->aspdl));
-
-    $loss = gluon->loss->CTCLoss();
-    $l = $loss->(mx->nd->ones([2,25,4]), mx->nd->array([[2,1,-1,-1],[3,2,2,-1]]), mx->nd->array([20,20]));
-    ok(almost_equal($l->aspdl, mx->nd->array([18.82820702, 16.50581741])->aspdl));
-
-    $loss = gluon->loss->CTCLoss();
-    $l = $loss->(mx->nd->ones([2,25,4]), mx->nd->array([[2,1,3,3],[3,2,2,3]]), mx->nd->array([20,20]), mx->nd->array([2,3]));
-    ok(almost_equal($l->aspdl, mx->nd->array([18.82820702, 16.50581741])->aspdl));
-}
-
-test_ctc_loss();
-
-sub test_ctc_loss_train
-{
-    mx->random->seed(1234);
-    srand(1234);
-    my $N = 20;
-    my $data = mx->random->uniform(-1, 1, shape=>[$N, 20, 10]);
-    my $label = mx->nd->arange(start => 4, repeat=>$N)->reshape([$N, 4]);
-    my $data_iter = mx->io->NDArrayIter($data, $label, batch_size=>10, label_name=>'label', shuffle=>1);
-    my $output = get_net(5, 0);
-    my $l = mx->symbol->Variable('label');
-    my $Loss = gluon->loss->CTCLoss(layout=>'NTC', label_layout=>'NT');
-    my $loss = $Loss->($output, $l);
-    $loss = mx->sym->make_loss($loss);
-    local($AI::MXNet::Logging::silent) = 1;
-    my $mod = mx->mod->Module($loss, data_names=>['data'], label_names=>['label']);
-    $mod->fit($data_iter, num_epoch=>200, optimizer_params=>{learning_rate => 0.01},
-            initializer=>mx->init->Xavier(magnitude=>2), eval_metric=>mx->metric->Loss(),
-            optimizer=>'adam');
-    ok($mod->score($data_iter, mx->metric->Loss())->{loss} < 20);
-}
-
-test_ctc_loss_train();
-
-sub test_sample_weight_loss
-{
-    mx->random->seed(1234);
-    srand(1234);
-    my $nclass = 10;
-    my $N = 20;
-    my $data = mx->random->uniform(-1, 1, shape=>[$N, $nclass]);
-    my $label = mx->nd->array([qw/2 0 8 4 3 4 2 5 5 7 2 3 7 1 2 6 4 2 8 0/], dtype=>'int32');
-    my $weight = mx->nd->array([(1)x10,(0)x10]);
-    my $data_iter = mx->io->NDArrayIter(
-        $data,
-        Hash::Ordered->new(label => $label, w => $weight),
-        batch_size=>10
-    );
-    my $output = get_net($nclass);
-    my $l = mx->symbol->Variable('label');
-    my $w = mx->symbol->Variable('w');
-    my $Loss = gluon->loss->SoftmaxCrossEntropyLoss();
-    my $loss = $Loss->($output, $l, $w);
-    $loss = mx->sym->make_loss($loss);
-    local($AI::MXNet::Logging::silent) = 1;
-    my $mod = mx->mod->Module($loss, data_names=>['data'], label_names=>['label', 'w']);
-    $mod->fit($data_iter, num_epoch=>200, optimizer_params=>{learning_rate => 0.01},
-            eval_metric=>mx->metric->Loss(), optimizer=>'adam');
-    $data_iter = mx->io->NDArrayIter(
-        $data->slice([10,$data->len-1]),
-        Hash::Ordered->new(label => $label, w => $weight),
-        batch_size=>10
-    );
-    my $score =  $mod->score($data_iter, mx->metric->Loss())->{loss};
-    ok($score > 1);
-    $data_iter = mx->io->NDArrayIter(
-        $data->slice([0,9]),
-        Hash::Ordered->new(label => $label, w => $weight),
-        batch_size=>10
-    );
-    $score =  $mod->score($data_iter, mx->metric->Loss())->{loss};
-    ok($score < 0.05);
-}
-
-test_sample_weight_loss();
-
-sub test_saveload
-{
-    mx->random->seed(1234);
-    srand(1234);
-    my $nclass = 10;
-    my $N = 20;
-    my $data = mx->random->uniform(-1, 1, shape=>[$N, $nclass]);
-    my $label = mx->nd->array([qw/2 0 8 4 3 4 2 5 5 7 2 3 7 1 2 6 4 2 8 0/], dtype=>'int32');
-    my $data_iter = mx->io->NDArrayIter($data, $label, batch_size=>10, label_name=>'label');
-    my $output = get_net($nclass);
-    my $l = mx->symbol->Variable('label');
-    my $Loss = gluon->loss->SoftmaxCrossEntropyLoss();
-    my $loss = $Loss->($output, $l);
-    $loss = mx->sym->make_loss($loss);
-    local($AI::MXNet::Logging::silent) = 1;
-    my $mod = mx->mod->Module($loss, data_names=>['data'], label_names=>['label']);
-    $mod->fit($data_iter, num_epoch=>100, optimizer_params=>{learning_rate => 1},
-            eval_metric=>mx->metric->Loss());
-    $mod->save_checkpoint('test', 100, 1);
-    $mod = mx->mod->Module->load('test', 100, 1,
-                             data_names=>['data'], label_names=>['label']);
-    $mod->fit($data_iter, num_epoch=>100, optimizer_params=>{learning_rate => 1},
-            eval_metric=>mx->metric->Loss()
-    );
-    ok($mod->score($data_iter, mx->metric->Loss())->{loss} < 0.05);
-}
-
-test_saveload();
-
-sub test_logistic_loss_equal_bce
-{
-    mx->random->seed(1234);
-    srand(1234);
-    my $N = 100;
-    my $loss_binary = gluon->loss->LogisticLoss(label_format=>'binary');
-    my $loss_signed = gluon->loss->LogisticLoss(label_format=>'signed');
-    my $loss_bce = gluon->loss->SigmoidBCELoss(from_sigmoid=>0);
-    my $data = mx->random->uniform(-10, 10, shape=>[$N, 1]);
-    my $label = mx->nd->round(mx->random->uniform(0, 1, shape=>[$N, 1]));
-    ok(almost_equal($loss_binary->($data, $label)->aspdl, $loss_bce->($data, $label)->aspdl));
-    ok(almost_equal($loss_signed->($data, 2 * $label - 1)->aspdl, $loss_bce->($data, $label)->aspdl));
-}
-
-test_logistic_loss_equal_bce();
-
-sub test_huber_loss
-{
-    mx->random->seed(1234);
-    srand(1234);
-    my $N = 20;
-    my $data = mx->random->uniform(-1, 1, shape=>[$N, 10]);
-    my $label = mx->random->uniform(-1, 1, shape=>[$N, 1]);
-    my $data_iter = mx->io->NDArrayIter($data, $label, batch_size=>10, label_name=>'label', shuffle=>1);
-    my $output = get_net(1);
-    my $l = mx->symbol->Variable('label');
-    my $Loss = gluon->loss->HuberLoss();
-    my $loss = $Loss->($output, $l);
-    $loss = mx->sym->make_loss($loss);
-    local($AI::MXNet::Logging::silent) = 1;
-    my $mod = mx->mod->Module($loss, data_names=>['data'], label_names=>['label']);
-    $mod->fit($data_iter, num_epoch=>200, optimizer_params=>{learning_rate => 0.01},
-            initializer=>mx->init->Xavier(magnitude=>2), eval_metric=>mx->metric->Loss(),
-            optimizer=>'adam');
-    ok($mod->score($data_iter, mx->metric->Loss())->{loss} < 0.05);
-}
-
-test_huber_loss();
-
-sub test_hinge_loss
-{
-    mx->random->seed(1234);
-    srand(1234);
-    my $N = 20;
-    my $data = mx->random->uniform(-1, 1, shape=>[$N, 10]);
-    my $label = mx->random->uniform(-1, 1, shape=>[$N, 1]);
-    my $data_iter = mx->io->NDArrayIter($data, $label, batch_size=>10, label_name=>'label', shuffle=>1);
-    my $output = get_net(1);
-    my $l = mx->symbol->Variable('label');
-    my $Loss = gluon->loss->HingeLoss();
-    my $loss = $Loss->($output, $l);
-    $loss = mx->sym->make_loss($loss);
-    local($AI::MXNet::Logging::silent) = 1;
-    my $mod = mx->mod->Module($loss, data_names=>['data'], label_names=>['label']);
-    $mod->fit($data_iter, num_epoch=>200, optimizer_params=>{learning_rate => 0.01},
-            initializer=>mx->init->Xavier(magnitude=>2), eval_metric=>mx->metric->Loss(),
-            optimizer=>'adam');
-    ok($mod->score($data_iter, mx->metric->Loss())->{loss} < 0.05);
-}
-
-test_hinge_loss();
-
-sub test_squared_hinge_loss
-{
-    mx->random->seed(1234);
-    srand(1234);
-    my $N = 20;
-    my $data = mx->random->uniform(-1, 1, shape=>[$N, 10]);
-    my $label = mx->random->uniform(-1, 1, shape=>[$N, 1]);
-    my $data_iter = mx->io->NDArrayIter($data, $label, batch_size=>10, label_name=>'label', shuffle=>1);
-    my $output = get_net(1);
-    my $l = mx->symbol->Variable('label');
-    my $Loss = gluon->loss->SquaredHingeLoss();
-    my $loss = $Loss->($output, $l);
-    $loss = mx->sym->make_loss($loss);
-    local($AI::MXNet::Logging::silent) = 1;
-    my $mod = mx->mod->Module($loss, data_names=>['data'], label_names=>['label']);
-    $mod->fit($data_iter, num_epoch=>200, optimizer_params=>{learning_rate => 0.01},
-            initializer=>mx->init->Xavier(magnitude=>2), eval_metric=>mx->metric->Loss(),
-            optimizer=>'adam');
-    ok($mod->score($data_iter, mx->metric->Loss())->{loss} < 0.05);
-}
-
-test_squared_hinge_loss();
-
-sub test_triplet_loss
-{
-    mx->random->seed(1234);
-    srand(1234);
-    my $N = 20;
-    my $data = mx->random->uniform(-1, 1, shape=>[$N, 10]);
-    my $pos = mx->random->uniform(-1, 1, shape=>[$N, 10]);
-    my $neg = mx->random->uniform(-1, 1, shape=>[$N, 10]);
-    my $data_iter = mx->io->NDArrayIter($data, Hash::Ordered->new(pos => $pos, neg => $neg),
-        batch_size=>10, label_name=>'label', shuffle=>1);
-    my $output = get_net(10);
-    $pos = mx->symbol->Variable('pos');
-    $neg = mx->symbol->Variable('neg');
-    my $Loss = gluon->loss->TripletLoss();
-    my $loss = $Loss->($output, $pos, $neg);
-    $loss = mx->sym->make_loss($loss);
-    local($AI::MXNet::Logging::silent) = 1;
-    my $mod = mx->mod->Module($loss, data_names=>['data'], label_names=>['pos', 'neg']);
-    $mod->fit($data_iter, num_epoch=>200, optimizer_params=>{learning_rate => 0.01},
-            initializer=>mx->init->Xavier(magnitude=>2), eval_metric=>mx->metric->Loss(),
-            optimizer=>'adam');
-    ok($mod->score($data_iter, mx->metric->Loss())->{loss} < 0.05);
-}
-
-test_triplet_loss();
-
-sub test_cosine_loss
-{
-    my $input1 = mx->nd->random->randn(3, 2);
-    my $input2 = mx->nd->random->randn(3, 2);
-    my $label  = mx->nd->sign(mx->nd->random->randn($input1->shape->[0]));
-
-    my $Loss = gluon->loss->CosineEmbeddingLoss();
-    my $loss = $Loss->($input1, $input2, $label);
-
-    my $numerator = mx->nd->sum($input1 * $input2, keepdims => 1, axis => 1);
-    my $denominator = mx->nd->sqrt(mx->nd->sum($input1**2, axis=>1, keepdims=>1))
-        *
-    mx->nd->sqrt(mx->nd->sum($input2**2, axis=>1, keepdims=>1));
-    my $pdl_loss = mx->nd->where(
-        ($label == 1), 1-$numerator/$denominator,
-        mx->nd->broadcast_maximum(mx->nd->array([0]), $numerator/$denominator, { axis=>1 })
-    );
-    ok(almost_equal($loss->aspdl, $pdl_loss->aspdl));
-}
-
-test_cosine_loss();
-
-sub test_poisson_nllloss
-{
-    my $N = 1000;
-    mx->random->seed(1234);
-    srand(1234);
-    my $data = mx->random->poisson(shape=>[$N, 2]);
-    my $label = mx->random->poisson(lam=>4, shape=>[$N, 1]);
-    my $data_iter = mx->io->NDArrayIter($data, $label, batch_size=>20, label_name=>'label', shuffle=>1);
-    my $output = mx->sym->exp(get_net(1));
-    my $l = mx->symbol->Variable('label');
-    my $Loss = gluon->loss->PoissonNLLLoss(from_logits=>0);
-    my $loss = $Loss->($output, $l);
-    $loss = mx->sym->make_loss($loss);
-    my $mod = mx->mod->Module($loss, data_names=>['data'], label_names=>['label']);
-    local($AI::MXNet::Logging::silent) = 1;
-    $mod->fit($data_iter, num_epoch=>20, optimizer_params=>{learning_rate => 0.01},
-            initializer=>mx->init->Normal(sigma=>0.1), eval_metric=>mx->metric->Loss(),
-            optimizer=>'adam');
-    ok($mod->score($data_iter, mx->metric->Loss())->{loss} < 0.05);
-}
-
-test_poisson_nllloss;
diff --git a/perl-package/AI-MXNet/t/test_metric.t b/perl-package/AI-MXNet/t/test_metric.t
deleted file mode 100644
index 031f2052b780..000000000000
--- a/perl-package/AI-MXNet/t/test_metric.t
+++ /dev/null
@@ -1,45 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-use strict;
-use warnings;
-use Test::More tests => 6;
-use JSON::PP;
-use AI::MXNet 'mx';
-
-sub check_metric
-{
-    my ($metric, @args) = @_;
-    $metric = mx->metric->create($metric, @args);
-    my $str_metric = encode_json($metric->get_config());
-    my $metric2 = mx->metric->create($str_metric);
-    is_deeply($metric->get_config(), $metric2->get_config());
-}
-
-
-sub test_metrics
-{
-    check_metric('acc', axis=>0);
-    check_metric('f1');
-    check_metric('perplexity', -1);
-    check_metric('pearsonr');
-    check_metric('confidence', 2, [0.5, 0.9]);
-    my $composite = mx->metric->create(['acc', 'f1']);
-    check_metric($composite);
-}
-
-test_metrics();
diff --git a/perl-package/AI-MXNet/t/test_model_parallel.t b/perl-package/AI-MXNet/t/test_model_parallel.t
deleted file mode 100644
index a9a0cdd3b092..000000000000
--- a/perl-package/AI-MXNet/t/test_model_parallel.t
+++ /dev/null
@@ -1,91 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-use strict;
-use warnings;
-use Test::More tests => 4;
-use AI::MXNet qw(mx);
-use AI::MXNet::TestUtils qw(reldiff);
-use AI::MXNet::Base;
-
-sub test_chain
-{
-    my $ctx1 = mx->cpu(0);
-    my $ctx2 = mx->cpu(1);
-    my $n = 2;
-    my $data1 = mx->sym->Variable('data1');
-    my $data2 = mx->sym->Variable('data2');
-    my $data3 = mx->sym->Variable('data2');
-    my $net;
-    {
-        local($mx::AttrScope) = mx->AttrScope(ctx_group=>'dev1');
-        $net = $data1 + $data2;
-        $net = $net * 3;
-    }
-    {
-        local($mx::AttrScope) = mx->AttrScope(ctx_group=>'dev2');
-        $net = $net + $data3;
-    }
-
-    my $arr = [];
-    my $arr_grad = [];
-    my $shape = [4, 5];
-    {
-        local($mx::Context) = $ctx1;
-        for (0..$n-1)
-        {
-            push @$arr, mx->nd->empty($shape);
-            push @$arr_grad, mx->nd->empty($shape);
-        }
-    }
-    {
-        local($mx::Context) = $ctx2;
-        push @$arr, mx->nd->empty($shape);
-        push @$arr_grad, mx->nd->empty($shape);
-    }
-
-    my $exec1 = $net->bind(
-        ctx          => $ctx1,
-        args         => $arr,
-        args_grad    => $arr_grad,
-        group2ctx    => { dev1 => $ctx1, dev2 => $ctx2 }
-    );
-    $arr->[0] .= 1;
-    $arr->[1] .= 2;
-    $arr->[2] .= 3;
-    my $arr2 = [map { $_->copyto($ctx1) } @$arr];
-    my $arr_grad2 = [map { $_->copyto($ctx1) } @$arr_grad];
-    my $exec2 = $net->bind(
-        ctx       => $ctx1,
-        args      => $arr2,
-        args_grad => $arr_grad2
-    );
-
-    $exec1->forward(1);
-    $exec2->forward(1);
-    ok(reldiff($exec1->outputs->[0]->aspdl, $exec2->outputs->[0]->aspdl) < 1e-6);
-    my $out_grad = mx->nd->empty($shape, ctx => $ctx1);
-    $out_grad .= 1;
-    $exec1->backward([$out_grad]);
-    $exec2->backward([$out_grad->copyto($ctx1)]);
-    for(zip($arr_grad, $arr_grad2)) {
-        my ($a, $b) = @$_;
-        ok(reldiff($a->aspdl, $b->aspdl) < 1e-6);
-    }
-}
-
-test_chain();
diff --git a/perl-package/AI-MXNet/t/test_module.t b/perl-package/AI-MXNet/t/test_module.t
deleted file mode 100644
index 2b5e72463275..000000000000
--- a/perl-package/AI-MXNet/t/test_module.t
+++ /dev/null
@@ -1,252 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-use strict;
-use warnings;
-use Test::More tests => 19;
-use AI::MXNet qw(mx);
-use AI::MXNet::Base;
-use AI::MXNet::TestUtils qw(almost_equal enumerate same_array dies_like rand_ndarray);
-$ENV{MXNET_STORAGE_FALLBACK_LOG_VERBOSE} = 0;
-$ENV{MXNET_SUBGRAPH_VERBOSE} = 0;
-
-sub test_module_layout
-{
-    my $sym = mx->sym->Variable('data');
-    $sym = mx->sym->Activation(data=>$sym, act_type=>'relu', __layout__=>'TNC');
-
-    my $dshape = [3, 8, 7];
-    my $mod = mx->mod->Module(
-        $sym,
-        data_names=>['data'],
-        context=>[mx->cpu(0), mx->cpu(1)]
-    );
-    $mod->bind(
-        data_shapes=>[mx->io->DataDesc('data', $dshape, layout=>'TNC')]
-    );
-    $mod->init_params();
-    $mod->forward(
-        mx->io->DataBatch(
-            data=>[mx->nd->ones($dshape)]
-        ),
-        is_train => 1
-    );
-    $mod->backward([mx->nd->ones($dshape)]);
-    is_deeply($mod->get_outputs()->[0]->shape, $dshape);
-
-    my $hdshape = [3, 4, 7];
-    for my $x (@{ $mod->get_outputs(0)->[0] })
-    {
-        is_deeply($x->shape, $hdshape);
-    }
-}
-
-sub test_save_load
-{
-    my $dict_equ = sub {
-        is_deeply([sort keys %$a], [sort keys %$b]);
-        for my $k (keys %$a)
-        {
-            ok(($a->{$k}->aspdl == $b->{$k}->aspdl)->all);
-        }
-    };
-    my $sym = mx->sym->Variable('data');
-    $sym = mx->sym->FullyConnected($sym, num_hidden=>100);
-
-    # single device
-    my $mod = mx->mod->Module($sym, data_names=>['data']);
-    $mod->bind(data_shapes=>[['data', [10, 10]]]);
-    $mod->init_params();
-    $mod->init_optimizer(optimizer_params=>{learning_rate => 0.1, momentum => 0.9});
-    $mod->update();
-    $mod->save_checkpoint('test', 0, 1);
-
-    my $mod2 = mx->mod->Module->load('test', 0, 1, data_names=>['data']);
-    $mod2->bind(data_shapes=>[['data', [10, 10]]]);
-    $mod2->init_optimizer(optimizer_params=>{learning_rate => 0.1, momentum => 0.9});
-    is($mod->_symbol->tojson(), $mod2->_symbol->tojson());
-    $dict_equ->(($mod->get_params())[0], ($mod2->get_params())[0]);
-    $dict_equ->($mod->_updater->states, $mod2->_updater->states);
-
-    # multi device
-    $mod = mx->mod->Module($sym, data_names=>['data'], context=>[mx->cpu(0), mx->cpu(1)]);
-    $mod->bind(data_shapes=>[['data', [10, 10]]]);
-    $mod->init_params();
-    $mod->init_optimizer(optimizer_params=>{learning_rate => 0.1, momentum => 0.9});
-    $mod->update();
-    $mod->save_checkpoint('test', 0, 1);
-
-    $mod2 = mx->mod->Module->load('test', 0, 1, data_names=>['data']);
-    $mod2->bind(data_shapes=>[['data', [10, 10]]]);
-    $mod2->init_optimizer(optimizer_params=>{learning_rate => 0.1, momentum => 0.9});
-    is($mod->_symbol->tojson(), $mod2->_symbol->tojson());
-    $dict_equ->(($mod->get_params())[0], ($mod2->get_params())[0]);
-    $dict_equ->($mod->_kvstore->_updater->states, $mod2->_updater->states);
-    unlink('test-0000.params');
-    unlink('test-0000.states');
-    unlink('test-symbol.json');
-}
-
-
-sub test_module_reshape
-{
-    my $data = mx->sym->Variable('data');
-    my $sym  = mx->sym->FullyConnected($data, num_hidden=>20, name=>'fc');
-
-    my $dshape = [7, 20];
-    my $mod = mx->mod->Module($sym, data_names=>['data'], context=>[mx->cpu(0), mx->cpu(1)]);
-    $mod->bind(data_shapes=>[['data', $dshape]]);
-    $mod->init_params();
-    $mod->init_optimizer(optimizer_params=>{learning_rate => 1});
-
-    $mod->forward(
-        mx->io->DataBatch(
-            data=>[mx->nd->ones($dshape)]
-        ),
-        is_train => 1
-    );
-    $mod->backward([mx->nd->ones($dshape)]);
-    $mod->update();
-    is_deeply($mod->get_outputs()->[0]->shape, $dshape);
-    ok((($mod->get_params())[0]{fc_bias}->aspdl == -1)->all);
-
-    $dshape = [14, 20];
-    $mod->reshape(data_shapes=>[['data', $dshape]]);
-    $mod->forward(
-        mx->io->DataBatch(
-            data=>[mx->nd->ones($dshape)]
-        ),
-        is_train => 1
-    );
-    $mod->backward([mx->nd->ones($dshape)]);
-    $mod->update();
-    is_deeply($mod->get_outputs()->[0]->shape, $dshape);
-    ok((($mod->get_params())[0]{fc_bias}->aspdl == -3)->all);
-}
-
-
-sub test_module_states
-{
-    my $stack = mx->rnn->SequentialRNNCell();
-    for my $i (0..1)
-    {
-        $stack->add(mx->rnn->LSTMCell(num_hidden=>20, prefix=>"lstm_l${i}_"));
-    }
-    my $begin_state = $stack->begin_state(func=>mx->sym->can('Variable'));
-    my (undef, $states) = $stack->unroll(10, begin_state=>$begin_state, inputs=>mx->sym->Variable('data'));
-
-    my $state_names = [map { $_->name } @$begin_state];
-    my $mod = mx->mod->Module(
-        mx->sym->Group($states), context=>[mx->cpu(0), mx->cpu(1)],
-        state_names=>$state_names
-    );
-    $mod->bind(data_shapes=>[['data', [5, 10]]], for_training=>0);
-    $mod->init_params();
-    my $batch = mx->io->DataBatch(data=>[mx->nd->zeros([5, 10])], label=>[]);
-
-    $mod->set_states(value=>1);
-    $mod->forward($batch);
-    my $out = $mod->get_outputs(0);
-    my $out1 = $mod->get_outputs(1);
-
-    $mod->set_states(states=>$out);
-    $mod->forward($batch);
-    my $out2 = $mod->get_outputs(1);
-
-    for(zip($out1, $out2)) {
-        my ($x1, $x2) = @$_;
-        ok(not almost_equal($x1->aspdl, $x2->aspdl, 1e-3));
-    }
-}
-
-sub test_module_dtype
-{
-    my $dtype = 'float16';
-    my $dshape = [3, 8, 7];
-
-    my $sym = mx->sym->Variable('data');
-    $sym    = mx->sym->Activation(data=>$sym, act_type=>'relu', __layout__=>'TNC');
-
-    my $mod = mx->mod->Module($sym, data_names=>['data'], context => [mx->cpu(0), mx->cpu(1)]);
-    $mod->bind(data_shapes=>[
-        mx->io->DataDesc('data', $dshape, dtype => $dtype, layout=>'TNC')
-    ]);
-    $mod->init_params();
-    $mod->forward(
-        mx->io->DataBatch(
-            data=>[mx->nd->ones($dshape, dtype=>$dtype)]
-        )
-    );
-    $mod->backward([mx->nd->ones($dshape, dtype=>$dtype)]);
-
-    for my $x (@{ $mod->get_outputs() })
-    {
-        is($x->dtype, $dtype);
-    }
-}
-
-sub test_module_input_grads
-{
-    my $a = mx->sym->Variable('a', __layout__=>'NC');
-    my $b = mx->sym->Variable('b', __layout__=>'NC');
-    my $c = mx->sym->Variable('c', __layout__=>'NC');
-
-    $c = $a + 2 * $b + 3 * $c;
-    my $net = mx->mod->Module(
-        $c, data_names=>['b', 'c', 'a'],
-        context=>[mx->cpu(0), mx->cpu(1)]
-    );
-    $net->bind(
-        data_shapes      => [['b', [5, 5]], ['c', [5, 5]], ['a', [5, 5]]],
-        inputs_need_grad => 1
-    );
-    $net->init_params();
-
-    $net->forward(
-        mx->io->DataBatch(data => [
-            mx->nd->ones([5, 5]),
-            mx->nd->ones([5, 5]),
-            mx->nd->ones([5, 5])
-        ])
-    );
-    $net->backward([mx->nd->ones([5, 5])]);
-    my $input_grads = $net->get_input_grads();
-    my $b_grad = $input_grads->[0]->aspdl;
-    my $c_grad = $input_grads->[1]->aspdl;
-    my $a_grad = $input_grads->[2]->aspdl;
-    ok(($a_grad == 1)->all);
-    ok(($b_grad == 2)->all);
-    ok(($c_grad == 3)->all);
-}
-
-sub test_forward_acceptable_input
-{
-    my $data = mx->sym->Variable('data');
-    my $out = $data * 2;
-    my $mod = mx->mod->Module(symbol => $out);
-    $mod->bind(data_shapes => [['data', [1, 10]]]);
-    $mod->init_params();
-    is_deeply($mod->predict(mx->nd->ones([1, 10]))->shape, [1, 10]);
-    is_deeply($mod->predict(mx->nd->ones([1, 10])->aspdl)->shape, [1, 10]);
-}
-
-test_module_input_grads();
-test_module_dtype();
-test_module_layout();
-test_module_states();
-test_save_load();
-test_forward_acceptable_input();
diff --git a/perl-package/AI-MXNet/t/test_ndarray.t b/perl-package/AI-MXNet/t/test_ndarray.t
deleted file mode 100644
index c71ce15b66d1..000000000000
--- a/perl-package/AI-MXNet/t/test_ndarray.t
+++ /dev/null
@@ -1,275 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-use strict;
-use warnings;
-use AI::MXNet qw(mx);
-use AI::MXNet::TestUtils qw(almost_equal same rand_ndarray randint zip);
-use Test::More tests => 280;
-use PDL;
-use File::Temp qw(tempdir);
-use IO::File;
-
-sub test_ndarray_reshape
-{
-    my $tensor = (mx->nd->arange(stop => 30) + 1)->reshape([2, 3, 5]);
-    my $true_res = mx->nd->arange(stop => 30) + 1;
-    ok(same($tensor->reshape([-1])->aspdl, $true_res->aspdl));
-    ok(same($tensor->reshape([2, -1])->aspdl, $true_res->reshape([2, 15])->aspdl));
-    ok(same($tensor->reshape([0, -1])->aspdl, $true_res->reshape([2, 15])->aspdl));
-    ok(same($tensor->reshape([-1, 2])->aspdl, $true_res->reshape([15, 2])->aspdl));
-    ok(same($tensor->reshape([6, 5])->aspdl, $true_res->reshape([6, 5])->aspdl));
-    ok(same($tensor->reshape([30])->aspdl, $true_res->aspdl));
-    ok(same($tensor->reshape([-1, 6])->aspdl, $true_res->reshape([5, 6])->aspdl));
-    ok(same($tensor->reshape([-2])->aspdl, $true_res->reshape([2, 3, 5])->aspdl));
-    ok(same($tensor->reshape([-3, -1])->aspdl, $true_res->reshape([6, 5])->aspdl));
-    ok(same($tensor->reshape([-1, 15])->reshape([0, -4, 3, -1])->aspdl, $true_res->reshape([2, 3, 5])->aspdl));
-    ok(same($tensor->reshape([-1, 0])->aspdl, $true_res->reshape([10, 3])->aspdl));
-    ok(same($tensor->reshape([-1, 0], reverse=>1)->aspdl, $true_res->reshape([6, 5])->aspdl));
-}
-
-sub test_moveaxis
-{
-    my $X = mx->nd->array([[[1, 2, 3], [4, 5, 6]],
-                           [[7, 8, 9], [10, 11, 12]]]);
-    my $res = $X->moveaxis(0, 2)->aspdl;
-    my $true_res = mx->nd->array([[[  1.,   7.],
-                                   [  2.,   8.],
-                                   [  3.,   9.]],
-                                  [[  4.,  10.],
-                                   [  5.,  11.],
-                                   [  6.,  12.]]]);
-    is_deeply($res->unpdl, $true_res->aspdl->unpdl);
-    is_deeply($X->moveaxis(2, 0)->shape, [3, 2, 2]);
-}
-
-
-sub test_output
-{
-    my $shape = [2,2];
-    my $ones = mx->nd->ones($shape);
-    my $zeros = mx->nd->zeros($shape);
-    my $out = mx->nd->zeros($shape);
-    mx->nd->ones($shape, out=>$out);
-    ok(almost_equal($out->aspdl, $ones->aspdl));
-    mx->nd->zeros($shape, out=>$out);
-    ok(almost_equal($out->aspdl, $zeros->aspdl));
-    mx->nd->full($shape, 2, out=>$out);
-    ok(almost_equal($out->aspdl, $ones->aspdl * 2));
-}
-
-sub test_cached
-{
-    my $sym = mx->sym->Convolution(kernel=>[3, 3], num_filter=>10) + 2;
-    my $op = mx->nd->CachedOp($sym);
-    my $data = mx->nd->ones([3, 4, 10, 10]);
-    my $weight = mx->nd->ones([10, 4, 3, 3]);
-    my $bias = mx->nd->ones([10]);
-    my $o1 = $op->($data, $weight, $bias);
-    $bias .= 2;
-    my $o2 = $op->($data, $weight, $bias);
-    ok(almost_equal($o2->aspdl, $o1->aspdl+1));
-    $o2 .= 0;
-    $op->($data, $weight, $bias, out=>$o2);
-    ok(almost_equal($o2->aspdl, $o1->aspdl+1));
-
-    $weight->attach_grad();
-    $bias->attach_grad();
-    my $o;
-    mx->autograd->record(sub {
-        $bias = $bias + 1;
-        $o = $op->($data, $weight, $bias);
-        $o = $o * 2;
-        $o->backward();
-    });
-
-    mx->autograd->record(sub {
-        $bias = $bias + 1;
-        $o = $op->($data, $weight, $bias);
-        $o = $o * 2;
-        $o->backward(retain_graph=>1);
-        $o->backward();
-    });
-
-    # try a different shape
-    $data = mx->nd->ones([5, 2, 10, 10]);
-    $weight = mx->nd->ones([10, 2, 3, 3]);
-    $bias = mx->nd->ones([10]);
-    $data->attach_grad;
-
-    mx->autograd->record(sub {
-        $bias = $bias + 1;
-        $o = $op->($data, $weight, $bias);
-        $o = $o * 2;
-        $o->backward();
-    });
-}
-
-sub test_ndarray_slice
-{
-    my $shape = [10];
-    my $A = mx->random->uniform(-10, 10, $shape);
-    my $A2 = $A->aspdl;
-    ok(same($A->slice([3,7])->aspdl, $A2->slice([3, 7])));
-    $A2->slice([3, 7]) *= 10;
-    $A->slice([3,7]) .= $A2->slice([3, 7]);
-    ok(same($A->slice([3,7])->aspdl, $A2->slice([3, 7])));
-
-    $shape = [3,4,5,6,7];
-    $A = mx->nd->random->uniform(shape=>$shape);
-    $A2 = $A->aspdl;
-
-    ok(same($A->slice([1], [3,3], 'X', [1,4], 'X')->aspdl, $A2->slice('X', [1,4], 'X', [3,3], [1])));
-    ok(($A->slice([1], [3,3], 'X', [1,4], 'X') == mx->nd->array($A2->slice('X', [1,4], 'X', [3,3], [1])))->aspdl->all);
-
-    ok($A->slice(1,2,3,4,5)->asscalar() == $A2->at(5, 4, 3, 2, 1));
-
-    my $a = mx->nd->array([[0, 1], [2, 3]]);
-    ok(($a->slice([[1, 1, 0], [0, 1, 0]])->aspdl == mx->nd->array([2, 3, 0])->aspdl)->all);
-    ok(($a->slice([mx->nd->array([1, 1, 0]), mx->nd->array([0, 1, 0])])->aspdl == mx->nd->array([2, 3, 0])->aspdl)->all);
-}
-
-sub test_linalg_gemm2
-{
-    # Single matrix multiply
-    my $A = mx->nd->array([[1.0, 1.0], [1.0, 1.0]]);
-    my $B = mx->nd->array([[1.0, 1.0], [1.0, 1.0], [1.0, 1.0]]);
-    ok(almost_equal(
-        mx->nd->linalg->gemm2($A, $B, transpose_b=>1, alpha=>2.0)->aspdl,
-        pdl([[4.0, 4.0, 4.0], [4.0, 4.0, 4.0]])
-    ));
-
-    # Batch matrix multiply
-    $A = mx->nd->array([[[1.0, 1.0]], [[0.1, 0.1]]]);
-    $B = mx->nd->array([[[1.0, 1.0]], [[0.1, 0.1]]]);
-    ok(almost_equal(
-        mx->nd->linalg->gemm2($A, $B, transpose_b=>1, alpha=>2.0)->aspdl,
-        pdl([[[4.0]], [[0.04]]])
-    ));
-}
-
-sub test_image_to_tensor
-{
-    ok(
-        same(
-            mx->nd->image->to_tensor(mx->nd->zeros([28, 28, 3]))->aspdl,
-            zeros(28, 28, 3)
-        )
-    );
-}
-
-sub test_buffer_load
-{
-    my $nrepeat = 10;
-    my $tmpdir = tempdir(CLEANUP => 1);
-    for my $repeat (1..$nrepeat)
-    {
-        # test load_buffer as list
-        my @data;
-        for(1..10)
-        {
-            push @data, rand_ndarray([randint(1, 5)], 'default');
-        }
-        my $fname = "$tmpdir/list_$repeat.param";
-        mx->nd->save($fname, \@data);
-        my $buf_data = join('',IO::File->new($fname)->getlines);
-        my $data2 = mx->nd->load_frombuffer($buf_data);
-        ok(@data == @$data2);
-        zip(sub {
-            my ($x, $y) = @_;
-            ok(same($x->aspdl, $y->aspdl));
-        }, \@data, $data2);
-        # test load_buffer as hash
-        my $i = 0;
-        my %hash = map { 'ndarray xx '.$i++ => $_ } @data;
-        $fname = "$tmpdir/hash_$repeat.param";
-        mx->nd->save($fname, \%hash);
-        $buf_data = join('',IO::File->new($fname)->getlines);
-        my $hash2 = mx->nd->load_frombuffer($buf_data);
-        ok(keys %hash == keys %$hash2);
-        while(my ($k, $v) = each %hash)
-        {
-            ok(same($v->aspdl, $hash2->{$k}->aspdl));
-        }
-    }
-}
-
-sub test_histogram
-{
-    my $z = mx->nd->array([0..99]);
-    my $b = mx->nd->array([10, 20, 30, 60]);
-    my ($hist, $bins) = @{ mx->nd->histogram($z, bins => $b) };
-    ok(same($hist->aspdl, pdl([10, 10, 31])));
-    ok(same($bins->aspdl, pdl([10, 20, 30, 60])));
-}
-
-sub test_overload
-{
-    # much of this depends on PDL being sane as well.
-    my $px = PDL->new([ 2, -5, 11, 17 ]) / 2;
-    my $nx = mx->nd->array($px, dtype => 'float64');
-    my $py = PDL->new([ -3, 7, 13, 19 ]) / 4;
-    my $ny = mx->nd->array($py, dtype => 'float64');
-
-    ok(same(($nx + $ny)->aspdl(), $px + $py), 'overloaded add');
-    ok(same(($nx - $ny)->aspdl(), $px - $py), 'overloaded sub');
-    ok(same(($nx * $ny)->aspdl(), $px * $py), 'overloaded mul');
-    ok(same(($nx / $ny)->aspdl(), $px / $py), 'overloaded div');
-    ok(same(($nx % $ny)->aspdl(), $px % $py), 'overloaded mod');
-    ok(same(($nx ** $ny)->aspdl(), $px ** $py), 'overloaded pow');
-    ok(same(($nx->copy() += $ny)->aspdl(), $px + $py), 'inplace add');
-    ok(same(($nx->copy() -= $ny)->aspdl(), $px - $py), 'inplace sub');
-    ok(same(($nx->copy() *= $ny)->aspdl(), $px * $py), 'inplace mul');
-    ok(same(($nx->copy() /= $ny)->aspdl(), $px / $py), 'inplace div');
-    ok(same(($nx->copy() %= $ny)->aspdl(), $px % $py), 'inplace mod');
-    ok(same(($nx->copy() **= $ny)->aspdl(), $px ** $py), 'inplace pow');
-    ok(same(cos($nx)->aspdl(), cos($px)), 'overloaded cos');
-    ok(same(sin($nx)->aspdl(), sin($px)), 'overloaded sin');
-    ok(same(exp($nx)->aspdl(), exp($px)), 'overloaded exp');
-    ok(same(abs($nx)->aspdl(), abs($px)), 'overloaded abs');
-    ok(same(log($nx)->aspdl(), log($px)), 'overloaded log');
-    ok(same(sqrt($nx)->aspdl(), sqrt($px)), 'overloaded sqrt');
-    ok(same(atan2($nx, 1.0)->aspdl(), atan2($px, 1.0)), 'overloaded atan2');
-}
-
-sub test_array_overload
-{
-    # array conversions are largely calls to mx->nd->split(), but have
-    # special cases around dimensions of length 0 and 1.
-    is_deeply([ @{ mx->nd->array(zeros(7, 0)) } ], []);
-    is_deeply(mx->nd->zeros([3, 7])->[0]->shape, [ 7 ]);
-    is_deeply(mx->nd->zeros([2, 7])->[0]->shape, [ 7 ]);
-    is_deeply(mx->nd->zeros([1, 7])->[0]->shape, [ 7 ]);
-    is_deeply(mx->nd->zeros([3, 7, 11])->[0]->shape, [7, 11]);
-    is_deeply(mx->nd->zeros([2, 7, 11])->[0]->shape, [7, 11]);
-    is_deeply(mx->nd->zeros([1, 7, 11])->[0]->shape, [7, 11]);
-    is_deeply(mx->nd->zeros([3, 7, 11, 13])->[0]->shape, [7, 11, 13]);
-    is_deeply(mx->nd->zeros([2, 7, 11, 13])->[0]->shape, [7, 11, 13]);
-    is_deeply(mx->nd->zeros([1, 7, 11, 13])->[0]->shape, [7, 11, 13]);
-}
-
-test_ndarray_slice();
-test_ndarray_reshape();
-test_moveaxis();
-test_output();
-test_cached();
-test_linalg_gemm2();
-test_image_to_tensor();
-test_buffer_load();
-test_histogram();
-test_overload();
-test_array_overload();
diff --git a/perl-package/AI-MXNet/t/test_optimizers.t b/perl-package/AI-MXNet/t/test_optimizers.t
deleted file mode 100644
index 26a87cdd75ba..000000000000
--- a/perl-package/AI-MXNet/t/test_optimizers.t
+++ /dev/null
@@ -1,1105 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-package PerlAdam;
-use strict;
-use warnings;
-use AI::MXNet qw(mx);
-use Mouse;
-use AI::MXNet::Function::Parameters;
-extends 'AI::MXNet::Optimizer';
-has 'beta1' => (is => 'rw', default => 0.9);
-has 'beta2' => (is => 'rw', default => 0.999);
-has 'epsilon' => (is => 'rw', default => 1e-8);
-has 'rescale_grad' => (is => 'rw', default => 1);
-has 'decay_factor' => (is => 'rw', default => (1-1e-8));
-around BUILDARGS => \&init;
-
-func init($code, $class, %kwargs)
-{
-    return $class->$code(learning_rate => 0.001, wd => 0.9, %kwargs);
-}
-
-=begin
-        Create additional optimizer state: mean, variance
-
-        Parameters
-        ----------
-        weight : NDArray
-        The weight data
-=cut
-
-method create_state($index, $weight)
-{
-    return [
-            mx->nd->zeros($weight->shape, ctx => $weight->context, dtype => $weight->dtype),  # mean
-            mx->nd->zeros($weight->shape, ctx => $weight->context, dtype => $weight->dtype)   # variance
-    ]; 
-}
-
-=begin
-        Update the parameters.
-
-        Parameters
-        ----------
-        index : int
-        An unique integer key used to index the parameters
-
-        weight : NDArray
-        weight ndarray
-
-        grad : NDArray
-        grad ndarray
-
-        state : NDArray or other objects returned by init_state
-        The auxiliary state used in optimization.
-=cut
-
-method update($index, $weight, $grad, $state)
-{
-    my $lr = $self->_get_lr($index);
-    $self->_update_count($index);
-    my $t = $self->_index_update_count->{$index};
-    my ($mean, $variance) = @$state;
-    my $wd = $self->_get_wd($index);
-    $grad = $grad * $self->rescale_grad;
-    if($self->clip_gradient)
-    {
-        mx->nd->clip($grad, -$self->clip_gradient, $self->clip_gradient, out => $grad);
-    }
-    $grad += $wd * $weight;
-    $mean *= $self->beta1;
-    $mean += $grad * (1 - $self->beta1);
-
-    $variance *= $self->beta2;
-    $variance += (1 - $self->beta2) * mx->nd->square($grad, out => $grad);
-
-    my $coef1 = 1 - $self->beta1**$t;
-    my $coef2 = 1 - $self->beta2**$t;
-    $lr *= sqrt($coef2)/$coef1;
-    $weight -= $lr*$mean/(mx->nd->sqrt($variance) + $self->epsilon);
-}
-
-=head
-
-    RMSProp optimizer of Tieleman & Hinton, 2012,
-
-    For centered=False, the code follows the version in
-    http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf by
-    Tieleman & Hinton, 2012
-
-    For centered=True, the code follows the version in
-    http://arxiv.org/pdf/1308.0850v5.pdf Eq(38) - Eq(45) by Alex Graves, 2013.
-
-    Parameters
-    ----------
-    learning_rate : float, optional
-        Step size.
-        Default value is set to 0.001.
-    rho: float, optional
-        decay factor of moving average for gradient, gradient^2.
-        Default value is set to 0.9.
-    momentum: float, optional
-        Default value if set to 0.9.
-        Only used if centered=True
-    epsilon : float, optional
-        Default value is set to 1e-8.
-    centered : boolean, optional
-        Use Graves or Tielemans & Hintons version of RMSProp
-    wd : float, optional
-        L2 regularization coefficient add to all the weights
-    rescale_grad : float, optional
-        rescaling factor of gradient.
-    clip_gradient : float, optional
-        clip gradient in range [-clip_gradient, clip_gradient]
-    clip_weights : float, optional
-        clip weights in range [-clip_weights, clip_weights]
-=cut
-
-package PerlRMSProp;
-use Mouse;
-extends 'AI::MXNet::Optimizer';
-has '+learning_rate' => (default => 0.001);
-has 'rho'         => (is => "ro", isa => "Num",  default => 0.9);
-has 'momentum'         => (is => "ro", isa => "Num",  default => 0.9);
-has 'epsilon'        => (is => "ro", isa => "Num",  default => 1e-8);
-has 'centered'       => (is => "ro", isa => "Bool", default => 0);
-has 'clip_weights'   => (is => "ro", isa => "Num");
-
-# For centered=False: n
-# For centered=True: n, g, delta
-method create_state(Index $index, AI::MXNet::NDArray $weight)
-{
-    return [
-            $self->centered
-            ? (
-                AI::MXNet::NDArray->zeros(
-                    $weight->shape,
-                    ctx => $weight->context
-                ),  # n
-                AI::MXNet::NDArray->zeros(
-                    $weight->shape,
-                    ctx => $weight->context
-                ),  # g
-                AI::MXNet::NDArray->zeros(
-                    $weight->shape,
-                    ctx => $weight->context
-                )
-            )   # delta
-            : (
-                AI::MXNet::NDArray->zeros(
-                    $weight->shape,
-                    ctx => $weight->context
-                ),  # n
-            )
-    ];
-}
-
-method update($index, $weight, $grad, $state)
-{
-    my $lr = $self->_get_lr($index);
-    my $wd = $self->_get_wd($index);
-    $self->_update_count($index);
-    $grad = $grad * $self->rescale_grad;
-    if(not $self->centered)
-    {
-        my ($n) = @$state;
-        if(defined $self->clip_gradient)
-        {
-            $grad = mx->nd->clip($grad, -$self->clip_gradient, $self->clip_gradient);
-        }
-        $grad += $wd * $weight;
-        $n .= (1 - $self->rho) * ($grad * $grad) + $self->rho * $n;
-        $weight -= $lr * $grad/(mx->nd->sqrt($n) + $self->epsilon);
-    }
-    else
-    {
-        my ($n, $g, $delta) = @$state;
-        if(defined $self->clip_gradient)
-        {
-            $grad = mx->nd->clip($grad, -$self->clip_gradient, $self->clip_gradient);
-        }
-        $grad += $wd * $weight;
-        $n .= (1 - $self->rho) * ($grad * $grad) + $self->rho * $n;
-        $g .= (1 - $self->rho) * $grad + $self->rho * $g;
-        $delta .= ($self->momentum) * $delta - $lr * $grad/(mx->nd->sqrt($n - $g*$g + $self->epsilon));
-        $weight += $delta;
-    }
-    if($self->clip_weights)
-    {
-        mx->nd->clip($weight, -$self->clip_weights, $self->clip_weights, out => $weight);
-    }
-}
-
-package PerlSGD;
-# perl reference implemenation of sgd
-use Mouse;
-extends 'AI::MXNet::Optimizer';
-has '+learning_rate' => (default => 0.01);
-has 'momentum'       => (is => "ro", isa => "Num",  default => 0);
-has 'multi_precision' => (is => 'ro', isa => 'Bool', default => 0);
-
-# Create additional optimizer state: momentum
-method create_state(Index $index, AI::MXNet::NDArray $weight)
-{
-    my $momentum;
-    my $weight_master_copy;
-    my $do_multi_precision = ($self->multi_precision and $weight->dtype eq 'float16');
-    if($do_multi_precision)
-    {
-        if($self->momentum != 0)
-        {
-            $momentum = mx->nd->zeros($weight->shape, ctx => $weight->context, dtype=>'float32');
-        }
-        $weight_master_copy = mx->nd->array($weight, ctx=>$weight->context, dtype=>'float32');
-        return [$momentum, $weight_master_copy];
-    }
-    else
-    {
-        if($self->momentum != 0)
-        {
-            $momentum = mx->nd->zeros($weight->shape, ctx => $weight->context, dtype => $weight->dtype);
-        }
-    }
-    return $momentum;
-}
-
-method update($index, $weight, $grad, $state)
-{
-    my $lr = $self->_get_lr($index);
-    my $wd = $self->_get_wd($index);
-    $self->_update_count($index);
-    my $use_multi_precision = ref($state) eq 'ARRAY';
-
-    if(not $use_multi_precision)
-    {
-        if($self->momentum == 0)
-        {
-            if(defined $self->clip_gradient)
-            {
-                $weight .= ((1 - $lr*$wd)*$weight -
-                    $lr * mx->nd->clip($grad*$self->rescale_grad, -$self->clip_gradient, $self->clip_gradient)
-                );
-            }
-            else
-            {
-                $weight .= (1 - $lr*$wd)*$weight - $lr*$self->rescale_grad*$grad;
-            }
-        }
-        else
-        {
-            my $mom = $state;
-            if(defined $self->clip_gradient)
-            {
-                $mom .= ($self->momentum*$mom - $lr*$wd*$weight -
-                    $lr * mx->nd->clip($grad*$self->rescale_grad, -$self->clip_gradient, $self->clip_gradient)
-                );
-                $weight += $mom;
-            }
-            else
-            {
-                $mom .= $self->momentum*$mom - $lr*$wd*$weight - $lr*$self->rescale_grad*$grad;
-                $weight += $mom;
-            }
-        }
-    }
-    else
-    {
-        my $grad32 = mx->nd->array($grad, ctx=>$grad->context, dtype=>'float32');
-        my $mom = $state->[0];
-        my $weight32 = $state->[1];
-        if($self->momentum == 0)
-        {
-            if(defined $self->clip_gradient)
-            {
-                $weight32 .= ((1 - $lr*$wd)*$weight32 -
-                    $lr * mx->nd->clip($grad32*$self->rescale_grad, -$self->clip_gradient, $self->clip_gradient)
-                );
-            }
-            else
-            {
-                $weight32 .= (1 - $lr*$wd)*$weight32 - $lr*$self->rescale_grad*$grad32;
-            }
-        }
-        else
-        {
-            if(defined $self->clip_gradient)
-            {
-                $mom .= ($self->momentum*$mom - $lr*$wd*$weight32 -
-                    $lr * mx->nd->clip($grad32*$self->rescale_grad, -$self->clip_gradient, $self->clip_gradient)
-                );
-                $weight32 += $mom;
-            }
-            else
-            {
-                $mom .= $self->momentum*$mom - $lr*$wd*$weight32 - $lr*$self->rescale_grad*$grad32;
-                $weight32 += $mom;
-            }
-        }
-        my $tmp = $weight32->astype($weight->dtype);
-        $tmp->copyto($weight);
-    }
-}
-
-package PerlSparseSGD;
-# perl reference implemenation of sgd
-use Mouse;
-use AI::MXNet::TestUtils qw(almost_equal);
-extends 'AI::MXNet::Optimizer';
-has '+learning_rate' => (default => 0.01);
-has 'momentum'       => (is => "ro", isa => "Num",  default => 0);
-has 'multi_precision' => (is => 'ro', isa => 'Bool', default => 0);
-has 'lazy_update' => (is => 'ro', isa => 'Bool', default => 1);
-
-method create_state($index, $weight)
-{
-    if($self->momentum == 0)
-    {
-        return undef;
-    }
-    else
-    {
-        return mx->nd->zeros($weight->shape, ctx => $weight->context, dtype => $weight->dtype);
-    }
-}
-
-method update($index, $weight, $grad, $state)
-{
-    my $lr = $self->_get_lr($index);
-    my $wd = $self->_get_wd($index);
-    $self->_update_count($index);
-    my $num_rows = $weight->shape->[0];
-    if($self->momentum == 0)
-    {
-        # Update on a per row basis, skip all-zero rows
-        for my $row (0..$num_rows-1)
-        {
-            my $grad_row = $grad->at($row);
-            my $all_zeros = almost_equal($grad_row->aspdl, mx->nd->zeros($grad_row->shape, ctx => $grad_row->context, dtype => $grad_row->dtype)->aspdl);
-            if($all_zeros and $self->lazy_update)
-            {
-                next;
-            }
-            if(defined $self->clip_gradient)
-            {
-                $weight->at($row) .= (
-                    (1 - $lr*$wd)*$weight->at($row) -
-                    $lr * mx->nd->clip(
-                        $grad->at($row)*$self->rescale_grad,
-                        -$self->clip_gradient, $self->clip_gradient
-                    )
-                );
-            }
-            else
-            {
-                $weight->at($row) .= (1 - $lr*$wd)*$weight->at($row) - $lr*$self->rescale_grad*$grad->at($row);
-            }
-        }
-    }
-    else
-    {
-        my $mom = $state;
-        for my $row (0..$num_rows-1)
-        {
-            my $grad_row = $grad->at($row);
-            my $all_zeros = almost_equal($grad_row->aspdl, mx->nd->zeros($grad_row->shape, ctx => $grad_row->context, dtype => $grad_row->dtype)->aspdl);
-            if($all_zeros and $self->lazy_update)
-            {
-                next;
-            }
-            if(defined $self->clip_gradient)
-            {
-                $mom->at($row) .= ($self->momentum*$mom->at($row) - $lr*$wd*$weight->at($row) -
-                    $lr * mx->nd->clip($grad->at($row)*$self->rescale_grad, -$self->clip_gradient, $self->clip_gradient)
-                );
-                $weight->at($row) += $mom->at($row);
-            }
-            else
-            {
-                $mom->at($row) .= $self->momentum*$mom->at($row) - $lr*$wd*$weight->at($row) - $lr*$self->rescale_grad*$grad->at($row);
-                $weight->at($row) += $mom->at($row);
-            }
-        }
-    }
-
-}
-
-package PerlNAG;
-use Mouse;
-extends 'PerlSGD';
-
-method create_state($index, $weight)
-{
-    my $momentum;
-    my $weight_master_copy;
-    my $do_multi_precision = ($self->multi_precision and $weight->dtype eq 'float16');
-    if($do_multi_precision)
-    {
-        if($self->momentum != 0)
-        {
-            $momentum = mx->nd->zeros($weight->shape, ctx => $weight->context, dtype=>'float32');
-        }
-        $weight_master_copy = mx->nd->array($weight, ctx=>$weight->context, dtype=>'float32');
-        return [$weight_master_copy, $momentum];
-    }
-    else
-    {
-        if($self->momentum != 0)
-        {
-            $momentum = mx->nd->zeros($weight->shape, ctx => $weight->context, dtype=>$weight->dtype);
-        }
-        return $momentum;
-    }
-}
-
-method update($index, $weight, $grad, $state)
-{
-    my $lr = $self->_get_lr($index);
-    my $wd = $self->_get_wd($index);
-    $self->_update_count($index);
-    my $use_multi_precision = (defined $state and not Scalar::Util::blessed($state) and ref($state eq 'ARRAY'));
-    if(not $use_multi_precision)
-    {
-        $grad *= $self->rescale_grad;
-        if(defined $self->clip_gradient)
-        {
-            $grad = mx->nd->clip($grad, -$self->clip_gradient, $self->clip_gradient);
-        }
-        if($self->momentum == 0)
-        {
-            $weight += -$lr * ($grad + $wd * $weight);
-        }
-        else
-        {
-            $grad += $wd * $weight;
-            my $mom = $state;
-            $mom *= $self->momentum;
-            $mom -= $lr * $grad;
-            $grad *= -$lr;
-            $grad += $self->momentum * $mom;
-            $weight += $grad;
-        }
-    }
-    else
-    {
-        my $grad32 = mx->nd->array($grad, ctx=>$grad->context, dtype=>'float32');
-        $grad32 *= $self->rescale_grad;
-        if(defined $self->clip_gradient)
-        {
-            $grad32 = mx->nd->clip($grad32, -$self->clip_gradient, $self->clip_gradient);
-        }
-        my $mom = $state->[1];
-        my $weight32 = $state->[0];
-        if($self->momentum == 0)
-        {
-            $weight32 += -$lr * ($grad32 + $wd * $weight32);
-        }
-        else
-        {
-            $grad32 += $wd * $weight32;
-            $mom *= $self->momentum;
-            $mom -= $lr * $grad32;
-            $grad32 *= -$lr;
-            $grad32 += $self->momentum * $mom;
-            $weight32 += $grad32;
-        }
-        my $tmp = $weight32->astype($weight->dtype);
-        $tmp->copyto($weight);
-    }
-}
-
-package PerlFTML;
-use Mouse;
-extends 'AI::MXNet::Optimizer';
-has 'beta1' => (is => 'rw', default => 0.6);
-has 'beta2' => (is => 'rw', default => 0.999);
-has 'epsilon' => (is => 'rw', default => 1e-8);
-
-method create_state($index, $weight)
-{
-    return [mx->nd->zeros($weight->shape, ctx => $weight->context, dtype=>$weight->dtype), # d_0
-            mx->nd->zeros($weight->shape, ctx => $weight->context, dtype=>$weight->dtype), # v_0
-            mx->nd->zeros($weight->shape, ctx => $weight->context, dtype=>$weight->dtype)] # z_0
-}
-
-method update($index, $weight, $grad, $state)
-{
-    $self->_update_count($index);
-    my $lr = $self->_get_lr($index);
-    my $wd = $self->_get_wd($index);
-    my $t = $self->_index_update_count->{$index};
-
-    my $grad = $grad * $self->rescale_grad;
-    if(defined $self->clip_gradient)
-    {
-        $grad = mx->nd->clip($grad, -$self->clip_gradient, $self->clip_gradient);
-    }
-    $grad += $wd * $weight;
-    # get previous states
-    my ($prev_d, $prev_v, $prev_z) = @{ $state };
-    # compute states
-    my $v_t = $self->beta2 * $prev_v + (1 - $self->beta2) * mx->nd->square($grad);
-    my $d_t = (1 - ($self->beta1**$t)) / $lr * (mx->nd->sqrt($v_t / (1 - ($self->beta2**$t))) + $self->epsilon);
-    my $sigma_t = $d_t - $self->beta1 * $prev_d;
-    my $z_t = $self->beta1 * $prev_z + (1 - $self->beta1) * $grad - $sigma_t * $weight;
-    # update weight
-    $weight .= - $z_t / $d_t;
-    # update states
-    $prev_d .= $d_t;
-    $prev_v .= $v_t;
-    $prev_z .= $z_t;
-}
-
-package PerlSignum;
-use Mouse;
-extends 'AI::MXNet::Optimizer';
-has 'wd_lh' => (is => 'rw', default => 0);
-has 'momentum' => (is => 'rw', default => 0.9);
-
-method create_state($index, $weight)
-{
-    if($self->momentum != 0)
-    {
-        return mx->nd->zeros($weight->shape, ctx => $weight->context, dtype=>$weight->dtype, stype=>$weight->stype);
-    }
-    return undef;
-}
-
-method update($index, $weight, $grad, $state)
-{
-    $self->_update_count($index);
-    my $lr = $self->_get_lr($index);
-    my $wd = $self->_get_wd($index);
-    if(defined $state)
-    {
-        my $mom = $state;
-        if(defined $self->clip_gradient)
-        {
-            $mom .= ($self->momentum*$mom - (1-$self->momentum)*($wd*$weight +
-                mx->nd->clip($grad*$self->rescale_grad, -$self->clip_gradient, $self->clip_gradient)));
-        }
-        else
-        {
-            $mom .= $self->momentum*$mom - (1-$self->momentum)*$wd*$weight - (1-$self->momentum)*$self->rescale_grad*$grad;
-        }
-        $weight .= (1 - $lr*$self->wd_lh)*$weight + $lr * mx->nd->sign($mom);
-    }
-    else
-    {
-        $weight .= (1 - $lr*($wd+$self->wd_lh))*$weight - $lr * mx->nd->sign($grad);
-    }
-}
-
-package PerlFtrl;
-use Mouse;
-use AI::MXNet::TestUtils qw(almost_equal);
-extends 'AI::MXNet::Optimizer';
-
-has 'lamda1' => (is => 'rw', default => 0.01);
-has '+learning_rate' => (default => 0.1);
-has 'beta' => (is => 'rw', default => 1);
-has 'sparse_update' => (is => 'rw', default => 0);
-
-method create_state($index, $weight)
-{
-    return [
-        mx->nd->zeros($weight->shape, ctx => $weight->context, dtype=>$weight->dtype),  # dn
-        mx->nd->zeros($weight->shape, ctx => $weight->context, dtype=>$weight->dtype)   # n
-    ];
-}
-
-method update($index, $weight, $grad, $state)
-{
-    $self->_update_count($index);
-    my $wd = $self->_get_wd($index);
-    my $lr = $self->_get_lr($index);
-    my $num_rows = $weight->shape->[0];
-
-    my ($dn, $n) = @$state;
-    for my $row (0..$num_rows-1)
-    {
-        my $grad_row = $grad->at($row);
-        my $all_zeros = almost_equal($grad_row->aspdl, mx->nd->zeros($grad_row->shape, ctx => $grad_row->context, dtype => $grad_row->dtype)->aspdl);
-        if($all_zeros and $self->sparse_update)
-        {
-            next;
-        }
-        $grad_row *= $self->rescale_grad;
-        if(defined $self->clip_gradient)
-        {
-            $grad_row .= mx->nd->clip($grad_row, -$self->clip_gradient, $self->clip_gradient);
-        }
-
-        #update dn, n
-        $dn->at($row) += $grad_row - (mx->nd->sqrt($n->at($row) + $grad_row * $grad_row) - mx->nd->sqrt($n->at($row))) * $weight->at($row) / $lr;
-        $n->at($row) += $grad_row * $grad_row;
-
-        # update weight
-        $weight->at($row) .= - mx->nd->sign($dn->at($row)) * (mx->nd->abs($dn->at($row)) - $self->lamda1)->maximum(0) /
-                          (($self->beta + mx->nd->sqrt($n->at($row))) / $lr + $wd);
-    }
-}
-
-package PerlAdaGrad;
-use Mouse;
-extends 'AI::MXNet::Optimizer';
-
-has 'epsilon' => (is => 'rw', default => 1e-7);
-method create_state($index, $weight)
-{
-    mx->nd->zeros($weight->shape, ctx => $weight->context, stype => $weight->stype);
-}
-
-method update($index, $weight, $grad, $state)
-{
-    $self->_update_count($index);
-    my $wd = $self->_get_wd($index);
-    my $lr = $self->_get_lr($index);
-    my $num_rows = $weight->shape->[0];
-    my $history = $state;
-    $grad *= $self->rescale_grad;
-    if(defined $self->clip_gradient)
-    {
-        $grad = mx->nd->clip($grad, -$self->clip_gradient, $self->clip_gradient);
-    }
-    $grad += $wd * $weight;
-    $history += mx->nd->square($grad);
-    my $div = $grad / (mx->nd->sqrt($history) + $self->epsilon);
-    $weight -= $lr * $div;
-}
-
-package main;
-use Carp;
-use Test::More tests => 7992;
-use AI::MXNet::Base;
-use PDL::NiceSlice;
-use AI::MXNet::TestUtils qw(same reldiff almost_equal rand_ndarray);
-use AI::MXNet::Function::Parameters;
-
-func compare_optimizer($opt1, $opt2, $shape, $dtype, $w_stype='default', $g_stype='default')
-{
-    my ($w1, $w2, $g1, $g2);
-    if($w_stype eq 'default')
-    {
-        $w1 = mx->random->uniform({shape => $shape, ctx => mx->cpu, dtype=>$dtype});
-        $w2 = $w1->copyto(mx->cpu());
-    }
-    elsif($w_stype eq 'row_sparse' or $w_stype eq 'csr')
-    {
-        $w2 = rand_ndarray($shape, $w_stype, 1, $dtype);
-        $w1 = $w2->copyto(mx->cpu())->tostype('default');
-    }
-    else
-    {
-        Carp::confess("type not supported yet");
-    }
-    if($g_stype eq 'default')
-    {
-        $g2 = mx->random->uniform(shape=>$shape, ctx=>mx->cpu, dtype=>$dtype);
-        $g1 = $g2->copyto(mx->cpu);
-    }
-    elsif($g_stype eq 'row_sparse' or $g_stype eq 'csr')
-    {
-        $g2 = rand_ndarray($shape, $g_stype, rand(), $dtype);
-        $g1 = $g2->copyto(mx->cpu)->tostype('default');
-    }
-    else
-    {
-        Carp::confess("type not supported yet");
-    }
-
-    my $state1 = $opt1->create_state(0, $w1);
-    my $state2 = $opt2->create_state(0, $w2);
-    zip(
-        sub {
-            my ($s1, $s2) = @_;
-            ok(same($s1->aspdl, $s2->aspdl)) if defined $s1 and defined $s2;
-        },
-        ref $state1 eq 'ARRAY' ? $state1 : [$state1], ref $state2 eq 'ARRAY' ? $state2 : [$state2]
-    ) if defined $state1 and defined $state2;
-
-    $opt1->update(0, $w1, $g1, $state1);
-    $opt2->update(0, $w2, $g2, $state2);
-    zip(
-        sub {
-            my ($s1, $s2) = @_;
-            ok(reldiff($s1->aspdl, $s2->aspdl) < 1e-5) if defined $s1 and defined $s2;
-        },
-        ref $state1 eq 'ARRAY' ? $state1 : [$state1], ref $state2 eq 'ARRAY' ? $state2 : [$state2]
-    ) if defined $state1 and defined $state2;
-    ok(reldiff($w1->aspdl, $w2->aspdl) < 1e-5);
-}
-
-func test_adam()
-{
-    mx->random->seed(0);
-    my $opt1 = 'PerlAdam';
-    my $opt2 = 'AI::MXNet::Adam';
-    my $shape = [3, 4, 5];
-    my @kwargs = ({},
-              {'clip_gradient'=> 0.5},
-              {'clip_gradient'=> 0.1},
-              {'rescale_grad'=> 0.1});
-    for my $kwarg (@kwargs)
-    {
-        compare_optimizer($opt1->new(%$kwarg), $opt2->new(wd => 0.9, %$kwarg), $shape, 'float32');
-    }
-}
-
-func test_rms()
-{
-    mx->random->seed(0);
-    my $opt1 = 'PerlRMSProp';
-    my $opt2 = 'AI::MXNet::RMSProp';
-    my $shape = [3, 4, 5];
-    my @kwargs = ({},
-              {clip_gradient => 0.5},
-              {clip_gradient => 0.4, rescale_grad => 0.14},
-              {rescale_grad  => 0.8},
-              {clip_gradient => 0.5, wd => 0.07},
-              {clip_gradient => 0.4, rescale_grad => 0.14, wd => 0.03},
-              {rescale_grad  => 0.8, wd => 0.05},
-              {centered => 1},
-              {clip_gradient => 0.5, centered => 1},
-              {clip_gradient => 0.4, rescale_grad => 0.14, centered => 1},
-              {rescale_grad  => 0.8, centered => 1},
-              {clip_gradient => 0.5, wd => 0.07, centered => 1},
-              {clip_gradient => 0.4, rescale_grad => 0.14, wd => 0.03, centered => 1},
-              {rescale_grad  => 0.8, wd => 0.05, centered => 1},
-              {clip_gradient => 0.5, clip_weights => 0.01},
-              {clip_gradient => 0.4, rescale_grad => 0.14, clip_weights => 0.01},
-              {rescale_grad  => 0.8, clip_weights => 0.01},
-              {clip_gradient => 0.5, wd => 0.07, clip_weights => 0.01},
-              {clip_gradient => 0.4, rescale_grad => 0.14, wd => 0.03, clip_weights => 0.01},
-              {rescale_grad  => 0.8, wd => 0.05, clip_weights => 0.01},
-              {centered => 1, clip_weights => 0.01},
-              {clip_gradient => 0.5, centered => 1, clip_weights => 0.01},
-              {clip_gradient => 0.4, rescale_grad => 0.14, centered => 1, clip_weights => 0.01},
-              {rescale_grad  => 0.8, centered => 1, clip_weights => 0.01},
-              {clip_gradient => 0.5, wd => 0.07, centered => 1, clip_weights => 0.01},
-              {clip_gradient => 0.4, rescale_grad => 0.14, wd => 0.03, centered => 1, clip_weights => 0.01},
-              {rescale_grad  => 0.8, wd => 0.05, centered => 1, clip_weights => 0.01});
-    for my $kwarg (@kwargs)
-    {
-        compare_optimizer($opt1->new(%$kwarg), $opt2->new(%$kwarg), $shape, 'float32');
-    }
-}
-
-sub test_sgd
-{
-    mx->random->seed(0);
-    my $opt1 = 'PerlSGD';
-    my $opt2 = mx->optimizer->SGD;
-    my $shape = [3, 4, 5];
-    my @mom_options = ({}, {momentum => 0.9});
-    my @cg_options = ({}, {clip_gradient => 0.4}, {clip_gradient => 0.5});
-    my @rg_options = ({}, {rescale_grad => 0.14}, {rescale_grad => 0.8});
-    my @wd_options = ({}, {wd => 0.03}, {wd => 0.05}, {wd => 0.07});
-    my @mp_options = ({}, {multi_precision => 0}, {multi_precision => 1});
-    for my $dtype(qw/float16 float32 float64/)
-    {
-        for my $mom_option (@mom_options)
-        {
-            for my $cg_option (@cg_options)
-            {
-                for my $rg_option (@rg_options)
-                {
-                    for my $wd_option (@wd_options)
-                    {
-                        for my $mp_option (@mp_options)
-                        {
-                            my %kwarg;
-                            %kwarg = (%kwarg, %$mom_option);
-                            %kwarg = (%kwarg, %$cg_option);
-                            %kwarg = (%kwarg, %$rg_option);
-                            %kwarg = (%kwarg, %$wd_option);
-                            %kwarg = (%kwarg, %$mp_option);
-                            next if (
-                                $dtype eq 'float16'
-                                    and
-                                (not exists $kwarg{multi_precision} or not $kwarg{multi_precision})
-                            );
-                            compare_optimizer($opt1->new(%kwarg), $opt2->new(%kwarg), $shape, $dtype);
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
-
-sub test_sparse_sgd
-{
-    mx->random->seed(0);
-    my $opt1 = 'PerlSparseSGD';
-    my $opt2 = mx->optimizer->SGD;
-    my $shape = [3, 4, 5];
-    my @mom_options = ({}, {momentum => 0.9});
-    my @cg_options = ({}, {clip_gradient => 0.4}, {clip_gradient => 0.5});
-    my @rg_options = ({}, {rescale_grad  => 0.14}, {rescale_grad => 0.8});
-    my @wd_options = ({}, {wd => 0.03}, {wd => 0.05}, {wd => 0.07});
-    my @mp_options = ({}, {multi_precision => 0}, {multi_precision => 1});
-    for my $dtype(qw/float32/)
-    {
-        for my $mom_option (@mom_options)
-        {
-            for my $cg_option (@cg_options)
-            {
-                for my $rg_option (@rg_options)
-                {
-                    for my $wd_option (@wd_options)
-                    {
-                        for my $mp_option (@mp_options)
-                        {
-                            my %kwarg;
-                            %kwarg = (%kwarg, %$mom_option);
-                            %kwarg = (%kwarg, %$cg_option);
-                            %kwarg = (%kwarg, %$rg_option);
-                            %kwarg = (%kwarg, %$wd_option);
-                            %kwarg = (%kwarg, %$mp_option);
-                            compare_optimizer($opt1->new(%kwarg), $opt2->new(%kwarg), $shape, $dtype, 'row_sparse', 'row_sparse');
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
-
-sub test_std_sparse_sgd
-{
-    mx->random->seed(0);
-    my $opt1 = 'PerlSparseSGD';
-    my $opt2 = mx->optimizer->SGD;
-    my $shape = [3, 4, 5];
-    my @mom_options = ({momentum => 0.9});
-    my @cg_options = ({}, {clip_gradient => 0.4}, {clip_gradient => 0.5});
-    my @rg_options = ({}, {rescale_grad  => 0.14}, {rescale_grad => 0.8});
-    my @wd_options = ({}, {wd => 0.03}, {wd => 0.05}, {wd => 0.07});
-    for my $dtype(qw/float32/)
-    {
-        for my $mom_option (@mom_options)
-        {
-            for my $cg_option (@cg_options)
-            {
-                for my $rg_option (@rg_options)
-                {
-                    for my $wd_option (@wd_options)
-                    {
-                        my %kwarg;
-                        %kwarg = (%kwarg, %$mom_option);
-                        %kwarg = (%kwarg, %$cg_option);
-                        %kwarg = (%kwarg, %$rg_option);
-                        %kwarg = (%kwarg, %$wd_option);
-                        compare_optimizer($opt1->new(lazy_update => 0, %kwarg), $opt2->new(lazy_update => 0, %kwarg), $shape, $dtype, 'row_sparse', 'row_sparse');
-                    }
-                }
-            }
-        }
-    }
-}
-
-sub test_nag
-{
-    mx->random->seed(0);
-    my $opt1 = 'PerlNAG';
-    my $opt2 = mx->optimizer->NAG;
-    my $shape = [3, 4, 5];
-    my @mom_options = ({}, {momentum => 0.9});
-    my @cg_options = ({}, {clip_gradient => 0.4}, {clip_gradient => 0.5});
-    my @rg_options = ({}, {rescale_grad => 0.14}, {rescale_grad => 0.8});
-    my @wd_options = ({}, {wd => 0.03}, {wd => 0.05}, {wd => 0.07});
-    my @mp_options = ({}, {multi_precision => 0}, {multi_precision => 1});
-    for my $dtype(qw/float16 float32 float64/)
-    {
-        for my $mom_option (@mom_options)
-        {
-            for my $cg_option (@cg_options)
-            {
-                for my $rg_option (@rg_options)
-                {
-                    for my $wd_option (@wd_options)
-                    {
-                        for my $mp_option (@mp_options)
-                        {
-                            my %kwarg;
-                            %kwarg = (%kwarg, %$mom_option);
-                            %kwarg = (%kwarg, %$cg_option);
-                            %kwarg = (%kwarg, %$rg_option);
-                            %kwarg = (%kwarg, %$wd_option);
-                            # %kwarg = (%kwarg, %$mp_option);
-                            next if (
-                                $dtype eq 'float16'
-                                    and
-                                (not exists $kwarg{multi_precision} or not $kwarg{multi_precision})
-                            );
-                            compare_optimizer($opt1->new(%kwarg), $opt2->new(%kwarg), $shape, $dtype);
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
-
-sub test_ftml
-{
-    mx->random->seed(0);
-    my $opt1 = 'PerlFTML';
-    my $opt2 = mx->optimizer->FTML;
-    my $shape = [3, 4, 5];
-    my @beta1_options = ({}, {beta1 => 0.5}, {beta1 => 0.7});
-    my @beta2_options = ({}, {beta1 => 0.8}, {beta1 => 0.9});
-    my @cg_options = ({}, {clip_gradient => 0.4}, {clip_gradient => 0.5});
-    my @rg_options = ({}, {rescale_grad => 0.14}, {rescale_grad => 0.8});
-    my @wd_options = ({}, {wd => 0.03}, {wd => 0.05}, {wd => 0.07});
-    for my $dtype(qw/float32/)
-    {
-        for my $beta1_option (@beta1_options)
-        {
-            for my $beta2_option (@beta2_options)
-            {
-                for my $rg_option (@rg_options)
-                {
-                    for my $wd_option (@wd_options)
-                    {
-                        for my $cg_option (@cg_options)
-                        {
-                            my %kwarg;
-                            %kwarg = (%kwarg, %$beta1_option);
-                            %kwarg = (%kwarg, %$beta2_option);
-                            %kwarg = (%kwarg, %$cg_option);
-                            %kwarg = (%kwarg, %$rg_option);
-                            %kwarg = (%kwarg, %$wd_option);
-                            compare_optimizer($opt1->new(%kwarg), $opt2->new(%kwarg), $shape, $dtype);
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
-
-sub test_signum
-{
-    mx->random->seed(0);
-    my $opt1 = 'PerlSignum';
-    my $opt2 = mx->optimizer->Signum;
-    my $shape = [3, 4, 5];
-    my @cg_options = ({}, {clip_gradient => 0.4}, {clip_gradient => 0.5});
-    my @rg_options = ({}, {rescale_grad => 0.14}, {rescale_grad => 0.8});
-    my @wd_options = ({}, {wd => 0.03}, {wd => 0.05}, {wd => 0.07});
-    my @wd_lh_options = ({}, {wd_lh => 0.015}, {wd_lh => 0.0});
-    my @mom_options = ({}, {momentum => 0.9});
-    my @lr_options = ({learning_rate => 0.05}, {learning_rate => 0.01});
-    for my $dtype (qw/float32 float64/)
-    {
-        for my $wd_lh_option (@wd_lh_options)
-        {
-            for my $mom_option (@mom_options)
-            {
-                for my $rg_option (@rg_options)
-                {
-                    for my $wd_option (@wd_options)
-                    {
-                        for my $cg_option (@cg_options)
-                        {
-                            for my $lr_option (@lr_options)
-                            {
-                                my %kwarg;
-                                %kwarg = (%kwarg, %$wd_lh_option);
-                                %kwarg = (%kwarg, %$mom_option);
-                                %kwarg = (%kwarg, %$lr_option);
-                                %kwarg = (%kwarg, %$cg_option);
-                                %kwarg = (%kwarg, %$rg_option);
-                                %kwarg = (%kwarg, %$wd_option);
-                                compare_optimizer($opt1->new(%kwarg), $opt2->new(%kwarg), $shape, $dtype);
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
-
-
-func test_lr_wd_mult()
-{
-    my $data = mx->sym->Variable('data');
-    my $bias = mx->sym->Variable('fc1_bias', lr_mult => 1.0);
-    my $fc1  = mx->sym->FullyConnected({ data => $data, bias => $bias, name => 'fc1', num_hidden => 10, lr_mult => 0 });
-    my $fc2  = mx->sym->FullyConnected({ data => $fc1, name => 'fc2', num_hidden => 10, wd_mult => 0.5 });
-
-    my $mod = mx->mod->new(symbol => $fc2, label_names => undef);
-    $mod->bind(data_shapes => [['data', [5,10]]]);
-    $mod->init_params(initializer => mx->init->Uniform(scale => 1.0));
-    $mod->init_optimizer(optimizer_params => { learning_rate => "1.0" });
-    my %args1 = %{ ($mod->get_params())[0] };
-    for my $k (keys %args1)
-    {
-        $args1{$k} = $args1{$k}->aspdl;
-    }
-    $mod->forward(AI::MXNet::DataBatch->new(data=>[mx->random->uniform({low=>-1.0, high=>1.0, shape=>[5,10]})], label=>undef), is_train=>1);
-    $mod->backward($mod->get_outputs());
-    $mod->update();
-    my %args2 = %{ ($mod->get_params())[0] };
-    for my $k (keys %args2)
-    {
-        $args2{$k} = $args2{$k}->aspdl;
-    }
-    is_deeply($mod->_p->_optimizer->lr_mult, { fc1_bias => 1, fc1_weight => 0 }, "lr_mult");
-    is_deeply($mod->_p->_optimizer->wd_mult, { fc2_bias => 0.5, fc2_weight => 0.5, fc1_bias => 0, }, "wd_mult");
-    ok(almost_equal($args1{fc1_weight}, $args2{fc1_weight}, 1e-10), "fc1_weight");
-    ok(!almost_equal($args1{fc1_bias}, $args2{fc1_bias}, 1e-1), "fc1_bias");
-    ok(!almost_equal($args1{fc2_weight}, $args2{fc2_weight}, 1e-1), "fc2_weight");
-}
-
-sub test_ftrl
-{
-    mx->random->seed(0);
-    my $opt1 = 'PerlFtrl';
-    my $opt2 = mx->optimizer->Ftrl;
-    my $shape = [3, 4, 5];
-    my @kwargs = ({},
-              {clip_gradient => 0.5},
-              {clip_gradient => 0.4, rescale_grad => 0.14},
-              {rescale_grad =>  0.8},
-              {clip_gradient =>  0.5, wd => 0.07},
-              {clip_gradient => 0.4, rescale_grad => 0.14, wd => 0.03},
-              {rescale_grad => 0.8, wd => 0.05},
-              {rescale_grad => 0.8, wd => 0.05, lamda1 => 0.01},
-              {clip_gradient => 0.5, wd => 0.07, lamda1 => 1.0});
-    for my $kwarg (@kwargs)
-    {
-        compare_optimizer($opt1->new(%$kwarg), $opt2->new(%$kwarg), $shape, 'float32');
-        compare_optimizer($opt1->new(sparse_update=>1, %$kwarg), $opt2->new(%$kwarg), $shape,
-                          'float32', 'row_sparse', 'row_sparse');
-    }
-}
-
-sub test_adagrad
-{
-    mx->random->seed(0);
-    my $opt1 = 'PerlAdaGrad';
-    my $opt2 = mx->optimizer->AdaGrad;
-    my $shape = [3, 4, 5];
-    my @eps_options= ({}, {epsilon => 1e-9});
-    my @cg_options = ({}, {clip_gradient => 0.4}, {clip_gradient => 0.5});
-    my @rg_options = ({}, {rescale_grad  => 0.14}, {rescale_grad => 0.8});
-    my @wd_options = ({}, {wd => 0});
-    for my $dtype(qw/float32/)
-    {
-        for my $eps_option (@eps_options)
-        {
-            for my $cg_option (@cg_options)
-            {
-                for my $rg_option (@rg_options)
-                {
-                    for my $wd_option (@wd_options)
-                    {
-                        my %kwarg;
-                        %kwarg = (%kwarg, %$eps_option);
-                        %kwarg = (%kwarg, %$cg_option);
-                        %kwarg = (%kwarg, %$rg_option);
-                        %kwarg = (%kwarg, %$wd_option);
-                        compare_optimizer($opt1->new(%kwarg), $opt2->new(%kwarg), $shape, $dtype);
-			if($wd_option->{wd} == 0)
-			{
-			    compare_optimizer($opt1->new(%kwarg), $opt2->new(%kwarg), $shape, $dtype, 'row_sparse', 'row_sparse');
-			    compare_optimizer($opt1->new(%kwarg), $opt2->new(%kwarg), $shape, $dtype, 'default', 'row_sparse');
-			}
-                    }
-                }
-            }
-        }
-    }
-}
-
-test_adam();
-test_rms();
-test_sgd();
-test_std_sparse_sgd();
-test_sparse_sgd();
-test_nag();
-test_ftml();
-test_signum();
-test_ftrl();
-test_adagrad();
-test_lr_wd_mult();
-
-
diff --git a/perl-package/AI-MXNet/t/test_random.t b/perl-package/AI-MXNet/t/test_random.t
deleted file mode 100644
index f049679cbdbd..000000000000
--- a/perl-package/AI-MXNet/t/test_random.t
+++ /dev/null
@@ -1,248 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-use strict;
-use warnings;
-use Test::More tests => 515;
-use AI::MXNet qw(mx);
-use AI::MXNet::TestUtils qw(same enumerate);
-
-sub check_with_device
-{
-    my ($device, $dtype) = @_;
-    my $tol = 0.1;
-    my @symbols = (
-        {
-            name   => 'normal',
-            symbol => sub { mx->sym->random->normal(@_) },
-            ndop   => sub { mx->nd->random->normal(@_)  },
-            params => { loc => 10.0, scale => 0.5 },
-            inputs => [ [loc => [ [ 0.0, 2.5 ], [ -9.75, -7.0 ] ]] , [scale => [ [ 1.0, 3.7 ], [ 4.2, 1.5 ] ]] ],
-            checks => [
-                [mean => sub { my ($x, $params) = @_; $x->astype('float64')->aspdl->avg - $params->{loc} }, $tol],
-                [std  => sub { my ($x, $params) = @_; ($x->astype('float64')->aspdl->stats)[6] - $params->{scale} }, $tol]
-            ]
-        },
-        {
-            name => 'randn',
-            ndop => sub { mx->nd->random->randn(@_) },
-            params => { loc => 10.0, scale => 0.5 },
-            checks => [
-                [mean => sub { my ($x, $params) = @_; $x->astype('float64')->aspdl->avg - $params->{loc} }, $tol],
-                [std  => sub { my ($x, $params) = @_; ($x->astype('float64')->aspdl->stats)[6] - $params->{scale} }, $tol]
-            ]
-        },
-        {
-            name   => 'uniform',
-            symbol => sub { mx->sym->random->uniform(@_) },
-            ndop   => sub { mx->nd->random->uniform(@_)  },
-            params => { low => -1.5, high => 3 },
-            inputs => [ [low => [ [ 0.0, 2.5 ], [ -9.75, -1.0 ] ]] , [high => [ [ 1.0, 3.7 ], [ 4.2, 10.5 ] ]] ],
-            checks => [
-                [mean => sub { my ($x, $params) = @_; $x->astype('float64')->aspdl->avg - ($params->{low} + $params->{high})/2 }, $tol],
-                [std  => sub { my ($x, $params) = @_; ($x->astype('float64')->aspdl->stats)[6] - sqrt(1/12) * ($params->{high} - $params->{low}) }, $tol]
-            ]
-        },
-        {
-            name   => 'gamma',
-            symbol => sub { mx->sym->random->gamma(@_) },
-            ndop   => sub { mx->nd->random->gamma(@_)  },
-            params => { alpha => 9, beta => 0.5 },
-            inputs => [ [alpha => [ [ 0.0, 2.5 ], [ 9.75, 11 ] ]] , [beta => [ [ 1, 0.7 ], [ 0.5, 0.3 ] ]] ],
-            checks => [
-                [mean => sub { my ($x, $params) = @_; $x->astype('float64')->aspdl->avg - $params->{alpha} * $params->{beta} }, $tol],
-                [std  => sub { my ($x, $params) = @_; ($x->astype('float64')->aspdl->stats)[6] - sqrt($params->{alpha} * $params->{beta}**2) }, $tol]
-            ]
-        },
-        {
-            name   => 'exponential',
-            symbol => sub { mx->sym->random->exponential(@_) },
-            ndop   => sub { mx->nd->random->exponential(@_)  },
-            params => { scale => 1/4 },
-            inputs => [ [scale => [ [ 1/1, 1/8.5 ], [ 1/2.7, 1/0.5 ] ]] ],
-            checks => [
-                [mean => sub { my ($x, $params) = @_; $x->astype('float64')->aspdl->avg - $params->{scale} }, $tol],
-                [std  => sub { my ($x, $params) = @_; ($x->astype('float64')->aspdl->stats)[6] - $params->{scale} }, $tol]
-            ]
-        },
-        {
-            name   => 'poisson',
-            symbol => sub { mx->sym->random->poisson(@_) },
-            ndop   => sub { mx->nd->random->poisson(@_)  },
-            params => { lam => 4 },
-            inputs => [ [lam => [ [ 1, 8.5 ], [ 2.7, 0.5 ] ]] ],
-            checks => [
-                [mean => sub { my ($x, $params) = @_; $x->astype('float64')->aspdl->avg - $params->{lam} }, $tol],
-                [std  => sub { my ($x, $params) = @_; ($x->astype('float64')->aspdl->stats)[6] - sqrt($params->{lam}) }, $tol]
-            ]
-        },
-        {
-            name   => 'neg-binomial',
-            symbol => sub { mx->sym->random->negative_binomial(@_) },
-            ndop   => sub { mx->nd->random->negative_binomial(@_)  },
-            params => { k => 3, p => 0.4 },
-            inputs => [ [k => [ [ 3, 4 ], [ 5, 6 ] ]] , [p => [ [ 0.4, 0.77 ], [ 0.5, 0.84 ] ]] ],
-            checks => [
-                [mean => sub { my ($x, $params) = @_; $x->astype('float64')->aspdl->avg - $params->{k}*(1-$params->{p})/$params->{p} }, $tol],
-                [std  => sub { my ($x, $params) = @_; ($x->astype('float64')->aspdl->stats)[6] - sqrt($params->{k}*(1-$params->{p}))/$params->{p} }, $tol]
-            ]
-        },
-        {
-            name   => 'gen-neg-binomial',
-            symbol => sub { mx->sym->random->generalized_negative_binomial(@_) },
-            ndop   => sub { mx->nd->random->generalized_negative_binomial(@_)  },
-            params => { mu => 2, alpha => 0.3 },
-            inputs => [ [mu => [ [ 2, 2.5 ], [ 1.3, 1.9 ] ]] , [alpha => [ [ 1.0, 0.1 ], [ 0.2, 0.5 ] ]] ],
-            checks => [
-                [mean => sub { my ($x, $params) = @_; $x->astype('float64')->aspdl->avg - $params->{mu} }, $tol],
-                [std  => sub { my ($x, $params) = @_; ($x->astype('float64')->aspdl->stats)[6] - sqrt($params->{mu}+$params->{alpha}*$params->{mu}**2) }, $tol]
-            ]
-        },
-    );
-    my $shape = [1000, 1000];
-    for my $symbdic (@symbols)
-    {
-        my $name = $symbdic->{name};
-        my $ndop = $symbdic->{ndop};
-
-        # check directly
-        my %params = %{ $symbdic->{params} };
-        %params = (%params, shape=>$shape, dtype=>$dtype, ctx=>$device);
-        mx->random->seed(128);
-        my $ret1 = $ndop->(%params);
-        mx->random->seed(128);
-        my $ret2 = $ndop->(%params);
-        ok(same($ret1->aspdl, $ret2->aspdl), "simple $name");
-
-        for my $d (@{ $symbdic->{checks} })
-        {
-            my ($check_name, $check_func, $tol) = @$d;
-            ok((abs($check_func->($ret1, \%params)) < $tol), "simple $name, $check_name");
-        }
-
-        # check multi-distribution sampling, only supports cpu for now
-        next unless $symbdic->{inputs};
-        %params = (shape=>$shape, dtype=>$dtype, ctx=>$device);
-        %params = (%params, map { $_->[0] => mx->nd->array($_->[1], ctx=>$device, dtype=>$dtype) } @{ $symbdic->{inputs} });
-        mx->random->seed(128);
-        $ret1 = $ndop->(%params);
-        mx->random->seed(128);
-        $ret2 = $ndop->(%params);
-        ok(same($ret1->aspdl, $ret2->aspdl), "advanced $name");
-
-        for my $i (0,1)
-        {
-            for my $j (0,1)
-            {
-                my %stats = map { $_->[0] => $_->[1][$i][$j] } @{ $symbdic->{inputs} };
-                for my $d (@{ $symbdic->{checks} })
-                {
-                    my ($check_name, $check_func, $tol) = @$d;
-                    ok((abs($check_func->($ret2->at($i)->at($j), \%stats)) < $tol), "advanced $name, $check_name");
-                }
-            }
-        }
-
-        # check symbolic
-        my $symbol = $symbdic->{symbol};
-        next if not $symbol;
-        my $X = mx->sym->Variable("X");
-        %params = %{ $symbdic->{params} };
-        %params = (%params, shape=>$shape, dtype=>$dtype);
-        my $Y = $symbol->(%params) + $X;
-        my $x = mx->nd->zeros($shape, dtype=>$dtype, ctx=>$device);
-        my $xgrad = mx->nd->zeros($shape, dtype=>$dtype, ctx=>$device);
-        my $yexec = $Y->bind(ctx => $device, args => { X => $x }, args_grad => { X => $xgrad });
-        mx->random->seed(128);
-        $yexec->forward(1);
-        $yexec->backward($yexec->outputs->[0]);
-        my $un1 = ($yexec->outputs->[0] - $x)->copyto($device);
-        ok(same($xgrad->aspdl, $un1->aspdl), "symbolic simple");
-        mx->random->seed(128);
-        $yexec->forward();
-        my $un2 = ($yexec->outputs->[0] - $x)->copyto($device);
-        ok(same($un1->aspdl, $un2->aspdl), "symbolic simple $name");
-
-        for my $d (@{ $symbdic->{checks} })
-        {
-            my ($check_name, $check_func, $tol) = @$d;
-            ok((abs($check_func->($un1, \%params)) < $tol), "symbolic $name, $check_name");
-        }
-
-        # check multi-distribution sampling, only supports cpu for now
-        $symbol = $symbdic->{symbol};
-        %params = (shape=>$shape, dtype=>$dtype);
-        my $single_param = @{ $symbdic->{inputs} } == 1;
-        my $v1 = mx->sym->Variable('v1');
-        my $v2 = mx->sym->Variable('v2');
-        $Y = $symbol->($single_param ? ($v1) : ($v1, $v2), %params);
-        my $bindings = { v1 => mx->nd->array($symbdic->{inputs}[0][1]) };
-        if(not $single_param)
-        {
-            $bindings->{v2} = mx->nd->array($symbdic->{inputs}[1][1]);
-        }
-        $yexec = $Y->bind(ctx=>$device, args=>$bindings);
-        $yexec->forward();
-        $un1 = $yexec->outputs->[0]->copyto($device);
-        %params = ();
-        enumerate(sub {
-            my ($i, $r) = @_;
-            enumerate(sub {
-                my ($j, $p1) = @_;
-                $params{ $symbdic->{inputs}[0][0] } = $p1;
-                if(not $single_param)
-                {
-                    $params{ $symbdic->{inputs}[1][0] } = $symbdic->{inputs}[1][1][$i][$j];
-                }
-                my $samples = $un1->at($i)->at($j);
-                for my $d (@{ $symbdic->{checks} })
-                {
-                    my ($check_name, $check_func, $tol) = @$d;
-                    ok((abs($check_func->($samples, \%params)) < $tol), "symbolic advanced $name, $check_name");
-                }
-            }, $r);
-        }, $symbdic->{inputs}[0][1]);
-    }
-}
-
-sub test_random
-{
-    check_with_device(mx->context->current_context(), 'float16');
-    check_with_device(mx->context->current_context(), 'float32');
-    check_with_device(mx->context->current_context(), 'float64');
-}
-
-test_random();
-
-sub test_sample_multinomial
-{
-    my $x = mx->nd->array([[0,1,2,3,4],[4,3,2,1,0]])/10.0;
-    ok(@{ mx->nd->random->multinomial($x, shape=>1000, get_prob=>1) }, "multiminomial");
-}
-
-test_sample_multinomial();
-
-sub test_seed_context
-{
-    ## only checking perl/swig interaction
-    ## c++ implementation is tested on python's side thoroughly already
-    mx->random->seed(1234);
-    mx->random->seed(1234, ctx => mx->cpu(0));
-    ok(1);
-}
-
-test_seed_context();
diff --git a/perl-package/AI-MXNet/t/test_recordio.t b/perl-package/AI-MXNet/t/test_recordio.t
deleted file mode 100644
index 05722228c75f..000000000000
--- a/perl-package/AI-MXNet/t/test_recordio.t
+++ /dev/null
@@ -1,90 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-use strict;
-use warnings;
-use AI::MXNet qw(mx);
-use Test::More tests => 1711;
-use File::Temp qw/tempfile/;
-use PDL;
-
-sub test_recordio
-{
-    my ($fd, $frec) = tempfile();
-    my $N = 255;
-
-    my $writer = mx->recordio->MXRecordIO($frec, 'w');
-    for my $i (0..$N-1)
-    {
-        $writer->write(chr($i));
-    }
-    undef $writer;
-
-    my $reader = mx->recordio->MXRecordIO($frec, 'r');
-    for my $i (0..$N-1)
-    {
-        my $res = $reader->read;
-        is($res, chr($i));
-    }
-}
-
-sub test_indexed_recordio
-{
-    my ($fi, $fidx) = tempfile();
-    my ($fr, $frec) = tempfile();
-    my $N = 255;
-
-    my $writer = mx->recordio->MXIndexedRecordIO($fidx, $frec, 'w');
-    for my $i (0..$N-1)
-    {
-        $writer->write_idx($i, chr($i));
-    }
-    undef $writer;
-
-    my $reader = mx->recordio->MXIndexedRecordIO($fidx, $frec, 'r');
-    my @keys = @{ $reader->keys };
-    is_deeply([sort {$a <=> $b} @keys], [0..$N-1]);
-    @keys = List::Util::shuffle(@keys);
-    for my $i (@keys)
-    {
-        my $res = $reader->read_idx($i);
-        is($res, chr($i));
-    }
-}
-
-sub test_recordio_pack_label
-{
-    my $N = 25;
-    my @ascii_uppercase_and_digits = ('A'..'Z', 0..9);
-    for my $i (1..$N-1)
-    {
-        for my $j (0..$N-1)
-        {
-            my $content = join('', map { $ascii_uppercase_and_digits[int(rand(36))] } 0..$j-1);
-            my $label = mx->nd->array(random($i), dtype => 'float32')->aspdl;
-            my $header = [0, $label, 0, 0];
-            my $s = mx->recordio->pack($header, $content);
-            my ($rheader, $rcontent) = mx->recordio->unpack($s);
-            ok(($label == $rheader->label)->all);
-            ok($content eq $rcontent);
-        }
-    }
-}
-
-test_recordio_pack_label();
-test_recordio();
-test_indexed_recordio();
\ No newline at end of file
diff --git a/perl-package/AI-MXNet/t/test_rnn.t b/perl-package/AI-MXNet/t/test_rnn.t
deleted file mode 100644
index da382547f5c3..000000000000
--- a/perl-package/AI-MXNet/t/test_rnn.t
+++ /dev/null
@@ -1,290 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-use strict;
-use warnings;
-use AI::MXNet qw(mx);
-use AI::MXNet::TestUtils qw(same);
-use PDL;
-use Test::More tests => 54;
-
-sub test_rnn
-{
-    my $cell = mx->rnn->RNNCell(100, prefix=>'rnn_');
-    my ($outputs) = $cell->unroll(3, input_prefix=>'rnn_');
-    $outputs = mx->sym->Group($outputs);
-    is_deeply([sort keys %{$cell->params->_params}], ['rnn_h2h_bias', 'rnn_h2h_weight', 'rnn_i2h_bias', 'rnn_i2h_weight']);
-    is_deeply($outputs->list_outputs(), ['rnn_t0_out_output', 'rnn_t1_out_output', 'rnn_t2_out_output']);
-    my (undef, $outs, undef) = $outputs->infer_shape(rnn_t0_data=>[10,50], rnn_t1_data=>[10,50], rnn_t2_data=>[10,50]);
-    is_deeply($outs, [[10, 100], [10, 100], [10, 100]]);
-}
-
-sub test_lstm
-{
-    my $cell = mx->rnn->LSTMCell(100, prefix=>'rnn_', forget_bias => 1);
-    my($outputs) = $cell->unroll(3, input_prefix=>'rnn_');
-    $outputs = mx->sym->Group($outputs);
-    is_deeply([sort keys %{$cell->params->_params}], ['rnn_h2h_bias', 'rnn_h2h_weight', 'rnn_i2h_bias', 'rnn_i2h_weight']);
-    is_deeply($outputs->list_outputs(), ['rnn_t0_out_output', 'rnn_t1_out_output', 'rnn_t2_out_output']);
-    my (undef, $outs, undef) = $outputs->infer_shape(rnn_t0_data=>[10,50], rnn_t1_data=>[10,50], rnn_t2_data=>[10,50]);
-    is_deeply($outs, [[10, 100], [10, 100], [10, 100]]);
-}
-
-sub test_lstm_forget_bias
-{
-    my $forget_bias = 2;
-    my $stack = mx->rnn->SequentialRNNCell();
-    $stack->add(mx->rnn->LSTMCell(100, forget_bias=>$forget_bias, prefix=>'l0_'));
-    $stack->add(mx->rnn->LSTMCell(100, forget_bias=>$forget_bias, prefix=>'l1_'));
-
-    my $dshape = [32, 1, 200];
-    my $data   = mx->sym->Variable('data');
-
-    my ($sym) = $stack->unroll(1, inputs => $data, merge_outputs => 1);
-    my $mod = mx->mod->Module($sym, context => mx->cpu(0));
-    $mod->bind(data_shapes=>[['data', $dshape]]);
-
-    $mod->init_params();
-    my ($bias_argument) = grep { /i2h_bias$/ } @{ $sym->list_arguments };
-    my $f = zeros(100);
-    my $expected_bias = $f->glue(0, $forget_bias * ones(100), zeros(200));
-    ok(
-        ((($mod->get_params())[0]->{$bias_argument}->aspdl - $expected_bias)->abs < 1e-07)->all
-    );
-}
-
-sub test_gru
-{
-    my $cell = mx->rnn->GRUCell(100, prefix=>'rnn_');
-    my($outputs) = $cell->unroll(3, input_prefix=>'rnn_');
-    $outputs = mx->sym->Group($outputs);
-    is_deeply([sort keys %{$cell->params->_params}], ['rnn_h2h_bias', 'rnn_h2h_weight', 'rnn_i2h_bias', 'rnn_i2h_weight']);
-    is_deeply($outputs->list_outputs(), ['rnn_t0_out_output', 'rnn_t1_out_output', 'rnn_t2_out_output']);
-    my (undef, $outs, undef) = $outputs->infer_shape(rnn_t0_data=>[10,50], rnn_t1_data=>[10,50], rnn_t2_data=>[10,50]);
-    is_deeply($outs, [[10, 100], [10, 100], [10, 100]]);
-}
-
-sub test_residual
-{
-    my $cell = mx->rnn->ResidualCell(mx->rnn->GRUCell(50, prefix=>'rnn_'));
-    my $inputs = [map { mx->sym->Variable("rnn_t${_}_data") } 0..1];
-    my ($outputs)= $cell->unroll(2, inputs => $inputs);
-    $outputs = mx->sym->Group($outputs);
-    is_deeply(
-        [sort keys %{ $cell->params->_params }],
-        ['rnn_h2h_bias', 'rnn_h2h_weight', 'rnn_i2h_bias', 'rnn_i2h_weight']
-    );
-    is_deeply(
-        $outputs->list_outputs,
-        ['rnn_t0_out_plus_residual_output', 'rnn_t1_out_plus_residual_output']
-    );
-
-    my (undef, $outs) = $outputs->infer_shape(rnn_t0_data=>[10, 50], rnn_t1_data=>[10, 50]);
-    is_deeply($outs, [[10, 50], [10, 50]]);
-    $outputs = $outputs->eval(args => {
-        rnn_t0_data=>mx->nd->ones([10, 50]),
-        rnn_t1_data=>mx->nd->ones([10, 50]),
-        rnn_i2h_weight=>mx->nd->zeros([150, 50]),
-        rnn_i2h_bias=>mx->nd->zeros([150]),
-        rnn_h2h_weight=>mx->nd->zeros([150, 50]),
-        rnn_h2h_bias=>mx->nd->zeros([150])
-    });
-    my $expected_outputs = mx->nd->ones([10, 50])->aspdl;
-    same(@{$outputs}[0]->aspdl, $expected_outputs);
-    same(@{$outputs}[1]->aspdl, $expected_outputs);
-}
-
-sub test_residual_bidirectional
-{
-    my $cell = mx->rnn->ResidualCell(
-        mx->rnn->BidirectionalCell(
-            mx->rnn->GRUCell(25, prefix=>'rnn_l_'),
-            mx->rnn->GRUCell(25, prefix=>'rnn_r_')
-        )
-    );
-    my $inputs = [map { mx->sym->Variable("rnn_t${_}_data") } 0..1];
-    my ($outputs) = $cell->unroll(2, inputs => $inputs, merge_outputs=>0);
-    $outputs = mx->sym->Group($outputs);
-    is_deeply(
-        [sort keys %{ $cell->params->_params }],
-        ['rnn_l_h2h_bias', 'rnn_l_h2h_weight', 'rnn_l_i2h_bias', 'rnn_l_i2h_weight',
-        'rnn_r_h2h_bias', 'rnn_r_h2h_weight', 'rnn_r_i2h_bias', 'rnn_r_i2h_weight']
-    );
-    is_deeply(
-        $outputs->list_outputs,
-        ['bi_t0_plus_residual_output', 'bi_t1_plus_residual_output']
-    );
-
-    my (undef, $outs) = $outputs->infer_shape(rnn_t0_data=>[10, 50], rnn_t1_data=>[10, 50]);
-    is_deeply($outs, [[10, 50], [10, 50]]);
-    $outputs = $outputs->eval(args => {
-        rnn_t0_data=>mx->nd->ones([10, 50])+5,
-        rnn_t1_data=>mx->nd->ones([10, 50])+5,
-        rnn_l_i2h_weight=>mx->nd->zeros([75, 50]),
-        rnn_l_i2h_bias=>mx->nd->zeros([75]),
-        rnn_l_h2h_weight=>mx->nd->zeros([75, 25]),
-        rnn_l_h2h_bias=>mx->nd->zeros([75]),
-        rnn_r_i2h_weight=>mx->nd->zeros([75, 50]),
-        rnn_r_i2h_bias=>mx->nd->zeros([75]),
-        rnn_r_h2h_weight=>mx->nd->zeros([75, 25]),
-        rnn_r_h2h_bias=>mx->nd->zeros([75])
-    });
-    my $expected_outputs = (mx->nd->ones([10, 50])+5)->aspdl;
-    ok(same(@{$outputs}[0]->aspdl, $expected_outputs));
-    ok(same(@{$outputs}[1]->aspdl, $expected_outputs));
-}
-
-sub test_stack
-{
-    my $cell = mx->rnn->SequentialRNNCell();
-    for my $i (0..4)
-    {
-        if($i == 1)
-        {
-            $cell->add(mx->rnn->ResidualCell(mx->rnn->LSTMCell(100, prefix=>"rnn_stack${i}_")));
-        }
-        else
-        {
-            $cell->add(mx->rnn->LSTMCell(100, prefix=>"rnn_stack${i}_"));
-        }
-    }
-    my ($outputs) = $cell->unroll(3, input_prefix=>'rnn_');
-    $outputs = mx->sym->Group($outputs);
-    my %params = %{ $cell->params->_params };
-    for my $i (0..4)
-    {
-        ok(exists $params{"rnn_stack${i}_h2h_weight"});
-        ok(exists $params{"rnn_stack${i}_h2h_bias"});
-        ok(exists $params{"rnn_stack${i}_i2h_weight"});
-        ok(exists $params{"rnn_stack${i}_i2h_bias"});
-    }
-    is_deeply($outputs->list_outputs(), ['rnn_stack4_t0_out_output', 'rnn_stack4_t1_out_output', 'rnn_stack4_t2_out_output']);
-    my (undef, $outs, undef) = $outputs->infer_shape(rnn_t0_data=>[10,50], rnn_t1_data=>[10,50], rnn_t2_data=>[10,50]);
-    is_deeply($outs, [[10, 100], [10, 100], [10, 100]]);
-}
-
-sub test_bidirectional
-{
-    my $cell = mx->rnn->BidirectionalCell(
-        mx->rnn->LSTMCell(100, prefix=>'rnn_l0_'),
-        mx->rnn->LSTMCell(100, prefix=>'rnn_r0_'),
-        output_prefix=>'rnn_bi_'
-    );
-    my ($outputs) = $cell->unroll(3, input_prefix=>'rnn_');
-    $outputs = mx->sym->Group($outputs);
-    is_deeply($outputs->list_outputs(), ['rnn_bi_t0_output', 'rnn_bi_t1_output', 'rnn_bi_t2_output']);
-    my (undef, $outs, undef) = $outputs->infer_shape(rnn_t0_data=>[10,50], rnn_t1_data=>[10,50], rnn_t2_data=>[10,50]);
-    is_deeply($outs, [[10, 200], [10, 200], [10, 200]]);
-}
-
-sub test_unfuse
-{
-    my $cell = mx->rnn->FusedRNNCell(
-        100, num_layers => 1, mode => 'lstm',
-        prefix => 'test_', bidirectional => 1
-    )->unfuse;
-    my ($outputs) = $cell->unroll(3, input_prefix=>'rnn_');
-    $outputs = mx->sym->Group($outputs);
-    is_deeply($outputs->list_outputs(), ['test_bi_lstm_0t0_output', 'test_bi_lstm_0t1_output', 'test_bi_lstm_0t2_output']);
-    my (undef, $outs, undef) = $outputs->infer_shape(rnn_t0_data=>[10,50], rnn_t1_data=>[10,50], rnn_t2_data=>[10,50]);
-    is_deeply($outs, [[10, 200], [10, 200], [10, 200]]);
-}
-
-sub test_zoneout
-{
-    my $cell = mx->rnn->ZoneoutCell(
-        mx->rnn->RNNCell(100, prefix=>'rnn_'),
-        zoneout_outputs => 0.5,
-        zoneout_states  => 0.5
-    );
-    my $inputs = [map { mx->sym->Variable("rnn_t${_}_data") } 0..2];
-    my ($outputs) = $cell->unroll(3, inputs => $inputs);
-    $outputs = mx->sym->Group($outputs);
-    my (undef, $outs) = $outputs->infer_shape(rnn_t0_data=>[10, 50], rnn_t1_data=>[10, 50], rnn_t2_data=>[10, 50]);
-    is_deeply($outs, [[10, 100], [10, 100], [10, 100]]);
-}
-
-sub test_convrnn
-{
-    my $cell = mx->rnn->ConvRNNCell(input_shape => [1, 3, 16, 10], num_hidden=>10,
-                              h2h_kernel=>[3, 3], h2h_dilate=>[1, 1],
-                              i2h_kernel=>[3, 3], i2h_stride=>[1, 1],
-                              i2h_pad=>[1, 1], i2h_dilate=>[1, 1],
-                              prefix=>'rnn_');
-    my $inputs = [map { mx->sym->Variable("rnn_t${_}_data") } 0..2];
-    my ($outputs) = $cell->unroll(3, inputs => $inputs);
-    $outputs = mx->sym->Group($outputs);
-    is_deeply(
-        [sort keys %{ $cell->params->_params }],
-        ['rnn_h2h_bias', 'rnn_h2h_weight', 'rnn_i2h_bias', 'rnn_i2h_weight']
-    );
-    is_deeply($outputs->list_outputs(), ['rnn_t0_out_output', 'rnn_t1_out_output', 'rnn_t2_out_output']);
-    my (undef, $outs) = $outputs->infer_shape(rnn_t0_data=>[1, 3, 16, 10], rnn_t1_data=>[1, 3, 16, 10], rnn_t2_data=>[1, 3, 16, 10]);
-    is_deeply($outs, [[1, 10, 16, 10], [1, 10, 16, 10], [1, 10, 16, 10]]);
-}
-
-sub test_convlstm
-{
-    my $cell = mx->rnn->ConvLSTMCell(input_shape => [1, 3, 16, 10], num_hidden=>10,
-                              h2h_kernel=>[3, 3], h2h_dilate=>[1, 1],
-                              i2h_kernel=>[3, 3], i2h_stride=>[1, 1],
-                              i2h_pad=>[1, 1], i2h_dilate=>[1, 1],
-                              prefix=>'rnn_', forget_bias => 1);
-    my $inputs = [map { mx->sym->Variable("rnn_t${_}_data") } 0..2];
-    my ($outputs) = $cell->unroll(3, inputs => $inputs);
-    $outputs = mx->sym->Group($outputs);
-    is_deeply(
-        [sort keys %{ $cell->params->_params }],
-        ['rnn_h2h_bias', 'rnn_h2h_weight', 'rnn_i2h_bias', 'rnn_i2h_weight']
-    );
-    is_deeply($outputs->list_outputs(), ['rnn_t0_out_output', 'rnn_t1_out_output', 'rnn_t2_out_output']);
-    my (undef, $outs) = $outputs->infer_shape(rnn_t0_data=>[1, 3, 16, 10], rnn_t1_data=>[1, 3, 16, 10], rnn_t2_data=>[1, 3, 16, 10]);
-    is_deeply($outs, [[1, 10, 16, 10], [1, 10, 16, 10], [1, 10, 16, 10]]);
-}
-
-sub test_convgru
-{
-    my $cell = mx->rnn->ConvGRUCell(input_shape => [1, 3, 16, 10], num_hidden=>10,
-                              h2h_kernel=>[3, 3], h2h_dilate=>[1, 1],
-                              i2h_kernel=>[3, 3], i2h_stride=>[1, 1],
-                              i2h_pad=>[1, 1], i2h_dilate=>[1, 1],
-                              prefix=>'rnn_', forget_bias => 1);
-    my $inputs = [map { mx->sym->Variable("rnn_t${_}_data") } 0..2];
-    my ($outputs) = $cell->unroll(3, inputs => $inputs);
-    $outputs = mx->sym->Group($outputs);
-    is_deeply(
-        [sort keys %{ $cell->params->_params }],
-        ['rnn_h2h_bias', 'rnn_h2h_weight', 'rnn_i2h_bias', 'rnn_i2h_weight']
-    );
-    is_deeply($outputs->list_outputs(), ['rnn_t0_out_output', 'rnn_t1_out_output', 'rnn_t2_out_output']);
-    my (undef, $outs) = $outputs->infer_shape(rnn_t0_data=>[1, 3, 16, 10], rnn_t1_data=>[1, 3, 16, 10], rnn_t2_data=>[1, 3, 16, 10]);
-    is_deeply($outs, [[1, 10, 16, 10], [1, 10, 16, 10], [1, 10, 16, 10]]);
-}
-
-test_rnn();
-test_lstm();
-test_lstm_forget_bias();
-test_gru();
-test_residual();
-test_residual_bidirectional();
-test_stack();
-test_bidirectional();
-test_unfuse();
-test_zoneout();
-test_convrnn();
-test_convlstm();
-test_convgru();
diff --git a/perl-package/AI-MXNet/t/test_runtime.t b/perl-package/AI-MXNet/t/test_runtime.t
deleted file mode 100644
index 293ae73096e0..000000000000
--- a/perl-package/AI-MXNet/t/test_runtime.t
+++ /dev/null
@@ -1,62 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-use AI::MXNet qw(mx);
-use AI::MXNet::TestUtils qw(dies_ok);
-use Test::More 'no_plan';
-use Scalar::Util qw(refaddr);
-
-sub test_features
-{
-    my $features = mx->runtime->Features();
-    ok(exists $features->features->{CUDA});
-    ok(keys %{ $features->features } >= 30);
-}
-
-sub test_is_singleton
-{
-    my $x = mx->runtime->Features();
-    my $y = mx->runtime->Features();
-    ok(refaddr($x) == refaddr($y));
-}
-
-sub test_is_enabled
-{
-    my $features = mx->runtime->Features();
-    for my $f (keys %{ $features->features })
-    {
-        if($features->features->{$f})
-        {
-           ok($features->is_enabled($f));
-        }
-        else
-        {
-           ok(not $features->is_enabled($f));
-        }
-    }
-}
-
-sub test_is_enabled_not_existing
-{
-    my $features = mx->runtime->Features();
-    dies_ok(sub { $features->is_enabled("hello world") });
-}
-
-test_features();
-test_is_singleton();
-test_is_enabled();
-test_is_enabled_not_existing();
diff --git a/perl-package/AI-MXNet/t/test_sparse_ndarray.t b/perl-package/AI-MXNet/t/test_sparse_ndarray.t
deleted file mode 100644
index afb0b25aa816..000000000000
--- a/perl-package/AI-MXNet/t/test_sparse_ndarray.t
+++ /dev/null
@@ -1,1006 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-use strict;
-use warnings;
-use Scalar::Util qw(blessed);
-use Test::More 'no_plan';
-use AI::MXNet qw(mx);
-use AI::MXNet::TestUtils qw(zip assert enumerate same rand_shape_2d rand_shape_3d
-    rand_sparse_ndarray random_arrays almost_equal rand_ndarray randint allclose dies_ok);
-use AI::MXNet::Base qw(pones pzeros pdl product rand_sparse);
-$ENV{MXNET_STORAGE_FALLBACK_LOG_VERBOSE} = 0;
-$ENV{MXNET_SUBGRAPH_VERBOSE} = 0;
-
-
-sub sparse_nd_ones
-{
-    my ($shape, $stype) = @_;
-    return mx->nd->ones($shape)->tostype($stype);
-}
-
-sub test_sparse_nd_elemwise_add
-{
-    my $check_sparse_nd_elemwise_binary = sub {
-        my ($shapes, $stypes, $f, $g) = @_;
-        # generate inputs
-        my @nds;
-        enumerate(sub {
-            my ($i, $stype) = @_;
-            my $nd;
-            if($stype eq 'row_sparse')
-            {
-                ($nd) = rand_sparse_ndarray($shapes->[$i], $stype);
-            }
-            elsif($stype eq 'default')
-            {
-                $nd = mx->nd->array(random_arrays($shapes->[$i]), dtype => 'float32');
-            }
-            else
-            {
-                die;
-            }
-            push @nds, $nd;
-        }, $stypes);
-        # check result
-        my $test = $f->($nds[0], $nds[1]);
-        ok(almost_equal($test->aspdl, $g->($nds[0]->aspdl, $nds[1]->aspdl)));
-    };
-    my $num_repeats = 2;
-    my $g = sub { $_[0] + $_[1] };
-    my $op = sub { mx->nd->elemwise_add(@_) };
-    for my $i (0..$num_repeats)
-    {
-        my $shape = rand_shape_2d();
-        $shape = [$shape, $shape];
-        $check_sparse_nd_elemwise_binary->($shape, ['default', 'default'], $op, $g);
-        $check_sparse_nd_elemwise_binary->($shape, ['row_sparse', 'row_sparse'], $op, $g);
-    }
-}
-
-test_sparse_nd_elemwise_add();
-
-sub test_sparse_nd_copy
-{
-    my $check_sparse_nd_copy = sub { my ($from_stype, $to_stype, $shape) = @_;
-        my $from_nd = rand_ndarray($shape, $from_stype);
-        # copy to ctx
-        my $to_ctx = $from_nd->copyto(AI::MXNet::Context->current_ctx);
-        # copy to stype
-        my $to_nd = rand_ndarray($shape, $to_stype);
-        $from_nd->copyto($to_nd);
-        ok(($from_nd->aspdl != $to_ctx->aspdl)->abs->sum == 0);
-        ok(($from_nd->aspdl != $to_nd->aspdl)->abs->sum == 0);
-    };
-    my $shape = rand_shape_2d();
-    my $shape_3d = rand_shape_3d();
-    my @stypes = ('row_sparse', 'csr');
-    for my $stype (@stypes)
-    {
-        $check_sparse_nd_copy->($stype, 'default', $shape);
-        $check_sparse_nd_copy->('default', $stype, $shape);
-    }
-    $check_sparse_nd_copy->('row_sparse', 'row_sparse', $shape_3d);
-    $check_sparse_nd_copy->('row_sparse', 'default', $shape_3d);
-    $check_sparse_nd_copy->('default', 'row_sparse', $shape_3d);
-}
-
-test_sparse_nd_copy();
-
-sub test_sparse_nd_basic
-{
-    my $check_sparse_nd_basic_rsp = sub {
-        my $storage_type = 'row_sparse';
-        my $shape = rand_shape_2d();
-        my ($nd) = rand_sparse_ndarray($shape, $storage_type);
-        ok($nd->_num_aux == 1);
-        ok($nd->indices->dtype eq 'int64');
-        ok($nd->stype eq 'row_sparse');
-    };
-    $check_sparse_nd_basic_rsp->();
-}
-
-test_sparse_nd_basic();
-
-sub test_sparse_nd_setitem
-{
-    my $check_sparse_nd_setitem = sub { my ($stype, $shape, $dst) = @_;
-        my $x = mx->nd->zeros($shape, stype=>$stype);
-        $x .= $dst;
-        my $dst_nd = (blessed $dst and $dst->isa('PDL')) ? mx->nd->array($dst) : $dst;
-        ok(($x->aspdl == (ref $dst_nd ? $dst_nd->aspdl : $dst_nd))->all);
-    };
-
-    my $shape = rand_shape_2d();
-    for my $stype ('row_sparse', 'csr')
-    {
-        # ndarray assignment
-        $check_sparse_nd_setitem->($stype, $shape, rand_ndarray($shape, 'default'));
-        $check_sparse_nd_setitem->($stype, $shape, rand_ndarray($shape, $stype));
-        # numpy assignment
-        $check_sparse_nd_setitem->($stype, $shape, pones(reverse @{ $shape }));
-    }
-    # scalar assigned to row_sparse NDArray
-    $check_sparse_nd_setitem->('row_sparse', $shape, 2);
-}
-
-test_sparse_nd_setitem();
-
-sub test_sparse_nd_slice
-{
-    my $shape = [randint(2, 10), randint(2, 10)];
-    my $stype = 'csr';
-    my ($A) = rand_sparse_ndarray($shape, $stype);
-    my $A2 = $A->aspdl;
-    my $start = randint(0, $shape->[0] - 1);
-    my $end = randint($start + 1, $shape->[0]);
-    ok(same($A->slice([$start, $end])->aspdl, $A2->slice('X', [$start, $end])));
-    ok(same($A->slice([$start - $shape->[0], $end])->aspdl, $A2->slice('X', [$start, $end])));
-    ok(same($A->slice([$start, $shape->[0] - 1])->aspdl, $A2->slice('X', [$start, $shape->[0]-1])));
-    ok(same($A->slice([0, $end])->aspdl, $A2->slice('X', [0, $end])));
-
-    my $start_col = randint(0, $shape->[1] - 1);
-    my $end_col = randint($start_col + 1, $shape->[1]);
-    my $result = $A->slice(begin=>[$start, $start_col], end=>[$end, $end_col]);
-    my $result_dense = mx->nd->array($A2)->slice(begin=>[$start, $start_col], end=>[$end, $end_col]);
-    ok(same($result_dense->aspdl, $result->aspdl));
-
-    $A = mx->nd->sparse->zeros('csr', $shape);
-    $A2 = $A->aspdl;
-    ok(same($A->slice([$start, $end])->aspdl, $A2->slice('X', [$start, $end])));
-    $result = $A->slice(begin=>[$start, $start_col], end=>[$end, $end_col]);
-    $result_dense = mx->nd->array($A2)->slice(begin=>[$start, $start_col], end=>[$end, $end_col]);
-    ok(same($result_dense->aspdl, $result->aspdl));
-
-    my $check_slice_nd_csr_fallback = sub { my ($shape) = @_;
-        my $stype = 'csr';
-        my ($A) = rand_sparse_ndarray($shape, $stype);
-        my $A2 = $A->aspdl;
-        my $start = randint(0, $shape->[0] - 1);
-        my $end = randint($start + 1, $shape->[0]);
-
-        # non-trivial step should fallback to dense slice op
-        my $result = $A->slice(begin=>[$start], end=>[$end+1], step=>[2]);
-        my $result_dense = mx->nd->array($A2)->slice(begin=>[$start], end=>[$end + 1], step=>[2]);
-        ok(same($result_dense->aspdl, $result->aspdl));
-    };
-    $shape = [randint(2, 10), randint(1, 10)];
-    $check_slice_nd_csr_fallback->($shape);
-}
-
-test_sparse_nd_slice();
-
-sub test_sparse_nd_equal
-{
-    for my $stype ('row_sparse', 'csr')
-    {
-        my $shape = rand_shape_2d();
-        my $x = mx->nd->zeros($shape, stype=>$stype);
-        my $y = sparse_nd_ones($shape, $stype);
-        my $z = $x == $y;
-        ok(($z->aspdl == pzeros(reverse @{ $shape }))->all);
-        $z = 0 == $x;
-        ok(($z->aspdl == pones(reverse @{ $shape }))->all);
-    }
-}
-
-test_sparse_nd_equal();
-
-sub test_sparse_nd_not_equal
-{
-    for my $stype ('row_sparse', 'csr')
-    {
-        my $shape = rand_shape_2d();
-        my $x = mx->nd->zeros($shape, stype=>$stype);
-        my $y = sparse_nd_ones($shape, $stype);
-        my $z = $x != $y;
-        ok(($z->aspdl == pones(reverse @{ $shape }))->all);
-        $z = 0 != $x;
-        ok(($z->aspdl == pzeros(reverse @{ $shape }))->all);
-    }
-}
-
-test_sparse_nd_not_equal();
-
-sub test_sparse_nd_greater
-{
-    for my $stype ('row_sparse', 'csr')
-    {
-        my $shape = rand_shape_2d();
-        my $x = mx->nd->zeros($shape, stype=>$stype);
-        my $y = sparse_nd_ones($shape, $stype);
-        my $z = $x > $y;
-        ok(($z->aspdl == pzeros(reverse @{ $shape }))->all);
-        $z = $y > 0;
-        ok(($z->aspdl == pones(reverse @{ $shape }))->all);
-        $z = 0 > $y;
-        ok(($z->aspdl == pzeros(reverse @{ $shape }))->all);
-    }
-}
-
-test_sparse_nd_greater();
-
-sub test_sparse_nd_greater_equal
-{
-    for my $stype ('row_sparse', 'csr')
-    {
-        my $shape = rand_shape_2d();
-        my $x = mx->nd->zeros($shape, stype=>$stype);
-        my $y = sparse_nd_ones($shape, $stype);
-        my $z = $x >= $y;
-        ok(($z->aspdl == pzeros(reverse @{ $shape }))->all);
-        $z = $y >= 0;
-        ok(($z->aspdl == pones(reverse @{ $shape }))->all);
-        $z = 0 >= $y;
-        ok(($z->aspdl == pzeros(reverse @{ $shape }))->all);
-        $z = $y >= 1;
-        ok(($z->aspdl == pones(reverse @{ $shape }))->all);
-    }
-}
-
-test_sparse_nd_greater_equal();
-
-sub test_sparse_nd_lesser
-{
-    for my $stype ('row_sparse', 'csr')
-    {
-        my $shape = rand_shape_2d();
-        my $x = mx->nd->zeros($shape, stype=>$stype);
-        my $y = sparse_nd_ones($shape, $stype);
-        my $z = $y < $x;
-        ok(($z->aspdl == pzeros(reverse @{ $shape }))->all);
-        $z = 0 < $y;
-        ok(($z->aspdl == pones(reverse @{ $shape }))->all);
-        $z = $y < 0;
-        ok(($z->aspdl == pzeros(reverse @{ $shape }))->all);
-    }
-}
-
-test_sparse_nd_lesser();
-
-sub test_sparse_nd_lesser_equal
-{
-    for my $stype ('row_sparse', 'csr')
-    {
-        my $shape = rand_shape_2d();
-        my $x = mx->nd->zeros($shape, stype=>$stype);
-        my $y = sparse_nd_ones($shape, $stype);
-        my $z = $y <= $x;
-        ok(($z->aspdl == pzeros(reverse @{ $shape }))->all);
-        $z = 0 <= $y;
-        ok(($z->aspdl == pones(reverse @{ $shape }))->all);
-        $z = $y <= 0;
-        ok(($z->aspdl == pzeros(reverse @{ $shape }))->all);
-        $z = 1 <= $y;
-        ok(($z->aspdl == pones(reverse @{ $shape }))->all);
-    }
-}
-
-test_sparse_nd_lesser_equal();
-
-sub test_sparse_nd_binary
-{
-    my $N = 2;
-    my $check_binary = sub { my ($fn, $stype) = @_;
-        for (0 .. 2)
-        {
-            my $ndim = 2;
-            my $oshape = [map { randint(1, 6) } 1..$ndim];
-            my $bdim = 2;
-            my @lshape = @$oshape;
-            # one for broadcast op, another for elemwise op
-            my @rshape = @lshape[($ndim-$bdim)..@lshape-1];
-            for my $i (0..$bdim-1)
-            {
-                my $sep = mx->nd->random->uniform(0, 1)->asscalar;
-                if($sep < 0.33)
-                {
-                    $lshape[$ndim-$i-1] = 1;
-                }
-                elsif($sep < 0.66)
-                {
-                    $rshape[$bdim-$i-1] = 1;
-                }
-            }
-            my $lhs = mx->nd->random->uniform(0, 1, shape=>\@lshape)->aspdl;
-            my $rhs = mx->nd->random->uniform(0, 1, shape=>\@rshape)->aspdl;
-            my $lhs_nd = mx->nd->array($lhs)->tostype($stype);
-            my $rhs_nd = mx->nd->array($rhs)->tostype($stype);
-            ok(allclose($fn->($lhs, $rhs), $fn->($lhs_nd, $rhs_nd)->aspdl, 1e-4));
-        }
-    };
-    for my $stype ('row_sparse', 'csr')
-    {
-        $check_binary->(sub { $_[0] +  $_[1] }, $stype);
-        $check_binary->(sub { $_[0] -  $_[1] }, $stype);
-        $check_binary->(sub { $_[0] *  $_[1] }, $stype);
-        $check_binary->(sub { $_[0] /  $_[1] }, $stype);
-        $check_binary->(sub { $_[0] ** $_[1] }, $stype);
-        $check_binary->(sub { $_[0] >  $_[1] }, $stype);
-        $check_binary->(sub { $_[0] <  $_[1] }, $stype);
-        $check_binary->(sub { $_[0] >= $_[1] }, $stype);
-        $check_binary->(sub { $_[0] <= $_[1] }, $stype);
-        $check_binary->(sub { $_[0] == $_[1] }, $stype);
-    }
-}
-
-test_sparse_nd_binary();
-
-sub test_sparse_nd_binary_scalar_op
-{
-    my $N = 3;
-    my $check = sub { my ($fn, $stype) = @_;
-        for (1..$N)
-        {
-            my $ndim = 2;
-            my $shape = [map { randint(1, 6) } 1..$ndim];
-            my $npy = mx->nd->random->normal(0, 1, shape=>$shape)->aspdl;
-            my $nd = mx->nd->array($npy)->tostype($stype);
-            ok(allclose($fn->($npy), $fn->($nd)->aspdl, 1e-4));
-        }
-    };
-    for my $stype ('row_sparse', 'csr')
-    {
-        $check->(sub { 1 +    $_[0] }, $stype);
-        $check->(sub { 1 -    $_[0] }, $stype);
-        $check->(sub { 1 *    $_[0] }, $stype);
-        $check->(sub { 1 /    $_[0] }, $stype);
-        $check->(sub { 2 **   $_[0] }, $stype);
-        $check->(sub { 1 >    $_[0] }, $stype);
-        $check->(sub { 0.5 >  $_[0] }, $stype);
-        $check->(sub { 0.5 <  $_[0] }, $stype);
-        $check->(sub { 0.5 >= $_[0] }, $stype);
-        $check->(sub { 0.5 <= $_[0] }, $stype);
-        $check->(sub { 0.5 == $_[0] }, $stype);
-        $check->(sub { $_[0] / 2    }, $stype);
-    }
-}
-
-test_sparse_nd_binary_scalar_op();
-
-sub test_sparse_nd_binary_iop
-{
-    my $N = 3;
-    my $check_binary = sub { my ($fn, $stype) = @_;
-        for (1..$N)
-        {
-            my $ndim = 2;
-            my $oshape = [map { randint(1, 6) } 1..$ndim];
-            my $lhs = mx->nd->random->uniform(0, 1, shape => $oshape)->aspdl;
-            my $rhs = mx->nd->random->uniform(0, 1, shape => $oshape)->aspdl;
-            my $lhs_nd = mx->nd->array($lhs)->tostype($stype);
-            my $rhs_nd = mx->nd->array($rhs)->tostype($stype);
-            ok(
-                allclose(
-                    $fn->($lhs, $rhs),
-                    $fn->($lhs_nd, $rhs_nd)->aspdl,
-                    1e-4
-                )
-            );
-        }
-    };
-
-    my $inplace_add = sub { my ($x, $y) = @_;
-        $x += $y;
-        return $x
-    };
-    my $inplace_mul = sub { my ($x, $y) = @_;
-        $x *= $y;
-        return $x
-    };
-    my @stypes = ('csr', 'row_sparse');
-    my @fns = ($inplace_add, $inplace_mul);
-    for my $stype (@stypes)
-    {
-        for my $fn (@fns)
-        {
-            $check_binary->($fn, $stype);
-        }
-    }
-}
-
-test_sparse_nd_binary_iop();
-
-sub test_sparse_nd_negate
-{
-    my $check_sparse_nd_negate = sub { my ($shape, $stype) = @_;
-        my $npy = mx->nd->random->uniform(-10, 10, shape => rand_shape_2d())->aspdl;
-        my $arr = mx->nd->array($npy)->tostype($stype);
-        ok(almost_equal($npy, $arr->aspdl));
-        ok(almost_equal(-$npy, (-$arr)->aspdl));
-
-        # a final check to make sure the negation (-) is not implemented
-        # as inplace operation, so the contents of arr does not change after
-        # we compute (-arr)
-        ok(almost_equal($npy, $arr->aspdl));
-    };
-    my $shape = rand_shape_2d();
-    my @stypes = ('csr', 'row_sparse');
-    for my $stype (@stypes)
-    {
-        $check_sparse_nd_negate->($shape, $stype);
-    }
-}
-
-test_sparse_nd_negate();
-
-sub test_sparse_nd_broadcast
-{
-    my $sample_num = 10; # TODO 1000
-    my $test_broadcast_to = sub { my ($stype) = @_;
-        for (1..$sample_num)
-        {
-            my $ndim = 2;
-            my $target_shape = [map { randint(1, 11) } 1..$ndim];
-            my $shape = \@{ $target_shape };
-            my $axis_flags = [map { randint(0, 2) } 1..$ndim];
-            my $axes = [];
-            enumerate(sub {
-                my ($axis, $flag) = @_;
-                if($flag)
-                {
-                    $shape->[$axis] = 1;
-                }
-            }, $axis_flags);
-            my $dat = mx->nd->random->uniform(0, 1, shape => $shape)->aspdl - 0.5;
-            my $pdl_ret = $dat;
-            my $ndarray = mx->nd->array($dat)->tostype($stype);
-            my $ndarray_ret = $ndarray->broadcast_to($target_shape);
-            ok((pdl($ndarray_ret->shape) == pdl($target_shape))->all);
-            my $err = (($ndarray_ret->aspdl - $pdl_ret)**2)->avg;
-            ok($err < 1E-8);
-        }
-    };
-    my @stypes = ('csr', 'row_sparse');
-    for my $stype (@stypes)
-    {
-        $test_broadcast_to->($stype);
-    }
-}
-
-test_sparse_nd_broadcast();
-
-sub test_sparse_nd_transpose
-{
-    my $npy = mx->nd->random->uniform(-10, 10, shape => rand_shape_2d())->aspdl;
-    my @stypes = ('csr', 'row_sparse');
-    for my $stype (@stypes)
-    {
-        my $nd = mx->nd->array($npy)->tostype($stype);
-        ok(almost_equal($npy->transpose, ($nd->T)->aspdl));
-    }
-}
-
-test_sparse_nd_transpose();
-
-sub test_sparse_nd_storage_fallback
-{
-    my $check_output_fallback = sub { my ($shape) = @_;
-        my $ones = mx->nd->ones($shape);
-        my $out = mx->nd->zeros($shape, stype=>'csr');
-        mx->nd->broadcast_add($ones, $ones * 2, out=>$out);
-        ok(($out->aspdl - 3)->sum == 0);
-    };
-
-    my $check_input_fallback = sub { my ($shape) = @_;
-        my $ones = mx->nd->ones($shape);
-        my $out = mx->nd->broadcast_add($ones->tostype('csr'), $ones->tostype('row_sparse'));
-        ok(($out->aspdl - 2)->sum == 0);
-    };
-
-    my $check_fallback_with_temp_resource = sub { my ($shape) = @_;
-        my $ones = mx->nd->ones($shape);
-        my $out = mx->nd->sum($ones);
-        ok($out->asscalar == product(@{ $shape }));
-    };
-
-    my $shape = rand_shape_2d();
-    $check_output_fallback->($shape);
-    $check_input_fallback->($shape);
-    $check_fallback_with_temp_resource->($shape);
-}
-
-test_sparse_nd_storage_fallback();
-
-sub test_sparse_nd_astype
-{
-    my @stypes = ('row_sparse', 'csr');
-    for my $stype (@stypes)
-    {
-        my $x = mx->nd->zeros(rand_shape_2d(), stype => $stype, dtype => 'float32');
-        my $y = $x->astype('int32');
-        ok($y->dtype eq 'int32');
-    }
-}
-
-test_sparse_nd_astype();
-
-sub test_sparse_nd_storable
-{
-    my $repeat = 1;
-    my $dim0 = 40;
-    my $dim1 = 40;
-    my @stypes = ('row_sparse', 'csr');
-    my @densities = (0, 0.5);
-    my %stype = (row_sparse => 'AI::MXNet::NDArray::RowSparse', csr => 'AI::MXNet::NDArray::CSR');
-    for (1..$repeat)
-    {
-        my $shape = rand_shape_2d($dim0, $dim1);
-        for my $stype (@stypes)
-        {
-            for my $density (@densities)
-            {
-                my ($a) = rand_sparse_ndarray($shape, $stype, density => $density);
-                ok($a->isa($stype{$stype}));
-                my $data = Storable::freeze($a);
-                my $b = Storable::thaw($data);
-                ok($b->isa($stype{$stype}));
-                ok(same($a->aspdl, $b->aspdl));
-            }
-        }
-    }
-}
-
-test_sparse_nd_storable();
-
-sub test_sparse_nd_save_load
-{
-    my $repeat = 1;
-    my @stypes = ('default', 'row_sparse', 'csr');
-    my %stype = (default => 'AI::MXNet::NDArray', row_sparse => 'AI::MXNet::NDArray::RowSparse', csr => 'AI::MXNet::NDArray::CSR');
-    my $num_data = 20;
-    my @densities = (0, 0.5);
-    my $fname = 'tmp_list.bin';
-    for (1..$repeat)
-    {
-        my @data_list1;
-        for (1..$num_data)
-        {
-            my $stype = $stypes[randint(0, scalar(@stypes))];
-            my $shape = rand_shape_2d(40, 40);
-            my $density = $densities[randint(0, scalar(@densities))];
-            push @data_list1, rand_ndarray($shape, $stype, $density);
-            ok($data_list1[-1]->isa($stype{$stype}));
-        }
-        mx->nd->save($fname, \@data_list1);
-
-        my @data_list2 = @{ mx->nd->load($fname) };
-        ok(@data_list1 == @data_list2);
-        zip(sub {
-            my ($x, $y) = @_;
-            ok(same($x->aspdl, $y->aspdl));
-        }, \@data_list1, \@data_list2);
-
-        my %data_map1;
-        enumerate(sub {
-            my ($i, $x) = @_;
-            $data_map1{"ndarray xx $i"} = $x;
-        }, \@data_list1);
-        mx->nd->save($fname, \%data_map1);
-        my %data_map2 = %{ mx->nd->load($fname) };
-        ok(keys(%data_map1) == keys(%data_map2));
-        while(my ($k, $x) = each %data_map1)
-        {
-            my $y = $data_map2{$k};
-            ok(same($x->aspdl, $y->aspdl));
-        }
-    }
-    unlink $fname;
-}
-
-test_sparse_nd_save_load();
-
-sub test_create_csr
-{
-    my $check_create_csr_from_nd = sub { my ($shape, $density, $dtype) = @_;
-        my $matrix = rand_ndarray($shape, 'csr', $density);
-        # create data array with provided dtype and ctx
-        my $data = mx->nd->array($matrix->data->aspdl, dtype=>$dtype);
-        my $indptr = $matrix->indptr;
-        my $indices = $matrix->indices;
-        my $csr_created = mx->nd->sparse->csr_matrix([$data, $indices, $indptr], shape=>$shape);
-        ok($csr_created->stype eq 'csr');
-        ok(same($csr_created->data->aspdl, $data->aspdl));
-        ok(same($csr_created->indptr->aspdl, $indptr->aspdl));
-        ok(same($csr_created->indices->aspdl, $indices->aspdl));
-        # verify csr matrix dtype and ctx is consistent from the ones provided
-        ok($csr_created->dtype eq $dtype);
-        ok($csr_created->data->dtype eq $dtype);
-        ok($csr_created->context eq AI::MXNet::Context->current_ctx);
-        my $csr_copy = mx->nd->array($csr_created);
-        ok(same($csr_copy->aspdl, $csr_created->aspdl));
-    };
-
-    my $check_create_csr_from_coo = sub { my ($shape, $density, $dtype) = @_;
-        my $matrix = rand_ndarray($shape, 'csr', $density);
-        my $sp_csr = $matrix->aspdlccs;
-        my $sp_coo = $sp_csr->tocoo();
-        my $csr_created = mx->nd->sparse->csr_matrix([$sp_coo->data, [$sp_coo->row, $sp_coo->col]], shape=>$shape, dtype=>$dtype);
-        ok($csr_created->stype eq 'csr');
-        ok(same($csr_created->data->aspdl, $sp_csr->data));
-        ok(same($csr_created->indptr->aspdl, $sp_csr->indptr));
-        ok(same($csr_created->indices->aspdl, $sp_csr->indices));
-        my $csr_copy = mx->nd->array($csr_created);
-        ok(same($csr_copy->aspdl, $csr_created->aspdl));
-        # verify csr matrix dtype and ctx is consistent
-        ok($csr_created->dtype eq $dtype);
-        ok($csr_created->data->dtype eq $dtype);
-        ok($csr_created->context eq AI::MXNet::Context->current_ctx);
-    };
-
-    my $check_create_csr_from_pdlccs = sub { my ($shape, $density, $f) = @_;
-        my $assert_csr_almost_equal = sub { my ($nd, $sp) = @_;
-            ok(almost_equal($nd->data->aspdl, $sp->data));
-            ok(almost_equal($nd->indptr->aspdl, $sp->indptr));
-            ok(almost_equal($nd->indices->aspdl, $sp->indices));
-            my $sp_csr = $nd->aspdlccs;
-            ok(almost_equal($sp_csr->data, $sp->data));
-            ok(almost_equal($sp_csr->indptr, $sp->indptr));
-            ok(almost_equal($sp_csr->indices, $sp->indices));
-            ok($sp->dtype eq $sp_csr->dtype);
-        };
-
-            my $csr_sp = rand_sparse($shape->[0], $shape->[1], $density);
-            my $csr_nd = $f->($csr_sp);
-            ok(almost_equal($csr_nd->aspdl, $csr_sp->todense));
-            # non-canonical csr which contains duplicates and unsorted indices
-            my $indptr = pdl([0, 2, 3, 7]);
-            my $indices = pdl([0, 2, 2, 0, 1, 2, 1]);
-            my $data = pdl([1, 2, 3, 4, 5, 6, 1]);
-            my $non_canonical_csr = mx->nd->sparse->csr_matrix([$data, $indices, $indptr], shape=>[3, 3], dtype=>$csr_nd->dtype);
-            my $canonical_csr_nd = $f->($non_canonical_csr, dtype=>$csr_nd->dtype);
-            my $canonical_csr_sp = $non_canonical_csr->copy();
-            ok(almost_equal($canonical_csr_nd->aspdl, $canonical_csr_sp->aspdl));
-    };
-
-    my $dim0 = 20;
-    my $dim1 = 20;
-    my @densities = (0.5);
-    my $dtype = 'float64';
-    for my $density (@densities)
-    {
-        my $shape = [$dim0, $dim1];
-        $check_create_csr_from_nd->($shape, $density, $dtype);
-        $check_create_csr_from_coo->($shape, $density, $dtype);
-        $check_create_csr_from_pdlccs->($shape, $density, sub { mx->nd->sparse->array(@_) });
-        $check_create_csr_from_pdlccs->($shape, $density, sub { mx->nd->array(@_) });
-    }
-}
-
-test_create_csr();
-
-sub test_create_row_sparse
-{
-    my $dim0 = 50;
-    my $dim1 = 50;
-    my @densities = (0, 0.5, 1);
-    for my $density (@densities)
-    {
-        my $shape = rand_shape_2d($dim0, $dim1);
-        my $matrix = rand_ndarray($shape, 'row_sparse', $density);
-        my $data = $matrix->data;
-        my $indices = $matrix->indices;
-        my $rsp_created = mx->nd->sparse->row_sparse_array([$data, $indices], shape=>$shape);
-        ok($rsp_created->stype eq 'row_sparse');
-        ok(same($rsp_created->data->aspdl, $data->aspdl));
-        ok(same($rsp_created->indices->aspdl, $indices->aspdl));
-        my $rsp_copy = mx->nd->array($rsp_created);
-        ok(same($rsp_copy->aspdl, $rsp_created->aspdl));
-    }
-}
-
-test_create_row_sparse();
-
-sub test_create_sparse_nd_infer_shape
-{
-    my $check_create_csr_infer_shape = sub { my ($shape, $density, $dtype) = @_;
-        eval {
-            my $matrix = rand_ndarray($shape, 'csr', $density);
-            my $data = $matrix->data;
-            my $indptr = $matrix->indptr;
-            my $indices = $matrix->indices;
-            my $nd = mx->nd->sparse->csr_matrix([$data, $indices, $indptr], dtype=>$dtype);
-            my ($num_rows, $num_cols) = @{ $nd->shape };
-            ok($num_rows == @{ $indptr } - 1);
-            ok($indices->shape->[0] > 0);
-            ok(($num_cols <= $indices)->aspdl->sum == 0);
-            ok($nd->dtype eq $dtype);
-        };
-    };
-    my $check_create_rsp_infer_shape = sub { my ($shape, $density, $dtype) = @_;
-        eval {
-            my $array = rand_ndarray($shape, 'row_sparse', $density);
-            my $data = $array->data;
-            my $indices = $array->indices;
-            my $nd = mx->nd->sparse->row_sparse_array([$data, $indices], dtype=>$dtype);
-            my $inferred_shape = $nd->shape;
-            is_deeply([@{ $inferred_shape }[1..@{ $inferred_shape }-1]], [@{ $data->shape }[1..@{ $data->shape }-1]]);
-            ok($indices->ndim > 0);
-            ok($nd->dtype eq $dtype);
-            if($indices->shape->[0] > 0)
-            {
-                ok(($inferred_shape->[0] <= $indices)->aspdl->sum == 0);
-            }
-        };
-    };
-
-    my $dtype = 'int32';
-    my $shape = rand_shape_2d();
-    my $shape_3d = rand_shape_3d();
-    my @densities = (0, 0.5, 1);
-    for my $density (@densities)
-    {
-        $check_create_csr_infer_shape->($shape, $density, $dtype);
-        $check_create_rsp_infer_shape->($shape, $density, $dtype);
-        $check_create_rsp_infer_shape->($shape_3d, $density, $dtype);
-    }
-}
-
-test_create_sparse_nd_infer_shape();
-
-sub test_create_sparse_nd_from_dense
-{
-    my $check_create_from_dns = sub { my ($shape, $f, $dense_arr, $dtype, $default_dtype, $ctx) = @_;
-        my $arr = $f->($dense_arr, shape => $shape, dtype => $dtype, ctx => $ctx);
-        ok(same($arr->aspdl, pones(reverse @{ $shape })));
-        ok($arr->dtype eq $dtype);
-        ok($arr->context eq $ctx);
-        # verify the default dtype inferred from dense arr
-        my $arr2 = $f->($dense_arr);
-        ok($arr2->dtype eq $default_dtype);
-        ok($arr2->context eq AI::MXNet::Context->current_ctx);
-    };
-    my $shape = rand_shape_2d();
-    my $dtype = 'int32';
-    my $src_dtype = 'float64';
-    my $ctx = mx->cpu(1);
-    my @dense_arrs = (
-        mx->nd->ones($shape, dtype=>$src_dtype),
-        mx->nd->ones($shape, dtype=>$src_dtype)->aspdl
-    );
-    for my $f (sub { mx->nd->sparse->csr_matrix(@_) }, sub { mx->nd->sparse->row_sparse_array(@_) })
-    {
-        for my $dense_arr (@dense_arrs)
-        {
-            my $default_dtype = blessed($dense_arr) ? $dense_arr->dtype : 'float32';
-            $check_create_from_dns->($shape, $f, $dense_arr, $dtype, $default_dtype, $ctx);
-        }
-    }
-}
-
-test_create_sparse_nd_from_dense();
-
-sub test_create_sparse_nd_from_sparse
-{
-    my $check_create_from_sp = sub { my ($shape, $f, $sp_arr, $dtype, $src_dtype, $ctx) = @_;
-        my $arr = $f->($sp_arr, shape => $shape, dtype=>$dtype, ctx=>$ctx);
-        ok(same($arr->aspdl, pones(reverse @{ $shape })));
-        ok($arr->dtype eq $dtype);
-        ok($arr->context eq $ctx);
-        # verify the default dtype inferred from sparse arr
-        my $arr2 = $f->($sp_arr);
-        ok($arr2->dtype eq $src_dtype);
-        ok($arr2->context eq AI::MXNet::Context->current_ctx);
-    };
-
-    my $shape = rand_shape_2d();
-    my $src_dtype = 'float64';
-    my $dtype = 'int32';
-    my $ctx = mx->cpu(1);
-    my $ones = mx->nd->ones($shape, dtype=>$src_dtype);
-    my @csr_arrs = ($ones->tostype('csr'));
-    my @rsp_arrs = ($ones->tostype('row_sparse'));
-    push @csr_arrs, mx->nd->ones($shape, dtype=>$src_dtype)->aspdl->tocsr;
-    my $f_csr = sub { mx->nd->sparse->csr_matrix(@_) };
-    my $f_rsp = sub { mx->nd->sparse->row_sparse_array(@_) };
-    for my $sp_arr (@csr_arrs)
-    {
-        $check_create_from_sp->($shape, $f_csr, $sp_arr, $dtype, $src_dtype, $ctx);
-    }
-    for my $sp_arr (@rsp_arrs)
-    {
-        $check_create_from_sp->($shape, $f_rsp, $sp_arr, $dtype, $src_dtype, $ctx);
-    }
-}
-
-test_create_sparse_nd_from_sparse();
-
-sub test_create_sparse_nd_empty
-{
-    my $check_empty = sub { my ($shape, $stype) = @_;
-        my $arr = mx->nd->sparse->empty($stype, $shape);
-        ok($arr->stype eq $stype);
-        ok(same($arr->aspdl, pzeros(reverse(@{ $shape }))));
-    };
-
-    my $check_csr_empty = sub { my ($shape, $dtype, $ctx) = @_;
-        my $arr = mx->nd->sparse->csr_matrix(undef, shape => $shape, dtype => $dtype, ctx => $ctx);
-        ok($arr->stype eq 'csr');
-        ok($arr->dtype eq $dtype);
-        ok($arr->context eq $ctx);
-        ok(same($arr->aspdl, pzeros(reverse(@{ $shape }))));
-        # check the default value for dtype and ctx
-        $arr = mx->nd->sparse->csr_matrix(undef, shape => $shape);
-        ok($arr->dtype eq 'float32');
-        ok($arr->context eq AI::MXNet::Context->current_ctx);
-    };
-
-    my $check_rsp_empty = sub { my ($shape, $dtype, $ctx) = @_;
-        my $arr = mx->nd->sparse->row_sparse_array(undef, shape => $shape, dtype=>$dtype, ctx=>$ctx);
-        ok($arr->stype eq 'row_sparse');
-        ok($arr->dtype eq $dtype);
-        ok($arr->context eq $ctx);
-        ok(same($arr->aspdl, pzeros(reverse(@{ $shape }))));
-        # check the default value for dtype and ctx
-        $arr = mx->nd->sparse->row_sparse_array(undef, shape => $shape);
-        ok($arr->dtype eq 'float32');
-        ok($arr->context eq AI::MXNet::Context->current_ctx);
-    };
-
-    my @stypes = ('csr', 'row_sparse');
-    my $shape = rand_shape_2d();
-    my $shape_3d = rand_shape_3d();
-    my $dtype = 'int32';
-    my $ctx = mx->cpu(1);
-    for my $stype (@stypes)
-    {
-        $check_empty->($shape, $stype);
-    }
-    $check_csr_empty->($shape, $dtype, $ctx);
-    $check_rsp_empty->($shape, $dtype, $ctx);
-    $check_rsp_empty->($shape_3d, $dtype, $ctx);
-}
-
-test_create_sparse_nd_empty();
-
-sub test_synthetic_dataset_generator
-{
-    my $test_powerlaw_generator = sub { my ($csr_arr, $final_row) = @_;
-        my $indices = $csr_arr->indices->aspdl;
-        my $indptr = $csr_arr->indptr->aspdl;
-        for my $row (1..$final_row)
-        {
-            my $nextrow = $row + 1;
-            my $current_row_nnz = $indices->at($indptr->at($row) - 1) + 1;
-            my $next_row_nnz = $indices->at($indptr->at($nextrow) - 1) + 1;
-            ok($next_row_nnz == 2 * $current_row_nnz);
-        }
-    };
-
-    # Test if density is preserved
-    my ($csr_arr_cols) = rand_sparse_ndarray([32, 10000], "csr",
-                                          density=>0.01, distribution=>"powerlaw");
-
-    my ($csr_arr_small) = rand_sparse_ndarray([5, 5], "csr",
-                                           density=>0.5, distribution=>"powerlaw");
-
-    my ($csr_arr_big) = rand_sparse_ndarray([32, 1000000], "csr",
-                                         density=>0.4, distribution=>"powerlaw");
-
-    my ($csr_arr_square) = rand_sparse_ndarray([1600, 1600], "csr",
-                                            density=>0.5, distribution=>"powerlaw");
-    ok($csr_arr_cols->data->len == 3200);
-    $test_powerlaw_generator->($csr_arr_cols, 9);
-    $test_powerlaw_generator->($csr_arr_small, 1);
-    $test_powerlaw_generator->($csr_arr_big, 4);
-    $test_powerlaw_generator->($csr_arr_square, 6);
-}
-
-test_synthetic_dataset_generator();
-
-sub test_sparse_nd_fluent
-{
-    my $check_fluent_regular = sub { my ($stype, $func, $kwargs, $shape, $equal_nan) = @_;
-        $shape //= [5, 17];
-        my $data = mx->nd->random->uniform(shape=>$shape)->tostype($stype);
-        my $regular = AI::MXNet::NDArray::Base->$func($data, %$kwargs);
-        my $fluent  = $data->$func(%$kwargs);
-        ok(almost_equal($regular->aspdl, $fluent->aspdl));
-    };
-
-    my @common_func = ('zeros_like', 'square');
-    my @rsp_func = ('round', 'rint', 'fix', 'floor', 'ceil', 'trunc',
-                'abs', 'sign', 'sin', 'degrees', 'radians', 'expm1');
-    for my $func (@common_func)
-    {
-        $check_fluent_regular->('csr', $func, {});
-    }
-    for my $func (@common_func, @rsp_func)
-    {
-        $check_fluent_regular->('row_sparse', $func, {});
-    }
-
-    @rsp_func = ('arcsin', 'arctan', 'tan', 'sinh', 'tanh',
-                'arcsinh', 'arctanh', 'log1p', 'sqrt', 'relu');
-    for my $func (@rsp_func)
-    {
-        $check_fluent_regular->('row_sparse', $func, {});
-    }
-
-    $check_fluent_regular->('csr', 'slice', {begin => [2, 5], end => [4, 7]});
-    $check_fluent_regular->('row_sparse', 'clip', {a_min => -0.25, a_max => 0.75});
-
-    for my $func ('sum', 'mean')
-    {
-        $check_fluent_regular->('csr', $func, {axis => 0});
-    }
-}
-
-test_sparse_nd_fluent();
-
-sub test_sparse_nd_exception
-{
-    my $a = mx->nd->ones([2,2]);
-    dies_ok(sub { mx->nd->sparse->retain($a, invalid_arg=>"garbage_value") });
-    dies_ok(sub { mx->nd->sparse->csr_matrix($a, shape=>[3,2]) });
-    dies_ok(sub { mx->nd->sparse->csr_matrix(pdl([2,2]), shape=>[3,2]) });
-    dies_ok(sub { mx->nd->sparse->row_sparse_array(pdl([2,2]), shape=>[3,2]) });
-    dies_ok(sub { mx->nd->sparse->zeros("invalid_stype", [2,2]) });
-}
-
-test_sparse_nd_exception();
-
-sub test_sparse_nd_check_format
-{
-    my $shape = rand_shape_2d();
-    my @stypes = ("csr", "row_sparse");
-    for my $stype (@stypes)
-    {
-        my ($arr) = rand_sparse_ndarray($shape, $stype);
-        $arr->check_format();
-        $arr = mx->nd->sparse->zeros($stype, $shape);
-        $arr->check_format();
-    }
-    # CSR format index pointer array should be less than the number of rows
-    $shape = [3, 4];
-    my $data_list = [7, 8, 9];
-    my $indices_list = [0, 2, 1];
-    my $indptr_list = [0, 5, 2, 3];
-    my $a = mx->nd->sparse->csr_matrix([$data_list, $indices_list, $indptr_list], shape=>$shape);
-    dies_ok(sub { $a->check_format });
-    # CSR format indices should be in ascending order per row
-    $indices_list = [2, 1, 1];
-    $indptr_list = [0, 2, 2, 3];
-    $a = mx->nd->sparse->csr_matrix([$data_list, $indices_list, $indptr_list], shape=>$shape);
-    dies_ok(sub { $a->check_format });
-    # CSR format indptr should end with value equal with size of indices
-    $indices_list = [1, 2, 1];
-    $indptr_list = [0, 2, 2, 4];
-    $a = mx->nd->sparse->csr_matrix([$data_list, $indices_list, $indptr_list], shape=>$shape);
-    dies_ok(sub { $a->check_format });
-    # CSR format indices should not be negative
-    $indices_list = [0, 2, 1];
-    $indptr_list = [0, -2, 2, 3];
-    $a = mx->nd->sparse->csr_matrix([$data_list, $indices_list, $indptr_list], shape=>$shape);
-    dies_ok(sub { $a->check_format });
-    # Row Sparse format indices should be less than the number of rows
-    $shape = [3, 2];
-    $data_list = [[1, 2], [3, 4]];
-    $indices_list = [1, 4];
-    $a = mx->nd->sparse->row_sparse_array([$data_list, $indices_list], shape=>$shape);
-    dies_ok(sub { $a->check_format });
-    # Row Sparse format indices should be in ascending order
-    $indices_list = [1, 0];
-    $a = mx->nd->sparse->row_sparse_array([$data_list, $indices_list], shape=>$shape);
-    dies_ok(sub { $a->check_format });
-    # Row Sparse format indices should not be negative
-    $indices_list = [1, -2];
-    $a = mx->nd->sparse->row_sparse_array([$data_list, $indices_list], shape=>$shape);
-    dies_ok(sub { $a->check_format });
-}
-
-test_sparse_nd_check_format();
diff --git a/perl-package/AI-MXNet/t/test_symbol.t b/perl-package/AI-MXNet/t/test_symbol.t
deleted file mode 100644
index 713480df053c..000000000000
--- a/perl-package/AI-MXNet/t/test_symbol.t
+++ /dev/null
@@ -1,423 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-use strict;
-use warnings;
-use Test::More tests => 31;
-use AI::MXNet qw(mx);
-use AI::MXNet::TestUtils qw(mlp2 check_consistency zip assert enumerate almost_equal same);
-use Storable qw(freeze thaw);
-use PDL;
-
-sub test_symbol_compose
-{
-    my $data = mx->symbol->Variable('data');
-    my $net1 = mx->symbol->FullyConnected(data=>$data, name=>'fc1', num_hidden=>10);
-    $net1 = mx->symbol->FullyConnected(data=>$net1, name=>'fc2', num_hidden=>100);
-    is_deeply($net1->list_arguments(), ['data',
-                              'fc1_weight', 'fc1_bias',
-                              'fc2_weight', 'fc2_bias']);
-
-    my $net2 = mx->symbol->FullyConnected(name=>'fc3', num_hidden=>10);
-    $net2 = mx->symbol->Activation(data=>$net2, act_type=>'relu');
-    $net2 = mx->symbol->FullyConnected(data=>$net2, name=>'fc4', num_hidden=>20);
-    my $composed = &{$net2}(fc3_data=>$net1, name=>'composed');
-    my $multi_out = mx->symbol->Group([$composed, $net1]);
-    ok(@{ $multi_out->list_outputs() } == 2);
-}
-
-test_symbol_compose();
-
-sub test_symbol_copy
-{
-    my $data = mx->symbol->Variable('data');
-    my $data_2 = $data->deepcopy;
-    is($data->tojson, $data_2->tojson);
-}
-
-test_symbol_copy();
-
-sub test_symbol_internal
-{
-    my $data = mx->symbol->Variable('data');
-    my $oldfc = mx->symbol->FullyConnected(data=>$data, name=>'fc1', num_hidden=>10);
-    my $net1 = mx->symbol->FullyConnected(data=>$oldfc, name=>'fc2', num_hidden=>100);
-    is_deeply($net1->list_arguments, ['data', 'fc1_weight', 'fc1_bias', 'fc2_weight', 'fc2_bias']);
-
-    my $internal = $net1->get_internals();
-    my $fc1 = $internal->slice('fc1_output');
-    is_deeply($fc1->list_arguments, $oldfc->list_arguments);
-}
-
-test_symbol_internal();
-
-sub test_symbol_children
-{
-    my $data = mx->symbol->Variable('data');
-    my $oldfc = mx->symbol->FullyConnected(data=>$data, name=>'fc1', num_hidden=>10);
-    my $net1 = mx->symbol->FullyConnected(data=>$oldfc, name=>'fc2', num_hidden=>100);
-
-    is_deeply($net1->get_children()->list_outputs(), ['fc1_output', 'fc2_weight', 'fc2_bias']);
-    is_deeply($net1->get_children()->get_children()->list_outputs() , ['data', 'fc1_weight', 'fc1_bias']);
-    is_deeply($net1->get_children()->slice('fc2_weight')->list_arguments(), ['fc2_weight']);
-    ok(not defined $net1->get_children()->slice('fc2_weight')->get_children());
-
-    $data = mx->sym->Variable('data');
-    my $sliced = mx->sym->SliceChannel($data, num_outputs=>3, name=>'slice');
-    my $concat = mx->sym->Concat(@{ $sliced });
-
-    is_deeply($concat->get_children()->list_outputs(),
-        ['slice_output0', 'slice_output1', 'slice_output2']);
-    is_deeply($sliced->get_children()->list_outputs(), ['data']);
-}
-
-test_symbol_children();
-
-sub test_symbol_storable
-{
-    my $mlist = [mlp2()];
-    my $data = freeze($mlist);
-    my $mlist2 = thaw($data);
-    zip(sub {
-        my ($x, $y) = @_;
-        is($x->tojson, $y->tojson);
-    }, $mlist, $mlist2);
-}
-
-test_symbol_storable();
-
-sub test_symbol_saveload
-{
-    my $sym = mlp2();
-    my $fname = 'tmp_sym.json';
-    $sym->save($fname);
-    my $data2 = mx->symbol->load($fname);
-    # save because of order
-    is($sym->tojson, $data2->tojson);
-    unlink $fname;
-}
-
-test_symbol_saveload();
-
-
-sub test_symbol_infer_shape
-{
-    my $num_hidden = 128;
-    my $num_dim    = 64;
-    my $num_sample = 10;
-
-    my $data = mx->symbol->Variable('data');
-    my $prev = mx->symbol->Variable('prevstate');
-    my $x2h  = mx->symbol->FullyConnected(data=>$data, name=>'x2h', num_hidden=>$num_hidden);
-    my $h2h  = mx->symbol->FullyConnected(data=>$prev, name=>'h2h', num_hidden=>$num_hidden);
-
-    my $out  = mx->symbol->Activation(data=>mx->sym->elemwise_add($x2h, $h2h), name=>'out', act_type=>'relu');
-
-    # shape inference will fail because information is not available for h2h
-    my @ret  = $out->infer_shape(data=>[$num_sample, $num_dim]);
-    is_deeply(\@ret, [undef, undef, undef]);
-
-    my ($arg_shapes, $out_shapes, $aux_shapes) = $out->infer_shape_partial(data=>[$num_sample, $num_dim]);
-    my %arg_shapes;
-    @arg_shapes{ @{ $out->list_arguments } } = @{ $arg_shapes };
-    is_deeply($arg_shapes{data}, [$num_sample, $num_dim]);
-    is_deeply($arg_shapes{x2h_weight}, [$num_hidden, $num_dim]);
-    is_deeply($arg_shapes{h2h_weight}, []);
-
-    # now we can do full shape inference
-    my $state_shape = $out_shapes->[0];
-    ($arg_shapes, $out_shapes, $aux_shapes) = $out->infer_shape(data=>[$num_sample, $num_dim], prevstate=>$state_shape);
-    @arg_shapes{ @{ $out->list_arguments } } = @{ $arg_shapes };
-    is_deeply($arg_shapes{data}, [$num_sample, $num_dim]);
-    is_deeply($arg_shapes{x2h_weight}, [$num_hidden, $num_dim]);
-    is_deeply($arg_shapes{h2h_weight}, [$num_hidden, $num_hidden]);
-}
-
-test_symbol_infer_shape();
-
-sub test_symbol_infer_shape_var
-{
-    #Test specifying shape information when constructing a variable
-    my $shape = [2, 3];
-    my $a = mx->symbol->Variable('a', shape=>$shape);
-    my $b = mx->symbol->Variable('b');
-    my $c = mx->symbol->elemwise_add($a, $b);
-    my ($arg_shapes, $out_shapes, $aux_shapes) = $c->infer_shape();
-    is_deeply($arg_shapes->[0], $shape);
-    is_deeply($arg_shapes->[1], $shape);
-    is_deeply($out_shapes->[0], $shape);
-
-    $shape = [5, 6];
-    ($arg_shapes, $out_shapes, $aux_shapes) = $c->infer_shape(a=>$shape);
-    is_deeply($arg_shapes->[0], $shape);
-    is_deeply($arg_shapes->[1], $shape);
-    is_deeply($out_shapes->[0], $shape);
-}
-
-test_symbol_infer_shape_var();
-
-sub check_symbol_consistency
-{
-    my ($sym1, $sym2, $ctx) = @_;
-    is_deeply($sym1->list_arguments(), $sym2->list_arguments());
-    is_deeply($sym1->list_auxiliary_states(), $sym2->list_auxiliary_states());
-    is_deeply($sym1->list_outputs(), $sym2->list_outputs());
-    check_consistency(sym => [$sym1, $sym2], ctx_list => [$ctx, $ctx]);
-}
-
-
-sub test_linalg_gemm2
-{
-    # Single matrix multiply
-    my $sym_gemm2 = mx->sym->linalg->gemm2(
-        mx->sym->var('A'),
-        mx->sym->var('B'),
-        transpose_b => 1,
-        alpha => 2.0
-    );
-    my $A = mx->nd->array([[1.0, 1.0], [1.0, 1.0]]);
-    my $B = mx->nd->array([[1.0, 1.0], [1.0, 1.0], [1.0, 1.0]]);
-    ok(almost_equal(
-        $sym_gemm2->eval(args => { A => $A, B => $B })->[0]->aspdl,
-        pdl([[4.0, 4.0, 4.0], [4.0, 4.0, 4.0]])
-    ));
-
-    # Batch matrix multiply
-    $A = mx->nd->array([[[1.0, 1.0]], [[0.1, 0.1]]]);
-    $B = mx->nd->array([[[1.0, 1.0]], [[0.1, 0.1]]]);
-    ok(almost_equal(
-        $sym_gemm2->eval(args => { A => $A, B => $B })->[0]->aspdl,
-        pdl([[[4.0]], [[0.04]]])
-    ));
-}
-
-test_linalg_gemm2();
-
-sub test_image_to_tensor
-{
-    my $sym_to_tensor = mx->sym->image->to_tensor(
-        mx->sym->var('A')
-    );
-    my $A = mx->nd->zeros([28, 28, 3]);
-    ok(same(
-        $sym_to_tensor->eval(args => { A => $A })->[0]->aspdl,
-        zeros(28, 28, 3)
-    ));
-}
-
-test_image_to_tensor();
-
-sub test_histogram
-{
-    my $z = mx->nd->array([0..99]);
-    my $b = mx->nd->array([10, 20, 30, 60]);
-    my ($hist, $bins) = @{ mx->sym->histogram(mx->sym->var("z"), bins => mx->sym->var("bins"))->eval(args => { z => $z, bins => $b }) };
-    ok(same($hist->aspdl, pdl([10, 10, 31])));
-    ok(same($bins->aspdl, pdl([10, 20, 30, 60])));
-}
-
-test_histogram();
-
-__DATA__
-{
-  "nodes": [
-    {
-      "op": "null", 
-      "param": {}, 
-      "name": "data", 
-      "inputs": [], 
-      "backward_source_id": -1, 
-      "attr": {
-        "ctx_group": "stage1", 
-        "lr_mult": "0.2"
-      }
-    }, 
-    {
-      "op": "null", 
-      "param": {}, 
-      "name": "fc1_weight", 
-      "inputs": [], 
-      "backward_source_id": -1, 
-      "attr": {
-        "ctx_group": "stage1", 
-        "wd_mult": "0.3", 
-        "weight_lr_mult": "1.2"
-      }
-    }, 
-    {
-      "op": "null", 
-      "param": {}, 
-      "name": "fc1_bias", 
-      "inputs": [], 
-      "backward_source_id": -1, 
-      "attr": {
-        "ctx_group": "stage1", 
-        "wd_mult": "0.3", 
-        "weight_lr_mult": "1.2"
-      }
-    }, 
-    {
-      "op": "FullyConnected", 
-      "param": {
-        "no_bias": "False", 
-        "num_hidden": "128"
-      }, 
-      "name": "fc1", 
-      "inputs": [[0, 0], [1, 0], [2, 0]], 
-      "backward_source_id": -1, 
-      "attr": {
-        "ctx_group": "stage1", 
-        "wd_mult": "0.3", 
-        "weight_lr_mult": "1.2"
-      }
-    }, 
-    {
-      "op": "Activation", 
-      "param": {"act_type": "relu"}, 
-      "name": "relu1", 
-      "inputs": [[3, 0]], 
-      "backward_source_id": -1, 
-      "attr": {"ctx_group": "stage1"}
-    }, 
-    {
-      "op": "null", 
-      "param": {}, 
-      "name": "fc2_weight", 
-      "inputs": [], 
-      "backward_source_id": -1, 
-      "attr": {
-        "ctx_group": "stage2", 
-        "lr_mult": "0.01"
-      }
-    }, 
-    {
-      "op": "null", 
-      "param": {}, 
-      "name": "fc2_bias", 
-      "inputs": [], 
-      "backward_source_id": -1, 
-      "attr": {
-        "ctx_group": "stage2", 
-        "lr_mult": "0.01"
-      }
-    }, 
-    {
-      "op": "FullyConnected", 
-      "param": {
-        "no_bias": "False", 
-        "num_hidden": "64"
-      }, 
-      "name": "fc2", 
-      "inputs": [[4, 0], [5, 0], [6, 0]], 
-      "backward_source_id": -1, 
-      "attr": {
-        "ctx_group": "stage2", 
-        "lr_mult": "0.01"
-      }
-    }, 
-    {
-      "op": "Activation", 
-      "param": {"act_type": "relu"}, 
-      "name": "relu2", 
-      "inputs": [[7, 0]], 
-      "backward_source_id": -1, 
-      "attr": {"ctx_group": "stage2"}
-    }, 
-    {
-      "op": "null", 
-      "param": {}, 
-      "name": "fc3_weight", 
-      "inputs": [], 
-      "backward_source_id": -1, 
-      "attr": {"ctx_group": "stage2"}
-    }, 
-    {
-      "op": "null", 
-      "param": {}, 
-      "name": "fc3_bias", 
-      "inputs": [], 
-      "backward_source_id": -1, 
-      "attr": {"ctx_group": "stage2"}
-    }, 
-    {
-      "op": "FullyConnected", 
-      "param": {
-        "no_bias": "False", 
-        "num_hidden": "10"
-      }, 
-      "name": "fc3", 
-      "inputs": [[8, 0], [9, 0], [10, 0]], 
-      "backward_source_id": -1, 
-      "attr": {"ctx_group": "stage2"}
-    }, 
-    {
-      "op": "null", 
-      "param": {}, 
-      "name": "batchnorm0_gamma", 
-      "inputs": [], 
-      "backward_source_id": -1, 
-      "attr": {"ctx_group": "stage2"}
-    }, 
-    {
-      "op": "null", 
-      "param": {}, 
-      "name": "batchnorm0_beta", 
-      "inputs": [], 
-      "backward_source_id": -1, 
-      "attr": {"ctx_group": "stage2"}
-    }, 
-    {
-      "op": "BatchNorm", 
-      "param": {
-        "eps": "0.001", 
-        "fix_gamma": "True", 
-        "momentum": "0.9", 
-        "use_global_stats": "False"
-      }, 
-      "name": "batchnorm0", 
-      "inputs": [[11, 0], [12, 0], [13, 0]], 
-      "backward_source_id": -1, 
-      "attr": {"ctx_group": "stage2"}
-    }, 
-    {
-      "op": "null", 
-      "param": {}, 
-      "name": "softmax_label", 
-      "inputs": [], 
-      "backward_source_id": -1, 
-      "attr": {"ctx_group": "stage2"}
-    }, 
-    {
-      "op": "SoftmaxOutput", 
-      "param": {
-        "grad_scale": "1", 
-        "ignore_label": "-1", 
-        "multi_output": "False", 
-        "normalization": "null", 
-        "out_grad": "False", 
-        "preserve_shape": "False", 
-        "use_ignore": "False"
-      }, 
-      "name": "softmax", 
-      "inputs": [[14, 0], [15, 0]], 
-      "backward_source_id": -1, 
-      "attr": {"ctx_group": "stage2"}
-    }
-  ], 
-  "arg_nodes": [0, 1, 2, 5, 6, 9, 10, 12, 13, 15], 
-  "heads": [[16, 0]]
-}
diff --git a/perl-package/AI-MXNet/t/test_viz.t b/perl-package/AI-MXNet/t/test_viz.t
deleted file mode 100644
index 2cbfd1ff7091..000000000000
--- a/perl-package/AI-MXNet/t/test_viz.t
+++ /dev/null
@@ -1,37 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-use AI::MXNet qw(mx);
-use Test::More tests => 1;
-
-sub test_print_summary
-{
-    my $data = mx->sym->Variable('data');
-    my $bias = mx->sym->Variable('fc1_bias', lr_mult => 1.0);
-    my $conv1= mx->sym->Convolution(data => $data, name => 'conv1', num_filter => 32, kernel => [3,3], stride => [2,2]);
-    my $bn1  = mx->sym->BatchNorm(data => $conv1, name => "bn1");
-    my $act1 = mx->sym->Activation(data => $bn1, name => 'relu1', act_type => "relu");
-    my $mp1  = mx->sym->Pooling(data => $act1, name => 'mp1', kernel => [2,2], stride => [2,2], pool_type => 'max');
-    my $fc1  = mx->sym->FullyConnected(data => $mp1, bias => $bias, name => 'fc1', num_hidden => 10, lr_mult => 0);
-    my $fc2  = mx->sym->FullyConnected(data => $fc1, name => 'fc2', num_hidden => 10, wd_mult => 0.5);
-    mx->viz->print_summary($fc2);
-    my $shape = { data => [1,3,28,28] };
-    mx->viz->print_summary($fc2, $shape);
-}
-
-test_print_summary();
-ok(1);
diff --git a/perl-package/AI-MXNetCAPI/Changes b/perl-package/AI-MXNetCAPI/Changes
deleted file mode 100644
index 9028f0fc0ec1..000000000000
--- a/perl-package/AI-MXNetCAPI/Changes
+++ /dev/null
@@ -1,55 +0,0 @@
-Revision history for Perl extension AI::MXNetCAPI
-1.5     Sun Feb 16 19:56:17 PST 2020
-        - Runtime features
-        - INT64 Tensor support
-
-1.4     Mon Feb 18 11:54:07 PST 2019
-        - Support for 64bit integers
-
-1.33    Thu Oct  4 13:25:56 PDT 2018
-        - Gluon: Better sparse support for KVStore.
-        - Gpu memory info via mxnet api call.
-
-1.32    Sun Aug  5 14:25:31 PDT 2018
-        - Bugfixes.
-
-1.3     Tue Jun 26 20:57:40 PDT 2018
-        - Major update, Gluon interface updated to parity with Python's API
-
-1.2     Sun Mar  4 16:29:19 PST 2018
-        - Support for sparse tensors
-
-1.1     Sun Oct  1 10:19:08 PDT 2017
-        - support for perl 5.14, Gluon, cuda kernels
-
-1.0102  Sun Aug  6 16:55:08 PDT 2017
-        - updated autograd calls.
-
-1.0101  Sun Jul  2 17:16:01 PDT 2017
-        - refactored CachedOp, using strings to index the kvstore.
-
-1.01    Sat Jun 10 23:57:27 PDT 2017
-        - sync with python.
-
-0.9507  Thu May 11 17:04:44 PDT 2017
-        - Added Autograd.
-
-0.9506  Sat Apr 29 20:26:50 PDT 2017
-        - removed compiled swig file.
-
-0.9504  Wed Apr 19 19:07:02 PDT 2017
-        - callbacks bugfix.
-
-0.95  Sun Mar 26 17:42:02 PDT 2017
-        - visible on https://mxnet.io
-
-0.03  Sat Feb 25 13:21:07 PST 2017
-	- sync up with the Python interface.
-
-0.02  Tue Feb 14 07:08:37 PST 2017
-	- prepared for inclusion to the mxnet code repository.
-
-
-0.01  Fri Jan  6 19:40:53 2017
-	- original version
-
diff --git a/perl-package/AI-MXNetCAPI/MANIFEST b/perl-package/AI-MXNetCAPI/MANIFEST
deleted file mode 100644
index cea95def1d70..000000000000
--- a/perl-package/AI-MXNetCAPI/MANIFEST
+++ /dev/null
@@ -1,10 +0,0 @@
-Changes
-Makefile.PL
-MANIFEST
-README
-META.json
-META.yml
-t/AI-MXNetCAPI.t
-lib/AI/MXNetCAPI.pm
-mxnet.i
-mxnet_typemaps.i
diff --git a/perl-package/AI-MXNetCAPI/META.json b/perl-package/AI-MXNetCAPI/META.json
deleted file mode 100644
index 4c9520336d7c..000000000000
--- a/perl-package/AI-MXNetCAPI/META.json
+++ /dev/null
@@ -1,41 +0,0 @@
-{
-   "abstract" : "Swig interface to mxnet c api",
-   "author" : [
-      "Sergey Kolychev <sergeykolychev.github@gmail.com>"
-   ],
-   "dynamic_config" : 0,
-   "generated_by" : "ExtUtils::MakeMaker version 7.24, CPAN::Meta::Converter version 2.143240",
-   "license" : [
-      "apache_2_0"
-   ],
-   "meta-spec" : {
-      "url" : "http://search.cpan.org/perldoc?CPAN::Meta::Spec",
-      "version" : "2"
-   },
-   "name" : "AI-MXNetCAPI",
-   "no_index" : {
-      "directory" : [
-         "t",
-         "inc"
-      ]
-   },
-   "prereqs" : {
-      "build" : {
-         "requires" : {
-            "ExtUtils::MakeMaker" : "0"
-         }
-      },
-      "configure" : {
-         "requires" : {
-            "ExtUtils::MakeMaker" : "0"
-         }
-      },
-      "runtime" : {
-         "requires" : {
-            "Test::More" : "0"
-         }
-      }
-   },
-   "release_status" : "stable",
-   "version" : "1.5"
-}
diff --git a/perl-package/AI-MXNetCAPI/META.yml b/perl-package/AI-MXNetCAPI/META.yml
deleted file mode 100644
index 61e7a17385ce..000000000000
--- a/perl-package/AI-MXNetCAPI/META.yml
+++ /dev/null
@@ -1,39 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
----
-abstract: 'Swig interface to mxnet c api'
-author:
-  - 'Sergey Kolychev <sergeykolychev.github@gmail.com>'
-build_requires:
-  ExtUtils::MakeMaker: '0'
-configure_requires:
-  ExtUtils::MakeMaker: '0'
-dynamic_config: 0
-generated_by: 'ExtUtils::MakeMaker version 7.24, CPAN::Meta::Converter version 2.143240'
-license: apache
-meta-spec:
-  url: http://module-build.sourceforge.net/META-spec-v1.4.html
-  version: '1.4'
-name: AI-MXNetCAPI
-no_index:
-  directory:
-    - t
-    - inc
-requires:
-  Test::More: '0'
-version: '1.5'
diff --git a/perl-package/AI-MXNetCAPI/Makefile.PL b/perl-package/AI-MXNetCAPI/Makefile.PL
deleted file mode 100644
index 8018cdfdddf9..000000000000
--- a/perl-package/AI-MXNetCAPI/Makefile.PL
+++ /dev/null
@@ -1,59 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-use ExtUtils::MakeMaker;
-use DynaLoader;
-`swig -noproxy -c++ -perl mxnet.i`;
-unlink "MXNetCAPI.pm";
-my @tmp = split(/ /, $ExtUtils::MakeMaker::Config{lddlflags});
-my @lddlflags;
-while(my $flag = shift(@tmp))
-{
-   if($flag eq '-arch')
-   {
-      my $arch = shift(@tmp);
-      if($arch eq 'i386')
-      {
-         next;
-      }
-      else
-      {
-         push @lddlflags, ($flag, $arch);
-      }
-   }
-   else
-   {
-      push @lddlflags, $flag;
-   }
-}
-WriteMakefile(
-    NAME            => 'AI::MXNetCAPI',
-    LICENSE         => 'apache_2_0',
-    AUTHOR          => 'Sergey Kolychev <sergeykolychev.github@gmail.com>',
-    VERSION_FROM    => 'lib/AI/MXNetCAPI.pm',
-    ABSTRACT_FROM   => 'lib/AI/MXNetCAPI.pm',
-    LIBS           => ['-L../../lib -lmxnet'],
-    INC            => '-I../../include/mxnet',
-    OBJECT         => 'mxnet_wrap.o',
-    LDDLFLAGS      => join(' ', @lddlflags),
-    PREREQ_PM      => {
-        # prereqs
-        # build/test prereqs
-        'Test::More'   => 0,
-    },
-    PL_FILES        => {},
-);
diff --git a/perl-package/AI-MXNetCAPI/README b/perl-package/AI-MXNetCAPI/README
deleted file mode 100644
index a19771c914f7..000000000000
--- a/perl-package/AI-MXNetCAPI/README
+++ /dev/null
@@ -1,23 +0,0 @@
-AI-MXNetCAPI version 1.5
-=====================
-
-Swig interface to MXNet c api.
-
-INSTALLATION
-
-To install this module type the following:
-
-   perl Makefile.PL
-   make
-   make test
-   make install
-
-DEPENDENCIES
-
-This module requires mxnet https://mxnet.io
-It's used by AI::MXNet
-
-COPYRIGHT AND LICENCE
-
-This library is licensed under Apache 2.0 license https://www.apache.org/licenses/LICENSE-2.0
-
diff --git a/perl-package/AI-MXNetCAPI/lib/AI/MXNetCAPI.pm b/perl-package/AI-MXNetCAPI/lib/AI/MXNetCAPI.pm
deleted file mode 100644
index c75ee336acf1..000000000000
--- a/perl-package/AI-MXNetCAPI/lib/AI/MXNetCAPI.pm
+++ /dev/null
@@ -1,53 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-package AI::MXNetCAPI;
-use strict;
-use base qw(DynaLoader);
-bootstrap AI::MXNetCAPI;
-our $VERSION = '1.5';
-1;
-__END__
-
-=head1 NAME
-
-AI::MXNetCAPI - Swig interface to mxnet c api
-
-=head1 SYNOPSIS
-
- use AI::MXNetCAPI;
-
-=head1 DESCRIPTION
-
-This module provides interface to mxnet
-via its api.
-
-=head1 SEE ALSO
-
-L<AI::MXNet>
-
-=head1 AUTHOR
-
-Sergey Kolychev, <sergeykolychev.github@gmail.com>
-
-=head1 COPYRIGHT & LICENSE
-
-This library is licensed under Apache 2.0 license.
-
-See https://www.apache.org/licenses/LICENSE-2.0 for more information.
-
-=cut
diff --git a/perl-package/AI-MXNetCAPI/mxnet.i b/perl-package/AI-MXNetCAPI/mxnet.i
deleted file mode 100644
index 9602b08d675e..000000000000
--- a/perl-package/AI-MXNetCAPI/mxnet.i
+++ /dev/null
@@ -1,2486 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-%module "AI::MXNetCAPI"
-%rename("%(strip:[MX])s") "";
-%include typemaps.i
-%include mxnet_typemaps.i
-%inline %{
-#include <c_api.h>
-
-// Taken as is from http://cpansearch.perl.org/src/COLEMINOR/Games-EternalLands-Binary-Float16-0.01/Float16.xs
-/* This method is faster than the OpenEXR implementation (very often
- * used, eg. in Ogre), with the additional benefit of rounding, inspired
- * by James Tursa's half-precision code. */
-static inline uint16_t _float_to_half(uint32_t x) {
-  uint16_t bits = (x >> 16) & 0x8000;
-  uint16_t m = (x >> 12) & 0x07ff;
-  unsigned int e = (x >> 23) & 0xff;
-  if (e < 103)
-    return bits;
-  if (e > 142) {
-    bits |= 0x7c00u;
-    bits |= e == 255 && (x & 0x007fffffu);
-    return bits;
-  }
-  if (e < 113) {
-    m |= 0x0800u;
-    bits |= (m >> (114 - e)) + ((m >> (113 - e)) & 1);
-    return bits;
-  }
-  bits |= ((e - 112) << 10) | (m >> 1);
-  bits += m & 1;
-  return bits;
-}
-
-static int const shifttable[32] = {
-  23, 14, 22, 0, 0, 0, 21, 0, 0, 0, 0, 0, 0, 0, 20, 0,
-  15, 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 17, 0, 18, 19, 0,
-};
-static uint32_t const shiftmagic = 0x07c4acddu;
-
-/* This algorithm is similar to the OpenEXR implementation, except it
- * uses branchless code in the denormal path. This is slower than a
- * table version, but will be more friendly to the cache for occasional
- * uses. */
-static inline uint32_t _half_to_float(uint16_t x) {
-  uint32_t s = (x & 0x8000u) << 16;
-  if ((x & 0x7fffu) == 0)
-    return (uint32_t)x << 16;
-  uint32_t e = x & 0x7c00u;
-  uint32_t m = x & 0x03ffu;
-  if (e == 0) {
-    uint32_t v = m | (m >> 1);
-    v |= v >> 2;
-    v |= v >> 4;
-    v |= v >> 8;
-    e = shifttable[(v * shiftmagic) >> 27];
-    return s | (((125 - e) << 23) + (m << e));
-  }
-  if (e == 0x7c00u) {
-    if (m == 0)
-      return s | 0x7f800000u;
-    return s | 0x7fc00000u;
-  }
-  return s | (((e >> 10) + 112) << 23) | (m << 13);
-}
-
-union fbits {
-  float f;
-  uint32_t x;
-};
-
-static void KVStore_callback(int index, NDArrayHandle recv, NDArrayHandle local, void* callback)
-{
-    {
-        dSP;
-        PUSHMARK(SP);
-        XPUSHs(sv_2mortal(newSViv(index)));
-        XPUSHs(SWIG_NewPointerObj(SWIG_as_voidptr(recv), SWIGTYPE_p_MXNDArray, 0));
-        XPUSHs(SWIG_NewPointerObj(SWIG_as_voidptr(local), SWIGTYPE_p_MXNDArray, 0));
-        PUTBACK;
-        call_sv((SV*)callback, G_DISCARD);
-    }
-}
-
-static void KVStoreStr_callback(const char *key, NDArrayHandle recv, NDArrayHandle local, void* callback)
-{
-    {
-        dSP;
-        PUSHMARK(SP);
-        XPUSHs(sv_2mortal(newSVpv(key, 0)));
-        XPUSHs(SWIG_NewPointerObj(SWIG_as_voidptr(recv), SWIGTYPE_p_MXNDArray, 0));
-        XPUSHs(SWIG_NewPointerObj(SWIG_as_voidptr(local), SWIGTYPE_p_MXNDArray, 0));
-        PUTBACK;
-        call_sv((SV*)callback, G_DISCARD);
-    }
-}
-
-static void KVStoreServer_callback(int head, const char *body, void* callback)
-{
-    {
-        dSP;
-        PUSHMARK(SP);
-        XPUSHs(sv_2mortal(newSViv(head)));
-        XPUSHs(sv_2mortal(newSVpv(body, 0)));
-        PUTBACK;
-        call_sv((SV*)callback, G_DISCARD);
-    }
-}
-
-static void ExecutorMonitor_callback(const char* name, NDArrayHandle handle, void* callback)
-{
-    {
-        dSP;
-        PUSHMARK(SP);
-        XPUSHs(sv_2mortal(newSVpv(name, 0)));
-        XPUSHs(SWIG_NewPointerObj(SWIG_as_voidptr(handle), SWIGTYPE_p_MXNDArray, 0));
-        PUTBACK;
-        call_sv((SV*)callback, G_DISCARD);
-    }
-}
-
-%}
-
-%{
-
-/* this is an adaptation of Python/bltinmodule.c's builtin_zip() */
-XS(py_zip) {
-    dXSARGS;
-    I32 i;
-    I32 len = -1;
-    AV *l[items];
-
-    for(i = 0; i < items; i++) {
-        AV *av = (AV *)SvRV(ST(i));
-        I32 thislen;
-
-        if(SvTYPE(av) != SVt_PVAV)
-            croak("zip argument#%d must be an array", i);
-        thislen = av_len(av) + 1;
-        if(len < 0 || thislen < len)
-            len = thislen;
-        l[i] = av;
-    }
-    EXTEND(SP, len);
-    for(i = 0; i < len; i++) {
-        I32 j;
-        SV *next[items];
-
-        for(j = 0; j < items; j++) {
-            SV **sv = av_fetch(l[j], i, 0);
-            next[j] = sv ? *sv : &PL_sv_undef;
-        }
-        ST(i) = sv_2mortal(newRV_noinc((SV *)av_make(items, next)));
-    }
-    XSRETURN(len);
-}
-
-%}
-
-%init %{
-    newXS(SWIG_prefix "py_zip", py_zip, (char *)__FILE__);
-    /* These SWIG_TypeClientData() calls might break in the future, but
-     * %rename should work on these types before that happens. */
-    SWIG_TypeClientData(SWIGTYPE_p_MXNDArray, (void *)"NDArrayHandle");
-    SWIG_TypeClientData(SWIGTYPE_p_MXFunction, (void *)"FunctionHandle");
-    SWIG_TypeClientData(SWIGTYPE_p_MXAtomicSymbolCreator, (void *)"AtomicSymbolCreator");
-    SWIG_TypeClientData(SWIGTYPE_p_MXSymbol, (void *)"SymbolHandle");
-    SWIG_TypeClientData(SWIGTYPE_p_MXExecutor, (void *)"ExecutorHandle");
-    SWIG_TypeClientData(SWIGTYPE_p_MXDataIterCreator, (void *)"DataIterCreator");
-    SWIG_TypeClientData(SWIGTYPE_p_MXDataIter, (void *)"DataIterHandle");
-    SWIG_TypeClientData(SWIGTYPE_p_MXKVStore, (void *)"KVStoreHandle");
-    SWIG_TypeClientData(SWIGTYPE_p_MXRecordIO, (void *)"RecordIOHandle");
-    SWIG_TypeClientData(SWIGTYPE_p_MXRtc, (void *)"RtcHandle");
-    SWIG_TypeClientData(SWIGTYPE_p_MXCachedOp, (void *)"CachedOpHandle");
-    SWIG_TypeClientData(SWIGTYPE_p_MXCudaModuleHandle, (void *)"CudaModuleHandle");
-    SWIG_TypeClientData(SWIGTYPE_p_MXCudaKernelHandle, (void *)"CudaKernelHandle");
-%}
-
-/*! \brief manually define unsigned int */
-typedef uint32_t mx_uint;
-/*! \brief manually define float */
-typedef float mx_float;
-// all the handles are simply void *
-// will be casted internally to specific pointers types
-// these typedefs are mainly used for readablity reasons
-/*! \brief handle to NDArray */
-typedef MXNDArray *NDArrayHandle;
-/*! \brief handle to a mxnet ndarray function that changes NDArray */
-typedef MXFunction *FunctionHandle;
-/*! \brief handle to a function that takes param and creates symbol */
-typedef MXAtomicSymbolCreator *AtomicSymbolCreator;
-/*! \brief handle to a symbol that can be bind as operator */
-typedef MXSymbol *SymbolHandle;
-/*! \brief handle to a AtomicSymbol */
-typedef MXAtomicSymbol *AtomicSymbolHandle;
-/*! \brief handle to an Executor */
-typedef MXExecutor *ExecutorHandle;
-/*! \brief handle a dataiter creator */
-typedef MXDataIterCreator *DataIterCreator;
-/*! \brief handle to a DataIterator */
-typedef MXDataIter *DataIterHandle;
-/*! \brief handle to KVStore */
-typedef MXKVStore *KVStoreHandle;
-/*! \brief handle to RecordIO */
-typedef MXRecordIO *RecordIOHandle;
-/*! \brief handle to MXRtc*/
-typedef MXRtc *RtcHandle;
-/*! \brief handle to cached operator */
-typedef MXCachedOp *CachedOpHandle;
-/*! \brief handle to rtc cuda module*/
-typedef MXCudaModuleHandle *CudaModuleHandle;
-/*! \brief handle to rtc cuda kernel*/
-typedef MXCudaKernelHandle *CudaKernelHandle;
-
-typedef void (*ExecutorMonitorCallback)(const char*,
-                                                       NDArrayHandle,
-                                                       void *);
-struct NativeOpInfo {
-  void (*forward)(int, float**, int*, unsigned**, int*, void*);
-  void (*backward)(int, float**, int*, unsigned**, int*, void*);
-  void (*infer_shape)(int, int*, unsigned**, void*);
-  void (*list_outputs)(char***, void*);
-  void (*list_arguments)(char***, void*);
-  // all functions also pass a payload void* pointer
-  void* p_forward;
-  void* p_backward;
-  void* p_infer_shape;
-  void* p_list_outputs;
-  void* p_list_arguments;
-};
-
-struct NDArrayOpInfo {
-  bool (*forward)(int, void**, int*, void*);
-  bool (*backward)(int, void**, int*, void*);
-  bool (*infer_shape)(int, int*, unsigned**, void*);
-  bool (*list_outputs)(char***, void*);
-  bool (*list_arguments)(char***, void*);
-  bool (*declare_backward_dependency)(const int*, const int*, const int*,
-                                      int*, int**, void*);
-  // all functions also pass a payload void* pointer
-  void* p_forward;
-  void* p_backward;
-  void* p_infer_shape;
-  void* p_list_outputs;
-  void* p_list_arguments;
-  void* p_declare_backward_dependency;
-};
-
-/*!
- * \brief return str message of the last error
- *  all function in this file will return 0 when success
- *  and -1 when an error occured,
- *  MXGetLastError can be called to retrieve the error
- *
- *  this function is threadsafe and can be called by different thread
- *  \return error info
- */
-const char *MXGetLastError();
-
-/*!
- * \brief Get list of features supported on the runtime
- * \param libFeature pointer to array of LibFeature
- * \param size of the array
- * \return 0 when success, -1 when failure happens.
- */
-int MXLibInfoFeatures(const LibFeature **libFeature, size_t *size);
-
-//-------------------------------------
-// Part 0: Global State setups
-//-------------------------------------
-/*!
- * \brief Seed the global random number generators in mxnet.
- * \param seed the random number seed.
- * \return 0 when success, -1 when failure happens.
- */
-int MXRandomSeed(int seed);
-/*!
- * \brief Seed the global random number generator of the given device.
- * \param seed the random number seed.
- * \return 0 when success, -1 when failure happens.
- */
-int MXRandomSeedContext(int seed, int dev_type, int dev_id);
-/*!
- * \brief Notify the engine about a shutdown,
- *  This can help engine to print less messages into display.
- *
- *  User do not have to call this function.
- * \return 0 when success, -1 when failure happens.
- */
-int MXNotifyShutdown();
-/*!
- * \brief Set up configuration of profiler
- * \param num_params Number of parameters
- * \param keys array of parameter keys
- * \param vals array of parameter values
- * \return 0 when success, -1 when failure happens.
- */
-int MXSetProfilerConfig(int num_params, const char* const* keys, const char* const* vals);
-
-/*!
- * \brief Set up state of profiler
- * \param state indicate the working state of profiler,
- *  profiler not running when state == 0,
- *  profiler running when state == 1
- * \return 0 when success, -1 when failure happens.
- */
-int MXSetProfilerState(int state);
-
-/*! \brief Save profile and stop profiler */
-int MXDumpProfile(int finished);
-
-/*! \brief Set the number of OMP threads to use */
-int MXSetNumOMPThreads(int thread_num);
-
-/*!
- * \brief get the MXNet library version as an integer
- * \param pointer to the integer holding the version number
- * \return 0 when success, -1 when failure happens
- */
-int MXGetVersion(int *out);
-
-/*!
- * \brief set bulk execution limit
- * \param bulk_size new bulk_size
- * \param prev_bulk_size previous bulk_size
- */
-int MXEngineSetBulkSize(int bulk_size, int* out);
-
-/*!
- * \brief Get the number of GPUs.
- * \param pointer to int that will hold the number of GPUs available.
- * \return 0 when success, -1 when failure happens.
- */
-int MXGetGPUCount(int* out);
-
-/*!
- * \brief get the free and total available memory on a GPU
- *  Note: deprecated, use MXGetGPUMemoryInformation64().
- * \param dev the GPU number to query
- * \param free_mem pointer to the integer holding free GPU memory
- * \param total_mem pointer to the integer holding total GPU memory
- * \return 0 when success, -1 when failure happens
- */
-int MXGetGPUMemoryInformation(int dev, int *out, int *out);
-
-/*!
- * \brief get the free and total available memory on a GPU
- * \param dev the GPU number to query
- * \param free_mem pointer to the uint64_t holding free GPU memory
- * \param total_mem pointer to the uint64_t holding total GPU memory
- * \return 0 when success, -1 when failure happens
- */
-int MXGetGPUMemoryInformation64(int dev, uint64_t *out, uint64_t *out);
-
-
-//-------------------------------------
-// Part 1: NDArray creation and deletion
-//-------------------------------------
-/*!
- * \brief create a NDArray handle that is not initialized
- *  can be used to pass in as mutate variables
- *  to hold the result of NDArray
- * \param out the returning handle
- * \return 0 when success, -1 when failure happens
- */
-int MXNDArrayCreateNone(NDArrayHandle *out);
-/*!
- * \brief create a NDArray with specified shape
- * \param shape the pointer to the shape
- * \param ndim the dimension of the shape
- * \param dev_type device type, specify device we want to take
- * \param dev_id the device id of the specific device
- * \param delay_alloc whether to delay allocation until
- *    the ndarray is first mutated
- * \param out the returning handle
- * \return 0 when success, -1 when failure happens
- */
-int MXNDArrayCreate(const uint32_t *in,
-                              uint32_t in,
-                              int dev_type,
-                              int dev_id,
-                              int delay_alloc,
-                              NDArrayHandle *out);
-
-/*!
- * \brief create a NDArray with specified shape and data type
- * \param shape the pointer to the shape
- * \param ndim the dimension of the shape
- * \param dev_type device type, specify device we want to take
- * \param dev_id the device id of the specific device
- * \param delay_alloc whether to delay allocation until
- *    the ndarray is first mutated
- * \param dtype data type of created array
- * \param out the returning handle
- * \return 0 when success, -1 when failure happens
- */
-int MXNDArrayCreateEx(const uint32_t *in,
-                              uint32_t in,
-                              int dev_type,
-                              int dev_id,
-                              int delay_alloc,
-                              int dtype,
-                              NDArrayHandle *out);
-/*!
- * \brief create a NDArray with specified shape and data type
- *  This api is available when MXNet is built with flag
- *  USE_INT64_TENSOR_SIZE=1 (not default) i.e. Large Tensor Support
- * \param shape the pointer to int64_t shape
- * \param ndim the dimension of the shape
- * \param dev_type device type, specify device we want to take
- * \param dev_id the device id of the specific device
- * \param delay_alloc whether to delay allocation until
- *    the narray is first mutated
- * \param dtype data type of created array
- * \param out the returning handle
- * \return 0 when success, -1 when failure happens
- */
-int MXNDArrayCreateEx64(const int64_t *in,
-                                  int ndim,
-                                  int dev_type,
-                                  int dev_id,
-                                  int delay_alloc,
-                                  int dtype,
-                                  NDArrayHandle *out);
-/*!
- * \brief create an empty sparse NDArray with specified shape and data type
- * \param storage_type the storage type of the ndarray
- * \param shape the pointer to the shape
- * \param ndim the dimension of the shape
- * \param dev_type device type, specify device we want to take
- * \param dev_id the device id of the specific device
- * \param delay_alloc whether to delay allocation until
- *        the narray is first mutated
- * \param dtype data type of created array
- * \param num_aux the number of aux data to support this ndarray
- * \param aux_type data type of the aux data for the created array
- * \param aux_ndims the dimension of the shapes of aux data
- * \param aux_shape the shapes of aux data
- * \param out the returning handle
- * \return 0 when success, -1 when failure happens
- */
-int MXNDArrayCreateSparseEx(int storage_type,
-                                      const uint32_t *in,
-                                      uint32_t in,
-                                      int dev_type,
-                                      int dev_id,
-                                      int delay_alloc,
-                                      int dtype,
-                                      uint32_t in,
-                                      int *in,
-                                      uint32_t *in,
-                                      const uint32_t *in,
-                                      NDArrayHandle *out);
-/*!
- * \brief create an empty sparse NDArray with specified shape and data type
- *  This api is available when MXNet is built with flag
- *  USE_INT64_TENSOR_SIZE=1 (not default) i.e. Large Tensor Support
- * \param storage_type the storage type of the ndarray
- * \param shape the pointer to the shape
- * \param ndim the dimension of the shape
- * \param dev_type device type, specify device we want to take
- * \param dev_id the device id of the specific device
- * \param delay_alloc whether to delay allocation until
- *        the narray is first mutated
- * \param dtype data type of created array
- * \param num_aux the number of aux data to support this ndarray
- * \param aux_type data type of the aux data for the created array
- * \param aux_ndims the dimension of the shapes of aux data
- * \param aux_shape the shapes of aux data
- * \param out the returning handle
- * \return 0 when success, -1 when failure happens
- */
-int MXNDArrayCreateSparseEx64(int storage_type,
-                                        const int64_t *in,
-                                        int ndim,
-                                        int dev_type,
-                                        int dev_id,
-                                        int delay_alloc,
-                                        int dtype,
-                                        uint32_t in,
-                                        int *in,
-                                        int *in,
-                                        const int64_t *in,
-                                        NDArrayHandle *out);
-/*!
- * \brief create a NDArray handle that is loaded from raw bytes.
- * \param buf the head of the raw bytes
- * \param size size of the raw bytes
- * \param out the returning handle
- * \return 0 when success, -1 when failure happens
- */
-int MXNDArrayLoadFromRawBytes(const void *in,
-                                        size_t size,
-                                        NDArrayHandle *out);
-/*!
- * \brief save the NDArray into raw bytes.
- * \param handle the NDArray handle
- * \param out_size size of the raw bytes
- * \param out_buf the head of returning memory bytes.
- * \return 0 when success, -1 when failure happens
- */
-int MXNDArraySaveRawBytes(NDArrayHandle handle,
-                                    size_t *out_size,
-                                    const char **out_array);
-/*!
- * \brief Save list of ndarray into the file.
- * \param fname name of the file.
- * \param num_args number of arguments to save.
- * \param args the array of NDArrayHandles to be saved.
- * \param keys the name of the NDArray, optional, can be NULL
- * \return 0 when success, -1 when failure happens
- */
-int MXNDArraySave(const char* fname,
-                            uint32_t in,
-                            NDArrayHandle* in,
-                            const char** in);
-/*!
- * \brief Load list of ndarray from the file.
- * \param fname name of the file.
- * \param out_size number of ndarray loaded.
- * \param out_arr head of the returning ndarray handles.
- * \param out_name_size size of output name arrray.
- * \param out_names the names of returning NDArrays, can be NULL
- * \return 0 when success, -1 when failure happens
- */
-int MXNDArrayLoad(const char* fname,
-                            uint32_t *out_size,
-                            NDArrayHandle** out_array,
-                            uint32_t *out_size,
-                            const char*** out_array);
-
-/*!
- * \brief Load list / dictionary of narrays from file content loaded into memory.
- * This will load a list of ndarrays in a similar
- * manner to MXNDArrayLoad, however, it loads from
- * buffer containing the contents of a file, rather than
- * from a specified file.
- * \param ndarray_buffer pointer to the start of the ndarray file content
- * \param size size of the file
- * \param out_size number of narray loaded.
- * \param out_arr head of the returning narray handles.
- * \param out_name_size size of output name arrray.
- * \param out_names the names of returning NDArrays, can be NULL
- * \return 0 when success, -1 when failure happens
- */
-int MXNDArrayLoadFromBuffer(const void *in,
-                            size_t size,
-                            uint32_t *out_size,
-                            NDArrayHandle** out_array,
-                            uint32_t *out_size,
-                            const char*** out_array);
-
-/*!
- * \brief Perform a synchronized copy from a contiguous CPU memory region.
- *
- *  This function will call WaitToWrite before the copy is performed.
- *  This is useful to copy data from existing memory region that are
- *  not wrapped by NDArray(thus dependency not being tracked).
- *
- * \param handle the NDArray handle
- * \param data the data source to copy from.
- * \param size the memory size we want to copy from.
- */
-int MXNDArraySyncCopyFromCPU(NDArrayHandle handle,
-                                       const void *in,
-                                       size_t size);
-/*!
- * \brief Perform a synchronized copy to a contiguous CPU memory region.
- *
- *  This function will call WaitToRead before the copy is performed.
- *  This is useful to copy data from existing memory region that are
- *  not wrapped by NDArray(thus dependency not being tracked).
- *
- * \param handle the NDArray handle
- * \param data the data source to copy into.
- * \param size the memory size we want to copy into.
- */
-int MXNDArraySyncCopyToCPU(NDArrayHandle handle,
-                                     void *in,
-                                     size_t size);
-/*!
- * \brief Copy src.data() to dst.data() if i = -1, else dst.aux_data(i) if i >= 0
- * This function blocks. Do not use it in performance critical code.
- * \param handle_dst handle of a dst ndarray whose data/aux_data has been allocated
- * \param handle_src handle of a src ndarray which has default storage type
- * \param i dst data blob indicator
- */
-int MXNDArraySyncCopyFromNDArray(NDArrayHandle handle_dst,
-                                           const NDArrayHandle handle_src,
-                                           const int i);
-
-/*!
- * \brief check whether the NDArray format is valid
- * \param full_check if `True`, rigorous check, O(N) operations
- *    Otherwise basic check, O(1) operations
- */
-int MXNDArraySyncCheckFormat(NDArrayHandle handle, const bool full_check);
-/*!
- * \brief Wait until all the pending writes with respect NDArray are finished.
- *  Always call this before read data out synchronizely.
- * \param handle the NDArray handle
- * \return 0 when success, -1 when failure happens
- */
-int MXNDArrayWaitToRead(NDArrayHandle handle);
-/*!
- * \brief Wait until all the pending read/write with respect NDArray are finished.
- *  Always call this before write data into NDArray synchronizely.
- * \param handle the NDArray handle
- * \return 0 when success, -1 when failure happens
- */
-int MXNDArrayWaitToWrite(NDArrayHandle handle);
-/*!
- * \brief wait until all delayed operations in
- *   the system is completed
- * \return 0 when success, -1 when failure happens
- */
-int MXNDArrayWaitAll();
-/*!
- * \brief free the ndarray handle
- * \param handle the handle to be freed
- * \return 0 when success, -1 when failure happens
- */
-int MXNDArrayFree(NDArrayHandle handle);
-/*!
- * \brief Slice the NDArray along axis 0.
- * \param handle the handle to the NDArray
- * \param slice_begin The beginning index of slice
- * \param slice_end The ending index of slice
- * \param out The NDArrayHandle of sliced NDArray
- * \return 0 when success, -1 when failure happens
- */
-int MXNDArraySlice(NDArrayHandle handle,
-                             uint32_t in,
-                             uint32_t in,
-                             NDArrayHandle *out);
-/*!
- * \brief Slice the NDArray along axis 0.
- *  This api is available when MXNet is built with flag
- *  USE_INT64_TENSOR_SIZE=1 (not default) i.e. Large Tensor Support
- * \param handle the handle to the NDArray
- * \param slice_begin The beginning index of slice
- * \param slice_end The ending index of slice
- * \param out The NDArrayHandle of sliced NDArray
- * \return 0 when success, -1 when failure happens
- */
-int MXNDArraySlice64(NDArrayHandle handle,
-                               int64_t in,
-                               int64_t in,
-                               NDArrayHandle *out);
-
-/*!
- * \brief Index the NDArray along axis 0.
- * \param handle the handle to the NDArray
- * \param idx the index
- * \param out The NDArrayHandle of output NDArray
- * \return 0 when success, -1 when failure happens
- */
-int MXNDArrayAt(NDArrayHandle handle,
-                          uint32_t in,
-                          NDArrayHandle *out);
-/*!
- * \brief Index the NDArray along axis 0.
- *  This api is available when MXNet is built with flag
- *  USE_INT64_TENSOR_SIZE=1 (not default) i.e. Large Tensor Support
- * \param handle the handle to the NDArray
- * \param idx the index
- * \param out The NDArrayHandle of output NDArray
- * \return 0 when success, -1 when failure happens
- */
-int MXNDArrayAt64(NDArrayHandle handle,
-                            int64_t in,
-                            NDArrayHandle *out);
-/*!
- * \brief get the storage type of the array
- */
-int MXNDArrayGetStorageType(NDArrayHandle handle,
-                                      int *out);
-/*!
- * \brief Reshape the NDArray.
- * \param handle the handle to the ndarray
- * \param ndim number of dimensions of new shape
- * \param dims new shape
- * \param out the NDArrayHandle of reshaped NDArray
- * \return 0 when success, -1 when failure happens
- */
-int MXNDArrayReshape(NDArrayHandle handle,
-                               int ndim,
-                               int *in,
-                               NDArrayHandle *out);
-/*!
- * \brief Reshape the NDArray.
- * \param handle the handle to the narray
- * \param ndim number of dimensions of new shape
- * \param dims new shape
- * \param out the NDArrayHandle of reshaped NDArray
- * \return 0 when success, -1 when failure happens
- */
-int MXNDArrayReshape64(NDArrayHandle handle,
-                                 int ndim,
-                                 dim_t *in,
-                                 bool reverse,
-                                 NDArrayHandle *out);
-
-/*!
- * \brief get the shape of the array
- * \param handle the handle to the ndarray
- * \param out_dim the output dimension
- * \param out_pdata pointer holder to get data pointer of the shape
- * \return 0 when success, -1 when failure happens
- */
-int MXNDArrayGetShapeEx(NDArrayHandle handle,
-                                int *out_dim,
-                                const int **out_pdata);
-/*!
- * \brief get the shape of the array
- *  This api is available when MXNet is built with flag
- *  USE_INT64_TENSOR_SIZE=1 (not default) i.e. Large Tensor Support
- * \param handle the handle to the narray
- * \param out_dim the output dimension
- * \param out_pdata pointer holder to get data pointer of the shape
- * \return 0 when success, -1 when failure happens
- */
-int MXNDArrayGetShapeEx64(NDArrayHandle handle,
-                                    int *out_dim,
-                                    const int64_t **out_pdata);
-/*!
- * \brief get the content of the data in NDArray
- * \param handle the handle to the ndarray
- * \param out_pdata pointer holder to get pointer of data
- * \return 0 when success, -1 when failure happens
- */
-int MXNDArrayGetData(NDArrayHandle handle,
-                                void **out_pdata);
-/*!
- * \brief get the type of the data in NDArray
- * \param handle the handle to the ndarray
- * \param out_dtype pointer holder to get type of data
- * \return 0 when success, -1 when failure happens
- */
-int MXNDArrayGetDType(NDArrayHandle handle,
-                               int *out);
-/*!
- * \brief get the type of the ith aux data in NDArray
- * \param handle the handle to the narray
- * \param i the index of the aux data
- * \param out_type pointer holder to get type of aux data
- * \return 0 when success, -1 when failure happens
- */
-int MXNDArrayGetAuxType(NDArrayHandle handle,
-                                  uint32_t in,
-                                  int *out);
-/*!
- * \brief get the type of the ith aux data in NDArray
- *  This api is available when MXNet is built with flag
- *  USE_INT64_TENSOR_SIZE=1 (not default) i.e. Large Tensor Support
- * \param handle the handle to the narray
- * \param i the index of the aux data
- * \param out_type pointer holder to get type of aux data
- * \return 0 when success, -1 when failure happens
- */
-// int MXNDArrayGetAuxType64(NDArrayHandle handle,
-//                                    int64_t i,
-//                                    int *out);
-
-/*!
- * \brief Get a deep copy of the ith aux data blob
- * in the form of an NDArray of default storage type.
- * This function blocks. Do not use it in performance critical code.
- */
-int MXNDArrayGetAuxNDArray(NDArrayHandle handle,
-                                     uint32_t in,
-                                     NDArrayHandle *out);
-/*!
- * \brief Get a deep copy of the ith aux data blob
- *  This api is available when MXNet is built with flag
- *  USE_INT64_TENSOR_SIZE=1 (not default) i.e. Large Tensor Support
- * in the form of an NDArray of default storage type.
- * This function blocks. Do not use it in performance critical code.
- */
-// int MXNDArrayGetAuxNDArray64(NDArrayHandle handle,
-//                                       int64_t i,
-//                                       NDArrayHandle *out);
-
-/*!
- * \brief Get a deep copy of the data blob
- * in the form of an NDArray of default storage type.
- * This function blocks. Do not use it in performance critical code.
- */
-int MXNDArrayGetDataNDArray(NDArrayHandle handle,
-                                      NDArrayHandle *out);
-
-/*!
- * \brief get the context of the NDArray
- * \param handle the handle to the ndarray
- * \param out_dev_type the output device type
- * \param out_dev_id the output device id
- * \return 0 when success, -1 when failure happens
- */
-int MXNDArrayGetContext(NDArrayHandle handle,
-                                  int *out,
-                                  int *out);
-/*!
- * \brief return gradient buffer attached to this NDArray
- * \param handle NDArray handle
- * \return 0 when success, -1 when failure happens
- */
-int MXNDArrayGetGrad(NDArrayHandle handle, NDArrayHandle *out);
-
-/*!
- * \brief detach and ndarray from computation graph by clearing entry_
- * \param handle NDArray handle
- * \return 0 when success, -1 when failure happens
- */
-int MXNDArrayDetach(NDArrayHandle handle, NDArrayHandle *out);
-
-/*!
- * \brief set the flag for gradient array state.
- * \param handle NDArray handle
- * \param state the new state.
- * \return 0 when success, -1 when failure happens
- */
-int MXNDArraySetGradState(NDArrayHandle handle, int state);
-
-/*!
- * \brief set the flag for gradient array state.
- * \param handle NDArray handle
- * \param state the new state.
- * \return 0 when success, -1 when failure happens
- */
-int MXNDArrayGetGradState(NDArrayHandle handle, int *out);
-
-//--------------------------------
-// Part 2: functions on NDArray
-//--------------------------------
-/*!
- * \brief list all the available functions handles
- *   most user can use it to list all the needed functions
- * \param out_size the size of returned array
- * \param out_array the output function array
- * \return 0 when success, -1 when failure happens
- */
-int MXListFunctions(uint32_t *out_size,
-                              FunctionHandle **out_array);
-/*!
- * \brief get the function handle by name
- * \param name the name of the function
- * \param out the corresponding function handle
- * \return 0 when success, -1 when failure happens
- */
-int MXGetFunction(const char *name,
-                            FunctionHandle *out);
-/*!
- * \brief Get the information of the function handle.
- * \param fun The function handle.
- * \param name The returned name of the function.
- * \param description The returned description of the function.
- * \param num_args Number of arguments.
- * \param arg_names Name of the arguments.
- * \param arg_type_infos Type information about the arguments.
- * \param arg_descriptions Description information about the arguments.
- * \param return_type Return type of the function.
- * \return 0 when success, -1 when failure happens
- */
-int MXFuncGetInfo(FunctionHandle fun,
-                            const char **name,
-                            const char **description,
-                            uint32_t *num_args,
-                            const char ***arg_names,
-                            const char ***arg_type_infos,
-                            const char ***arg_descriptions
-                            );
-/*!
- * \brief get the argument requirements of the function
- * \param fun input function handle
- * \param num_use_vars how many NDArrays to be passed in as used_vars
- * \param num_scalars scalar variable is needed
- * \param num_mutate_vars how many NDArrays to be passed in as mutate_vars
- * \param type_mask the type mask of this function
- * \return 0 when success, -1 when failure happens
- * \sa MXFuncInvoke
- */
-int MXFuncDescribe(FunctionHandle fun,
-                             uint32_t *out,
-                             uint32_t *out,
-                             uint32_t *out,
-                             int *out);
-/*!
- * \brief invoke a function, the array size of passed in arguments
- *   must match the values in the
- * \param fun the function
- * \param use_vars the normal arguments passed to function
- * \param scalar_args the scalar qarguments
- * \param mutate_vars the mutate arguments
- * \return 0 when success, -1 when failure happens
- * \sa MXFuncDescribeArgs
- */
-int MXFuncInvoke(FunctionHandle fun,
-                           NDArrayHandle *in,
-                           float *in,
-                           NDArrayHandle *in);
-/*!
- * \brief invoke a function, the array size of passed in arguments
- *   must match the values in the
- * \param fun the function
- * \param use_vars the normal arguments passed to function
- * \param scalar_args the scalar qarguments
- * \param mutate_vars the mutate arguments
- * \param num_params number of keyword parameters
- * \param param_keys keys for keyword parameters
- * \param param_vals values for keyword parameters
- * \return 0 when success, -1 when failure happens
- * \sa MXFuncDescribeArgs
- */
-int MXFuncInvokeEx(FunctionHandle fun,
-                             NDArrayHandle *in,
-                             float *in,
-                             NDArrayHandle *in,
-                             int num_params,
-                             char **keys,
-                             char **vals);
-/*!
- * \brief invoke a nnvm op and imperative function
- * \param creator the op
- * \param num_inputs number of input NDArrays
- * \param inputs input NDArrays
- * \param num_outputs number of output NDArrays
- * \param outputs output NDArrays
- * \param num_params number of keyword parameters
- * \param param_keys keys for keyword parameters
- * \param param_vals values for keyword parameters
- * \return 0 when success, -1 when failure happens
- */
-int MXImperativeInvoke(AtomicSymbolCreator in,
-                                 int num_inputs,
-                                 NDArrayHandle *in,
-                                 int *out_size,
-                                 NDArrayHandle **out_array,
-                                 int num_params,
-                                 const char **keys,
-                                 const char **vals);
-/*!
- * \brief invoke a nnvm op and imperative function
- * \param creator the op
- * \param num_inputs number of input NDArrays
- * \param inputs input NDArrays
- * \param num_outputs number of output NDArrays
- * \param outputs output NDArrays
- * \param num_params number of keyword parameters
- * \param param_keys keys for keyword parameters
- * \param param_vals values for keyword parameters
- * \param out_stypes output ndarrays' stypes
- * \return 0 when success, -1 when failure happens
- */
-int MXImperativeInvokeEx(AtomicSymbolCreator in,
-                                   int num_inputs,
-                                   NDArrayHandle *in,
-                                   int *out_size,
-                                   NDArrayHandle **out_array,
-                                   int num_params,
-                                   const char **keys,
-                                   const char **vals,
-                                   const int **out_stypes);
-
-/*!
-  * \brief set whether to record operator for autograd
- * \param is_recording 1 when recording, 0 when not recording.
- * \param prev returns the previous status before this set.
- * \return 0 when success, -1 when failure happens
- */
-int MXAutogradSetIsRecording(int is_recording, int* out);
-
-/*!
- * \brief set whether to record operator for autograd
- * \param is_train 1 when training, 0 when testing
- * \param prev returns the previous status before this set.
- * \return 0 when success, -1 when failure happens
- */
-int MXAutogradSetIsTraining(int is_training, int* out);
-
-/*!
- * \brief get whether autograd recording is on
- * \param curr returns the current status.
- * \return 0 when success, -1 when failure happens
- */
-int MXAutogradIsRecording(bool* out);
-
-/*!
- * \brief get whether training mode is on
- * \param curr returns the current status.
- * \return 0 when success, -1 when failure happens
- */
-int MXAutogradIsTraining(bool* out);
-
-/*!
- * \brief mark NDArrays as variables to compute gradient for autograd
- * \param num_var number of variable NDArrays
- * \param var_handles variable NDArrays
- * \return 0 when success, -1 when failure happens
- */
-int MXAutogradMarkVariables(uint32_t in,
-                                      NDArrayHandle *in,
-                                      uint32_t *in,
-                                      NDArrayHandle *in);
-/*!
- * \brief compute the gradient of outputs w.r.t variables
- * \param num_output number of output NDArray
- * \param output_handles output NDArrays
- * \return 0 when success, -1 when failure happens
- */
-int MXAutogradComputeGradient(uint32_t in,
-                                        NDArrayHandle* in);
-/*!
- * \brief compute the gradient of outputs w.r.t variabels
- * \param num_output number of output NDArray
- * \param output_handles output NDArrays
- * \param ograd_handles head gradient for NDArrays
- * \param retain_graph whether to keep the graph after backward
- * \return 0 when success, -1 when failure happens
- */
-int MXAutogradBackward(uint32_t in,
-                                 NDArrayHandle* in,
-                                 NDArrayHandle* in,
-                                 int retain_graph);
-
-/*!
- * \brief compute the gradient of outputs w.r.t variabels
- * \param num_output number of output NDArray
- * \param output_handles output NDArrays
- * \param ograd_handles head gradient for NDArrays
- * \param retain_graph whether to keep the graph after backward
- * \param is_train whether to do backward for training or inference
- * \return 0 when success, -1 when failure happens
- */
-int MXAutogradBackwardEx(uint32_t in,
-                                   NDArrayHandle *in,
-                                   NDArrayHandle *in,
-                                   uint32_t in,
-                                   NDArrayHandle *in,
-                                   int retain_graph,
-                                   int create_graph,
-                                   int is_train,
-                                   NDArrayHandle **out_grad,
-                                   int **out_stype);
-
-/*
- * \brief get the graph constructed by autograd.
- * \param handle ndarray handle
- * \param out output symbol handle
- */
-int MXAutogradGetSymbol(NDArrayHandle handle, SymbolHandle *out);
-
- /*!
-  * \brief create cached operator
-  */
-int MXCreateCachedOp(SymbolHandle handle,
-                                CachedOpHandle *out);
-/*!
- * \brief create cached operator
- */
-int MXCreateCachedOpEx(SymbolHandle handle,
-                                 int num_flags,
-                                 const char** keys,
-                                 const char** vals,
-                                 CachedOpHandle *out);
- /*!
-  * \brief free cached operator
-  */
-int MXFreeCachedOp(CachedOpHandle handle);
- /*!
-  * \brief invoke cached operator
-  */
-int MXInvokeCachedOp(CachedOpHandle handle,
-                               int num_inputs,
-                               NDArrayHandle *in,
-                               int *out_size,
-                               NDArrayHandle **out_array);
-/*!
- * \brief invoke a cached op
- * \param handle the handle to the cached op
- * \param num_inputs number of input NDArrays
- * \param inputs input NDArrays
- * \param num_outputs number of output NDArrays
- * \param outputs output NDArrays
- * \param out_stypes output ndarrays' stypes
- * \return 0 when success, -1 when failure happens
- */
-int MXInvokeCachedOpEx(CachedOpHandle handle,
-                                 int num_inputs,
-                                 NDArrayHandle *in,
-                                 int *out_size,
-                                 NDArrayHandle **out_array,
-                                 const int** out_stypes);
-//--------------------------------------------
-// Part 3: symbolic configuration generation
-//--------------------------------------------
-/*!
- * \brief list all the available operator names, include entries.
- * \param out_size the size of returned array
- * \param out_array the output operator name array.
- * \return 0 when success, -1 when failure happens
- */
-int MXListAllOpNames(uint32_t *out_size,
-                               const char ***out_array);
-/*!
- * \brief list all the available AtomicSymbolEntry
- * \param out_size the size of returned array
- * \param out_array the output AtomicSymbolCreator array
- * \return 0 when success, -1 when failure happens
- */
-int MXSymbolListAtomicSymbolCreators(uint32_t *out_size,
-                                               AtomicSymbolCreator **out_array);
-
-/*!
- * \brief Get the name of an atomic symbol.
- * \param creator the AtomicSymbolCreator.
- * \param name The returned name of the creator.
- */
-int MXSymbolGetAtomicSymbolName(AtomicSymbolCreator in,
-                                          const char **out);
-/*!
- * \brief Get the detailed information about atomic symbol.
- * \param creator the AtomicSymbolCreator.
- * \param name The returned name of the creator.
- * \param description The returned description of the symbol.
- * \param num_args Number of arguments.
- * \param arg_names Name of the arguments.
- * \param arg_type_infos Type informations about the arguments.
- * \param arg_descriptions Description information about the arguments.
- * \param key_var_num_args The keyword argument for specifying variable number of arguments.
- *            When this parameter has non-zero length, the function allows variable number
- *            of positional arguments, and will need the caller to pass it in in
- *            MXSymbolCreateAtomicSymbol,
- *            With key = key_var_num_args, and value = number of positional arguments.
- * \param return_type Return type of the function, can be Symbol or Symbol[]
- * \return 0 when success, -1 when failure happens
- */
-int MXSymbolGetAtomicSymbolInfo(AtomicSymbolCreator in,
-                                          const char **name,
-                                          const char **description,
-                                          uint32_t *num_args,
-                                          const char ***arg_names,
-                                          const char ***arg_type_infos,
-                                          const char ***arg_descriptions,
-                                          const char **key_var_num_args
-                                          );
-/*!
- * \brief Create an AtomicSymbol.
- * \param creator the AtomicSymbolCreator
- * \param num_param the number of parameters
- * \param keys the keys to the params
- * \param vals the vals of the params
- * \param out pointer to the created symbol handle
- * \return 0 when success, -1 when failure happens
- */
-int MXSymbolCreateAtomicSymbol(AtomicSymbolCreator in,
-                                         uint32_t in,
-                                         const char **keys,
-                                         const char **vals,
-                                         SymbolHandle *out);
-/*!
- * \brief Create a Variable Symbol.
- * \param name name of the variable
- * \param out pointer to the created symbol handle
- * \return 0 when success, -1 when failure happens
- */
-int MXSymbolCreateVariable(const char *name, SymbolHandle *out);
-/*!
- * \brief Create a Symbol by grouping list of symbols together
- * \param num_symbols number of symbols to be grouped
- * \param symbols array of symbol handles
- * \param out pointer to the created symbol handle
- * \return 0 when success, -1 when failure happens
- */
-int MXSymbolCreateGroup(uint32_t in,
-                                  SymbolHandle *in,
-                                  SymbolHandle *out);
-/*!
- * \brief Load a symbol from a json file.
- * \param fname the file name.
- * \param out the output symbol.
- * \return 0 when success, -1 when failure happens
- */
-int MXSymbolCreateFromFile(const char *fname, SymbolHandle *out);
-/*!
- * \brief Load a symbol from a json string.
- * \param json the json string.
- * \param out the output symbol.
- * \return 0 when success, -1 when failure happens
- */
-int MXSymbolCreateFromJSON(const char *json, SymbolHandle *out);
-/*!
- * \brief Save a symbol into a json file.
- * \param symbol the input symbol.
- * \param fname the file name.
- * \return 0 when success, -1 when failure happens
- */
-int MXSymbolSaveToFile(SymbolHandle symbol, const char *fname);
-/*!
- * \brief Save a symbol into a json string
- * \param symbol the input symbol.
- * \param out_json output json string.
- * \return 0 when success, -1 when failure happens
- */
-int MXSymbolSaveToJSON(SymbolHandle symbol, const char **out);
-/*!
- * \brief Free the symbol handle.
- * \param symbol the symbol
- * \return 0 when success, -1 when failure happens
- */
-int MXSymbolFree(SymbolHandle symbol);
-/*!
- * \brief Copy the symbol to another handle
- * \param symbol the source symbol
- * \param out used to hold the result of copy
- * \return 0 when success, -1 when failure happens
- */
-int MXSymbolCopy(SymbolHandle symbol, SymbolHandle *out);
-/*!
- * \brief Print the content of symbol, used for debug.
- * \param symbol the symbol
- * \param out_str pointer to hold the output string of the printing.
- * \return 0 when success, -1 when failure happens
- */
-int MXSymbolPrint(SymbolHandle symbol, const char **out);
-/*!
- * \brief Get string name from symbol
- * \param symbol the source symbol
- * \param out The result name.
- * \param success Whether the result is contained in out.
- * \return 0 when success, -1 when failure happens
- */
-int MXSymbolGetName(SymbolHandle symbol,
-                              const char** out,
-                              int *out);
-/*!
- * \brief Get string attribute from symbol
- * \param symbol the source symbol
- * \param key The key of the symbol.
- * \param out The result attribute, can be NULL if the attribute do not exist.
- * \param success Whether the result is contained in out.
- * \return 0 when success, -1 when failure happens
- */
-int MXSymbolGetAttr(SymbolHandle symbol,
-                              const char* key,
-                              const char** out,
-                              int *out);
-/*!
- * \brief Set string attribute from symbol.
- *  NOTE: Setting attribute to a symbol can affect the semantics(mutable/immutable) of symbolic graph.
- *
- *  Safe recommendaton: use  immutable graph
- *  - Only allow set attributes during creation of new symbol as optional parameter
- *
- *  Mutable graph (be careful about the semantics):
- *  - Allow set attr at any point.
- *  - Mutating an attribute of some common node of two graphs can cause confusion from user.
- *
- * \param symbol the source symbol
- * \param key The key of the symbol.
- * \param value The value to be saved.
- * \return 0 when success, -1 when failure happens
- */
-int MXSymbolSetAttr(SymbolHandle symbol,
-                              const char* in,
-                              const char* in);
-/*!
- * \brief Get all attributes from symbol, including all descendents.
- * \param symbol the source symbol
- * \param out_size The number of output attributes
- * \param out 2*out_size strings representing key value pairs.
- * \return 0 when success, -1 when failure happens
- */
-int MXSymbolListAttr(SymbolHandle symbol,
-                               uint32_t *out_size,
-                               const char*** out_array2);
-/*!
- * \brief Get all attributes from symbol, excluding descendents.
- * \param symbol the source symbol
- * \param out_size The number of output attributes
- * \param out 2*out_size strings representing key value pairs.
- * \return 0 when success, -1 when failure happens
- */
-int MXSymbolListAttrShallow(SymbolHandle symbol,
-                                      uint32_t *out_size,
-                                      const char*** out_array2);
-/*!
- * \brief List arguments in the symbol.
- * \param symbol the symbol
- * \param out_size output size
- * \param out_str_array pointer to hold the output string array
- * \return 0 when success, -1 when failure happens
- */
-int MXSymbolListArguments(SymbolHandle symbol,
-                                    uint32_t *out_size,
-                                    const char ***out_array);
-/*!
- * \brief List returns in the symbol.
- * \param symbol the symbol
- * \param out_size output size
- * \param out_str_array pointer to hold the output string array
- * \return 0 when success, -1 when failure happens
- */
-int MXSymbolListOutputs(SymbolHandle symbol,
-                                  uint32_t *out_size,
-                                  const char ***out_array);
-/*!
- * \brief Get a symbol that contains all the internals.
- * \param symbol The symbol
- * \param out The output symbol whose outputs are all the internals.
- * \return 0 when success, -1 when failure happens
- */
-int MXSymbolGetInternals(SymbolHandle symbol,
-                                   SymbolHandle *out);
-/*!
- * \brief Get a symbol that contains only direct children.
- * \param symbol The symbol
- * \param out The output symbol whose outputs are the direct children.
- * \return 0 when success, -1 when failure happens
- */
-int MXSymbolGetChildren(SymbolHandle symbol,
-                                  SymbolHandle *out);
-/*!
- * \brief Get index-th outputs of the symbol.
- * \param symbol The symbol
- * \param index the Index of the output.
- * \param out The output symbol whose outputs are the index-th symbol.
- * \return 0 when success, -1 when failure happens
- */
-int MXSymbolGetOutput(SymbolHandle symbol,
-                                mx_uint in,
-                                SymbolHandle *out);
-int MXSymbolGetNumOutputs(SymbolHandle symbol,
-                                    uint32_t *out);
-/*!
- * \brief List auxiliary states in the symbol.
- * \param symbol the symbol
- * \param out_size output size
- * \param out_str_array pointer to hold the output string array
- * \return 0 when success, -1 when failure happens
- */
-int MXSymbolListAuxiliaryStates(SymbolHandle symbol,
-                                          uint32_t *out_size,
-                                          const char ***out_array);
-/*!
- * \brief Compose the symbol on other symbols.
- *
- *  This function will change the sym hanlde.
- *  To achieve function apply behavior, copy the symbol first
- *  before apply.
- *
- * \param sym the symbol to apply
- * \param name the name of symbol
- * \param num_args number of arguments
- * \param keys the key of keyword args (optional)
- * \param args arguments to sym
- * \return 0 when success, -1 when failure happens
- */
-int MXSymbolCompose(SymbolHandle sym,
-                              const char *name,
-                              uint32_t in,
-                              const char** in,
-                              SymbolHandle* in);
-/*!
- * \brief Get the gradient graph of the symbol
- *
- * \param sym the symbol to get gradient
- * \param num_wrt number of arguments to get gradient
- * \param wrt the name of the arguments to get gradient
- * \param out the returned symbol that has gradient
- * \return 0 when success, -1 when failure happens
- */
-int MXSymbolGrad(SymbolHandle sym,
-                           uint32_t in,
-                           const char** in,
-                           SymbolHandle* out);
-/*!
- * \brief infer shape of unknown input shapes given the known one.
- *  The shapes are packed into a CSR matrix represented by arg_ind_ptr and arg_shape_data
- *  The call will be treated as a kwargs call if key != nullptr or num_args==0, otherwise it is positional.
- *
- * \param sym symbol handle
- * \param num_args numbe of input arguments.
- * \param keys the key of keyword args (optional)
- * \param arg_ind_ptr the head pointer of the rows in CSR
- * \param arg_shape_data the content of the CSR
- * \param in_shape_size sizeof the returning array of in_shapes
- * \param in_shape_ndim returning array of shape dimensions of eachs input shape.
- * \param in_shape_data returning array of pointers to head of the input shape.
- * \param out_shape_size sizeof the returning array of out_shapes
- * \param out_shape_ndim returning array of shape dimensions of eachs input shape.
- * \param out_shape_data returning array of pointers to head of the input shape.
- * \param aux_shape_size sizeof the returning array of aux_shapes
- * \param aux_shape_ndim returning array of shape dimensions of eachs auxiliary shape.
- * \param aux_shape_data returning array of pointers to head of the auxiliary shape.
- * \param complete whether infer shape completes or more information is needed.
- * \return 0 when success, -1 when failure happens
- */
-int MXSymbolInferShapeEx(SymbolHandle sym,
-                         uint32_t in,
-                         const char** in,
-                         const uint32_t *in,
-                         const int *in,
-                         uint32_t *in_shape_size,
-                         const int **in_shape_ndim,
-                         const int ***in_shape_data,
-                         uint32_t *out_shape_size,
-                         const int **out_shape_ndim,
-                         const int ***out_shape_data,
-                         uint32_t *aux_shape_size,
-                         const int **aux_shape_ndim,
-                         const int ***aux_shape_data,
-                         int *out);
-/*!
- * \brief infer shape of unknown input shapes given the known one.
- *  The shapes are packed into a CSR matrix represented by arg_ind_ptr and arg_shape_data
- *  The call will be treated as a kwargs call if key != NULL or num_args==0, otherwise it is positional.
- *  This api is available when MXNet is built with flag
- *  USE_INT64_TENSOR_SIZE=1 (not default) i.e. Large Tensor Support
- * \param sym symbol handle
- * \param num_args number of input arguments.
- * \param keys the key of keyword args (optional)
- * \param arg_ind_ptr the head pointer of the rows in CSR
- * \param arg_shape_data the content of the CSR
- * \param in_shape_size sizeof the returning array of in_shapes
- * \param in_shape_ndim returning array of shape dimensions of each input shape.
- * \param in_shape_data returning array of pointers to head of the input shape.
- * \param out_shape_size sizeof the returning array of out_shapes
- * \param out_shape_ndim returning array of shape dimensions of each output shape.
- * \param out_shape_data returning array of pointers to head of the output shape.
- * \param aux_shape_size sizeof the returning array of aux_shapes
- * \param aux_shape_ndim returning array of shape dimensions of each auxiliary shape.
- * \param aux_shape_data returning array of pointers to head of the auxiliary shape.
- * \param complete whether infer shape completes or more information is needed.
- * \return 0 when success, -1 when failure happens
- */
-int MXSymbolInferShapeEx64(SymbolHandle sym,
-                                     uint32_t in,
-                                     const char** in,
-                                     const int64_t *in,
-                                     const int64_t *in,
-                                     size_t *in_shape_size,
-                                     const int **in_shape_ndim,
-                                     const int64_t ***in_shape_data,
-                                     size_t *out_shape_size,
-                                     const int **out_shape_ndim,
-                                     const int64_t ***out_shape_data,
-                                     size_t *aux_shape_size,
-                                     const int **aux_shape_ndim,
-                                     const int64_t ***aux_shape_data,
-                                     int *out);
-
-/*!
- * \brief partially infer shape of unknown input shapes given the known one.
- *
- *  Return partially inferred results if not all shapes could be inferred.
- *  The shapes are packed into a CSR matrix represented by arg_ind_ptr and arg_shape_data
- *  The call will be treated as a kwargs call if key != nullptr or num_args==0, otherwise it is positional.
- *
- * \param sym symbol handle
- * \param num_args numbe of input arguments.
- * \param keys the key of keyword args (optional)
- * \param arg_ind_ptr the head pointer of the rows in CSR
- * \param arg_shape_data the content of the CSR
- * \param in_shape_size sizeof the returning array of in_shapes
- * \param in_shape_ndim returning array of shape dimensions of eachs input shape.
- * \param in_shape_data returning array of pointers to head of the input shape.
- * \param out_shape_size sizeof the returning array of out_shapes
- * \param out_shape_ndim returning array of shape dimensions of eachs input shape.
- * \param out_shape_data returning array of pointers to head of the input shape.
- * \param aux_shape_size sizeof the returning array of aux_shapes
- * \param aux_shape_ndim returning array of shape dimensions of eachs auxiliary shape.
- * \param aux_shape_data returning array of pointers to head of the auxiliary shape.
- * \param complete whether infer shape completes or more information is needed.
- * \return 0 when success, -1 when failure happens
- */
-int MXSymbolInferShapePartialEx(SymbolHandle sym,
-                                uint32_t in,
-                                const char** in,
-                                const uint32_t *in,
-                                const int *in,
-                                uint32_t *in_shape_size,
-                                const int **in_shape_ndim,
-                                const int ***in_shape_data,
-                                uint32_t *out_shape_size,
-                                const int **out_shape_ndim,
-                                const int ***out_shape_data,
-                                uint32_t *aux_shape_size,
-                                const int **aux_shape_ndim,
-                                const int ***aux_shape_data,
-                                int *out);
-/*!
- * \brief partially infer shape of unknown input shapes given the known one.
- *
- *  Return partially inferred results if not all shapes could be inferred.
- *  The shapes are packed into a CSR matrix represented by arg_ind_ptr and arg_shape_data
- *  The call will be treated as a kwargs call if key != NULL or num_args==0, otherwise it is positional.
- *  This api is available when MXNet is built with flag
- *  USE_INT64_TENSOR_SIZE=1 (not default) i.e. Large Tensor Support
- *
- * \param sym symbol handle
- * \param num_args number of input arguments.
- * \param keys the key of keyword args (optional)
- * \param arg_ind_ptr the head pointer of the rows in CSR
- * \param arg_shape_data the content of the CSR
- * \param in_shape_size sizeof the returning array of in_shapes
- * \param in_shape_ndim returning array of shape dimensions of each input shape.
- * \param in_shape_data returning array of pointers to head of the input shape.
- * \param out_shape_size sizeof the returning array of out_shapes
- * \param out_shape_ndim returning array of shape dimensions of each output shape.
- * \param out_shape_data returning array of pointers to head of the output shape.
- * \param aux_shape_size sizeof the returning array of aux_shapes
- * \param aux_shape_ndim returning array of shape dimensions of each auxiliary shape.
- * \param aux_shape_data returning array of pointers to head of the auxiliary shape.
- * \param complete whether infer shape completes or more information is needed.
- * \return 0 when success, -1 when failure happens
- */
-int MXSymbolInferShapePartialEx64(SymbolHandle sym,
-                                            uint32_t in,
-                                            const char** in,
-                                            const int64_t *in,
-                                            const int64_t *in,
-                                            size_t *in_shape_size,
-                                            const int **in_shape_ndim,
-                                            const int64_t ***in_shape_data,
-                                            size_t *out_shape_size,
-                                            const int **out_shape_ndim,
-                                            const int64_t ***out_shape_data,
-                                            size_t *aux_shape_size,
-                                            const int **aux_shape_ndim,
-                                            const int64_t ***aux_shape_data,
-                                            int *out);
-
-/*!
- * \brief infer type of unknown input types given the known one.
- *  The types are packed into a CSR matrix represented by arg_ind_ptr and arg_type_data
- *  The call will be treated as a kwargs call if key != nullptr or num_args==0, otherwise it is positional.
- *
- * \param sym symbol handle
- * \param num_args numbe of input arguments.
- * \param keys the key of keyword args (optional)
- * \param arg_type_data the content of the CSR
- * \param in_type_size sizeof the returning array of in_types
- * \param in_type_data returning array of pointers to head of the input type.
- * \param out_type_size sizeof the returning array of out_types
- * \param out_type_data returning array of pointers to head of the input type.
- * \param aux_type_size sizeof the returning array of aux_types
- * \param aux_type_data returning array of pointers to head of the auxiliary type.
- * \param complete whether infer type completes or more information is needed.
- * \return 0 when success, -1 when failure happens
- */
-int MXSymbolInferType(SymbolHandle sym,
-                                uint32_t in,
-                                const char** in,
-                                const int *in,
-                                uint32_t *in_type_size,
-                                const int **in_type_data,
-                                uint32_t *out_type_size,
-                                const int **out_type_data,
-                                uint32_t *aux_type_size,
-                                const int **aux_type_data,
-                                int *out);
-/*!
- * \brief partially infer type of unknown input types given the known one.
- *
- *  Return partially inferred results if not all types could be inferred.
- *  The types are packed into a CSR matrix represented by arg_ind_ptr and arg_type_data
- *  The call will be treated as a kwargs call if key != NULL or num_args==0, otherwise it is positional.
- *
- * \param sym symbol handle
- * \param num_args numbe of input arguments.
- * \param keys the key of keyword args (optional)
- * \param arg_type_data the content of the CSR
- * \param in_type_size sizeof the returning array of in_types
- * \param in_type_data returning array of pointers to head of the input type.
- * \param out_type_size sizeof the returning array of out_types
- * \param out_type_data returning array of pointers to head of the output type.
- * \param aux_type_size sizeof the returning array of aux_types
- * \param aux_type_data returning array of pointers to head of the auxiliary type.
- * \param complete whether infer type completes or more information is needed.
- * \return 0 when success, -1 when failure happens
- */
-int MXSymbolInferTypePartial(SymbolHandle sym,
-                                       uint32_t in,
-                                       const char** in,
-                                       const int *in,
-                                       uint32_t *in_type_size,
-                                       const int **in_type_data,
-                                       uint32_t *out_type_size,
-                                       const int **out_type_data,
-                                       uint32_t *aux_type_size,
-                                       const int **aux_type_data,
-                                       int *out);
-/*!
- * \brief Generate atomic symbol (able to be composed) from a source symbol
- * \param sym_handle source symbol
- * \param ret_sym_handle returned atomic symbol
- */
-int MXGenAtomicSymbolFromSymbol(SymbolHandle sym_handle, SymbolHandle *out);
-
-/*!
- * \brief Partitions symbol for given backend, potentially creating subgraphs
- * \param sym_handle symbol to be partitioned
- * \param dev_type context device type
- * \param backend_name backend name
- * \param ret_sym_handle partitioned symbol returned
- * \param len number of args
- * \param in_args_handle args array
- * \param num_options number of key value pairs
- * \param keys keys for options
- * \param vals values corresponding to keys
- */
-int MXOptimizeForBackend(SymbolHandle sym_handle,
-                         const char* in,
-                         const int dev_type,
-                         SymbolHandle* in,
-                         const mx_uint in,
-                         NDArrayHandle* in,
-                         const mx_uint in,
-                         NDArrayHandle* in,
-                         const mx_uint in,
-                         const char** keys,
-                         const char** vals,
-                         int* new_args_cnt,
-                         NDArrayHandle** new_args_handle,
-                         char*** new_arg_names_handle,
-                         int* new_aux_cnt,
-                         NDArrayHandle** new_aux_handle,
-                         char*** new_aux_names_handle);
-
-//--------------------------------------------
-// Part 4: Executor interface
-//--------------------------------------------
-/*!
- * \brief Delete the executor
- * \param handle the executor.
- * \return 0 when success, -1 when failure happens
- */
-int MXExecutorFree(ExecutorHandle handle);
-/*!
- * \brief Print the content of execution plan, used for debug.
- * \param handle the executor.
- * \param out_str pointer to hold the output string of the printing.
- * \return 0 when success, -1 when failure happens
- */
-int MXExecutorPrint(ExecutorHandle handle, const char **out);
-/*!
- * \brief Executor forward method
- *
- * \param handle executor handle
- * \param is_train bool value to indicate whether the forward pass is for evaluation
- * \return 0 when success, -1 when failure happens
- */
-int MXExecutorForward(ExecutorHandle handle, int is_train);
-/*!
- * \brief Excecutor run backward
- *
- * \param handle execute handle
- * \param len lenth
- * \param head_grads NDArray handle for heads' gradient
- *
- * \return 0 when success, -1 when failure happens
- */
-int MXExecutorBackward(ExecutorHandle handle,
-                                 uint32_t in,
-                                 NDArrayHandle *in);
-
-/*!
- * \brief Excecutor run backward
- *
- * \param handle execute handle
- * \param len lenth
- * \param head_grads NDArray handle for heads' gradient
- * \param is_train int value to indicate whether the backward pass is for evaluation
- *
- * \return 0 when success, -1 when failure happens
- */
-int MXExecutorBackwardEx(ExecutorHandle handle,
-                                   uint32_t in,
-                                   NDArrayHandle *in,
-                                   int is_train);
-
-/*!
- * \brief Get executor's head NDArray
- *
- * \param handle executor handle
- * \param out_size output ndarray vector size
- * \param out out put ndarray handles
- * \return 0 when success, -1 when failure happens
- */
-int MXExecutorOutputs(ExecutorHandle handle,
-                                uint32_t *out_size,
-                                NDArrayHandle **out_array);
-
-/*!
- * \brief Generate Executor from symbol
- *
- * \param symbol_handle symbol handle
- * \param dev_type device type
- * \param dev_id device id
- * \param len length
- * \param in_args in args array
- * \param arg_grad_store arg grads handle array
- * \param grad_req_type grad req array
- * \param aux_states_len length of auxiliary states
- * \param aux_states auxiliary states array
- * \param out output executor handle
- * \return 0 when success, -1 when failure happens
- */
-int MXExecutorBind(SymbolHandle symbol_handle,
-                             int dev_type,
-                             int dev_id,
-                             uint32_t in,
-                             NDArrayHandle *in,
-                             NDArrayHandle *in,
-                             uint32_t *in,
-                             uint32_t aux_states_len,
-                             NDArrayHandle *in,
-                             ExecutorHandle *out);
-/*!
- * \brief Generate Executor from symbol,
- *  This is advanced function, allow specify group2ctx map.
- *  The user can annotate "ctx_group" attribute to name each group.
- *
- * \param symbol_handle symbol handle
- * \param dev_type device type of default context
- * \param dev_id device id of default context
- * \param num_map_keys size of group2ctx map
- * \param map_keys keys of group2ctx map
- * \param map_dev_types device type of group2ctx map
- * \param map_dev_ids device id of group2ctx map
- * \param len length
- * \param in_args in args array
- * \param arg_grad_store arg grads handle array
- * \param grad_req_type grad req array
- * \param aux_states_len length of auxiliary states
- * \param aux_states auxiliary states array
- * \param out output executor handle
- * \return 0 when success, -1 when failure happens
- */
-int MXExecutorBindX(SymbolHandle symbol_handle,
-                              int dev_type,
-                              int dev_id,
-                              uint32_t in,
-                              const char** in,
-                              const int* in,
-                              const int* in,
-                              uint32_t in,
-                              NDArrayHandle *in,
-                              NDArrayHandle *in,
-                              uint32_t *in,
-                              uint32_t in,
-                              NDArrayHandle *in,
-                              ExecutorHandle *out);
-/*!
- * \brief Generate Executor from symbol,
- *  This is advanced function, allow specify group2ctx map.
- *  The user can annotate "ctx_group" attribute to name each group.
- *
- * \param symbol_handle symbol handle
- * \param dev_type device type of default context
- * \param dev_id device id of default context
- * \param num_map_keys size of group2ctx map
- * \param map_keys keys of group2ctx map
- * \param map_dev_types device type of group2ctx map
- * \param map_dev_ids device id of group2ctx map
- * \param len length
- * \param in_args in args array
- * \param arg_grad_store arg grads handle array
- * \param grad_req_type grad req array
- * \param aux_states_len length of auxiliary states
- * \param aux_states auxiliary states array
- * \param shared_exec input executor handle for memory sharing
- * \param out output executor handle
- * \return 0 when success, -1 when failure happens
- */
-int MXExecutorBindEX(SymbolHandle symbol_handle,
-                               int dev_type,
-                               int dev_id,
-                               uint32_t in,
-                               const char** in,
-                               const int* in,
-                               const int* in,
-                               uint32_t in,
-                               NDArrayHandle *in,
-                               NDArrayHandle *in,
-                               uint32_t *in,
-                               uint32_t in,
-                               NDArrayHandle *in,
-                               ExecutorHandle shared_exec,
-                               ExecutorHandle *out);
-
-int MXExecutorSimpleBindEx(SymbolHandle symbol_handle,
-                           int dev_type,
-                           int dev_id,
-                           const uint32_t in, // num_g2c_keys,
-                           const char** in, // g2c_keys,
-                           const int* in, // g2c_dev_types,
-                           const int* in, // g2c_dev_ids,
-                           const uint32_t in, // provided_grad_req_list_len,
-                           const char** in, // provided_grad_req_names,
-                           const char** in, // provided_grad_req_types,
-                           const uint32_t in, //  num_provided_arg_shapes,
-                           const char** in, // provided_arg_shape_names,
-                           const int* in, // provided_arg_shape_data,
-                           const uint32_t* in, // provided_arg_shape_idx,
-                           const uint32_t in, // num_provided_arg_dtypes,
-                           const char** in, // provided_arg_dtype_names,
-                           const int* in, // provided_arg_dtypes,
-                           const uint32_t in, // num_provided_arg_stypes,
-                           const char** in, // provided_arg_stype_names,
-                           const int* in, // provided_arg_stypes,
-                           const uint32_t in, // num_shared_arg_names,
-                           const char** in, // shared_arg_name_list,
-                           int* shared_buffer_len,
-                           const char** shared_buffer_name_list,
-                           NDArrayHandle* shared_buffer_handle_list,
-                           const char*** updated_shared_buffer_name_list,
-                           NDArrayHandle** updated_shared_buffer_handle_list,
-                           uint32_t* num_in_args,
-                           NDArrayHandle** in_args,
-                           NDArrayHandle** arg_grads,
-                           uint32_t* num_aux_states,
-                           NDArrayHandle** aux_states,
-                           ExecutorHandle shared_exec_handle,
-                           ExecutorHandle* out
-);
-
-int MXExecutorSimpleBindEx64(SymbolHandle symbol_handle,
-                                     int dev_type,
-                                     int dev_id,
-                                     const uint32_t in, // num_g2c_keys,
-                                     const char** in, // g2c_keys,
-                                     const int* in, // g2c_dev_types,
-                                     const int* in, // g2c_dev_ids,
-                                     const uint32_t in, // provided_grad_req_list_len,
-                                     const char** in, // provided_grad_req_names,
-                                     const char** in, // provided_grad_req_types,
-                                     const uint32_t in, // num_provided_arg_shapes,
-                                     const char** in, // provided_arg_shape_names,
-                                     const int64_t* in, // provided_arg_shape_data,
-                                     const uint32_t* in, // provided_arg_shape_idx,
-                                     const uint32_t in, // num_provided_arg_dtypes,
-                                     const char** in, // provided_arg_dtype_names,
-                                     const int* in, // provided_arg_dtypes,
-                                     const uint32_t in, // num_provided_arg_stypes,
-                                     const char** in, // provided_arg_stype_names,
-                                     const int* in, // provided_arg_stypes,
-                                     const uint32_t in, // num_shared_arg_names,
-                                     const char** in, // shared_arg_name_list,
-                                     int* shared_buffer_len,
-                                     const char** shared_buffer_name_list,
-                                     NDArrayHandle* shared_buffer_handle_list,
-                                     const char*** updated_shared_buffer_name_list,
-                                     NDArrayHandle** updated_shared_buffer_handle_list,
-                                     uint32_t* num_in_args,
-                                     NDArrayHandle** in_args,
-                                     NDArrayHandle** arg_grads,
-                                     uint32_t* num_aux_states,
-                                     NDArrayHandle** aux_states,
-                                     ExecutorHandle shared_exec_handle,
-                                     ExecutorHandle* out);
-
-/*!
- * \brief Return a new executor with the same symbol and shared memory,
- * but different input/output shapes.
- *
- * \param partial_shaping Whether to allow changing the shape of unspecified arguments.
- * \param allow_up_sizing Whether to allow allocating new ndarrays that's larger than the original.
- * \param dev_type device type of default context
- * \param dev_id device id of default context
- * \param num_map_keys size of group2ctx map
- * \param map_keys keys of group2ctx map
- * \param map_dev_types device type of group2ctx map
- * \param map_dev_ids device id of group2ctx map
- * \param num_in_args length of in_args
- * \param in_args in args array
- * \param arg_grads arg grads handle array
- * \param num_aux_states length of auxiliary states
- * \param aux_states auxiliary states array
- * \param shared_exec input executor handle for memory sharing
- * \param out output executor handle
- * \return a new executor
- */
-int MXExecutorReshapeEx(int partial_shaping,
-                        int allow_up_sizing,
-                        int dev_type,
-                        int dev_id,
-                        uint32_t in,
-                        const char** in,
-                        const int* in,
-                        const int* in,
-                        const uint32_t in,
-                        const char** in,
-                        const int* in,
-                        const uint32_t* in,
-                        uint32_t* couple_out_size,
-                        NDArrayHandle** out_first_array,
-                        NDArrayHandle** out_second_array,
-                        uint32_t* out_size,
-                        NDArrayHandle** out_array,
-                        ExecutorHandle shared_exec,
-                        ExecutorHandle *out);
- /*!
-  * \brief get optimized graph from graph executor
-  */
-int MXExecutorGetOptimizedSymbol(ExecutorHandle handle,
-                                            SymbolHandle *out);
-
-/*!
- * \brief set a call back to notify the completion of operation
- */
-int MXExecutorSetMonitorCallback(ExecutorHandle handle,
-                                           ExecutorMonitorCallback callback,
-                                           void* callback_handle);
-
-//--------------------------------------------
-// Part 5: IO Interface
-//--------------------------------------------
-/*!
- * \brief List all the available iterator entries
- * \param out_size the size of returned iterators
- * \param out_array the output iteratos entries
- * \return 0 when success, -1 when failure happens
- */
-int MXListDataIters(uint32_t *out_size,
-                              DataIterCreator **out_array);
-/*!
- * \brief Init an iterator, init with parameters
- * the array size of passed in arguments
- * \param handle of the iterator creator
- * \param num_param number of parameter
- * \param keys parameter keys
- * \param vals parameter values
- * \param out resulting iterator
- * \return 0 when success, -1 when failure happens
- */
-int MXDataIterCreateIter(DataIterCreator handle,
-                                   uint32_t in,
-                                   const char **keys,
-                                   const char **vals,
-                                   DataIterHandle *out);
-/*!
- * \brief Get the detailed information about data iterator.
- * \param creator the DataIterCreator.
- * \param name The returned name of the creator.
- * \param description The returned description of the symbol.
- * \param num_args Number of arguments.
- * \param arg_names Name of the arguments.
- * \param arg_type_infos Type informations about the arguments.
- * \param arg_descriptions Description information about the arguments.
- * \return 0 when success, -1 when failure happens
- */
-int MXDataIterGetIterInfo(DataIterCreator creator,
-                                    const char **name,
-                                    const char **description,
-                                    uint32_t *num_args,
-                                    const char ***arg_names,
-                                    const char ***arg_type_infos,
-                                    const char ***arg_descriptions);
-/*!
- * \brief Free the handle to the IO module
- * \param handle the handle pointer to the data iterator
- * \return 0 when success, -1 when failure happens
- */
-int MXDataIterFree(DataIterHandle handle);
-/*!
- * \brief Move iterator to next position
- * \param handle the handle to iterator
- * \param out return value of next
- * \return 0 when success, -1 when failure happens
- */
-int MXDataIterNext(DataIterHandle handle,
-                             int *out);
-/*!
- * \brief Call iterator.Reset
- * \param handle the handle to iterator
- * \return 0 when success, -1 when failure happens
- */
-int MXDataIterBeforeFirst(DataIterHandle handle);
-
-/*!
- * \brief Get the handle to the NDArray of underlying data
- * \param handle the handle pointer to the data iterator
- * \param out handle to underlying data NDArray
- * \return 0 when success, -1 when failure happens
- */
-int MXDataIterGetData(DataIterHandle handle,
-                                NDArrayHandle *out);
-/*!
- * \brief Get the image index by array.
- * \param handle the handle pointer to the data iterator
- * \param out_index output index of the array.
- * \param out_size output size of the array.
- * \return 0 when success, -1 when failure happens
- */
-int MXDataIterGetIndex(DataIterHandle handle,
-                                 uint64_t **out_index,
-                                 uint64_t *out_size);
-/*!
- * \brief Get the padding number in current data batch
- * \param handle the handle pointer to the data iterator
- * \param pad pad number ptr
- * \return 0 when success, -1 when failure happens
- */
-int MXDataIterGetPadNum(DataIterHandle handle,
-                                  int *out);
-
-/*!
- * \brief Get the handle to the NDArray of underlying label
- * \param handle the handle pointer to the data iterator
- * \param out the handle to underlying label NDArray
- * \return 0 when success, -1 when failure happens
- */
-int MXDataIterGetLabel(DataIterHandle handle,
-                                 NDArrayHandle *out);
-//--------------------------------------------
-// Part 6: basic KVStore interface
-//--------------------------------------------
-/*!
- * \brief Initialized ps-lite environment variables
- * \param num_vars number of variables to initialize
- * \param keys environment keys
- * \param vals environment values
- */
-int MXInitPSEnv(uint32_t in,
-                          const char **keys,
-                          const char **vals);
-
-/*!
- * \brief Create a kvstore
- * \param type the type of KVStore
- * \param out The output type of KVStore
- * \return 0 when success, -1 when failure happens
- */
-int MXKVStoreCreate(const char *type,
-                              KVStoreHandle *out);
-/*!
- * \brief Delete a KVStore handle.
- * \param handle handle to the kvstore
- * \return 0 when success, -1 when failure happens
- */
-int MXKVStoreFree(KVStoreHandle handle);
-
-/*!
- * \brief Set parameters to use low-bit compressed gradients
- * \param handle handle to the kvstore
- * \param keys keys for compression parameters
- * \param vals values for compression parameters
- * \return 0 when success, -1 when failure happens
- */
-int MXKVStoreSetGradientCompression(KVStoreHandle handle,
-                                              uint32_t in,
-                                              const char** keys,
-                                              const char** vals);
-
-
-/*!
- * \brief Init a list of (key,value) pairs in kvstore, where each key is a string
- * \param handle handle to the kvstore
- * \param num the number of key-value pairs
- * \param keys the list of keys
- * \param vals the list of values
- * \return 0 when success, -1 when failure happens
- */
-int MXKVStoreInitEx(KVStoreHandle handle,
-                              uint32_t in,
-                              const char** in,
-                              NDArrayHandle* in);
- /*!
- * \brief Push a list of (key,value) pairs to kvstore, where each key is a string
- * \param handle handle to the kvstore
- * \param num the number of key-value pairs
- * \param keys the list of keys
- * \param vals the list of values
- * \param priority the priority of the action
- * \return 0 when success, -1 when failure happens
- */
-int MXKVStorePushEx(KVStoreHandle handle,
-                              uint32_t in,
-                              const char** in,
-                              NDArrayHandle* in,
-                              int priority);
- /*!
- * \brief pull a list of (key, value) pairs from the kvstore, where each key is a string
- * \param handle handle to the kvstore
- * \param num the number of key-value pairs
- * \param keys the list of keys
- * \param vals the list of values
- * \param priority the priority of the action
- * \return 0 when success, -1 when failure happens
- */
-int MXKVStorePullEx(KVStoreHandle handle,
-                              uint32_t in,
-                              const char** in,
-                              NDArrayHandle* in,
-                              int priority);
-/*!
- * \brief pull a list of (key, value) pairs from the kvstore, where each key is an integer.
- *        The NDArray pulled back will be in row_sparse storage with only the specified
- *        row_ids present based row_ids (others rows are zeros).
- * \param handle handle to the kvstore
- * \param num the number of key-value pairs
- * \param keys the list of keys
- * \param vals the list of values
- * \param row_ids the list of row_id NDArrays
- * \param priority the priority of the action
- * \return 0 when success, -1 when failure happens
- */
-int MXKVStorePullRowSparse(KVStoreHandle handle,
-                                     uint32_t in,
-                                     const int* in,
-                                     NDArrayHandle* in,
-                                     NDArrayHandle* in,
-                                     int priority);
-/*!
- * \brief pull a list of (key, value) pairs from the kvstore, where each key is a string.
- *        The NDArray pulled back will be in row_sparse storage with only the specified
- *        row_ids present based row_ids (others rows are zeros).
- * \param handle handle to the kvstore
- * \param num the number of key-value pairs
- * \param keys the list of keys
- * \param vals the list of values
- * \param row_ids the list of row_id NDArrays
- * \param priority the priority of the action
- * \return 0 when success, -1 when failure happens
- */
-int MXKVStorePullRowSparseEx(KVStoreHandle handle,
-                                       uint32_t in,
-                                       const char** in,
-                                       NDArrayHandle* in,
-                                       NDArrayHandle* in,
-                                       int priority);
-
-/*!
- * \brief pull a list of (key, value) pairs from the kvstore, where each key is a string
- * \param handle handle to the kvstore
- * \param num the number of key-value pairs
- * \param keys the list of keys
- * \param vals the list of values
- * \param priority the priority of the action
- * \param ignore_sparse whether to ignore sparse arrays in the request
- * \return 0 when success, -1 when failure happens
- */
-int MXKVStorePullWithSparseEx(KVStoreHandle handle,
-                                        uint32_t in,
-                                        const char** in,
-                                        NDArrayHandle* in,
-                                        int priority,
-                                        bool ignore_sparse);
-
-/*!
- * \brief user-defined updater for the kvstore
- * It's this updater's responsibility to delete \a recv and \a local
- * \param the key
- * \param recv the pushed value on this key
- * \param local the value stored on local on this key
- * \param handle The additional handle to the updater
- */
-typedef void (MXKVStoreUpdater)(int key,
-                                NDArrayHandle recv,
-                                NDArrayHandle local,
-                                void *handle);
-/*!
- * \brief user-defined updater for the kvstore with string keys
- * It's this updater's responsibility to delete \a recv and \a local
- * \param the key
- * \param recv the pushed value on this key
- * \param local the value stored on local on this key
- * \param handle The additional handle to the updater
- */
-typedef void (MXKVStoreStrUpdater)(const char* key,
-                                   NDArrayHandle recv,
-                                   NDArrayHandle local,
-                                   void *handle);
-/*!
- * \brief register an push updater
- * \param handle handle to the KVStore
- * \param updater udpater function
- * \param updater_handle The additional handle used to invoke the updater
- * \return 0 when success, -1 when failure happens
- */
-int MXKVStoreSetUpdater(KVStoreHandle handle,
-                                  MXKVStoreUpdater updater,
-                                  void *callback_handle);
-/*!
- * \brief register a push updater with int keys and one with string keys
- * \param handle handle to the KVStore
- * \param updater updater function with int keys
- * \param str_updater updater function with string keys
- * \param updater_handle The additional handle used to invoke the updater
- * \return 0 when success, -1 when failure happens
- */
-int MXKVStoreSetUpdaterEx(KVStoreHandle handle,
-                                    MXKVStoreUpdater updater,
-                                    MXKVStoreStrUpdater updater,
-                                    void *callback_handle);
-/*!
- * \brief get the type of the kvstore
- * \param handle handle to the KVStore
- * \param type a string type
- * \return 0 when success, -1 when failure happens
- */
-int MXKVStoreGetType(KVStoreHandle handle,
-                               const char** out);
-//--------------------------------------------
-// Part 6: advanced KVStore for multi-machines
-//--------------------------------------------
-
-/**
- * \brief return The rank of this node in its group, which is in [0, GroupSize).
- *
- * \param handle handle to the KVStore
- * \param ret the node rank
- * \return 0 when success, -1 when failure happens
- */
-int MXKVStoreGetRank(KVStoreHandle handle,
-                               int *out);
-
-/**
- * \brief return The number of nodes in this group, which is
- * - number of workers if if `IsWorkerNode() == true`,
- * - number of servers if if `IsServerNode() == true`,
- * - 1 if `IsSchedulerNode() == true`,
- * \param handle handle to the KVStore
- * \param ret the group size
- * \return 0 when success, -1 when failure happens
- */
-int MXKVStoreGetGroupSize(KVStoreHandle handle,
-                                    int *out);
-
-/**
- * \brief return whether or not this process is a worker node.
- * \param ret 1 for yes, 0 for no
- * \return 0 when success, -1 when failure happens
- */
-int MXKVStoreIsWorkerNode(int *out);
-
-
-/**
- * \brief return whether or not this process is a server node.
- * \param ret 1 for yes, 0 for no
- * \return 0 when success, -1 when failure happens
- */
-int MXKVStoreIsServerNode(int *out);
-
-
-/**
- * \brief return whether or not this process is a scheduler node.
- * \param ret 1 for yes, 0 for no
- * \return 0 when success, -1 when failure happens
- */
-int MXKVStoreIsSchedulerNode(int *out);
-
-/**
- * \brief global barrier among all worker machines
- *
- * \param handle handle to the KVStore
- * \return 0 when success, -1 when failure happens
- */
-int MXKVStoreBarrier(KVStoreHandle handle);
-
-/**
- * \brief whether to do barrier when finalize
- *
- * \param handle handle to the KVStore
- * \param barrier_before_exit whether to do barrier when kvstore finalize
- * \return 0 when success, -1 when failure happens
- */
-int MXKVStoreSetBarrierBeforeExit(KVStoreHandle handle,
-                                            const int barrier_before_exit);
-
-/**
- * \brief the prototype of a server controller
- * \param head the head of the command
- * \param body the body of the command
- * \param controller_handle helper handle for implementing controller
- */
-typedef void (MXKVStoreServerController)(int head,
-                                         const char *body,
-                                         void *controller_handle);
-
-/**
- * \return Run as server (or scheduler)
- *
- * \param handle handle to the KVStore
- * \param controller the user-defined server controller
- * \param controller_handle helper handle for implementing controller
- * \return 0 when success, -1 when failure happens
- */
-int MXKVStoreRunServer(KVStoreHandle handle,
-                                 MXKVStoreServerController controller,
-                                 void *callback_handle);
-
-/**
- * \return Send a command to all server nodes
- *
- * \param handle handle to the KVStore
- * \param cmd_id the head of the command
- * \param cmd_body the body of the command
- * \return 0 when success, -1 when failure happens
- */
-int MXKVStoreSendCommmandToServers(KVStoreHandle handle,
-                                             int cmd_id,
-                                             const char* cmd_body);
-
-/**
- * \brief Get the number of ps dead node(s) specified by {node_id}
- *
- * \param handle handle to the KVStore
- * \param node_id Can be a node group or a single node.
- *                kScheduler = 1, kServerGroup = 2, kWorkerGroup = 4
- * \param number Ouptut number of dead nodes
- * \param timeout_sec A node fails to send heartbeart in {timeout_sec} seconds
- *                    will be presumed as 'dead'
- */
-int MXKVStoreGetNumDeadNode(KVStoreHandle handle,
-                                      const int node_id,
-                                      int *out,
-                                      const int timeout_sec = 60);
-
-/**
- * \brief Create a RecordIO writer object
- * \param uri path to file
- * \param out handle pointer to the created object
- * \return 0 when success, -1 when failure happens
-*/
-int MXRecordIOWriterCreate(const char *uri, RecordIOHandle *out);
-
-/**
- * \brief Delete a RecordIO writer object
- * \param handle handle to RecordIO object
- * \return 0 when success, -1 when failure happens
-*/
-int MXRecordIOWriterFree(RecordIOHandle handle);
-
-/**
- * \brief Write a record to a RecordIO object
- * \param handle handle to RecordIO object
- * \param buf buffer to write
- * \param size size of buffer
- * \return 0 when success, -1 when failure happens
-*/
-int MXRecordIOWriterWriteRecord(RecordIOHandle handle,
-                                          const char *buf, size_t size);
-
-/**
- * \brief Get the current writer pointer position
- * \param handle handle to RecordIO object
- * \param pos handle to output position
- * \return 0 when success, -1 when failure happens
-*/
-int MXRecordIOWriterTell(RecordIOHandle handle, size_t *out);
-
-/**
- * \brief Create a RecordIO reader object
- * \param uri path to file
- * \param out handle pointer to the created object
- * \return 0 when success, -1 when failure happens
-*/
-int MXRecordIOReaderCreate(const char *uri, RecordIOHandle *out);
-
-/**
- * \brief Delete a RecordIO reader object
- * \param handle handle to RecordIO object
- * \return 0 when success, -1 when failure happens
-*/
-int MXRecordIOReaderFree(RecordIOHandle handle);
-
-/**
- * \brief Write a record to a RecordIO object
- * \param handle handle to RecordIO object
- * \param buf pointer to return buffer
- * \param size point to size of buffer
- * \return 0 when success, -1 when failure happens
-*/
-int MXRecordIOReaderReadRecord(RecordIOHandle handle,
-                                        char const **out_array, size_t *out_size);
-
-/**
- * \brief Set the current reader pointer position
- * \param handle handle to RecordIO object
- * \param pos target position
- * \return 0 when success, -1 when failure happens
-*/
-int MXRecordIOReaderSeek(RecordIOHandle handle, size_t pos);
-
-/**
- * \brief Create a MXRtc object
-*/
-int MXRtcCreate(char* name, uint32_t in, uint32_t in,
-                          char** in, char** in,
-                          NDArrayHandle* in, NDArrayHandle* in,
-                          char* kernel, RtcHandle *out);
-
-/**
- * \brief Run cuda kernel
-*/
-int MXRtcPush(RtcHandle handle, uint32_t in, uint32_t in,
-                        NDArrayHandle* in, NDArrayHandle* in,
-                        uint32_t in, // gridDimX,
-                        uint32_t in, // gridDimY,
-                        uint32_t in, // gridDimZ,
-                        uint32_t in, // blockDimX,
-                        uint32_t in, // blockDimY,
-                        uint32_t in // blockDimZ
-);
-
-/**
- * \brief Delete a MXRtc object
-*/
-int MXRtcFree(RtcHandle handle);
-
-/*
- * \brief create cuda rtc module
- * \param source cuda source code
- * \param num_options number of compiler flags
- * \param options compiler flags
- * \param num_exports number of exported function names
- * \param exported function names
- * \param out handle to created module
- */
-int MXRtcCudaModuleCreate(const char* source, int num_options,
-                                    const char** in, int num_exports,
-                                    const char** in, CudaModuleHandle *out);
-/*
- * \brief delete cuda rtc module
- * \param handle handle to cuda module
- */
-int MXRtcCudaModuleFree(CudaModuleHandle handle);
-/*
- * \brief get kernel from module
- * \param handle handle to cuda module
- * \param name name of kernel function
- * \param num_args number of arguments
- * \param is_ndarray whether argument is ndarray
- * \param is_const whether argument is constant
- * \param arg_types data type of arguments
- * \param out created kernel
- */
-int MXRtcCudaKernelCreate(CudaModuleHandle handle, const char* name,
-                                    int num_args, int* in, int* in,
-                                    int* in, CudaKernelHandle *out);
-/*
- * \brief delete kernel
- * \param handle handle to previously created kernel
- */
-int MXRtcCudaKernelFree(CudaKernelHandle handle);
-/*
- * \brief launch cuda kernel
- * \param handle handle to kernel
- * \param dev_id (GPU) device id
- * \param args pointer to arguments
- * \param grid_dim_x grid dimension x
- * \param grid_dim_y grid dimension y
- * \param grid_dim_z grid dimension z
- * \param block_dim_x block dimension x
- * \param block_dim_y block dimension y
- * \param block_dim_z block dimension z
- * \param shared_mem size of dynamically allocated shared memory
- */
-int MXRtcCudaKernelCall(CudaKernelHandle handle, int dev_id, void** cuda_kernel_args,
-                                  uint32_t in, // grid_dim_x,
-                                  uint32_t in, // grid_dim_y,
-                                  uint32_t in, // grid_dim_z,
-                                  uint32_t in, // block_dim_x,
-                                  uint32_t in, // block_dim_y,
-                                  uint32_t in, // block_dim_z,
-                                  uint32_t in // shared_mem
-);
diff --git a/perl-package/AI-MXNetCAPI/mxnet_typemaps.i b/perl-package/AI-MXNetCAPI/mxnet_typemaps.i
deleted file mode 100644
index aad098750810..000000000000
--- a/perl-package/AI-MXNetCAPI/mxnet_typemaps.i
+++ /dev/null
@@ -1,1390 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-%typemap(in) (const char** in), (char** in)
-{
-    AV *tempav;
-    I32 len;
-    int i;
-    SV  **tv;
-    if (!SvROK($input))
-        croak("Argument $argnum is not a reference.");
-        if (SvTYPE(SvRV($input)) != SVt_PVAV)
-        croak("Argument $argnum is not an array.");
-        tempav = (AV*)SvRV($input);
-    len = av_len(tempav) + 1;
-    if(len!=0) 
-    {
-        $1 = (char **) safemalloc((len)*sizeof(char *));
-        for (i = 0; i < len; i++) {
-            tv = av_fetch(tempav, i, 0);
-            $1[i] = (char *) SvPV_nolen(*tv);
-        }
-    }
-    else
-    {
-       $1 = NULL;
-    }
-}
-%typemap(freearg) (const char** in), (char** in)  {
-    Safefree($1);
-}
-
-%typemap(in) (const char **keys, const char **vals), (char **keys, char **vals), (const char* const* keys, const char* const* vals)
-{
-    HV *temphv;
-    char *key;
-    SV *val;
-    I32 len;
-    int hash_len;
-    int i = 0;
-    if (!SvROK($input))
-        croak("Argument $argnum is not a reference.");
-        if (SvTYPE(SvRV($input)) != SVt_PVHV)
-    croak("Argument $argnum is not a hash.");
-        temphv = (HV*)SvRV($input);
-    hash_len = hv_iterinit(temphv);
-    if(hash_len)
-    {
-        $1 = (char **)safemalloc(hash_len*sizeof(char *));
-        $2 = (char **)safemalloc(hash_len*sizeof(char *));
-        while ((val = hv_iternextsv(temphv, &key, &len)))
-        {
-            $1[i] = key;
-            $2[i] = SvPV_nolen(val);
-            ++i;
-        }
-    }
-    else
-    {
-       $1 = NULL;
-       $2 = NULL;
-    }
-}
-%typemap(freearg) (const char **keys, const char **vals), (char **keys, char **vals) 
-{
-    Safefree($1);
-    Safefree($2);
-}
-
-%typemap(in,numinputs=0) (const char **out) (char *temp)
-{
-    temp = NULL;
-    $1 = &temp;
-}
-
-%typemap(argout) (const char **out)
-{
-    if(!result)
-    {
-        $result = newSVpv(*$1, 0);
-        sv_2mortal($result);
-        argvi++;
-    }
-}
-
-%typemap(in) (void **out_pdata) (void *temp)
-{
-    temp = NULL;
-    $1 = &temp;
-}
-
-%typemap(argout) (void **out_pdata)
-{
-    if(!result)
-    {
-        $result = newSVpvn((char*)(*$1), SvIV(ST(1)));
-        sv_2mortal($result);
-        argvi++;
-    }
-}
-
-%typemap(in,numinputs=0) (int *out) (int temp), (bool *out) (bool temp), (uint64_t *out) (uint64_t temp), (int64_t *out) (int64_t temp)
-{
-    temp = 0;
-    $1 = &temp;
-}
-
-%typemap(argout) (int *out), (bool *out)
-{
-    if(!result)
-    {
-        $result = newSViv(*$1);
-        sv_2mortal($result);
-        argvi++;
-    }
-}
-
-%typemap(argout) (uint64_t *out), (int64_t *out)
-{
-    if(!result)
-    {
-        $result = newSVnv((double)(*$1));
-        sv_2mortal($result);
-        argvi++;
-    }
-}
-
-
-%typemap(in,numinputs=0) (const int **out_stypes) (int* temp)
-{
-    temp = NULL;
-    $1 = &temp;
-}
-
-%typemap(argout) (const int **out_stypes)
-{
-    if(av_len((AV*)SvRV(ST(3))) == -1 && !result)
-    {
-        AV *myav;
-        SV **svs;
-        int i = 0;
-        svs = (SV **)safemalloc(*arg4*sizeof(SV *));
-        for (i = 0; i < *arg4 ; i++) {
-            svs[i] = newSViv((*$1)[i]);
-            sv_2mortal(svs[i]);
-        }
-        myav = av_make(*arg4, svs);
-        Safefree(svs);
-        $result = newRV_noinc((SV*)myav);
-        sv_2mortal($result);
-        argvi++;
-    }
-}
-
-%typemap(in,numinputs=0) (nn_uint *out_size, const char ***out_array) (nn_uint temp_size, char** temp),
-                         (mx_uint *out_size, const char ***out_array) (mx_uint temp_size, char** temp),
-                         (uint32_t *out_size, const char ***out_array) (uint32_t temp_size, char** temp)
-{
-    $1 = &temp_size;
-    *$1 = 0;
-    $2 = &temp;
-}
-
-%typemap(argout) (nn_uint *out_size, const char ***out_array),
-                 (mx_uint *out_size, const char ***out_array),
-                 (uint32_t *out_size, const char ***out_array)
-{
-    if(!result)
-    {
-        AV *myav;
-        SV **svs;
-        int i = 0;
-        svs = (SV **)safemalloc(*$1*sizeof(SV *));
-        for (i = 0; i < *$1 ; i++) {
-            svs[i] = newSVpv((*$2)[i],0);
-            sv_2mortal(svs[i]);
-        }
-        myav = av_make(*$1,svs);
-        Safefree(svs);
-        $result = newRV_noinc((SV*)myav);
-        sv_2mortal($result);
-        argvi++;
-    }
-}
-
-%typemap(in,numinputs=0) (const LibFeature **libFeature, size_t *size) (LibFeature *temp1, size_t temp2)
-{
-    $1 = &temp1;
-    $2 = &temp2;
-    *$2 = 0;
-}
-
-%typemap(argout) (const LibFeature **libFeature, size_t *size)
-{
-    if(!result)
-    {
-        HV* hash = newHV();
-        for(int i = 0; i < *$2; i++)
-        {
-            hv_store(hash, ((*$1)[i]).name, strlen(((*$1)[i]).name), newSViv(((*$1)[i]).enabled), 0);
-        }
-        $result = newRV_noinc((SV*)hash);
-        sv_2mortal($result);
-        argvi++;
-    }
-}
-
-%typemap(in,numinputs=0) (mx_uint *out_size, const char ***out_array2) (mx_uint temp_size, char** temp),
-                         (uint32_t *out_size, const char ***out_array2) (uint32_t temp_size, char** temp)
-{
-    $1 = &temp_size;
-    *$1 = 0;
-    $2 = &temp;
-}
-
-%typemap(argout) (mx_uint *out_size, const char ***out_array2),
-                 (uint32_t *out_size, const char ***out_array2)
-{
-    if(!result)
-    {
-        AV *myav;
-        SV **svs;
-        int i = 0;
-        svs = (SV **)safemalloc(*$1*sizeof(SV *)*2);
-        for (i = 0; i < *$1*2 ; i++) {
-            svs[i] = newSVpv((*$2)[i],0);
-            sv_2mortal(svs[i]);
-        }
-        myav = av_make(*$1*2,svs);
-        Safefree(svs);
-        $result = newRV_noinc((SV*)myav);
-        sv_2mortal($result);
-        argvi++;
-    }
-}
-
-%typemap(in) (uint32_t in), (const uint32_t in), (mx_uint in), (const mx_uint in)
-{
-    $1 = (uint32_t)SvIV($input);
-}
-
-%typemap(in) (uint64_t in), (const uint64_t in)
-{
-    $1 = (uint64_t)SvUV($input);
-}
-
-%typemap(in) (int64_t in), (const int64_t in)
-{
-    $1 = (int64_t)SvUV($input);
-}
-
-%typemap(in) (FunctionHandle in)
-{
-    int res;
-    void **void_ptrptr = const_cast< void** >(&$1);
-    res = SWIG_ConvertPtr($input,void_ptrptr, 0, 0);
-    if (!SWIG_IsOK(res)) {
-        SWIG_exception_fail(SWIG_ArgError(res), "in method '" "$symname" "', argument " "$argnum"" of type '" "FunctionHandle""'"); 
-    }
-}
-
-%typemap(in) (AtomicSymbolCreator in)
-{
-    int res = SWIG_ConvertPtr($input,&$1, 0, 0);
-    if (!SWIG_IsOK(res)) {
-        SWIG_exception_fail(SWIG_ArgError(res), "in method '" "$symname" "', argument " "$argnum"" of type '" "AtomicSymbolCreator""'"); 
-    }
-}
-
-%typemap(in) (const void *in), (void *in)
-{
-    $1 = (void *)SvPV_nolen($input);
-}
-
-%typemap(in) (const char *in)
-{
-    $1 = SvPV_nolen($input);
-}
-
-%typemap(in) (const mx_uint *in), (mx_uint *in), (const uint32_t *in), (uint32_t *in)
-{
-    AV *tempav;
-    int i;
-    SV  **tv;
-    int av_len;
-    if (!SvROK($input))
-        croak("Argument $argnum is not a reference.");
-        if (SvTYPE(SvRV($input)) != SVt_PVAV)
-        croak("Argument $argnum is not an array.");
-        tempav = (AV*)SvRV($input);
-    av_len = av_len(tempav) + 1;
-    if(av_len)
-    {
-        $1 = (mx_uint *)safemalloc(av_len*sizeof(mx_uint));
-        for (i = 0; i < av_len; i++) {
-            tv = av_fetch(tempav, i, 0);
-            $1[i] = (mx_uint)SvIV(*tv);
-        }
-    }
-    else
-    {
-       $1 = NULL;
-    }
-}
-
-%typemap(in) (const uint64_t *in), (uint64_t *in)
-{
-    AV *tempav;
-    int i;
-    SV  **tv;
-    int av_len;
-    if (!SvROK($input))
-        croak("Argument $argnum is not a reference.");
-        if (SvTYPE(SvRV($input)) != SVt_PVAV)
-        croak("Argument $argnum is not an array.");
-        tempav = (AV*)SvRV($input);
-    av_len = av_len(tempav) + 1;
-    if(av_len)
-    {
-        $1 = (uint64_t *)safemalloc(av_len*sizeof(uint64_t));
-        for (i = 0; i < av_len; i++) {
-            tv = av_fetch(tempav, i, 0);
-            $1[i] = (uint64_t)SvUV(*tv);
-        }
-    }
-    else
-    {
-       $1 = NULL;
-    }
-}
-
-%typemap(in) (const int64_t *in), (int64_t *in)
-{
-    AV *tempav;
-    int i;
-    SV  **tv;
-    int av_len;
-    if (!SvROK($input))
-        croak("Argument $argnum is not a reference.");
-        if (SvTYPE(SvRV($input)) != SVt_PVAV)
-        croak("Argument $argnum is not an array.");
-        tempav = (AV*)SvRV($input);
-    av_len = av_len(tempav) + 1;
-    if(av_len)
-    {
-        $1 = (int64_t *)safemalloc(av_len*sizeof(int64_t));
-        for (i = 0; i < av_len; i++) {
-            tv = av_fetch(tempav, i, 0);
-            $1[i] = (int64_t)SvUV(*tv);
-        }
-    }
-    else
-    {
-       $1 = NULL;
-    }
-}
-
-%typemap(freearg) (const mx_uint *in), (mx_uint *in), (const uint32_t *in),
-                  (uint32_t *in), (const uint64_t *in), (uint64_t *in), (const int64_t *in), (int64_t *in)
-{
-    Safefree($1);
-}
-
-%typemap(in) (const int *in), (int *in)
-{
-    AV *tempav;
-    int i;
-    SV  **tv;
-    int av_len; 
-    if (!SvROK($input))
-        croak("Argument $argnum is not a reference.");
-        if (SvTYPE(SvRV($input)) != SVt_PVAV)
-        croak("Argument $argnum is not an array.");
-        tempav = (AV*)SvRV($input);
-    av_len = av_len(tempav) + 1;
-    if(av_len)
-    {
-        $1 = (int *)safemalloc(av_len*sizeof(int));
-        for (i = 0; i < av_len; i++) {
-            tv = av_fetch(tempav, i, 0);
-            $1[i] = (int)SvIV(*tv);
-        }
-    }
-    else
-    {
-       $1 = NULL;
-    }
-
-}
-
-%typemap(freearg) (const int *in), (int *in) {
-    Safefree($1);
-}
-
-%typemap(in) (dim_t *in)
-{
-    AV *tempav;
-    int i;
-    SV  **tv;
-    int av_len; 
-    if (!SvROK($input))
-        croak("Argument $argnum is not a reference.");
-        if (SvTYPE(SvRV($input)) != SVt_PVAV)
-        croak("Argument $argnum is not an array.");
-        tempav = (AV*)SvRV($input);
-    av_len = av_len(tempav) + 1;
-    if(av_len)
-    {
-        $1 = (dim_t *)safemalloc(av_len*sizeof(dim_t));
-        for (i = 0; i < av_len; i++) {
-            tv = av_fetch(tempav, i, 0);
-            $1[i] = (dim_t)SvIV(*tv);
-        }
-    }
-    else
-    {
-       $1 = NULL;
-    }
-}
-
-%typemap(freearg) (dim_t *in) {
-    Safefree($1);
-}
-
-%typemap(in) (NDArrayHandle* in), (SymbolHandle* in)
-{
-    AV *tempav;
-    int i;
-    SV  **tv;
-    int res;
-    int av_len;
-    if (!SvROK($input))
-        croak("Argument $argnum is not a reference.");
-        if (SvTYPE(SvRV($input)) != SVt_PVAV)
-        croak("Argument $argnum is not an array.");
-        tempav = (AV*)SvRV($input);
-    av_len = av_len(tempav) + 1;
-    if(av_len)
-    {
-        $1 = ($1_type)safemalloc(av_len*sizeof($*1_type));
-        for (i = 0; i < av_len; i++) {
-            tv = av_fetch(tempav, i, 0);
-            res = SWIG_ConvertPtr(*tv,SWIG_as_voidptrptr(&$1[i]), $*1_descriptor, 0);
-            if (!SWIG_IsOK(res)) {
-                SWIG_exception_fail(SWIG_ArgError(res), "in method '" "$symname" "', argument " "$argnum"" of type '" "$*1_type""'"); 
-            }
-        }
-    }
-    else
-    {
-       $1 = NULL;
-    }
-}
-%typemap(freearg) (NDArrayHandle* in), (SymbolHandle* in) {
-    Safefree($1);
-}
-
-%typemap(in) (void** cuda_kernel_args)
-{
-    AV *tempav;
-    int i;
-    SV  **tv;
-    int res;
-    int av_len;
-    if (!SvROK($input))
-        croak("Argument $argnum is not a reference.");
-        if (SvTYPE(SvRV($input)) != SVt_PVAV)
-        croak("Argument $argnum is not an array.");
-        tempav = (AV*)SvRV($input);
-    av_len = av_len(tempav) + 1;
-    if(av_len)
-    {
-        $1 = ($1_type)safemalloc(av_len*sizeof($*1_type));
-        for (i = 0; i < av_len; i++) {
-            tv = av_fetch(tempav, i, 0);
-            res = SWIG_ConvertPtr(*tv,SWIG_as_voidptrptr(&$1[i]), SWIGTYPE_p_MXNDArray, 0);
-            if (!SWIG_IsOK(res)) {
-                $1[i] = (void*)SvPV_nolen(*tv);
-            }
-        }
-    }
-    else
-    {
-       $1 = NULL;
-    }
-}
-%typemap(freearg) (void** cuda_kernel_args) {
-    Safefree($1);
-}
-
-%typemap(in) (mx_float *in), (float *in)
-{
-    AV *tempav;
-    int i, len;
-    SV  **tv;
-    if (!SvROK($input))
-        croak("Argument $argnum is not a reference.");
-        if (SvTYPE(SvRV($input)) != SVt_PVAV)
-        croak("Argument $argnum is not an array.");
-        tempav = (AV*)SvRV($input);
-    len = av_len(tempav) + 1;
-    if(len)
-    {
-        $1 = (mx_float *)safemalloc(len*sizeof(mx_float));
-        for (i = 0; i < len; i++) {
-            tv = av_fetch(tempav, i, 0);
-            $1[i] = (mx_float)SvNV(*tv);
-        }
-    }
-    else
-    {
-       $1 = NULL;
-    }
-}
-
-%typemap(freearg) (mx_float *in), (float *in) {
-    Safefree($1);
-}
-
-%typemap(in,numinputs=0) (NDArrayHandle *out) (NDArrayHandle temp),
-                         (FunctionHandle* out) (FunctionHandle temp),
-                         (SymbolHandle *out) (SymbolHandle temp),
-                         (ExecutorHandle *out) (ExecutorHandle temp),
-                         (DataIterHandle *out) (ExecutorHandle temp),
-                         (KVStoreHandle *out) (KVStoreHandle temp),
-                         (RecordIOHandle *out) (RecordIOHandle temp),
-                         (RtcHandle *out) (RtcHandle temp),
-                         (CachedOpHandle *out) (CachedOpHandle temp),
-                         (CudaModuleHandle *out) (CudaModuleHandle temp),
-                         (CudaKernelHandle *out) (CudaKernelHandle temp)
-{
-    $1 = &temp;
-}
-%typemap(argout) (NDArrayHandle *out), (FunctionHandle* out), (SymbolHandle *out), (ExecutorHandle *out), (DataIterHandle *out),
-                 (KVStoreHandle *out), (RecordIOHandle *out), (RtcHandle *out) (RtcHandle temp), (CachedOpHandle *out) (CachedOpHandle temp),
-                 (CudaModuleHandle *out) (CudaModuleHandle temp), (CudaKernelHandle *out) (CudaKernelHandle temp)
-
-{
-    if(!result)
-    {
-        $result =  SWIG_NewPointerObj(SWIG_as_voidptr(*$1), $*1_descriptor, 0); argvi++;
-    }
-}
-
-%typemap(in) (mx_float **out_pdata) (mx_float *temp_pdata), (float **out_pdata) (float *temp_pdata)
-{
-    $1 = &temp_pdata;
-}
-%typemap(argout) (mx_float **out_pdata)
-{
-    if(!result)
-    {
-        AV *myav;
-        SV **svs;
-        int len;
-        int i = 0;
-        len = SvIV($input); 
-        svs = (SV **)safemalloc(len*sizeof(SV *));
-        for (i = 0; i < len ; i++) {
-            svs[i] = newSVnv((*$1)[i]);
-            sv_2mortal(svs[i]);
-        }
-        myav = av_make(len,svs);
-        Safefree(svs);
-        $result = newRV_noinc((SV*)myav);
-        sv_2mortal($result);
-        argvi++;
-    }
-}
-
-%typemap(in,numinputs=0) (char const **out_array, size_t *out_size) (char * temp, size_t temp_size)
-{
-    $2 = &temp_size;
-    *$2 = 0;
-    $1 = &temp;
-}
-
-%typemap(argout) (char const **out_array, size_t *out_size)
-{
-    if(!result)
-    {
-        $result = newSVpvn(*$1, *$2);
-        sv_2mortal($result);
-        argvi++;
-    }
-}
-
-%typemap(in,numinputs=0) (size_t *out_size, char const **out_array) (size_t temp_size, char *temp)
-{
-    $1 = &temp_size;
-    *$1 = 0;
-    $2 = &temp;
-}
-
-%typemap(argout) (size_t *out_size, char const **out_array)
-{
-    if(!result)
-    {
-        $result = newSVpvn(*$2, *$1);
-        sv_2mortal($result);
-        argvi++;
-    }
-}
-
-%typemap(in,numinputs=0) (int *out_dim, const int **out_pdata) (int temp_dim, int *temp_pdata),
-                         (int *out_dim, const int64_t **out_pdata) (int temp_dim, int64_t *temp_pdata)
-{
-    $1 = &temp_dim;
-    $2 = &temp_pdata;
-}
-
-%typemap(argout) (int *out_dim, const int64_t **out_pdata)
-{
-    if(!result)
-    {
-        AV *myav;
-        SV **svs;
-        int i = 0;
-        svs = (SV **)safemalloc(*$1*sizeof(SV *));
-        for (i = 0; i < *$1 ; i++) {
-            svs[i] = newSVnv((double)((*$2)[i]));
-            sv_2mortal(svs[i]);
-        }
-        myav = av_make(*$1,svs);
-        Safefree(svs);
-        $result = newRV_noinc((SV*)myav);
-        sv_2mortal($result);
-        argvi++;
-    }
-}
-
-%typemap(argout) (int *out_dim, const int **out_pdata)
-{
-    if(!result)
-    {
-        AV *myav;
-        SV **svs;
-        int i = 0;
-        svs = (SV **)safemalloc(*$1*sizeof(SV *));
-        for (i = 0; i < *$1 ; i++) {
-            svs[i] = newSViv((*$2)[i]);
-            sv_2mortal(svs[i]);
-        }
-        myav = av_make(*$1,svs);
-        Safefree(svs);
-        $result = newRV_noinc((SV*)myav);
-        sv_2mortal($result);
-        argvi++;
-    }
-}
-
-%typemap(in,numinputs=0) (uint64_t **out_index, uint64_t *out_size) (uint64_t *temp1, uint64_t temp2)
-{
-    $1 = &temp1;
-    $2 = &temp2;
-    *$2 = 0;
-}
-
-%typemap(argout) (uint64_t **out_index, uint64_t *out_size)
-{
-    if(!result)
-    {
-        AV *myav;
-        SV **svs;
-        uint64_t i = 0;
-        svs = (SV **)safemalloc(*$2*sizeof(SV *));
-        for (i = 0; i < *$2 ; i++) {
-            svs[i] = newSVnv((double)((*$1)[i]));
-            sv_2mortal(svs[i]);
-        }
-        myav = av_make(*$2,svs);
-        Safefree(svs);
-        $result = newRV_noinc((SV*)myav);
-        sv_2mortal($result);
-        argvi++;
-    }
-}
-
-%typemap(in,numinputs=0) (mx_uint *out_size, FunctionHandle** out_array) (mx_uint temp_size, FunctionHandle* temp),
-                         (mx_uint *out_size, AtomicSymbolCreator** out_array) (mx_uint temp_size, AtomicSymbolCreator* temp),
-                         (mx_uint *out_size, DataIterCreator **out_array) (mx_uint temp_size, DataIterCreator* temp),
-                         (mx_uint *out_size, NDArrayHandle** out_array) (mx_uint temp_size, NDArrayHandle* temp),
-                         (uint32_t *out_size, FunctionHandle** out_array) (uint32_t temp_size, FunctionHandle* temp),
-                         (uint32_t *out_size, AtomicSymbolCreator** out_array) (uint32_t temp_size, AtomicSymbolCreator* temp),
-                         (uint32_t *out_size, DataIterCreator **out_array) (uint32_t temp_size, DataIterCreator* temp),
-                         (uint32_t *out_size, NDArrayHandle** out_array) (uint32_t temp_size, NDArrayHandle* temp)
-
-{
-    $1 = &temp_size;
-    *$1 = 0;
-    $2 = &temp;
-}
-
-// many argouts needed because SWIG can't $**2_mangle
-%typemap(argout) (mx_uint *out_size, AtomicSymbolCreator** out_array),
-                 (uint32_t *out_size, AtomicSymbolCreator** out_array)
-{
-    if(!result)
-    {
-        AV *myav;
-        SV **svs;
-        int i = 0;
-        svs = (SV **)safemalloc(*$1*sizeof(SV *));
-        for (i = 0; i < *$1 ; i++) {
-            svs[i] = SWIG_NewPointerObj(SWIG_as_voidptr((*$2)[i]), SWIGTYPE_p_MXAtomicSymbolCreator, 0);
-        }
-        myav = av_make(*$1,svs);
-        Safefree(svs);
-        $result = newRV_noinc((SV*)myav);
-        sv_2mortal($result);
-        argvi++;
-    }
-}
-
-%typemap(argout) (mx_uint *out_size, FunctionHandle** out_array),
-                 (uint32_t *out_size, FunctionHandle** out_array)
-{
-    if(!result)
-    {
-        AV *myav;
-        SV **svs;
-        int i = 0;
-        svs = (SV **)safemalloc(*$1*sizeof(SV *));
-        for (i = 0; i < *$1 ; i++) {
-            svs[i] = SWIG_NewPointerObj(SWIG_as_voidptr((*$2)[i]), SWIGTYPE_p_MXFunction, 0);
-        }
-        myav = av_make(*$1,svs);
-        Safefree(svs);
-        $result = newRV_noinc((SV*)myav);
-        sv_2mortal($result);
-        argvi++;
-    }
-}
-
-%typemap(argout) (mx_uint *out_size, DataIterCreator **out_array),
-                 (uint32_t *out_size, DataIterCreator **out_array)
-{
-    if(!result)
-    {
-        AV *myav;
-        SV **svs;
-        int i = 0;
-        svs = (SV **)safemalloc(*$1*sizeof(SV *));
-        for (i = 0; i < *$1 ; i++) {
-            svs[i] = SWIG_NewPointerObj(SWIG_as_voidptr((*$2)[i]), SWIGTYPE_p_MXDataIterCreator, 0);
-        }
-        myav = av_make(*$1,svs);
-        Safefree(svs);
-        $result = newRV_noinc((SV*)myav);
-        sv_2mortal($result);
-        argvi++;
-    }
-}
-
-%typemap(argout) (mx_uint *out_size, NDArrayHandle** out_array),
-                 (uint32_t *out_size, NDArrayHandle** out_array)
-{
-    if(!result)
-    {
-        AV *myav;
-        SV **svs;
-        int i = 0;
-        svs = (SV **)safemalloc(*$1*sizeof(SV *));
-        for (i = 0; i < *$1 ; i++) {
-            svs[i] = SWIG_NewPointerObj(SWIG_as_voidptr((*$2)[i]), SWIGTYPE_p_MXNDArray, 0);
-        }
-        myav = av_make(*$1,svs);
-        Safefree(svs);
-        $result = newRV_noinc((SV*)myav);
-        sv_2mortal($result);
-        argvi++;
-    }
-}
-
-%typemap(in,numinputs=0) (mx_uint* couple_out_size, NDArrayHandle** out_first_array, NDArrayHandle** out_second_array)
-                         (mx_uint t, NDArrayHandle* t1, NDArrayHandle* t2),
-                         (uint32_t* couple_out_size, NDArrayHandle** out_first_array, NDArrayHandle** out_second_array)
-                         (uint32_t t, NDArrayHandle* t1, NDArrayHandle* t2)
-{
-    $1 = &t;
-    *$1 = 0;
-    $2 = &t1;
-    $3 = &t2;
-}
-
-%typemap(argout) (mx_uint* couple_out_size, NDArrayHandle** out_first_array, NDArrayHandle** out_second_array),
-                 (uint32_t* couple_out_size, NDArrayHandle** out_first_array, NDArrayHandle** out_second_array)
-{
-    if(!result)
-    {
-        AV *container, *in_args, *arg_grads;
-        int i;
-        container = newAV();
-        in_args = newAV();
-        arg_grads = newAV();
-        for (i = 0; i < *$1 ; i++) {
-            av_push(in_args, SvREFCNT_inc(SWIG_NewPointerObj(SWIG_as_voidptr((*$2)[i]), SWIGTYPE_p_MXNDArray, 0)));
-            av_push(arg_grads, SvREFCNT_inc(SWIG_NewPointerObj(SWIG_as_voidptr((*$3)[i]), SWIGTYPE_p_MXNDArray, 0)));
-        }
-        av_push(container, newRV_noinc((SV*)in_args));
-        av_push(container, newRV_noinc((SV*)arg_grads));
-        $result = newRV_noinc((SV*)container);
-        sv_2mortal($result);
-        argvi++;
-    }
-}
-
-%typemap(in,numinputs=0) (NDArrayHandle **out_grad) (NDArrayHandle* temp)
-{
-    int vars = SvIV(ST(3));
-    if(vars)
-    {
-        $1 = &temp;
-    }
-    else
-    {
-        $1 = NULL;
-    }
-}
-
-
-%typemap(argout) (NDArrayHandle** out_grad)
-{
-    if(!result)
-    {
-        AV *myav;
-        SV **svs;
-        int i = 0;
-        int len = SvIV(ST(3));
-        svs = (SV **)safemalloc(len*sizeof(SV *));
-        for (i = 0; i < len ; i++) {
-            svs[i] = SWIG_NewPointerObj(SWIG_as_voidptr((*$1)[i]), SWIGTYPE_p_MXNDArray, 0);
-        }
-        myav = av_make(len,svs);
-        Safefree(svs);
-        $result = newRV_noinc((SV*)myav);
-        sv_2mortal($result);
-        argvi++;
-    }
-}
-
-%typemap(in,numinputs=0) (int **out_stype) (int *temp)
-{
-    int vars = SvIV(ST(3));
-    if(vars)
-    {
-        $1 = &temp;
-    }
-    else
-    {
-        $1 = NULL;
-    }
-}
-
-%typemap(argout) (int** out_stype)
-{
-    if(!result)
-    {
-        AV *myav;
-        SV **svs;
-        int i = 0;
-        int len = SvIV(ST(3));
-        svs = (SV **)safemalloc(len*sizeof(SV *));
-        for (i = 0; i < len ; i++) {
-            svs[i] = newSViv((*$1)[i]);
-        }
-        myav = av_make(len,svs);
-        Safefree(svs);
-        $result = newRV_noinc((SV*)myav);
-        sv_2mortal($result);
-        argvi++;
-    }
-}
-
-%typemap(in) (int *out_size, NDArrayHandle** out_array) (int temp, NDArrayHandle* temp_array)
-{
-    AV *tempav;
-    int i;
-    SV  **tv;
-    int res;
-    int av_len;
-    if (!SvROK($input))
-        croak("Argument $argnum is not a reference.");
-        if (SvTYPE(SvRV($input)) != SVt_PVAV)
-        croak("Argument $argnum is not an array.");
-        tempav = (AV*)SvRV($input);
-    av_len = av_len(tempav) + 1;
-    temp_array = NULL;
-    if(av_len)
-    {
-        temp_array = (void**)safemalloc(av_len*sizeof(void*));
-        for (i = 0; i < av_len; i++) {
-            tv = av_fetch(tempav, i, 0);
-            res = SWIG_ConvertPtr(*tv,SWIG_as_voidptrptr(&(temp_array[i])), 0, 0);
-            if (!SWIG_IsOK(res)) {
-                SWIG_exception_fail(SWIG_ArgError(res), "in method '" "$symname" "', argument " "$argnum"" of type '" "NDArray""'"); 
-            }
-        }
-    }
-    temp = av_len;
-    $1 = &temp;
-    $2 = &temp_array;
-}
-
-%typemap(freearg) (int *out_size, NDArrayHandle** out_array) {
-    if(av_len((AV*)SvRV(ST(3))) > -1)
-    {
-        Safefree(*$2);
-    }
-}
-
-%typemap(argout) (int *out_size, NDArrayHandle** out_array)
-{
-    SV **svs;
-    int i = 0;
-    if(av_len((AV*)SvRV(ST(3))) == -1)
-    {
-        if(!result)
-        {
-            AV *container = newAV();
-            for (i = 0; i < *$1 ; i++) {
-                av_push(container, SvREFCNT_inc(SWIG_NewPointerObj(SWIG_as_voidptr((*$2)[i]), SWIGTYPE_p_MXNDArray, 0)));
-            }
-            $result = newRV_noinc((SV*)container);
-            sv_2mortal($result);
-            argvi++;
-        }
-    }
-}
-
-%typemap(in,numinputs=0) (const char **name,
-                          const char **description,
-                          uint32_t *num_args,
-                          const char ***arg_names,
-                          const char ***arg_type_infos,
-                          const char ***arg_descriptions
-                          ) 
-                          (char *name_temp,
-                           char *desc_temp,
-                           uint32_t num_args_temp,
-                           char **names_temp,
-                           char **types_temp,
-                           char **descs_temp
-                           )
-{
-    $1 = &name_temp;
-    $2 = &desc_temp;
-    $3 = &num_args_temp;
-    *$3 = 0;
-    $4 = &names_temp;
-    $5 = &types_temp;
-    $6 = &descs_temp;
-}
-
-%typemap(argout) (const char **name,
-                  const char **description,
-                  uint32_t *num_args,
-                  const char ***arg_names,
-                  const char ***arg_type_infos,
-                  const char ***arg_descriptions
-                  )
-{
-    if(!result)
-    {
-        AV *container, *names, *types, *descs;
-        int i;
-        container = newAV();
-        names = newAV();
-        types = newAV();
-        descs = newAV();
-        if($1) av_push(container, newSVpv(*$1,0));
-        if($2) av_push(container, newSVpv(*$2,0));
-        if($3)
-        {
-            for (i = 0; i < *$3 ; i++) {
-                av_push(names, newSVpv((*$4)[i],0));
-                av_push(types, newSVpv((*$5)[i],0));
-                av_push(descs, newSVpv((*$6)[i],0));
-            }
-        }
-        av_push(container, newRV_noinc((SV*)names));
-        av_push(container, newRV_noinc((SV*)types));
-        av_push(container, newRV_noinc((SV*)descs));
-        $result = newRV_noinc((SV*)container);
-        sv_2mortal($result);
-        argvi++;
-    }
-}
-
-%typemap(in,numinputs=0) (const char **name,
-                          const char **description,
-                          uint32_t *num_args,
-                          const char ***arg_names,
-                          const char ***arg_type_infos,
-                          const char ***arg_descriptions,
-                          const char **key_var_num_args
-                          ) 
-                          (char *name_temp, 
-                           char *desc_temp, 
-                           uint32_t num_args_temp, 
-                           char **names_temp,
-                           char **types_temp,
-                           char **descs_temp,
-                           char *key_temp
-                           )
-{
-    $1 = &name_temp; 
-    $2 = &desc_temp;
-    $3 = &num_args_temp;
-    *$3 = 0;
-    $4 = &names_temp;
-    $5 = &types_temp;
-    $6 = &descs_temp;
-    $7 = &key_temp;
-}
-
-%typemap(argout) (const char **name,
-                  const char **description,
-                  uint32_t *num_args,
-                  const char ***arg_names,
-                  const char ***arg_type_infos,
-                  const char ***arg_descriptions,
-                  const char **key_var_num_args
-                  )
-{
-    if(!result)
-    {
-        AV *container, *names, *types, *descs;
-        int i;
-        container = newAV();
-        names = newAV();
-        types = newAV();
-        descs = newAV();
-        if($1) av_push(container, newSVpv(*$1,0));
-        if($2) av_push(container, newSVpv(*$2,0));
-        if($3)
-        {
-            for (i = 0; i < *$3 ; i++) {
-                av_push(names, newSVpv((*$4)[i],0));
-                av_push(types, newSVpv((*$5)[i],0));
-                av_push(descs, newSVpv((*$6)[i],0));
-            }
-        }
-        av_push(container, newRV_noinc((SV*)names));
-        av_push(container, newRV_noinc((SV*)types));
-        av_push(container, newRV_noinc((SV*)descs));
-        if($7) av_push(container, newSVpv(*$7,0));
-        $result = newRV_noinc((SV*)container);
-        sv_2mortal($result);
-        argvi++;
-    }
-}
-
-%typemap(in,numinputs=0) (uint32_t *out) (uint32_t temp), (size_t *out) (size_t temp)
-{
-    $1 = &temp;
-    *$1 = 0;
-}
-
-%typemap(argout) (uint32_t *out), (size_t *out)
-{
-    if(!result)
-    {
-        $result = newSViv(*$1);
-        sv_2mortal($result);
-        argvi++;
-    }
-}
-
-%typemap(in,numinputs=0) (uint32_t *in_shape_size, const int **in_shape_ndim, const int ***in_shape_data) 
-                         (uint32_t temp1, int *temp2, int **temp3),
-                         (uint32_t *out_shape_size, const int **out_shape_ndim, const int ***out_shape_data) 
-                         (uint32_t temp1, int *temp2, int **temp3),
-                         (uint32_t *aux_shape_size, const int **aux_shape_ndim, const int ***aux_shape_data) 
-                         (uint32_t temp1, int *temp2, int **temp3),
-                         (size_t *in_shape_size, const int **in_shape_ndim, const int64_t ***in_shape_data)
-                         (size_t temp1, int *temp2, int64_t **temp3),
-                         (size_t *out_shape_size, const int **out_shape_ndim, const int64_t ***out_shape_data)
-                         (size_t temp1, int *temp2, int64_t **temp3),
-                         (size_t *aux_shape_size, const int **aux_shape_ndim, const int64_t ***aux_shape_data)
-                         (size_t temp1, int *temp2, int64_t **temp3)
-{
-    $1 = &temp1;
-    $2 = &temp2;
-    $3 = &temp3;
-    *$1 = 0;
-}
-
-%typemap(argout) (uint32_t *in_shape_size, const int **in_shape_ndim, const int ***in_shape_data),
-                 (uint32_t *out_shape_size, const int **out_shape_ndim, const int ***out_shape_data),
-                 (uint32_t *aux_shape_size, const int **aux_shape_ndim, const int ***aux_shape_data)
-{
-    if(!result && *arg15)
-    {
-        AV *container;
-        AV *tmp;
-        int i, j;
-        container = newAV();
-        for (i = 0; i < *$1 ; i++)
-        {
-            tmp = newAV();
-            int len = (*$2)[i];
-            for (j = 0; j < len ; j++)
-            {
-                av_push(tmp, newSViv((*$3)[i][j]));
-            }
-            av_push(container, newRV((SV*)tmp));
-        }
-        $result = newRV_noinc((SV*)container);
-        sv_2mortal($result);
-        argvi++;
-    }
-}
-
-%typemap(argout) (size_t *in_shape_size, const int **in_shape_ndim, const int64_t ***in_shape_data),
-                 (size_t *out_shape_size, const int **out_shape_ndim, const int64_t ***out_shape_data),
-                 (size_t *aux_shape_size, const int **aux_shape_ndim, const int64_t ***aux_shape_data)
-{
-    if(!result && *arg15)
-    {
-        AV *container;
-        AV *tmp;
-        size_t i;
-        int j;
-        container = newAV();
-        for (i = 0; i < *$1 ; i++)
-        {
-            tmp = newAV();
-            int len = (*$2)[i];
-            for (j = 0; j < len ; j++)
-            {
-                av_push(tmp, newSVnv((double)((*$3)[i][j])));
-            }
-            av_push(container, newRV((SV*)tmp));
-        }
-        $result = newRV_noinc((SV*)container);
-        sv_2mortal($result);
-        argvi++;
-    }
-}
-
-%typemap(in,numinputs=0) (uint32_t *in_type_size, const int **in_type_data)
-                         (uint32_t temp1, int *temp2),
-                         (uint32_t *out_type_size, const int **out_type_data) 
-                         (uint32_t temp1, int *temp2),
-                         (uint32_t *aux_type_size, const int **aux_type_data) 
-                         (uint32_t temp1, int *temp2)
-{
-    $1 = &temp1;
-    $2 = &temp2;
-    *$1 = 0;
-}
-
-%typemap(argout)  (uint32_t *in_type_size,  const int **in_type_data),
-                  (uint32_t *out_type_size, const int **out_type_data),
-                  (uint32_t *aux_type_size, const int **aux_type_data)
-
-{
-    if(!result && *arg11)
-    {
-        AV *container;
-        int i;
-        container = newAV();
-        for (i = 0; i < *$1 ; i++) 
-        {
-            av_push(container, newSViv((*$2)[i]));
-        }
-        $result = newRV_noinc((SV*)container);
-        sv_2mortal($result);
-        argvi++;
-    }
-}
-
-%typemap(in,numinputs=0) (uint32_t* num_in_args,
-                          NDArrayHandle** in_args,
-                          NDArrayHandle** arg_grads)
-                         (uint32_t temp1,
-                         NDArrayHandle* temp2,
-                         NDArrayHandle* temp3)
-{
-    $1 = &temp1;
-    $2 = &temp2;
-    $3 = &temp3;
-    *$1 = 0;
-}
-
-%typemap(argout) (uint32_t* num_in_args,
-                  NDArrayHandle** in_args,
-                  NDArrayHandle** arg_grads)
-{
-    if(!result)
-    {
-        AV *container1 = newAV();
-        AV *container2 = newAV();
-        for (int i = 0; i < *$1 ; i++)
-        {
-            av_push(container1, SvREFCNT_inc(SWIG_NewPointerObj(SWIG_as_voidptr((*$2)[i]), SWIGTYPE_p_MXNDArray, 0)));
-            av_push(container2, (*$3)[i] ? SvREFCNT_inc(SWIG_NewPointerObj(SWIG_as_voidptr((*$3)[i]), SWIGTYPE_p_MXNDArray, 0)) : newSV(0));
-        }
-        $result = newRV_noinc((SV*)container1);
-        sv_2mortal($result);
-        argvi++;
-        $result = newRV_noinc((SV*)container2);
-        sv_2mortal($result);
-        argvi++;
-    }
-}
-
-%typemap(in,numinputs=0) (uint32_t* num_aux_states,
-                          NDArrayHandle** aux_states)
-                         (uint32_t temp1,
-                         NDArrayHandle* temp2)
-{
-    $1 = &temp1;
-    $2 = &temp2;
-    *$1 = 0;
-}
-
-%typemap(argout) (uint32_t* num_aux_states,
-                  NDArrayHandle** aux_states)
-{
-    if(!result)
-    {
-        AV *container  = newAV();
-        for (int i = 0; i < *$1 ; i++)
-        {
-            av_push(container, SvREFCNT_inc(SWIG_NewPointerObj(SWIG_as_voidptr((*$2)[i]), SWIGTYPE_p_MXNDArray, 0)));
-        }
-        $result = newRV_noinc((SV*)container);
-        sv_2mortal($result);
-        argvi++;
-    }
-}
-
-%typemap(in) (int* shared_buffer_len,
-              const char** shared_buffer_name_list,
-              NDArrayHandle* shared_buffer_handle_list,
-              const char*** updated_shared_buffer_name_list,
-              NDArrayHandle** updated_shared_buffer_handle_list)
-              (int temp1,
-               char* temp2,
-               NDArrayHandle temp3,
-               char** temp4,
-               NDArrayHandle* temp5)
-{
-    HV *temphv;
-    char *key;
-    SV *val;
-    I32 len;
-    int res;
-    int i = 0;
-    int hash_len;
-    $1 = &temp1;
-    $2 = &temp2;
-    $3 = &temp3;
-    $4 = &temp4;
-    $5 = &temp5;
-    if (!SvROK($input))
-    {
-        *$1 = -1;
-        $2 = NULL;
-        $3 = NULL;
-    }
-    else
-    {
-        if (SvTYPE(SvRV($input)) != SVt_PVHV)
-            croak("Argument $argnum is not a hash.");
-        temphv = (HV*)SvRV($input);
-        *$1 = hv_iterinit(temphv);
-        if(*$1)
-        {
-            $2 = (char**)safemalloc((*$1)*sizeof(char*));
-            $3 = (void**)safemalloc((*$1)*sizeof(void*));
-            while ((val = hv_iternextsv(temphv, &key, &len)))
-            {
-                $2[i] = key;
-                res = SWIG_ConvertPtr(val,SWIG_as_voidptrptr(&($3[i])), 0, 0);
-                if (!SWIG_IsOK(res)) {
-                    SWIG_exception_fail(SWIG_ArgError(res), "in method '" "$symname" "', argument " "$argnum"" of type '" "NDArray""'"); 
-                }
-                i++;
-            }
-        }
-        else
-        {
-            $2 = NULL;
-            $3 = NULL;
-        }
-    }
-}
-
-%typemap(freearg) (int* shared_buffer_len,
-                   const char** shared_buffer_name_list,
-                   NDArrayHandle* shared_buffer_handle_list,
-                   const char*** updated_shared_buffer_name_list,
-                   NDArrayHandle** updated_shared_buffer_handle_list)
-{
-    Safefree($2);
-    Safefree($3);
-}
-
-%typemap(argout)  (int* shared_buffer_len,
-                   const char** shared_buffer_name_list,
-                   NDArrayHandle* shared_buffer_handle_list,
-                   const char*** updated_shared_buffer_name_list,
-                   NDArrayHandle** updated_shared_buffer_handle_list)
-
-{
-    if(!result)
-    {
-        HV* hash = newHV();
-        for(int j = 0; j < *$1; j++)
-        {
-            hv_store(hash, (*$4)[j], strlen((*$4)[j]), SvREFCNT_inc(SWIG_NewPointerObj(SWIG_as_voidptr((*$5)[j]), SWIGTYPE_p_MXNDArray, 0)), 0);
-        }
-        $result = newRV_noinc((SV*)hash);
-        sv_2mortal($result);
-        argvi++;
-    }
-}
-
-
-%typemap(in) (uint32_t x)
-{
-    union fbits u;
-    u.f = SvNV($input);
-    $1 = u.x;
-}
-
-%typemap(out) (uint16_t)
-{
-    $result = newSViv($1);
-    sv_2mortal($result);
-    argvi++;
-}
-
-%typemap(in) (uint16_t x)
-{
-    $1 = SvIV($input);
-}
-
-%typemap(out) (uint32_t)
-{
-    union fbits u;
-    u.x = $1;
-    $result = newSVnv(u.f);
-    sv_2mortal($result);
-    argvi++;
-}
-
-%typemap(in,numinputs=0) (MXKVStoreUpdater* updater)
-{
-    $1 = KVStore_callback;
-}
-
-%typemap(in,numinputs=0) (MXKVStoreStrUpdater* updater)
-{
-    $1 = KVStoreStr_callback;
-}
-
-%typemap(in,numinputs=0) (MXKVStoreServerController* controller)
-{
-    $1 = KVStoreServer_callback;
-}
-
-%typemap(in,numinputs=0) (ExecutorMonitorCallback callback)
-{
-    $1 = ExecutorMonitor_callback;
-}
-
-%typemap(in) (void* callback_handle)
-{
-    $1 = (void*)newSVsv($input);
-}
diff --git a/perl-package/AI-MXNetCAPI/t/AI-MXNetCAPI.t b/perl-package/AI-MXNetCAPI/t/AI-MXNetCAPI.t
deleted file mode 100644
index 02171bc9dd88..000000000000
--- a/perl-package/AI-MXNetCAPI/t/AI-MXNetCAPI.t
+++ /dev/null
@@ -1,22 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-use strict;
-use warnings;
-use Test::More tests => 1;
-BEGIN { use_ok('AI::MXNetCAPI') };
-
diff --git a/perl-package/AI-NNVMCAPI/Changes b/perl-package/AI-NNVMCAPI/Changes
deleted file mode 100644
index cf9628ab8d1c..000000000000
--- a/perl-package/AI-NNVMCAPI/Changes
+++ /dev/null
@@ -1,28 +0,0 @@
-Revision history for Perl extension AI::NNVMCAPI.
-1.4     Sun Feb 16 19:56:17 PST 2020
-        - use strict
-
-1.3     Tue Jun 26 20:57:40 PDT 2018
-        - Major update, Gluon interface updated to parity with Python's API
-
-1.2     Sun Mar  4 16:29:19 PST 2018
-        - Support for sparse tensors
-
-1.1     Sun Sep 24 10:26:54 PDT 2017
-        - support for perl 5.14
-
-1.01    Sat Jun 10 23:57:27 PDT 2017
-        - sync with python.
-
-0.95  Sun Mar 26 17:42:02 PDT 2017
-        - visible on https://mxnet.io
-
-0.03  Sat Feb 25 13:21:07 PST 2017
-	- sync up with the Python interface.
-
-0.02  Tue Feb 14 07:28:11 PST 2017
-        - prepared for inclusion to the mxnet code repository.
-
-0.01  Fri Jan  6 19:40:53 2017
-	- original version
-
diff --git a/perl-package/AI-NNVMCAPI/MANIFEST b/perl-package/AI-NNVMCAPI/MANIFEST
deleted file mode 100644
index 4cbdbafb3abc..000000000000
--- a/perl-package/AI-NNVMCAPI/MANIFEST
+++ /dev/null
@@ -1,10 +0,0 @@
-Changes
-Makefile.PL
-MANIFEST
-META.json
-META.yml
-README
-t/AI-NNVMCAPI.t
-lib/AI/NNVMCAPI.pm
-nnvm.i
-nnvm_typemaps.i
diff --git a/perl-package/AI-NNVMCAPI/META.json b/perl-package/AI-NNVMCAPI/META.json
deleted file mode 100644
index 5f8364e91847..000000000000
--- a/perl-package/AI-NNVMCAPI/META.json
+++ /dev/null
@@ -1,41 +0,0 @@
-{
-   "abstract" : "Swig interface to nnvm c api",
-   "author" : [
-      "Sergey Kolychev <sergeykolychev.github@gmail.com>"
-   ],
-   "dynamic_config" : 0,
-   "generated_by" : "ExtUtils::MakeMaker version 7.24, CPAN::Meta::Converter version 2.143240",
-   "license" : [
-      "apache_2_0"
-   ],
-   "meta-spec" : {
-      "url" : "http://search.cpan.org/perldoc?CPAN::Meta::Spec",
-      "version" : "2"
-   },
-   "name" : "AI-NNVMCAPI",
-   "no_index" : {
-      "directory" : [
-         "t",
-         "inc"
-      ]
-   },
-   "prereqs" : {
-      "build" : {
-         "requires" : {
-            "ExtUtils::MakeMaker" : "0"
-         }
-      },
-      "configure" : {
-         "requires" : {
-            "ExtUtils::MakeMaker" : "0"
-         }
-      },
-      "runtime" : {
-         "requires" : {
-            "Test::More" : "0"
-         }
-      }
-   },
-   "release_status" : "stable",
-   "version" : "1.4"
-}
diff --git a/perl-package/AI-NNVMCAPI/META.yml b/perl-package/AI-NNVMCAPI/META.yml
deleted file mode 100644
index e7b29f377a23..000000000000
--- a/perl-package/AI-NNVMCAPI/META.yml
+++ /dev/null
@@ -1,39 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
----
-abstract: 'Swig interface to nnvm c api'
-author:
-  - 'Sergey Kolychev <sergeykolychev.github@gmail.com>'
-build_requires:
-  ExtUtils::MakeMaker: '0'
-configure_requires:
-  ExtUtils::MakeMaker: '0'
-dynamic_config: 0
-generated_by: 'ExtUtils::MakeMaker version 7.24, CPAN::Meta::Converter version 2.143240'
-license: apache
-meta-spec:
-  url: http://module-build.sourceforge.net/META-spec-v1.4.html
-  version: '1.4'
-name: AI-NNVMCAPI
-no_index:
-  directory:
-    - t
-    - inc
-requires:
-  Test::More: '0'
-version: '1.4'
diff --git a/perl-package/AI-NNVMCAPI/Makefile.PL b/perl-package/AI-NNVMCAPI/Makefile.PL
deleted file mode 100644
index 50bc1a9426b5..000000000000
--- a/perl-package/AI-NNVMCAPI/Makefile.PL
+++ /dev/null
@@ -1,59 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-use ExtUtils::MakeMaker;
-`swig -noproxy -c++ -perl nnvm.i`;
-unlink "NNVMCAPI.pm";
-my @tmp = split(/ /, $ExtUtils::MakeMaker::Config{lddlflags});
-my @lddlflags;
-while(my $flag = shift(@tmp))
-{
-   if($flag eq '-arch')
-   {
-      my $arch = shift(@tmp);
-      if($arch eq 'i386')
-      {
-         next;
-      }
-      else
-      {
-         push @lddlflags, ($flag, $arch);
-      }
-   }
-   else
-   {
-      push @lddlflags, $flag;
-   }
-}
-
-WriteMakefile(
-    NAME            => 'AI::NNVMCAPI',
-    LICENSE         => 'apache_2_0',
-    AUTHOR          => 'Sergey Kolychev <sergeykolychev.github@gmail.com>',
-    VERSION_FROM    => 'lib/AI/NNVMCAPI.pm',
-    ABSTRACT_FROM   => 'lib/AI/NNVMCAPI.pm',
-    LIBS           => ['-L../../lib -lmxnet'],
-    INC            => '-I../../3rdparty/tvm/nnvm/include/nnvm',
-    OBJECT         => 'nnvm_wrap.o',
-    LDDLFLAGS      => join(' ', @lddlflags),
-    PREREQ_PM      => {
-        # prereqs
-        # build/test prereqs
-        'Test::More'   => 0,
-    },
-    PL_FILES        => {},
-);
diff --git a/perl-package/AI-NNVMCAPI/README b/perl-package/AI-NNVMCAPI/README
deleted file mode 100644
index e190fefbde4a..000000000000
--- a/perl-package/AI-NNVMCAPI/README
+++ /dev/null
@@ -1,24 +0,0 @@
-AI-NNVMCAPI version 1.4
-=====================
-
-Swig interface to MXNet c api.
-
-INSTALLATION
-
-To install this module type the following:
-
-   perl Makefile.PL
-   make
-   make test
-   make install
-
-DEPENDENCIES
-
-This module requires mxnet https://mxnet.io
-It's used by AI::MXNet.
-
-COPYRIGHT AND LICENCE
-
-This library is licensed under Apache 2.0 license https://www.apache.org/licenses/LICENSE-2.0
-
-
diff --git a/perl-package/AI-NNVMCAPI/lib/AI/NNVMCAPI.pm b/perl-package/AI-NNVMCAPI/lib/AI/NNVMCAPI.pm
deleted file mode 100644
index 40f0d35501ea..000000000000
--- a/perl-package/AI-NNVMCAPI/lib/AI/NNVMCAPI.pm
+++ /dev/null
@@ -1,53 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-package AI::NNVMCAPI;
-use strict;
-use base qw(DynaLoader);
-bootstrap AI::NNVMCAPI;
-our $VERSION = '1.4';
-1;
-__END__
-
-=head1 NAME
-
-AI::NNVMCAPI - Swig interface to nnvm c api
-
-=head1 SYNOPSIS
-
- use AI::NNVMCAPI;
-
-=head1 DESCRIPTION
-
-This module provides interface to nnvm
-via its api.
-
-=head1 SEE ALSO
-
-L<AI::MXNet>
-
-=head1 AUTHOR
-
-Sergey Kolychev, <sergeykolychev.github@gmail.com>
-
-=head1 COPYRIGHT & LICENSE
-
-This library is licensed under Apache 2.0 license.
-
-See https://www.apache.org/licenses/LICENSE-2.0 for more information.
-
-=cut
diff --git a/perl-package/AI-NNVMCAPI/nnvm.i b/perl-package/AI-NNVMCAPI/nnvm.i
deleted file mode 100644
index d40eb40ca930..000000000000
--- a/perl-package/AI-NNVMCAPI/nnvm.i
+++ /dev/null
@@ -1,365 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
-
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-%module  "AI::NNVMCAPI"
-%include typemaps.i
-%rename("%(strip:[NN])s") "";
-%init %{
-    /* These SWIG_TypeClientData() calls might break in the future, but
-     * %rename should work on these types before that happens. */
-    SWIG_TypeClientData(SWIGTYPE_p_NNOp, (void *)"OpHandle");
-    SWIG_TypeClientData(SWIGTYPE_p_NNSymbol, (void *)"SymbolHandle");
-    SWIG_TypeClientData(SWIGTYPE_p_NNGraph, (void *)"GraphHandle");
-%}
-%inline %{
-#include <c_api.h>
-%} 
-%include nnvm_typemaps.i
-
-/*! \brief manually define unsigned int */
-typedef unsigned int nn_uint;
-/*! \brief handle to a function that takes param and creates symbol */
-typedef NNOp *OpHandle;
-/*! \brief handle to a symbol that can be bind as operator */
-typedef NNSymbol *SymbolHandle;
-/*! \brief handle to Graph */
-typedef NNGraph *GraphHandle;
-/*!
- * \brief Set the last error message needed by C API
- * \param msg The error message to set.
- */
-void NNAPISetLastError(const char* msg);
-/*!
- * \brief return str message of the last error
- *  all function in this file will return 0 when success
- *  and -1 when an error occured,
- *  NNGetLastError can be called to retrieve the error
- *
- *  this function is threadsafe and can be called by different thread
- *  \return error info
- */
-const char *NNGetLastError(void);
-/*!
- * \brief list all the available operator names, include entries.
- * \param out_size the size of returned array
- * \param out_array the output operator name array.
- * \return 0 when success, -1 when failure happens
- */
-int NNListAllOpNames(nn_uint *out_size, const char*** out_array);
-/*!
- * \brief Get operator handle given name.
- * \param op_name The name of the operator.
- * \param op_out The returnning op handle.
- */
-int NNGetOpHandle(const char* op_name,
-                           OpHandle* out);
-/*!
- * \brief list all the available operators.
- *  This won't include the alias, use ListAllNames
- *  instead to get all alias names.
- *
- * \param out_size the size of returned array
- * \param out_array the output AtomicSymbolCreator array
- * \return 0 when success, -1 when failure happens
- */
-int NNListUniqueOps(nn_uint *out_size,
-                             OpHandle **out_array);
-/*!
- * \brief Get the detailed information about atomic symbol.
- * \param op The operator handle.
- * \param real_name The returned name of the creator.
- *   This name is not the alias name of the atomic symbol.
- * \param description The returned description of the symbol.
- * \param num_doc_args Number of arguments that contain documents.
- * \param arg_names Name of the arguments of doc args
- * \param arg_type_infos Type informations about the arguments.
- * \param arg_descriptions Description information about the arguments.
- * \param return_type Return type of the function, if any.
- * \return 0 when success, -1 when failure happens
- */
-int NNGetOpInfo(OpHandle op,
-                         const char **real_name,
-                         const char **description,
-                         nn_uint *num_doc_args,
-                         const char ***arg_names,
-                         const char ***arg_type_infos,
-                         const char ***arg_descriptions,
-                         const char **return_type);
-/*!
- * \brief Create an AtomicSymbol functor.
- * \param op The operator handle
- * \param num_param the number of parameters
- * \param keys the keys to the params
- * \param vals the vals of the params
- * \param out pointer to the created symbol handle
- * \return 0 when success, -1 when failure happens
- */
-int NNSymbolCreateAtomicSymbol(OpHandle op,
-                                        nn_uint num_param,
-                                        const char **keys,
-                                        const char **vals,
-                                        SymbolHandle *out);
-/*!
- * \brief Create a Variable Symbol.
- * \param name name of the variable
- * \param out pointer to the created symbol handle
- * \return 0 when success, -1 when failure happens
- */
-int NNSymbolCreateVariable(const char *name, SymbolHandle *out);
-/*!
- * \brief Create a Symbol by grouping list of symbols together
- * \param num_symbols number of symbols to be grouped
- * \param symbols array of symbol handles
- * \param out pointer to the created symbol handle
- * \return 0 when success, -1 when failure happens
- */
-int NNSymbolCreateGroup(nn_uint num_symbols,
-                                 SymbolHandle *in,
-                                 SymbolHandle *out);
-/*!
- * \brief Add src_dep to the handle as control dep.
- * \param handle The symbol to add dependency edges on.
- * \param src_dep the source handles.
- */
-int NNAddControlDeps(SymbolHandle in,
-                              SymbolHandle in);
-/*!
- * \brief Free the symbol handle.
- * \param symbol the symbol
- * \return 0 when success, -1 when failure happens
- */
-int NNSymbolFree(SymbolHandle in);
-/*!
- * \brief Copy the symbol to another handle
- * \param symbol the source symbol
- * \param out used to hold the result of copy
- * \return 0 when success, -1 when failure happens
- */
-int NNSymbolCopy(SymbolHandle in, SymbolHandle *out);
-/*!
- * \brief Print the content of symbol, used for debug.
- * \param symbol the symbol
- * \param out_str pointer to hold the output string of the printing.
- * \return 0 when success, -1 when failure happens
- */
-int NNSymbolPrint(SymbolHandle in, const char **out);
-/*!
- * \brief Get string attribute from symbol
- * \param symbol the source symbol
- * \param key The key of the symbol.
- * \param out The result attribute, can be NULL if the attribute do not exist.
- * \param success Whether the result is contained in out.
- * \return 0 when success, -1 when failure happens
- */
-int NNSymbolGetAttr(SymbolHandle in,
-                             const char* key,
-                             const char** out,
-                             int *out);
-/*!
- * \brief Set string attribute from symbol.
- *  NOTE: Setting attribute to a symbol can affect the semantics(mutable/immutable) of symbolic graph.
- *
- *  Safe recommendaton: use  immutable graph
- *  - Only allow set attributes during creation of new symbol as optional parameter
- *
- *  Mutable graph (be careful about the semantics):
- *  - Allow set attr at any point.
- *  - Mutating an attribute of some common node of two graphs can cause confusion from user.
- *
- * \param symbol the source symbol
- * \param num_param Number of parameters to set.
- * \param keys The keys of the attribute
- * \param values The value to be set
- * \return 0 when success, -1 when failure happens
- */
-int NNSymbolSetAttrs(SymbolHandle in,
-                              nn_uint num_param,
-                              const char** keys,
-                              const char** vals);
-/*!
- * \brief Get all attributes from symbol, including all descendents.
- * \param symbol the source symbol
- * \param recursive_option 0 for recursive, 1 for shallow.
- * \param out_size The number of output attributes
- * \param out 2*out_size strings representing key value pairs.
- * \return 0 when success, -1 when failure happens
- */
-int NNSymbolListAttrs(SymbolHandle in,
-                               int recursive_option,
-                               nn_uint *half_of_out_size,
-                               const char*** out_array);
-/*!
- * \brief List inputs variables in the symbol.
- * \param symbol the symbol
- * \param option The option to list the inputs
- *   option=0 means list all arguments.
- *   option=1 means list arguments that are readed only by the graph.
- *   option=2 means list arguments that are mutated by the graph.
- * \param out_size output size
- * \param out_sym_array the output array.
- * \return 0 when success, -1 when failure happens
- */
-int NNSymbolListInputVariables(SymbolHandle in,
-                                        int option,
-                                        nn_uint *out_size,
-                                        SymbolHandle** out_array);
-/*!
- * \brief List input names in the symbol.
- * \param symbol the symbol
- * \param option The option to list the inputs
- *   option=0 means list all arguments.
- *   option=1 means list arguments that are readed only by the graph.
- *   option=2 means list arguments that are mutated by the graph.
- * \param out_size output size
- * \param out_str_array pointer to hold the output string array
- * \return 0 when success, -1 when failure happens
- */
-int NNSymbolListInputNames(SymbolHandle in,
-                                    int option,
-                                    nn_uint *out_size,
-                                    const char ***out_array);
-/*!
- * \brief List returns names in the symbol.
- * \param symbol the symbol
- * \param out_size output size
- * \param out_str_array pointer to hold the output string array
- * \return 0 when success, -1 when failure happens
- */
-int NNSymbolListOutputNames(SymbolHandle in,
-                                     nn_uint *out_size,
-                                     const char ***out_array);
-/*!
- * \brief Get a symbol that contains all the internals.
- * \param symbol The symbol
- * \param out The output symbol whose outputs are all the internals.
- * \return 0 when success, -1 when failure happens
- */
-int NNSymbolGetInternals(SymbolHandle in,
-                                  SymbolHandle *out);
-/*!
- * \brief Get index-th outputs of the symbol.
- * \param symbol The symbol
- * \param index the Index of the output.
- * \param out The output symbol whose outputs are the index-th symbol.
- * \return 0 when success, -1 when failure happens
- */
-int NNSymbolGetOutput(SymbolHandle in,
-                               nn_uint index,
-                               SymbolHandle *out);
-
-/*!
- * \brief Compose the symbol on other symbols.
- *
- *  This function will change the sym hanlde.
- *  To achieve function apply behavior, copy the symbol first
- *  before apply.
- *
- * \param sym the symbol to apply
- * \param name the name of symbol
- * \param num_args number of arguments
- * \param keys the key of keyword args (optional)
- * \param symbols arguments to sym
- * \return 0 when success, -1 when failure happens
- */
-int NNSymbolCompose(SymbolHandle in,
-                             const char* name,
-                             nn_uint num_args,
-                             const char** in,
-                             SymbolHandle* in);
-
-// Graph IR API
-/*!
- * \brief create a graph handle from symbol
- * \param symbol The symbol representing the graph.
- * \param graph The graph handle created.
- * \return 0 when success, -1 when failure happens
- */
-int NNGraphCreate(SymbolHandle in, GraphHandle *out);
-/*!
- * \brief free the graph handle
- * \param handle The handle to be freed.
- */
-int NNGraphFree(GraphHandle handle);
-/*!
- * \brief Get a new symbol from the graph.
- * \param graph The graph handle.
- * \param symbol The corresponding symbol
- * \return 0 when success, -1 when failure happens
- */
-int NNGraphGetSymbol(GraphHandle graph, SymbolHandle *out);
-
-/*!
- * \brief Get Set a attribute in json format.
- * This feature allows pass graph attributes back and forth in reasonable speed.
- *
- * \param handle The graph handle.
- * \param key The key to the attribute.
- * \param json_value The value need to be in format [type_name, value],
- *  Where type_name is a registered type string in C++ side via DMLC_JSON_ENABLE_ANY.
- * \return 0 when success, -1 when failure happens
- */
-int NNGraphSetJSONAttr(GraphHandle handle,
-                                const char* key,
-                                const char* json_value);
-
-
-/*!
- * \brief Get a serialized attrirbute from graph.
- * This feature allows pass graph attributes back and forth in reasonable speed.
- *
- * \param handle The graph handle.
- * \param key The key to the attribute.
- * \param json_out The result attribute, can be NULL if the attribute do not exist.
- *  The json_out is an array of [type_name, value].
- *  Where the type_name is a registered type string in C++ side via DMLC_JSON_ENABLE_ANY.
- * \param success Whether the result is contained in out.
- * \return 0 when success, -1 when failure happens
- */
-int NNGraphGetJSONAttr(SymbolHandle in,
-                                const char* key,
-                                const char** out,
-                                int *out);
-
-/*!
- * \brief Set a attribute whose type is std::vector<NodeEntry> in c++
- * This feature allows pass List of symbolic variables for gradient request.
- *
- * \note This is beta feature only used for test purpos
- *
- * \param handle The graph handle.
- * \param key The key to the attribute.
- * \param list The symbol whose outputs represents the list of NodeEntry to be passed.
- * \return 0 when success, -1 when failure happens
- */
-int NNGraphSetNodeEntryListAttr_(GraphHandle handle,
-                                          const char* key,
-                                          SymbolHandle in);
-/*!
- * \brief Apply passes on the src graph.
- * \param src The source graph handle.
- * \param num_pass The number of pass to be applied.
- * \param pass_names The names of the pass.
- * \param dst The result graph.
- * \return 0 when success, -1 when failure happens
- */
-int NNGraphApplyPasses(GraphHandle src,
-                                nn_uint num_pass,
-                                const char** in,
-                                GraphHandle *out);
diff --git a/perl-package/AI-NNVMCAPI/nnvm_typemaps.i b/perl-package/AI-NNVMCAPI/nnvm_typemaps.i
deleted file mode 100644
index e64b3c9b7613..000000000000
--- a/perl-package/AI-NNVMCAPI/nnvm_typemaps.i
+++ /dev/null
@@ -1,338 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-%typemap(in) (const char** in), (char** in)
-{
-    AV *tempav;
-    I32 len;
-    int i;
-    SV  **tv;
-    if (!SvROK($input))
-        croak("Argument $argnum is not a reference.");
-        if (SvTYPE(SvRV($input)) != SVt_PVAV)
-        croak("Argument $argnum is not an array.");
-        tempav = (AV*)SvRV($input);
-    len = av_len(tempav) + 1;
-    if(len!=0) 
-    {
-        $1 = (char **) safemalloc((len)*sizeof(char *));
-        for (i = 0; i < len; i++) {
-            tv = av_fetch(tempav, i, 0);
-            $1[i] = (char *) SvPV_nolen(*tv);
-        }
-    }
-    else
-    {
-       $1 = NULL;
-    }
-}
-%typemap(freearg) (const char** in), (char** in)  {
-    Safefree($1);
-}
-
-%typemap(in) (const char **keys, const char **vals), (char **keys, char **vals)
-{
-    HV *temphv;
-    char *key;
-    SV *val;
-    I32 len;
-    int hash_len;
-    int i = 0;
-    if (!SvROK($input))
-        croak("Argument $argnum is not a reference.");
-        if (SvTYPE(SvRV($input)) != SVt_PVHV)
-    croak("Argument $argnum is not a hash.");
-        temphv = (HV*)SvRV($input);
-    hash_len = hv_iterinit(temphv);
-    if(hash_len)
-    {
-        $1 = (char **)safemalloc(hash_len*sizeof(char *));
-        $2 = (char **)safemalloc(hash_len*sizeof(char *));
-        while ((val = hv_iternextsv(temphv, &key, &len))) 
-        {
-            $1[i] = key;
-            $2[i] = SvPV_nolen(val);
-            ++i;
-        }
-    }
-}
-%typemap(freearg) (const char **keys, const char **vals), (char **keys, char **vals) 
-{
-    Safefree($1);
-    Safefree($2);
-}
-
-%typemap(in,numinputs=0) (const char **out) (char *temp)
-{
-    $1 = &temp;
-}
-
-%typemap(argout) (const char **out)
-{
-    if(!result)
-    {
-        $result = newSVpv(*$1, 0);
-        sv_2mortal($result);
-        argvi++;
-    }
-}
-
-%typemap(in,numinputs=0) (int *out) (int temp)
-{
-    $1 = &temp;
-    *$1 = 0;
-}
-
-%typemap(argout) (int *out)
-{
-    if(!result)
-    {
-        $result = newSViv(*$1);
-        sv_2mortal($result);
-        argvi++;
-    }
-}
-
-%typemap(in,numinputs=0) (nn_uint *out_size, const char ***out_array) (nn_uint temp_size, char** temp),
-                         (mx_uint *out_size, const char ***out_array) (mx_uint temp_size, char** temp)
-{
-    $1 = &temp_size;
-    *$1 = 0;
-    $2 = &temp;
-}
-
-%typemap(argout) (nn_uint *out_size, const char ***out_array),
-                 (mx_uint *out_size, const char ***out_array)
-{
-    if(!result)
-    {
-        AV *myav;
-        SV **svs;
-        int i = 0;
-        svs = (SV **)safemalloc(*$1*sizeof(SV *));
-        for (i = 0; i < *$1 ; i++) {
-            svs[i] = newSVpv((*$2)[i],0);
-            sv_2mortal(svs[i]);
-        }
-        myav = av_make(*$1,svs);
-        Safefree(svs);
-        $result = newRV_noinc((SV*)myav);
-        sv_2mortal($result);
-        argvi++;
-    }
-}
-
-%typemap(in,numinputs=0) (nn_uint *half_of_out_size, const char ***out_array) (nn_uint temp_size, char **temp)
-{
-    $1 = &temp_size;
-    *$1 = 0;
-    $2 = &temp;
-}
-%typemap(argout) (nn_uint *half_of_out_size, const char ***out_array)
-{
-    if(!result)
-    {
-        AV *myav;
-        SV **svs;
-        int i = 0;
-        svs = (SV **)safemalloc(*$1*sizeof(SV *));
-        for (i = 0; i < (*$1)*2 ; i++) {
-            svs[i] = newSVpv((*$2)[i],0);
-            sv_2mortal(svs[i]);
-        };
-        myav = av_make(*$1,svs);
-        Safefree(svs);
-        $result = newRV_noinc((SV*)myav);
-        sv_2mortal($result);
-        argvi++;
-    }
-}
-
-%typemap(in) (SymbolHandle *in)
-{
-    AV *tempav;
-    int i;
-    SV  **tv;
-    int res;
-    int len;
-    if (!SvROK($input))
-        croak("Argument $argnum is not a reference.");
-        if (SvTYPE(SvRV($input)) != SVt_PVAV)
-        croak("Argument $argnum is not an array.");
-        tempav = (AV*)SvRV($input);
-    len = av_len(tempav) + 1;
-    if(len)
-    {
-        $1 = ($1_type)safemalloc(len*sizeof($*1_type));
-        for (i = 0; i < len; i++) {
-            tv = av_fetch(tempav, i, 0);    
-            res = SWIG_ConvertPtr(*tv,SWIG_as_voidptrptr(&$1[i]), 0, 0);
-            if (!SWIG_IsOK(res)) {
-                SWIG_exception_fail(SWIG_ArgError(res), "in method '" "$symname" "', argument " "$argnum"" of type '" "$*1_type""'"); 
-            }
-        }
-    }
-}
-
-%typemap(freearg) (SymbolHandle *in) {
-    Safefree($1);
-}
-
-
-%typemap(in,numinputs=0) (const char **real_name,
-                          const char **description,
-                          nn_uint *num_doc_args,
-                          const char ***arg_names,
-                          const char ***arg_type_infos,
-                          const char ***arg_descriptions,
-                          const char **return_type
-                          ) 
-                          (char *name_temp, 
-                           char *desc_temp, 
-                           nn_uint num_args_temp, 
-                           char **names_temp,
-                           char **types_temp,
-                           char **descs_temp,
-                           char *return_temp
-                          )
-{
-    $1 = &name_temp; 
-    $2 = &desc_temp;
-    $3 = &num_args_temp; 
-    $4 = &names_temp;
-    $5 = &types_temp;
-    $6 = &descs_temp;
-    $7 = &return_temp;
-}
-%typemap(argout) (const char **real_name,
-                  const char **description,
-                  nn_uint *num_doc_args,
-                  const char ***arg_names,
-                  const char ***arg_type_infos,
-                  const char ***arg_descriptions,
-                  const char **return_type
-                  )
-{
-    if(!result)
-    {
-        AV *container, *names, *types, *descs;
-        int i;
-        container = newAV();
-        names = newAV();
-        types = newAV();
-        descs = newAV();
-        if($1) av_push(container, newSVpv(*$1,0));
-        if($2) av_push(container, newSVpv(*$2,0));
-        if($3)
-        {
-            for (i = 0; i < *$3 ; i++) {
-                av_push(names, newSVpv((*$4)[i],0));
-                av_push(types, newSVpv((*$5)[i],0));
-                av_push(descs, newSVpv((*$6)[i],0));                    
-            }
-        }
-        av_push(container, newRV_noinc((SV*)names));
-        av_push(container, newRV_noinc((SV*)types));
-        av_push(container, newRV_noinc((SV*)descs));
-        if($7) av_push(container, newSVpv(*$7,0));
-        $result = newRV_noinc((SV*)container);
-        sv_2mortal($result);
-        argvi++;
-    }
-}
-
-%typemap(in,numinputs=0) (OpHandle *out) (OpHandle temp), 
-                         (SymbolHandle *out) (SymbolHandle temp),
-                         (GraphHandle *out) (GraphHandle temp)
-{
-    $1 = &temp;
-}
-%typemap(argout) (OpHandle *out) 
-{
-    if(!result)
-    {
-        $result =  SWIG_NewPointerObj(SWIG_as_voidptr(*$1), $*1_descriptor, 0); argvi++;
-    }
-}
-
-%typemap(argout) (SymbolHandle *out), (GraphHandle *out) 
-{
-    if(!result)
-    {
-        $result =  SWIG_NewPointerObj(SWIG_as_voidptr(*$1), $*1_descriptor, 0); argvi++;
-    }
-}
-
-%typemap(in,numinputs=0) (nn_uint *out_size, OpHandle** out_array) (nn_uint temp_num, OpHandle* temp)
-{
-    $1 = &temp_num;
-    *$1 = 0;
-    $2 = &temp;
-}
-%typemap(argout) (nn_uint *out_size, OpHandle** out_array)
-{
-    if(!result)
-    {
-        AV *myav;
-        SV **svs;
-        int i = 0;
-        svs = (SV **)safemalloc(*$1*sizeof(SV *));
-        for (i = 0; i < *$1 ; i++) {
-            svs[i] = SWIG_NewPointerObj(SWIG_as_voidptr((*$2)[i]), SWIGTYPE_p_NNOp, 0);
-        }
-        myav = av_make(*$1,svs);
-        Safefree(svs);
-        $result = newRV_noinc((SV*)myav);
-        sv_2mortal($result);
-        argvi++;
-    }
-}
-
-%typemap(in,numinputs=0) (nn_uint *out_size, SymbolHandle** out_array) (nn_uint temp_num, SymbolHandle* temp)
-{
-    $1 = &temp_num;
-    *$1 = 0;
-    $2 = &temp;
-}
-%typemap(argout) (nn_uint *out_size, SymbolHandle** out_array)
-{
-    if(!result)
-    {
-        AV *myav;
-        SV **svs;
-        int i = 0;
-        svs = (SV **)safemalloc(*$1*sizeof(SV *));
-        for (i = 0; i < *$1 ; i++) {
-            svs[i] = SWIG_NewPointerObj(SWIG_as_voidptr((*$2)[i]), SWIGTYPE_p_NNSymbol, 0);
-        }
-        myav = av_make(*$1,svs);
-        Safefree(svs);
-        $result = newRV_noinc((SV*)myav);
-        sv_2mortal($result);
-        argvi++;
-    }
-}
-
-%typemap(in) (SymbolHandle in)
-{
-    int res = SWIG_ConvertPtr($input,&$1, 0, 0);
-    if (!SWIG_IsOK(res)) {
-        SWIG_exception_fail(SWIG_ArgError(res), "in method '" "$symname" "', argument " "$argnum"" of type '" "SymbolHandle""'"); 
-    }
-}
diff --git a/perl-package/AI-NNVMCAPI/t/AI-NNVMCAPI.t b/perl-package/AI-NNVMCAPI/t/AI-NNVMCAPI.t
deleted file mode 100644
index 8296165c0138..000000000000
--- a/perl-package/AI-NNVMCAPI/t/AI-NNVMCAPI.t
+++ /dev/null
@@ -1,22 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-use strict;
-use warnings;
-use Test::More tests => 1;
-BEGIN { use_ok('AI::NNVMCAPI') };
-
diff --git a/perl-package/README.md b/perl-package/README.md
deleted file mode 100644
index dc1824357181..000000000000
--- a/perl-package/README.md
+++ /dev/null
@@ -1,35 +0,0 @@
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-[Perl API](https://mxnet.apache.org/api/perl/index.html)
-[![GitHub license](http://dmlc.github.io/img/apache2.svg)](../LICENSE)
-
-You have found MXNet Perl Package! The MXNet Perl packages brings flexible and efficient GPU
-computing and state-of-art deep learning to Perl.
-
-- It enables you to write seamless tensor/matrix computation with multiple GPUs in Perl.
-- It also enables you to construct and customize state-of-art deep learning models in Perl,
-  and apply them to tasks such as image classification and data science challenges.
-
-Installation
----------
-* [MXNet Setup Document](https://mxnet.apache.org/get_started/ubuntu_setup#install-the-mxnet-package-for-perl)
-  - Check this out for detailed documents, examples and installation guides.
-
-License
--------
-This library is licensed under Apache 2.0 license https://www.apache.org/licenses/LICENSE-2.0
diff --git a/perl-package/test.sh b/perl-package/test.sh
deleted file mode 100755
index 87101a389480..000000000000
--- a/perl-package/test.sh
+++ /dev/null
@@ -1,42 +0,0 @@
-#!/bin/bash
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-MXNET_HOME=${PWD}
-export PERL5LIB=${MXNET_HOME}/perl5/lib/perl5
-
-cd ${MXNET_HOME}/perl-package/AI-MXNetCAPI/
-perl Makefile.PL INSTALL_BASE=${MXNET_HOME}/perl5
-make install || exit -1
-
-cd ${MXNET_HOME}/perl-package/AI-NNVMCAPI/
-perl Makefile.PL INSTALL_BASE=${MXNET_HOME}/perl5
-make install || exit -1
-
-cd ${MXNET_HOME}/perl-package/AI-MXNet/
-perl Makefile.PL INSTALL_BASE=${MXNET_HOME}/perl5
-make test TEST_VERBOSE=1 || exit -1 # Add debug output to test log
-make install || exit -1
-
-cd ${MXNET_HOME}/perl-package/AI-MXNet-Gluon-Contrib/
-perl Makefile.PL INSTALL_BASE=${MXNET_HOME}/perl5
-make install || exit -1
-
-cd ${MXNET_HOME}/perl-package/AI-MXNet-Gluon-ModelZoo/
-perl Makefile.PL INSTALL_BASE=${MXNET_HOME}/perl5
-make test TEST_VERBOSE=1 || exit -1
-
diff --git a/python/mxnet/__init__.py b/python/mxnet/__init__.py
index 1d07b712d0b2..efdd02a3be6a 100644
--- a/python/mxnet/__init__.py
+++ b/python/mxnet/__init__.py
@@ -71,10 +71,6 @@
 # Attribute scope to add attributes to symbolic graphs
 from .attribute import AttrScope
 
-from . import monitor
-# use mx.mon as short for mx.monitor
-from . import monitor as mon
-
 from . import torch
 # use mx.th as short for mx.torch
 from . import torch as th
diff --git a/python/mxnet/_ctypes/ndarray.py b/python/mxnet/_ctypes/ndarray.py
index 56f62f3d5377..4b15b5653a97 100644
--- a/python/mxnet/_ctypes/ndarray.py
+++ b/python/mxnet/_ctypes/ndarray.py
@@ -24,7 +24,7 @@
 
 from ..base import _LIB
 from ..base import c_str_array, c_handle_array
-from ..base import NDArrayHandle, CachedOpHandle
+from ..base import NDArrayHandle, CachedOpHandle, SymbolHandle
 from ..base import check_call
 from .. import _global_var
 
@@ -122,9 +122,24 @@ def __init__(self, sym, flags=()):
     def __del__(self):
         check_call(_LIB.MXFreeCachedOp(self.handle))
 
+    def get_optimized_symbol(self):
+        """Get an optimized version of the symbol from the cached op.
+
+        Returns
+        -------
+        symbol : Symbol
+            Optimized symbol from the executor.
+        """
+        from ..symbol import Symbol
+        sym_handle = SymbolHandle()
+        check_call(_LIB.MXCachedOpGetOptimizedSymbol(self.handle, ctypes.byref(sym_handle)))
+        ret = Symbol(sym_handle)
+        return ret
+
     def __call__(self, *args, **kwargs):
         """ctypes implementation of imperative invoke wrapper"""
         out = kwargs.pop('out', None)
+        default_ctx = kwargs.pop('default_ctx', None)
         if out is not None:
             original_output = out
             if isinstance(out, NDArrayBase):
@@ -145,10 +160,19 @@ def __call__(self, *args, **kwargs):
         # a handle's stype in _ndarray_cls
         out_stypes = ctypes.POINTER(ctypes.c_int)()
 
+        # (None, ) -> []
+        if len(args) == 1 and args[0] is None:
+            args = []
+            assert default_ctx is not None, 'default_ctx is required if no input is provided'
+        else:
+            default_ctx = args[0].ctx if default_ctx is None else default_ctx
+
         check_call(_LIB.MXInvokeCachedOpEx(
             self.handle,
             ctypes.c_int(len(args)),
             c_handle_array(args),
+            ctypes.c_int(default_ctx.device_typeid),
+            ctypes.c_int(default_ctx.device_id),
             ctypes.byref(num_output),
             ctypes.byref(output_vars),
             ctypes.byref(out_stypes)))
diff --git a/python/mxnet/autograd.py b/python/mxnet/autograd.py
index f41c304b2748..f968275a1390 100644
--- a/python/mxnet/autograd.py
+++ b/python/mxnet/autograd.py
@@ -233,8 +233,8 @@ def _parse_head(heads, head_grads):
     if head_grads is None:
         hgrad_handles = ctypes.c_void_p(0)
     else:
-        assert len(heads) == len(head_grads), \
-            "heads and head_grads must be lists of the same length"
+        msg = "heads and head_grads must be lists of the same length: {} vs. {}"
+        assert len(heads) == len(head_grads), msg.format(len(heads), len(head_grads))
         hgrad_handles = c_array(NDArrayHandle,
                                 [i.handle if i is not None else NDArrayHandle(0)
                                  for i in head_grads])
diff --git a/python/mxnet/contrib/__init__.py b/python/mxnet/contrib/__init__.py
index 606bb0ada54f..ca4b9d4090db 100644
--- a/python/mxnet/contrib/__init__.py
+++ b/python/mxnet/contrib/__init__.py
@@ -30,6 +30,4 @@
 from . import text
 from . import onnx
 from . import io
-from . import quantization
-from . import quantization as quant
 from . import tensorrt
diff --git a/python/mxnet/contrib/amp/lists/symbol_bf16.py b/python/mxnet/contrib/amp/lists/symbol_bf16.py
index 59311324dd2d..7ae799c896a9 100644
--- a/python/mxnet/contrib/amp/lists/symbol_bf16.py
+++ b/python/mxnet/contrib/amp/lists/symbol_bf16.py
@@ -108,7 +108,6 @@
     '_cond',
     '_contrib_AdaptiveAvgPooling2D',
     '_contrib_BilinearResize2D',
-    '_contrib_SparseEmbedding',
     '_contrib_bipartite_matching',
     '_contrib_dequantize',
     '_contrib_div_sqrt_dim',
@@ -116,7 +115,6 @@
     '_contrib_getnnz',
     '_contrib_gradientmultiplier',
     '_contrib_group_adagrad_update',
-    '_contrib_ifft',
     '_contrib_index_array',
     '_contrib_index_copy',
     '_contrib_quadratic',
diff --git a/python/mxnet/contrib/amp/lists/symbol_fp16.py b/python/mxnet/contrib/amp/lists/symbol_fp16.py
index 676129f733db..506757307352 100644
--- a/python/mxnet/contrib/amp/lists/symbol_fp16.py
+++ b/python/mxnet/contrib/amp/lists/symbol_fp16.py
@@ -85,7 +85,6 @@
     '_cond',
     '_contrib_AdaptiveAvgPooling2D',
     '_contrib_BilinearResize2D',
-    '_contrib_SparseEmbedding',
     '_contrib_bipartite_matching',
     '_contrib_dequantize',
     '_contrib_div_sqrt_dim',
@@ -93,7 +92,6 @@
     '_contrib_getnnz',
     '_contrib_gradientmultiplier',
     '_contrib_group_adagrad_update',
-    '_contrib_ifft',
     '_contrib_index_array',
     '_contrib_index_copy',
     '_contrib_quadratic',
diff --git a/python/mxnet/contrib/quantization.py b/python/mxnet/contrib/quantization.py
deleted file mode 100644
index af3001235de0..000000000000
--- a/python/mxnet/contrib/quantization.py
+++ /dev/null
@@ -1,631 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Quantization module for generating quantized (INT8) models from FP32 models."""
-
-
-try:
-    from scipy import stats
-except ImportError:
-    stats = None
-
-import ctypes
-import logging
-import os
-import numpy as np
-from ..base import _LIB, check_call, py_str
-from ..base import c_array, c_str, mx_uint, c_str_array
-from ..base import NDArrayHandle, SymbolHandle
-from ..symbol import Symbol
-from ..symbol import load as sym_load
-from .. import ndarray
-from ..ndarray import load as nd_load
-from ..ndarray import NDArray
-from ..io import DataIter, DataDesc, DataBatch
-from ..context import cpu, Context
-
-
-def _quantize_params(qsym, params, th_dict):
-    """Given a quantized symbol and a dict of params that have not been quantized,
-    generate quantized params. Currently only supports quantizing the arg_params
-    with names of `weight` or `bias`, not aux_params. If `qsym` contains symbols
-    that are excluded from being quantized, their corresponding params will
-    not be quantized, but saved together with quantized params of the symbols that
-    have been quantized.
-
-    Parameters
-    ----------
-    qsym : Symbol
-        Quantized symbol from FP32 symbol.
-    params : dict of str->NDArray
-    th_dict: dict of min/max pairs of layers' output
-    """
-    inputs_name = qsym.list_arguments()
-    quantized_params = {}
-    for name in inputs_name:
-        if name.endswith(('weight_quantize', 'bias_quantize')):
-            original_name = name[:-len('_quantize')]
-            param = params[original_name]
-            # pylint: disable=unbalanced-tuple-unpacking
-            val, vmin, vmax = ndarray.contrib.quantize(data=param,
-                                                       min_range=ndarray.min(param),
-                                                       max_range=ndarray.max(param),
-                                                       out_type='int8')
-            quantized_params[name] = val
-            quantized_params[name+'_min'] = vmin
-            quantized_params[name+'_max'] = vmax
-        elif name in params:
-            quantized_params[name] = params[name]
-        elif name.endswith(('_min')):
-            output = name[: - len('_min')]
-            if output in th_dict:
-                quantized_params[name] = ndarray.array([th_dict[output][0]])
-        elif name.endswith(('_max')):
-            output = name[: - len('_min')]
-            if output in th_dict:
-                quantized_params[name] = ndarray.array([th_dict[output][1]])
-    return quantized_params
-
-def _quantize_symbol(sym, ctx, excluded_symbols=None, excluded_operators=None,
-                     offline_params=None, quantized_dtype='int8', quantize_mode='smart',
-                     quantize_granularity='tensor-wise'):
-    """Given a symbol object representing a neural network of data type FP32,
-    quantize it into a INT8 network.
-
-    Parameters
-    ----------
-    sym : Symbol
-        FP32 neural network symbol.
-    ctx : Context
-        Defines the device that users want to run quantized symbol.
-    excluded_symbols : list of strings
-        A list of strings representing the names of the symbols that users want to excluding
-        from being quantized.
-    excluded_operators : list of strings
-        A list of strings representing the names of the operators that users want to excluding
-        from being quantized.
-    offline_params : list of strs
-        Names of the parameters that users want to quantize offline. It's always recommended to
-        quantize parameters offline so that quantizing parameters during the inference can be
-        avoided.
-    quantized_dtype: str
-        The quantized destination type for input data.
-    quantize_mode: str
-        The mode that quantization pass to apply.
-    quantize_granularity: str
-        The granularity of quantization, currently supports 'tensor-wise' and 'channel-wise'
-        quantization. The default value is 'tensor-wise'.
-
-    """
-    num_excluded_symbols = 0
-    if excluded_symbols is not None:
-        assert isinstance(excluded_symbols, list)
-        num_excluded_symbols = len(excluded_symbols)
-    else:
-        excluded_symbols = []
-
-    num_excluded_ops = 0
-    if excluded_operators is not None:
-        assert isinstance(excluded_operators, list)
-        num_excluded_ops = len(excluded_operators)
-    else:
-        excluded_operators = []
-
-    num_offline = 0
-    offline = []
-    if offline_params is not None:
-        num_offline = len(offline_params)
-        for k in offline_params:
-            offline.append(c_str(k))
-
-    out = SymbolHandle()
-    size = mx_uint()
-    calib_str = ctypes.POINTER(ctypes.c_char_p)()
-    check_call(_LIB.MXQuantizeSymbol(sym.handle,
-                                     ctypes.byref(out),
-                                     ctypes.byref(ctypes.c_int(ctx.device_typeid)),
-                                     mx_uint(num_excluded_symbols),
-                                     c_str_array(excluded_symbols),
-                                     mx_uint(num_excluded_ops),
-                                     c_str_array(excluded_operators),
-                                     mx_uint(num_offline),
-                                     c_array(ctypes.c_char_p, offline),
-                                     c_str(quantized_dtype),
-                                     ctypes.c_bool(True),
-                                     c_str(quantize_mode),
-                                     c_str(quantize_granularity),
-                                     ctypes.byref(size),
-                                     ctypes.byref(calib_str)))
-    calib_layer = []
-    calib_layer = [py_str(calib_str[i]) for i in range(size.value)]
-    return Symbol(out), calib_layer
-
-def combine_histogram(old_hist, arr, new_min, new_max, new_th):
-    """ Collect layer histogram for arr and combine it with old histogram.
-    """
-    (old_hist, old_hist_edges, old_min, old_max, old_th) = old_hist
-    if new_th <= old_th:
-        hist, _ = np.histogram(arr, bins=len(old_hist), range=(-old_th, old_th))
-        return (old_hist + hist, old_hist_edges, min(old_min, new_min), max(old_max, new_max), old_th)
-    else:
-        # Need to generate new histogram with new_th
-        old_num_bins = len(old_hist)
-        old_step = 2 * old_th / old_num_bins
-        half_increased_bins = int((new_th - old_th) // old_step + 1)
-        new_num_bins = half_increased_bins * 2 + old_num_bins
-        new_th = half_increased_bins * old_step + old_th
-        hist, hist_edges = np.histogram(arr, bins=new_num_bins, range=(-new_th, new_th))
-        hist[half_increased_bins:new_num_bins - half_increased_bins] += old_hist
-        return (hist, hist_edges, min(old_min, new_min), max(old_max, new_max), new_th)
-
-class _LayerHistogramCollector(object):
-    """Saves layer histogram in a dict with layer names as keys and lists of NDArrays as
-    values. The collected histogram will be used for calculating the optimal thresholds for
-    quantization using KL divergence.
-    """
-    def __init__(self, num_bins=8001, include_layer=None, logger=None):
-        self.hist_dict = {}
-        self.num_bins = num_bins
-        self.include_layer = include_layer
-        self.logger = logger
-
-    def collect(self, name, arr):
-        """Callback function for collecting layer output NDArrays."""
-        name = py_str(name)
-        if name not in self.include_layer:
-            return
-        handle = ctypes.cast(arr, NDArrayHandle)
-        arr = NDArray(handle, writable=False).copyto(cpu()).asnumpy()
-        if self.logger:
-            self.logger.debug("Collecting layer %s histogram of shape %s" % (name, arr.shape))
-        min_range = np.min(arr)
-        max_range = np.max(arr)
-        th = max(abs(min_range), abs(max_range))
-        if name in self.hist_dict:
-            self.hist_dict[name] = combine_histogram(self.hist_dict[name], arr, min_range, max_range, th)
-        else:
-            hist, hist_edges = np.histogram(arr, bins=self.num_bins, range=(-th, th))
-            self.hist_dict[name] = (hist, hist_edges, min_range, max_range, th)
-
-class _LayerOutputMinMaxCollector(object):
-    """Saves layer output min and max values in a dict with layer names as keys.
-    The collected min and max values will be directly used as thresholds for quantization.
-    """
-    def __init__(self, quantized_dtype, include_layer=None, logger=None):
-        self.min_max_dict = {}
-        self.quantized_dtype = quantized_dtype
-        self.include_layer = include_layer
-        self.logger = logger
-
-    def collect(self, name, arr):
-        """Callback function for collecting min and max values from an NDArray."""
-        name = py_str(name)
-        if name not in self.include_layer:
-            return
-        handle = ctypes.cast(arr, NDArrayHandle)
-        arr = NDArray(handle, writable=False)
-        min_range = ndarray.min(arr).asscalar()
-        max_range = ndarray.max(arr).asscalar()
-        if name in self.min_max_dict:
-            cur_min_max = self.min_max_dict[name]
-            self.min_max_dict[name] = (min(cur_min_max[0], min_range),
-                                       max(cur_min_max[1], max_range))
-        else:
-            self.min_max_dict[name] = (min_range, max_range)
-        if self.logger:
-            self.logger.debug("Collecting layer %s min_range=%f, max_range=%f"
-                              % (name, min_range, max_range))
-
-def _calibrate_quantized_sym(qsym, th_dict):
-    """Given a dictionary containing the thresholds for quantizing the layers,
-    set the thresholds into the quantized symbol as the params of requantize operators.
-    """
-    if th_dict is None or len(th_dict) == 0:
-        return qsym
-    num_layer_outputs = len(th_dict)
-    layer_output_names = []
-    min_vals = []
-    max_vals = []
-    for k, v in th_dict.items():
-        layer_output_names.append(k)
-        min_vals.append(v[0])
-        max_vals.append(v[1])
-
-    calibrated_sym = SymbolHandle()
-    check_call(_LIB.MXSetCalibTableToQuantizedSymbol(qsym.handle,
-                                                     mx_uint(num_layer_outputs),
-                                                     c_str_array(layer_output_names),
-                                                     c_array(ctypes.c_float, min_vals),
-                                                     c_array(ctypes.c_float, max_vals),
-                                                     ctypes.byref(calibrated_sym)))
-    return Symbol(calibrated_sym)
-
-
-def _collect_layer_statistics(mod, data, collector, max_num_examples=None, logger=None):
-    if not isinstance(data, DataIter):
-        raise ValueError('Only supports data as a type of DataIter, while received type %s'
-                         % str(type(data)))
-    mod._exec_group.execs[0].set_monitor_callback(collector.collect, monitor_all=True)
-    num_batches = 0
-    num_examples = 0
-    for batch in data:
-        mod.forward(data_batch=batch, is_train=False)
-        num_batches += 1
-        num_examples += data.batch_size
-        if max_num_examples is not None and num_examples >= max_num_examples:
-            break
-    if logger is not None:
-        logger.info("Collected statistics from %d batches with batch_size=%d"
-                    % (num_batches, data.batch_size))
-    return num_examples
-
-
-def _collect_layer_output_min_max(mod, data, quantized_dtype, include_layer=None,
-                                  max_num_examples=None, logger=None):
-    """Collect min and max values from layer outputs and save them in
-    a dictionary mapped by layer names.
-    """
-    collector = _LayerOutputMinMaxCollector(quantized_dtype=quantized_dtype,
-                                            include_layer=include_layer, logger=logger)
-    num_examples = _collect_layer_statistics(mod, data, collector, max_num_examples, logger)
-    return collector.min_max_dict, num_examples
-
-
-def _collect_layer_histogram(mod, data, include_layer=None,
-                             max_num_examples=None, logger=None):
-    """Collect layer outputs and save them in a dictionary mapped by layer names."""
-    collector = _LayerHistogramCollector(include_layer=include_layer, logger=logger)
-    num_examples = _collect_layer_statistics(mod, data, collector, max_num_examples, logger)
-    return collector.hist_dict, num_examples
-
-
-def _smooth_distribution(p, eps=0.0001):
-    """Given a discrete distribution (may have not been normalized to 1),
-    smooth it by replacing zeros with eps multiplied by a scaling factor and taking the
-    corresponding amount off the non-zero values.
-    Ref: http://web.engr.illinois.edu/~hanj/cs412/bk3/KL-divergence.pdf
-    """
-    is_zeros = (p == 0).astype(np.float32)
-    is_nonzeros = (p != 0).astype(np.float32)
-    n_zeros = is_zeros.sum()
-    n_nonzeros = p.size - n_zeros
-    if not n_nonzeros:
-        raise ValueError('The discrete probability distribution is malformed. All entries are 0.')
-    eps1 = eps * float(n_zeros) / float(n_nonzeros)
-    assert eps1 < 1.0, 'n_zeros=%d, n_nonzeros=%d, eps1=%f' % (n_zeros, n_nonzeros, eps1)
-    hist = p.astype(np.float32)
-    hist += eps * is_zeros + (-eps1) * is_nonzeros
-    assert (hist <= 0).sum() == 0
-    return hist
-
-
-# pylint: disable=line-too-long
-def _get_optimal_threshold(hist_data, quantized_dtype, num_quantized_bins=255):
-    """Given a dataset, find the optimal threshold for quantizing it.
-    The reference distribution is `q`, and the candidate distribution is `p`.
-    `q` is a truncated version of the original distribution.
-
-    Ref: http://on-demand.gputechconf.com/gtc/2017/presentation/s7310-8-bit-inference-with-tensorrt.pdf
-    """
-    (hist, hist_edges, min_val, max_val, _) = hist_data
-    num_bins = len(hist)
-    assert (num_bins % 2 == 1)
-    if min_val >= 0 and quantized_dtype in ['auto', 'uint8']:
-        # We need to move negative bins to positive bins to fit uint8 range.
-        num_quantized_bins = num_quantized_bins * 2 + 1
-    hist = ndarray.array(hist, ctx=cpu())
-    hist_edges = ndarray.array(hist_edges, ctx=cpu())
-    threshold, divergence = ndarray.contrib.calibrate_entropy(hist=hist,
-                                                              hist_edges=hist_edges,
-                                                              num_quantized_bins=num_quantized_bins)
-    threshold = threshold.asnumpy()
-    divergence = divergence.asnumpy()
-    return min_val, max_val, threshold, divergence
-# pylint: enable=line-too-long
-
-def _get_optimal_thresholds(hist_dict, quantized_dtype, num_quantized_bins=255, logger=None):
-    """Given a ndarray dict, find the optimal threshold for quantizing each value of the key."""
-    if stats is None:
-        raise ImportError('scipy.stats is required for running entropy mode of calculating'
-                          ' the optimal thresholds for quantizing FP32 ndarrays into int8.'
-                          ' Please check if the scipy python bindings are installed.')
-    assert isinstance(hist_dict, dict)
-    if logger is not None:
-        logger.info('Calculating optimal thresholds for quantization using KL divergence'
-                    ' with num_quantized_bins=%d' % num_quantized_bins)
-    th_dict = {}
-    # copy hist_dict keys since the keys() only returns a view in python3
-    layer_names = list(hist_dict.keys())
-    for name in layer_names:
-        assert name in hist_dict
-        min_val, max_val, th, divergence = \
-            _get_optimal_threshold(hist_dict[name], quantized_dtype,
-                                   num_quantized_bins=num_quantized_bins)
-        if min_val >= 0 and quantized_dtype in ['auto', 'uint8']:
-            th_dict[name] = (0, th)
-        else:
-            th_dict[name] = (-th, th)
-        del hist_dict[name]  # release the memory
-        if logger:
-            logger.debug('layer=%s, min_val=%f, max_val=%f, th=%f, divergence=%f'
-                         % (name, min_val, max_val, th, divergence))
-    return th_dict
-
-
-def _load_sym(sym, logger=None):
-    """Given a str as a path the symbol .json file or a symbol, returns a Symbol object."""
-    if isinstance(sym, str):  # sym is a symbol file path
-        cur_path = os.path.dirname(os.path.realpath(__file__))
-        symbol_file_path = os.path.join(cur_path, sym)
-        if logger:
-            logger.info('Loading symbol from file %s' % symbol_file_path)
-        return sym_load(symbol_file_path)
-    elif isinstance(sym, Symbol):
-        return sym
-    else:
-        raise ValueError('_load_sym only accepts Symbol or path to the symbol file,'
-                         ' while received type %s' % str(type(sym)))
-
-
-def _load_params(params, logger=None):
-    """Given a str as a path to the .params file or a pair of params,
-    returns two dictionaries representing arg_params and aux_params.
-    """
-    if isinstance(params, str):
-        cur_path = os.path.dirname(os.path.realpath(__file__))
-        param_file_path = os.path.join(cur_path, params)
-        if logger:
-            logger.info('Loading params from file %s' % param_file_path)
-        save_dict = nd_load(param_file_path)
-        arg_params = {}
-        aux_params = {}
-        for k, v in save_dict.items():
-            tp, name = k.split(':', 1)
-            if tp == 'arg':
-                arg_params[name] = v
-            if tp == 'aux':
-                aux_params[name] = v
-        return arg_params, aux_params
-    elif isinstance(params, (tuple, list)) and len(params) == 2:
-        return params[0], params[1]
-    else:
-        raise ValueError('Unsupported params provided. Must be either a path to the param file or'
-                         ' a pair of dictionaries representing arg_params and aux_params')
-
-# pylint: disable=super-init-not-called
-class _DataIterWrapper(DataIter):
-    """DataIter wrapper for general iterator, e.g., gluon dataloader"""
-    def __init__(self, calib_data):
-        self._data = calib_data
-        try:
-            calib_iter = iter(calib_data)
-        except TypeError as e:
-            raise TypeError('calib_data is not a valid iterator. {}'.format(str(e)))
-        data_example = next(calib_iter)
-        if isinstance(data_example, (list, tuple)):
-            data_example = list(data_example)
-        else:
-            data_example = [data_example]
-        # suppose there must be one label in data_example
-        # TODO(xinyu-intel): little tricky here, need to refactor.
-        num_data = len(data_example)
-        assert num_data > 0
-        # here reshape is to handle the 5D/6D input data
-        if len(data_example[0].shape) > 4:
-            data_example[0] = data_example[0].reshape((-1,) + data_example[0].shape[2:])
-        self.provide_data = [DataDesc(name='data', shape=(data_example[0].shape))]
-        self.provide_data += [DataDesc(name='data{}'.format(i), shape=x.shape) for i, x in enumerate(data_example[1:])]
-        # data0, data1, ..., label
-        if num_data >= 3:
-            self.provide_data = [DataDesc(name='data{}'.format(i), shape=x.shape)
-                                 for i, x in enumerate(data_example[0:])]
-        self.batch_size = data_example[0].shape[0]
-        self.reset()
-
-    def reset(self):
-        self._iter = iter(self._data)
-
-    def next(self):
-        next_data = next(self._iter)
-        # here reshape is to handle the 5D/6D input data
-        if len(next_data[0].shape) > 4:
-            next_data[0] = next_data[0].reshape((-1,) + next_data[0].shape[2:])
-        return DataBatch(data=next_data)
-# pylint: enable=super-init-not-called
-
-def _as_data_iter(calib_data):
-    """Convert normal iterator to mx.io.DataIter while parsing the data_shapes"""
-    if isinstance(calib_data, DataIter):
-        # already validated DataIter, just return
-        return calib_data, calib_data.provide_data
-
-    calib_data = _DataIterWrapper(calib_data)
-    return calib_data, calib_data.provide_data
-
-def quantize_graph(sym, arg_params, aux_params, ctx=cpu(),
-                   excluded_sym_names=None, excluded_op_names=None,
-                   calib_mode='entropy', quantized_dtype='int8',
-                   quantize_mode='full', quantize_granularity='tensor-wise',
-                   LayerOutputCollector=None, logger=None):
-    """User-level API for generating a quantized model from a FP32 model w/o calibration
-    and a collector for naive or entropy calibration.
-    The backend quantized operators are only enabled for Linux systems. Please do not run
-    inference using the quantized models on Windows for now.
-    Parameters
-    ----------
-    sym : str or Symbol
-        Defines the structure of a neural network for FP32 data types.
-    ctx : Context
-        Defines the device that users want to run forward propagation on the calibration
-        dataset for collecting layer output statistics. Currently, only supports single context.
-    arg_params : dict
-        Dictionary of name to `NDArray`.
-    aux_params : dict
-        Dictionary of name to `NDArray`.
-    excluded_sym_names : list of strings
-        A list of strings representing the names of the symbols that users want to excluding
-        from being quantized.
-    excluded_op_names : list of strings
-        A list of strings representing the names of the operators that users want to excluding
-    calib_mode : str
-        If calib_mode='none', no calibration will be used and the thresholds for
-        requantization after the corresponding layers will be calculated at runtime by
-        calling min and max operators. The quantized models generated in this
-        mode are normally 10-20% slower than those with calibrations during inference.
-        If calib_mode='naive', the min and max values of the layer outputs from a calibration
-        dataset will be directly taken as the thresholds for quantization.
-        If calib_mode='entropy' (default mode), the thresholds for quantization will be
-        derived such that the KL divergence between the distributions of FP32 layer outputs and
-        quantized layer outputs is minimized based upon the calibration dataset.
-    quantized_dtype : str
-        The quantized destination type for input data. Currently support 'int8'
-        , 'uint8' and 'auto'. 'auto' means automatically select output type according to calibration result.
-        Default value is 'int8'.
-    quantize_mode : str
-        The mode that quantization pass to apply. Support 'full' and 'smart'.
-        'full' means quantize all operator if possible.
-        'smart' means quantization pass will smartly choice which operator should be quantized.
-    quantize_granularity: str
-        The granularity of quantization, currently supports 'tensor-wise' and 'channel-wise'
-        quantization. The default value is 'tensor-wise'.
-    LayerOutputCollector : class
-        For customize calibration method usage.
-    logger : Object
-        A logging object for printing information during the process of quantization.
-    Returns
-    -------
-    tuple
-        A tuple of quantized symbol, quantized arg_params, aux_params and collector.
-    -------
-    """
-    if excluded_sym_names is None:
-        excluded_sym_names = []
-    if not isinstance(excluded_sym_names, list):
-        raise ValueError('excluded_sym_names must be a list of strings representing'
-                         ' the names of the symbols that will not be quantized,'
-                         ' while received type %s' % str(type(excluded_sym_names)))
-    if not isinstance(ctx, Context):
-        raise ValueError('currently only supports single ctx, while received %s' % str(ctx))
-    if logger:
-        os.environ['MXNET_QUANTIZATION_VERBOSE'] = '1'
-        logger.info('Quantizing graph')
-    if quantized_dtype not in ('int8', 'uint8', 'auto'):
-        raise ValueError('unknown quantized_dtype %s received,'
-                         ' expected `int8`, `uint8` or `auto`' % quantized_dtype)
-    if quantize_granularity not in ('tensor-wise', 'channel-wise'):
-        raise ValueError('unkonwn quantize_granularity %s received,'
-                         ' expected `tensor-wise` or `channel-wise`.' % quantize_granularity)
-    qsym, calib_layer = _quantize_symbol(sym, ctx, excluded_symbols=excluded_sym_names,
-                                         excluded_operators=excluded_op_names,
-                                         offline_params=list(
-                                             arg_params.keys()),
-                                         quantized_dtype=quantized_dtype,
-                                         quantize_mode=quantize_mode,
-                                         quantize_granularity=quantize_granularity)
-
-    th_dict = {}
-    collector = None
-    if calib_mode is not None and calib_mode != 'none':
-        if calib_mode == 'entropy':
-            collector = _LayerHistogramCollector(
-                include_layer=calib_layer, logger=logger)
-            if logger:
-                logger.info(
-                    'Create a layer output collector for entropy calibration.')
-        elif calib_mode == 'naive':
-            collector = _LayerOutputMinMaxCollector(quantized_dtype=quantized_dtype,
-                                                    include_layer=calib_layer, logger=logger)
-            if logger:
-                logger.info(
-                    'Create a layer output minmax collector for naive calibration')
-        elif calib_mode == 'customize' and LayerOutputCollector is not None:
-            collector = LayerOutputCollector
-            if logger:
-                logger.info(
-                    'Create a customize layer output minmax collector for calibration')
-        else:
-            raise ValueError('unknown calibration mode %s received,'
-                             ' expected `none`, `naive`, `entropy` or `customize`' % calib_mode)
-        if logger:
-            logger.info('Collector created, please use set_monitor_callback'
-                        ' to collect calibration information.')
-
-    if logger:
-        logger.info('Quantizing parameters')
-    qarg_params = _quantize_params(qsym, arg_params, th_dict)
-
-    return qsym, qarg_params, aux_params, collector
-
-def calib_graph(qsym, arg_params, aux_params, collector,
-                calib_mode='entropy', quantized_dtype='int8', logger=logging):
-    """User-level API for calibrating a quantized model using a filled collector.
-    The backend quantized operators are only enabled for Linux systems. Please do not run
-    inference using the quantized models on Windows for now.
-    Parameters
-    ----------
-    qsym : str or Symbol
-        Defines the structure of a neural network for INT8 data types.
-    arg_params : dict
-        Dictionary of name to `NDArray`.
-    aux_params : dict
-        Dictionary of name to `NDArray`.
-    collector : function
-        layer collector for naive or entropy calibration.
-    calib_mode : str
-        If calib_mode='none', no calibration will be used and the thresholds for
-        requantization after the corresponding layers will be calculated at runtime by
-        calling min and max operators. The quantized models generated in this
-        mode are normally 10-20% slower than those with calibrations during inference.
-        If calib_mode='naive', the min and max values of the layer outputs from a calibration
-        dataset will be directly taken as the thresholds for quantization.
-        If calib_mode='entropy' (default mode), the thresholds for quantization will be
-        derived such that the KL divergence between the distributions of FP32 layer outputs and
-        quantized layer outputs is minimized based upon the calibration dataset.
-    quantized_dtype : str
-        The quantized destination type for input data. Currently support 'int8'
-        , 'uint8' and 'auto'. 'auto' means automatically select output type according to calibration result.
-        Default value is 'int8'.
-    logger : Object
-        A logging object for printing information during the process of quantization.
-    Returns
-    -------
-    tuple
-        A tuple of calibrated symbol, quantized arg_params, aux_params.
-    -------
-    """
-    th_dict = {}
-    if calib_mode is not None and calib_mode != 'none':
-        if calib_mode == 'entropy':
-            if logger:
-                logger.info('Calculating optimal thresholds for quantization')
-            th_dict = _get_optimal_thresholds(
-                collector.hist_dict, quantized_dtype, logger=logger)
-        elif calib_mode == 'naive':
-            th_dict = collector.min_max_dict
-        elif calib_mode == 'customize':
-            th_dict = collector.min_max_dict
-        else:
-            raise ValueError('unknown calibration mode %s received,'
-                             ' expected `none`, `naive`, `entropy` or `customize`' % calib_mode)
-        qsym = _calibrate_quantized_sym(qsym, th_dict)
-    else:
-        raise ValueError('please set calibration mode to naive or entropy.')
-
-    if logger:
-        logger.info('Quantizing parameters')
-    qarg_params = _quantize_params(qsym, arg_params, th_dict)
-
-    return qsym, qarg_params, aux_params
diff --git a/python/mxnet/cython/base.pyi b/python/mxnet/cython/base.pyi
index ed4bbf76a25e..3a7606b893f2 100644
--- a/python/mxnet/cython/base.pyi
+++ b/python/mxnet/cython/base.pyi
@@ -116,9 +116,13 @@ cdef extern from "mxnet/c_api.h":
     int MXInvokeCachedOpEx(CachedOpHandle handle,
                            int num_inputs,
                            NDArrayHandle *inputs,
+                           int default_ctx_type,
+                           int default_ctx_dev_id,
                            int *num_outputs,
                            NDArrayHandle **outputs,
                            const int **out_stypes);
     int MXCachedOpRegisterOpHook(NDArrayHandle handle,
                                  CachedOpMonitorCallback callback,
                                  _bool monitor_all);
+    int MXCachedOpGetOptimizedSymbol(CachedOpHandle handle,
+                                     SymbolHandle *out);
diff --git a/python/mxnet/cython/ndarray.pyx b/python/mxnet/cython/ndarray.pyx
index 9e0504d306de..f13e65824aec 100644
--- a/python/mxnet/cython/ndarray.pyx
+++ b/python/mxnet/cython/ndarray.pyx
@@ -126,15 +126,36 @@ cdef class CachedOp:
     def __del__(self):
         CALL(MXFreeCachedOp(self.chandle))
 
-    def __call__(self, *args, out=None):
+    def get_optimized_symbol(self):
+        """Get an optimized version of the symbol from the cached op.
+
+        Returns
+        -------
+        symbol : Symbol
+            Optimized symbol from the executor.
+        """
+        from ..symbol import Symbol
+        cdef SymbolHandle shandle
+        CALL(MXCachedOpGetOptimizedSymbol(self.chandle, &shandle))
+        ret = Symbol(_ctypes.cast(<unsigned long long>shandle, _ctypes.c_void_p))
+        return ret
+
+    def __call__(self, *args, out=None, default_ctx=None):
         """ctypes implementation of imperative invoke wrapper"""
         cdef vector[NDArrayHandle] ndvars
         cdef vector[NDArrayHandle] output_vars
         cdef NDArrayHandle* p_output_vars
         cdef NDArrayHandle ret_handle
+        cdef int default_ctx_type
+        cdef int default_ctx_dev_id
         cdef int num_output
         cdef const int* p_output_stypes
 
+        if len(args) == 1 and args[0] is None:
+            args = []
+            assert default_ctx is not None, 'default_ctx is required if no input is provided'
+        else:
+            default_ctx = args[0].ctx if default_ctx is None else default_ctx
         for i in args:
             ndvars.push_back((<NDArrayBase>i).chandle)
 
@@ -157,6 +178,8 @@ cdef class CachedOp:
             self.chandle,
             <int>len(args),
             &ndvars[0] if ndvars.size() != 0 else NULL,
+            <int>(default_ctx.device_typeid),
+            <int>(default_ctx.device_id),
             &num_output,
             &p_output_vars,
             &p_output_stypes))
diff --git a/python/mxnet/executor.py b/python/mxnet/executor.py
index b42cfa8dd64a..85dba4b58858 100644
--- a/python/mxnet/executor.py
+++ b/python/mxnet/executor.py
@@ -19,92 +19,120 @@
 # pylint: disable=invalid-name, protected-access, too-many-locals, too-many-arguments
 """Symbolic Executor component of MXNet."""
 
-from array import array as py_array
-import ctypes
-import copy
 import numpy as np
-from .base import _LIB
-from .base import mx_uint, NDArrayHandle, SymbolHandle, ExecutorHandle, py_str, mx_int
-from .base import check_call, c_handle_array, c_array_buf, c_str_array
 from . import ndarray
-from .ndarray import NDArray
-from .ndarray import _ndarray_cls
 
-def _monitor_callback_wrapper(callback):
-    """A wrapper for the user-defined handle."""
-    def callback_handle(name, array, _):
-        """ ctypes function """
-        callback(name, array)
-    return callback_handle
-
-class Executor(object):
-    """Executor is the object providing efficient symbolic graph execution and optimization.
+class Executor:
+    """Executor is the object providing efficient symbolic and imperative graph
+    execution and optimization.
 
     Examples
     --------
     >>> # typical approach to create an executor is to bind symbol
-    >>> a = mx.sym.Variable('a')
-    >>> b = mx.sym.Variable('b')
+    >>> a = mx.sym.var('a')
+    >>> b = mx.sym.var('b')
     >>> c = 2 * a + b
-    >>> texec = c.bind(mx.cpu(), {'a': mx.nd.array([1,2]), 'b':mx.nd.array([2,3])})
+    >>> texec = c._bind(mx.cpu(), {'a': mx.nd.array([1,2]), 'b':mx.nd.array([2,3])})
     """
-    def __init__(self, handle, symbol, ctx, grad_req, group2ctx):
-        """Constructor, used Symbol.bind and Symbol.simple_bind instead.
+    def __init__(self, sym, ctx, args, args_grad, grad_req, aux_states):
+        self.outputs = None
+        self._input_names = sym.list_inputs()
+        self._aux_names = sym.list_auxiliary_states()
+        self._arg_names = sym.list_arguments()
+        self._output_names = sym.list_outputs()
+        self._ctx = ctx
+        self._grad_req = grad_req
+        # grad_req
+        self._requires_grad = False
+        if isinstance(grad_req, dict):
+            for k, v in grad_req.items():
+                if k in self._input_names and v != 'null':
+                    self._requires_grad = True
+        else:
+            assert isinstance(grad_req, str)
+            self._requires_grad = grad_req != 'null'
+
+        # args grad
+        self._args_grad = args_grad
+        if not self._args_grad:
+            self._args_grad = None
+
+        # args
+        self._args = [None] * len(self._input_names)
+        if isinstance(args, dict):
+            for k, v in args.items():
+                try:
+                    i = self._input_names.index(k)
+                    self._args[i] = v.copyto(ctx)
+                # ignore provided arg which is not present in
+                # input_names
+                except ValueError:
+                    pass
+        else:
+            assert isinstance(args, (list, tuple))
+            for i, arg in enumerate(args):
+                name = self._arg_names[i]
+                index = self._input_names.index(name)
+                self._args[index] = arg.copyto(ctx)
+
+        # aux states
+        if aux_states:
+            if isinstance(aux_states, dict):
+                for k, v in aux_states.items():
+                    if k in self._aux_names:
+                        i = self._input_names.index(k)
+                        self._args[i] = v.copyto(ctx)
+            else:
+                assert isinstance(aux_states, (list, tuple))
+                for i, v in enumerate(aux_states):
+                    index = self._input_names.index(self._aux_names[i])
+                    self._args[index] = v.copyto(ctx)
+
+        # arg grad
+        if self._args_grad:
+            if isinstance(self._args_grad, dict):
+                for k, g in self._args_grad.items():
+                    try:
+                        i = self._input_names.index(k)
+                        # get req
+                        if isinstance(grad_req, str):
+                            req = grad_req
+                        else:
+                            assert isinstance(grad_req, dict)
+                            req = grad_req[k]
+                        if req != 'null':
+                            with self._ctx:
+                                self._args[i].attach_grad(req, stype=g.stype)
+                                self._args[i].grad[:] = g
+                    # ignore provided arg which is not present in
+                    # input_names
+                    except ValueError:
+                        pass
+            else:
+                assert isinstance(self._args_grad, (list, tuple))
+                for i, g in enumerate(self._args_grad):
+                    # get req
+                    if isinstance(grad_req, str):
+                        req = grad_req
+                    else:
+                        assert isinstance(grad_req, dict)
+                        req = grad_req[self._input_names[i]]
+                    if req != 'null':
+                        with self._ctx:
+                            self._args[i].attach_grad(req, stype=g.stype)
+                            self._args[i].grad[:] = g
+        self._cached_op = ndarray.CachedOp(sym)
 
-        Parameters
-        ----------
-        handle: ExecutorHandle
-            ExecutorHandle generated by calling `bind`.
-
-        See Also
-        --------
-        Symbol.bind : to create executor.
-        """
-        if not isinstance(handle, ExecutorHandle):
-            raise TypeError("Handle type error")
-        self.handle = handle
-        self.arg_arrays = []
-        self.grad_arrays = []
-        self.aux_arrays = []
-        self.outputs = self._get_outputs()
-        self._symbol = copy.deepcopy(symbol)
-        self._optimized_symbol = None
-        self._arg_dict = None
-        self._grad_dict = None
-        self._aux_dict = None
-        self._output_dict = None
-        self._monitor_callback = None
-        self._ctx = copy.deepcopy(ctx)
-        self._grad_req = copy.deepcopy(grad_req)
-        self._group2ctx = copy.deepcopy(group2ctx)
-
-    def __del__(self):
-        check_call(_LIB.MXExecutorFree(self.handle))
-
-    @staticmethod
-    def _get_dict(names, ndarrays):
-        """Get the dictionary given name and ndarray pairs."""
-        nset = set()
-        for nm in names:
-            if nm in nset:
-                raise ValueError('Duplicate names detected, %s' % str(names))
-            nset.add(nm)
-        return dict(zip(names, ndarrays))
-
-    def _get_outputs(self):
-        """List all the output NDArray.
+    def get_optimized_symbol(self):
+        """Get an optimized version of the symbol from the executor.
 
         Returns
         -------
-        A list of ndarray bound to the heads of executor.
+        symbol : Symbol
+            Optimized symbol from the executor.
         """
-        out_size = mx_uint()
-        handles = ctypes.POINTER(NDArrayHandle)()
-        check_call(_LIB.MXExecutorOutputs(self.handle,
-                                          ctypes.byref(out_size), ctypes.byref(handles)))
-        num_output = out_size.value
-        outputs = [_ndarray_cls(NDArrayHandle(handles[i])) for i in range(num_output)]
-        return outputs
+        return self._cached_op.get_optimized_symbol()
+
 
     def forward(self, is_train=False, **kwargs):
         """Calculate the outputs specified by the bound symbol.
@@ -129,25 +157,36 @@ def forward(self, is_train=False, **kwargs):
         >>> outputs = texec.forward(is_train=True, data=mydata)
         >>> print(outputs[0].asnumpy())
         """
-        if len(kwargs) != 0:
-            arg_dict = self.arg_dict
+        if kwargs:
             for name, array in kwargs.items():
-                if not isinstance(array, (NDArray, np.ndarray)):
-                    raise ValueError('only accept keyword argument of NDArrays and numpy.ndarray')
-                if name not in arg_dict:
-                    raise TypeError('Unknown argument %s' % name)
-                if arg_dict[name].shape != array.shape:
-                    raise ValueError('Shape not match! Argument %s, need: %s, received: %s'
-                                     %(name, str(arg_dict[name].shape), str(array.shape)))
-                arg_dict[name][:] = array
-
-        check_call(_LIB.MXExecutorForward(
-            self.handle,
-            ctypes.c_int(int(is_train))))
-        self.outputs = self._get_outputs()
+                if name in self._input_names:
+                    index = self._input_names.index(name)
+                    with self._ctx:
+                        arr = ndarray.array(array, dtype=array.dtype)
+                        if self._args[index] is None:
+                            self._args[index] = arr
+                            # get req
+                            if isinstance(self._grad_req, str):
+                                req = self._grad_req
+                            else:
+                                assert isinstance(self._grad_req, dict)
+                                req = self._grad_req[name]
+                            if req != 'null':
+                                with self._ctx:
+                                    self._args[index].attach_grad(req)
+                        else:
+                            self._args[index][:] = arr
+
+        from . import autograd
+        default_ctx = None if self._input_names else self._ctx
+        with autograd.record(train_mode=is_train):
+            self.outputs = self._cached_op(*self._args,
+                                           default_ctx=default_ctx)
+        if not isinstance(self.outputs, (list, tuple)):
+            self.outputs = [self.outputs]
         return self.outputs
 
-    def backward(self, out_grads=None, is_train=True):
+    def backward(self, out_grads=None):
         """Do backward pass to get the gradient of arguments.
 
         Parameters
@@ -162,47 +201,72 @@ def backward(self, out_grads=None, is_train=True):
             during inference.
 
         """
-        if out_grads is None:
-            out_grads = []
-        elif isinstance(out_grads, NDArray):
-            out_grads = [out_grads]
-        elif isinstance(out_grads, dict):
-            out_grads = [out_grads[k] for k in self._symbol.list_outputs()]
-
-        for obj in out_grads:
-            if not isinstance(obj, NDArray):
-                raise TypeError("inputs must be NDArray")
-        handle_array = c_handle_array(out_grads)
-        check_call(_LIB.MXExecutorBackwardEx(
-            self.handle,
-            mx_uint(len(out_grads)),
-            handle_array,
-            ctypes.c_int(is_train)))
-
-    def set_monitor_callback(self, callback, monitor_all=False):
-        """Install callback for monitor.
+        from . import autograd
+        if out_grads is not None:
+            if not isinstance(out_grads, (list, tuple)):
+                out_grads = [out_grads]
+            out_grads = [o.copyto(self._ctx) for o in out_grads]
+
+        if self._requires_grad:
+            if self.outputs is None:
+                self.forward()
+            autograd.backward(self.outputs, head_grads=out_grads)
+
+            if isinstance(self._args_grad, dict):
+                for k, v in self._args_grad.items():
+                    try:
+                        i = self._input_names.index(k)
+                        if self._args[i].grad is not None:
+                            v[:] = self._args[i].grad
+                    # ignore provided arg grad which is not present in
+                    # input_names
+                    except ValueError:
+                        pass
+            else:
+                assert isinstance(self._args_grad, (list, tuple))
+                for arg, out in zip(self._args, self._args_grad):
+                    if arg.grad is not None:
+                        out[:] = arg.grad
 
-        Parameters
-        ----------
-        callback : function
-            Takes a string and an NDArrayHandle.
-        monitor_all : bool, default False
-            If true, monitor both input and output, otherwise monitor output only.
+    @property
+    def aux_arrays(self):
+        """the auxilary argument array"""
+        assert isinstance(self._args, list)
+        aux_array = []
+        for name in self._aux_names:
+            index = self._input_names.index(name)
+            aux_array.append(self._args[index])
+        return aux_array
 
-        Examples
-        --------
-        >>> def mon_callback(*args, **kwargs):
-        >>>     print("Do your stuff here.")
-        >>>
-        >>> texe.set_monitor_callback(mon_callback)
-        """
-        cb_type = ctypes.CFUNCTYPE(None, ctypes.c_char_p, NDArrayHandle, ctypes.c_void_p)
-        self._monitor_callback = cb_type(_monitor_callback_wrapper(callback))
-        check_call(_LIB.MXExecutorSetMonitorCallbackEX(
-            self.handle,
-            self._monitor_callback,
-            None,
-            ctypes.c_int(monitor_all)))
+    @property
+    def arg_arrays(self):
+        """the argument array"""
+        assert isinstance(self._args, list)
+        arg_array = []
+        for name in self._arg_names:
+            index = self._input_names.index(name)
+            arg_array.append(self._args[index])
+        return arg_array
+
+    @property
+    def grad_arrays(self):
+        """the gradient array"""
+        if isinstance(self._args_grad, (list, tuple)):
+            return list(self._args_grad)
+
+        arr = [None] * len(self._arg_names)
+        if self._args_grad:
+            assert isinstance(self._args_grad, dict)
+            for k, _ in self._args_grad.items():
+                try:
+                    i = self._input_names.index(k)
+                    j = self._arg_names.index(k)
+                    arr[j] = self._args[i].grad
+                # ignore provided arg grad which is not present in
+                # input_names
+                except ValueError:
+                    pass
+        return arr
 
     @property
     def arg_dict(self):
@@ -217,24 +281,11 @@ def arg_dict(self):
         ------
         ValueError : if there are duplicated names in the arguments.
         """
-        if self._arg_dict is None:
-            self._arg_dict = Executor._get_dict(
-                self._symbol.list_arguments(), self.arg_arrays)
-        return self._arg_dict
-
-    @property
-    def grad_dict(self):
-        """Get dictionary representation of gradient arrays.
-
-        Returns
-        -------
-        grad_dict : dict of str to NDArray
-            The dictionary that maps name of arguments to gradient arrays.
-        """
-        if self._grad_dict is None:
-            self._grad_dict = Executor._get_dict(
-                self._symbol.list_arguments(), self.grad_arrays)
-        return self._grad_dict
+        ret = {}
+        for k, v in zip(self._input_names, self._args):
+            if k in self._arg_names:
+                ret[k] = v
+        return ret
 
     @property
     def aux_dict(self):
@@ -249,10 +300,26 @@ def aux_dict(self):
         ------
         ValueError : if there are duplicated names in the auxiliary states.
         """
-        if self._aux_dict is None:
-            self._aux_dict = Executor._get_dict(
-                self._symbol.list_auxiliary_states(), self.aux_arrays)
-        return self._aux_dict
+        ret = {}
+        for k, v in zip(self._input_names, self._args):
+            if k in self._aux_names:
+                ret[k] = v
+        return ret
+
+    @property
+    def grad_dict(self):
+        """Get dictionary representation of gradient arrays.
+
+        Returns
+        -------
+        grad_dict : dict of str to NDArray
+            The dictionary that maps name of arguments to gradient arrays.
+        """
+        ret = {}
+        for k, v in zip(self._input_names, self._args):
+            if k in self._arg_names:
+                ret[k] = v.grad
+        return ret
 
     @property
     def output_dict(self):
@@ -267,10 +334,10 @@ def output_dict(self):
         ------
         ValueError : if there are duplicated names in the outputs.
         """
-        if self._output_dict is None:
-            self._output_dict = Executor._get_dict(
-                self._symbol.list_outputs(), self.outputs)
-        return self._output_dict
+        ret = {}
+        for k, v in zip(self._output_names, self.outputs):
+            ret[k] = v
+        return ret
 
     def copy_params_from(self, arg_params, aux_params=None, allow_extra_params=False):
         """Copy parameters from arg_params, aux_params into executor's internal array.
@@ -324,157 +391,3 @@ def copy_params_from(self, arg_params, aux_params=None, allow_extra_params=False
                     array.astype(dst.dtype).copyto(dst)
             elif not allow_extra_params:
                 raise ValueError('Find name %s that is not in the auxiliary states' % name)
-
-    def reshape(self, partial_shaping=False, allow_up_sizing=False, **kwargs):
-        """Return a new executor with the same symbol and shared memory,
-        but different input/output shapes.
-        For runtime reshaping, variable length sequences, etc.
-        The returned executor shares state with the current one,
-        and cannot be used in parallel with it.
-
-        Parameters
-        ----------
-        partial_shaping : bool
-            Whether to allow changing the shape of unspecified arguments.
-        allow_up_sizing : bool
-            Whether to allow allocating new ndarrays that's larger than the original.
-        kwargs : dict of string to tuple of int
-            New shape for arguments.
-
-        Returns
-        -------
-        exec : Executor
-            A new executor that shares memory with self.
-
-        Examples
-        --------
-        >>> a = mx.sym.Variable('a')
-        >>> b = mx.sym.Variable('b')
-        >>> c = 2 * a + b
-        >>> texec = c.bind(mx.cpu(), {'a': mx.nd.zeros((2, 1)), 'b': mx.nd.ones((2,1))})
-        >>> new_shape = {'a': (4, 2), 'b': (4, 2)}
-        >>> texec.reshape(allow_up_sizing=True, **new_shape)
-        """
-        # pylint: disable=too-many-branches
-        provided_arg_shape_data = []  # shape data
-        # argument shape index in sdata,
-        # e.g. [sdata[indptr[0]], sdata[indptr[1]]) is the shape of the first arg
-        provided_arg_shape_idx = [0]
-        provided_arg_shape_names = []  # provided argument names
-        for k, v in kwargs.items():
-            if isinstance(v, tuple):
-                provided_arg_shape_names.append(k)
-                provided_arg_shape_data.extend(v)
-                provided_arg_shape_idx.append(len(provided_arg_shape_data))
-
-        ctx_map_keys = []
-        ctx_map_dev_types = []
-        ctx_map_dev_ids = []
-
-        if self._group2ctx:
-            for key, val in self._group2ctx.items():
-                ctx_map_keys.append(key)
-                ctx_map_dev_types.append(val.device_typeid)
-                ctx_map_dev_ids.append(val.device_id)
-
-        handle = ExecutorHandle()
-        shared_handle = self.handle
-
-        num_in_args = ctypes.c_uint()
-        in_arg_handles = ctypes.POINTER(NDArrayHandle)()
-        arg_grad_handles = ctypes.POINTER(NDArrayHandle)()
-        num_aux_states = ctypes.c_uint()
-        aux_state_handles = ctypes.POINTER(NDArrayHandle)()
-
-        check_call(_LIB.MXExecutorReshapeEx(ctypes.c_int(int(partial_shaping)),
-                                            ctypes.c_int(int(allow_up_sizing)),
-                                            ctypes.c_int(self._ctx.device_typeid),
-                                            ctypes.c_int(self._ctx.device_id),
-                                            mx_uint(len(ctx_map_keys)),
-                                            c_str_array(ctx_map_keys),
-                                            c_array_buf(ctypes.c_int,
-                                                        py_array('i', ctx_map_dev_types)),
-                                            c_array_buf(ctypes.c_int,
-                                                        py_array('i', ctx_map_dev_ids)),
-                                            mx_uint(len(provided_arg_shape_names)),
-                                            c_str_array(provided_arg_shape_names),
-                                            c_array_buf(mx_int,
-                                                        py_array('i', provided_arg_shape_data)),
-                                            c_array_buf(mx_uint,
-                                                        py_array('I', provided_arg_shape_idx)),
-                                            ctypes.byref(num_in_args),
-                                            ctypes.byref(in_arg_handles),
-                                            ctypes.byref(arg_grad_handles),
-                                            ctypes.byref(num_aux_states),
-                                            ctypes.byref(aux_state_handles),
-                                            shared_handle,
-                                            ctypes.byref(handle)))
-
-        arg_arrays = [_ndarray_cls(NDArrayHandle(in_arg_handles[i]))
-                      for i in range(num_in_args.value)]
-        grad_arrays = [_ndarray_cls(NDArrayHandle(arg_grad_handles[i]))
-                       if arg_grad_handles[i] is not None
-                       else None for i in range(num_in_args.value)]
-        aux_arrays = [_ndarray_cls(NDArrayHandle(aux_state_handles[i]))
-                      for i in range(num_aux_states.value)]
-
-        executor = Executor(handle, self._symbol, self._ctx, self._grad_req, self._group2ctx)
-        executor.arg_arrays = arg_arrays
-        executor.grad_arrays = grad_arrays
-        executor.aux_arrays = aux_arrays
-        return executor
-
-    def debug_str(self):
-        """Get a debug string about internal execution plan.
-
-        Returns
-        -------
-        debug_str : string
-            Debug string of the executor.
-
-        Examples
-        --------
-        >>> a = mx.sym.Variable('a')
-        >>> b = mx.sym.sin(a)
-        >>> c = 2 * a + b
-        >>> texec = c.bind(mx.cpu(), {'a': mx.nd.array([1,2]), 'b':mx.nd.array([2,3])})
-        >>> print(texec.debug_str())
-        Symbol Outputs:
-	            output[0]=_plus0(0)
-        Variable:a
-        --------------------
-        Op:_mul_scalar, Name=_mulscalar0
-        Inputs:
-	        arg[0]=a(0) version=0
-        Attrs:
-	        scalar=2
-        --------------------
-        Op:sin, Name=sin0
-        Inputs:
-	        arg[0]=a(0) version=0
-        --------------------
-        Op:elemwise_add, Name=_plus0
-        Inputs:
-	        arg[0]=_mulscalar0(0)
-	        arg[1]=sin0(0)
-        Total 0 MB allocated
-        Total 11 TempSpace resource requested
-        """
-        debug_str = ctypes.c_char_p()
-        check_call(_LIB.MXExecutorPrint(
-            self.handle, ctypes.byref(debug_str)))
-        return py_str(debug_str.value)
-
-    def get_optimized_symbol(self):
-        """Get an optimized version of the symbol from the executor.
-
-        Returns
-        -------
-        symbol : Symbol
-            Optimized symbol from the executor.
-        """
-        from .symbol import Symbol
-        sym_handle = SymbolHandle()
-        check_call(_LIB.MXExecutorGetOptimizedSymbol(self.handle, ctypes.byref(sym_handle)))
-        ret = Symbol(sym_handle)
-        return ret
diff --git a/python/mxnet/gluon/contrib/nn/basic_layers.py b/python/mxnet/gluon/contrib/nn/basic_layers.py
index 6bb2147f0d7c..d5bf41614749 100644
--- a/python/mxnet/gluon/contrib/nn/basic_layers.py
+++ b/python/mxnet/gluon/contrib/nn/basic_layers.py
@@ -19,14 +19,13 @@
 # pylint: disable= arguments-differ
 """Custom neural network layers in model_zoo."""
 
-__all__ = ['Concurrent', 'HybridConcurrent', 'Identity', 'SparseEmbedding',
+__all__ = ['Concurrent', 'HybridConcurrent', 'Identity',
            'SyncBatchNorm', 'PixelShuffle1D', 'PixelShuffle2D',
            'PixelShuffle3D']
 
 import warnings
 from .... import ndarray as nd, context
-from ...block import HybridBlock, Block
-from ...parameter import Parameter
+from ...block import HybridBlock
 from ...nn import Sequential, HybridSequential, BatchNorm
 
 class Concurrent(Sequential):
@@ -110,53 +109,6 @@ def __init__(self):
     def hybrid_forward(self, F, x):
         return x
 
-class SparseEmbedding(Block):
-    r"""Turns non-negative integers (indexes/tokens) into dense vectors
-    of fixed size. eg. [4, 20] -> [[0.25, 0.1], [0.6, -0.2]]
-
-    This SparseBlock is designed for distributed training with extremely large
-    input dimension. Both weight and gradient w.r.t. weight are `RowSparseNDArray`.
-
-    Note: if `sparse_grad` is set to True, the gradient w.r.t weight will be
-    sparse. Only a subset of optimizers support sparse gradients, including SGD, AdaGrad
-    and Adam. By default lazy updates is turned on, which may perform differently
-    from standard updates. For more details, please check the Optimization API at:
-    https://mxnet.incubator.apache.org/api/python/optimization/optimization.html
-
-    Parameters
-    ----------
-    input_dim : int
-        Size of the vocabulary, i.e. maximum integer index + 1.
-    output_dim : int
-        Dimension of the dense embedding.
-    dtype : str or np.dtype, default 'float32'
-        Data type of output embeddings.
-    weight_initializer : Initializer
-        Initializer for the `embeddings` matrix.
-
-    Inputs:
-        - **data**: (N-1)-D tensor with shape: `(x1, x2, ..., xN-1)`.
-    Output:
-        - **out**: N-D tensor with shape: `(x1, x2, ..., xN-1, output_dim)`.
-    """
-    def __init__(self, input_dim, output_dim, dtype='float32',
-                 weight_initializer=None, **kwargs):
-        super(SparseEmbedding, self).__init__(**kwargs)
-        self._kwargs = {'input_dim': input_dim, 'output_dim': output_dim,
-                        'dtype': dtype, 'sparse_grad': True}
-        self.weight = Parameter('weight', shape=(input_dim, output_dim),
-                                init=weight_initializer, dtype=dtype,
-                                grad_stype='row_sparse', stype='row_sparse')
-
-    def forward(self, x):
-        weight = self.weight.row_sparse_data(x)
-        return nd.Embedding(x, weight, name='fwd', **self._kwargs)
-
-    def __repr__(self):
-        s = '{block_name}({input_dim} -> {output_dim}, {dtype})'
-        return s.format(block_name=self.__class__.__name__,
-                        **self._kwargs)
-
 class SyncBatchNorm(BatchNorm):
     """Cross-GPU Synchronized Batch normalization (SyncBN)
 
diff --git a/python/mxnet/monitor.py b/python/mxnet/monitor.py
deleted file mode 100644
index ce19ad0ca8ac..000000000000
--- a/python/mxnet/monitor.py
+++ /dev/null
@@ -1,145 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# coding: utf-8
-# pylint: disable=protected-access, logging-format-interpolation, invalid-name, no-member, too-many-branches
-"""Monitor outputs, weights, and gradients for debugging."""
-
-import re
-import ctypes
-import logging
-from math import sqrt
-
-from .ndarray import NDArray
-from .base import NDArrayHandle, py_str
-from . import ndarray
-
-
-class Monitor(object):
-    """Monitor inputs, outputs, weights, and gradients for debugging.
-
-    Parameters
-    ----------
-    interval : int
-        Number of batches between printing.
-    stat_func : function
-        A function that computes statistics of tensors.
-        Takes an `NDArray` and returns an `NDArray`. Defaults to mean
-        absolute value abs(x)/size(x).
-    pattern : str
-        A regular expression specifying which tensors to monitor.
-        Only tensors with names that match `name_pattern` will be included.
-        For example, '.*weight|.*output' will print all weights and outputs and
-        '.*backward.*' will print all gradients.
-    monitor_all : bool, default False
-        If true, monitor both input and output, otherwise monitor output only.
-    """
-    def __init__(self, interval, stat_func=None, pattern='.*', sort=False, monitor_all=False):
-        if stat_func is None:
-            def asum_stat(x):
-                """returns |x|/size(x), async execution."""
-                return ndarray.norm(x)/sqrt(x.size)
-            stat_func = asum_stat
-        self.stat_func = stat_func
-        self.interval = interval
-        self.activated = False
-        self.queue = []
-        self.step = 0
-        self.exes = []
-        self.re_prog = re.compile(pattern)
-        self.sort = sort
-        self.monitor_all = monitor_all
-        def stat_helper(name, array):
-            """wrapper for executor callback"""
-            array = ctypes.cast(array, NDArrayHandle)
-            array = NDArray(array, writable=False)
-            if not self.activated or not self.re_prog.match(py_str(name)):
-                return
-            self.queue.append((self.step, py_str(name), self.stat_func(array)))
-        self.stat_helper = stat_helper
-
-    def install(self, exe):
-        """install callback to executor.
-        Supports installing to multiple exes.
-
-        Parameters
-        ----------
-        exe : mx.executor.Executor
-            The Executor (returned by symbol.bind) to install to.
-        """
-        exe.set_monitor_callback(self.stat_helper, self.monitor_all)
-        self.exes.append(exe)
-
-    def tic(self):
-        """Start collecting stats for current batch.
-        Call before calling forward."""
-        if self.step % self.interval == 0:
-            for exe in self.exes:
-                for array in exe.arg_arrays:
-                    array.wait_to_read()
-                for array in exe.aux_arrays:
-                    array.wait_to_read()
-            self.queue = []
-            self.activated = True
-        self.step += 1
-
-
-    def toc(self):
-        """End collecting for current batch and return results.
-        Call after computation of current batch.
-
-        Returns
-        -------
-        res : list of """
-        if not self.activated:
-            return []
-        for exe in self.exes:
-            for array in exe.arg_arrays:
-                array.wait_to_read()
-            for array in exe.aux_arrays:
-                array.wait_to_read()
-        for exe in self.exes:
-            for name, array in zip(exe._symbol.list_arguments(), exe.arg_arrays):
-                if self.re_prog.match(name):
-                    self.queue.append((self.step, name, self.stat_func(array)))
-            for name, array in zip(exe._symbol.list_auxiliary_states(), exe.aux_arrays):
-                if self.re_prog.match(name):
-                    self.queue.append((self.step, name, self.stat_func(array)))
-        self.activated = False
-        res = []
-        if self.sort:
-            self.queue.sort(key=lambda x: x[1])
-        for n, k, v_list in self.queue:
-            if isinstance(v_list, NDArray):
-                v_list = [v_list]
-            assert isinstance(v_list, list)
-            s = ''
-            for v in v_list:
-                assert isinstance(v, NDArray)
-                if v.shape == (1,):
-                    s += str(v.asscalar()) + '\t'
-                else:
-                    s += str(v.asnumpy()) + '\t'
-            res.append((n, k, s))
-        self.queue = []
-        return res
-
-    def toc_print(self):
-        """End collecting and print results."""
-        res = self.toc()
-        for n, k, v in res:
-            logging.info('Batch: {:7d} {:30s} {:s}'.format(n, k, v))
diff --git a/python/mxnet/ndarray/ndarray.py b/python/mxnet/ndarray/ndarray.py
index 49a4406aa0ff..9cc8b8942c1d 100644
--- a/python/mxnet/ndarray/ndarray.py
+++ b/python/mxnet/ndarray/ndarray.py
@@ -154,13 +154,17 @@ def _new_alloc_handle(shape, ctx, delay_alloc, dtype=mx_real_t):
     """
     hdl = NDArrayHandle()
     if _int64_enabled():
+        if np.dtype(dtype) == np.dtype([('bfloat16', np.uint16)]):
+            dtype_type = np.dtype(dtype)
+        else:
+            dtype_type = np.dtype(dtype).type
         check_call(_LIB.MXNDArrayCreateEx64(
             c_array_buf(mx_int64, native_array('q', shape)),
             ctypes.c_int(len(shape)),
             ctypes.c_int(ctx.device_typeid),
             ctypes.c_int(ctx.device_id),
             ctypes.c_int(int(delay_alloc)),
-            ctypes.c_int(int(_DTYPE_NP_TO_MX[np.dtype(dtype).type])),
+            ctypes.c_int(int(_DTYPE_NP_TO_MX[dtype_type])),
             ctypes.byref(hdl)))
     else:
         # When shape is larger than unit32 then there is an overflow error at python end itself.
@@ -2816,7 +2820,7 @@ def attach_grad(self, grad_req='write', stype=None):
         """
         from . import zeros as _zeros
         if stype is not None:
-            grad = _zeros(self.shape, stype=stype)
+            grad = _zeros(self.shape, stype=stype, dtype=self.dtype)
         else:
             grad = op.zeros_like(self)  # pylint: disable=undefined-variable
         grad_req = _GRAD_REQ_MAP[grad_req]
diff --git a/python/mxnet/symbol/symbol.py b/python/mxnet/symbol/symbol.py
index 5fb1b8c4e700..39b8799ce155 100644
--- a/python/mxnet/symbol/symbol.py
+++ b/python/mxnet/symbol/symbol.py
@@ -33,12 +33,11 @@
 from ..attribute import AttrScope
 from ..base import _LIB, numeric_types, c_array, c_array_buf, c_str, c_str_array, c_handle_array
 from ..base import mx_uint, py_str, string_types, integer_types, mx_int, mx_int64
-from ..base import NDArrayHandle, ExecutorHandle, SymbolHandle
+from ..base import NDArrayHandle, SymbolHandle
 from ..base import check_call, MXNetError, NotImplementedForSymbol
 from ..context import Context, current_context
-from ..ndarray import NDArray, _DTYPE_NP_TO_MX, _DTYPE_MX_TO_NP, _GRAD_REQ_MAP
+from ..ndarray import NDArray, _DTYPE_NP_TO_MX, _DTYPE_MX_TO_NP
 from ..ndarray.ndarray import _STORAGE_TYPE_STR_TO_ID, _int64_enabled, _SIGNED_INT32_UPPER_LIMIT
-from ..ndarray import _ndarray_cls
 from ..executor import Executor
 from . import _internal
 from . import op
@@ -1548,11 +1547,9 @@ def optimize_for(self, backend, args=None, aux=None, ctx=None, **kwargs):
         # return modified symbol
         return Symbol(out)
 
-
     # pylint: disable=too-many-locals
-    def simple_bind(self, ctx, grad_req='write', type_dict=None, stype_dict=None,
-                    group2ctx=None, shared_arg_names=None, shared_exec=None,
-                    shared_buffer=None, **kwargs):
+    def _simple_bind(self, ctx, grad_req='write', type_dict=None, stype_dict=None,
+                     **kwargs):
         """Bind current symbol to get an executor, allocate all the arguments needed.
         Allows specifying data types.
 
@@ -1597,23 +1594,6 @@ def simple_bind(self, ctx, grad_req='write', type_dict=None, stype_dict=None,
         stype_dict  : Dict of str->str
             Input storage type dictionary, name->storage_type
 
-        group2ctx : Dict of string to mx.Context
-            The dict mapping the `ctx_group` attribute to the context assignment.
-
-        shared_arg_names : List of string
-            The argument names whose `NDArray` of shared_exec can be reused for initializing
-            the current executor.
-
-        shared_exec : Executor
-            The executor whose arg_arrays, arg_arrays, grad_arrays, and aux_arrays can be
-            reused for initializing the current executor.
-
-        shared_buffer : Dict of string to `NDArray`
-            The dict mapping argument names to the `NDArray` that can be reused for initializing
-            the current executor. This buffer will be checked for reuse if one argument name
-            of the current executor is not found in `shared_arg_names`. The `NDArray` s are
-            expected have default storage type.
-
         kwargs : Dict of str->shape
             Input shape dictionary, name->shape
 
@@ -1622,238 +1602,58 @@ def simple_bind(self, ctx, grad_req='write', type_dict=None, stype_dict=None,
         executor : mxnet.Executor
             The generated executor
         """
-        # data types
-        num_provided_arg_types = 0
-        provided_arg_type_names = ctypes.POINTER(ctypes.c_char_p)()  # provided type argument names
-        provided_arg_type_data = ctypes.POINTER(mx_uint)()  # provided types
-        if type_dict is not None:
-            provided_arg_type_names = []
-            provided_arg_type_data = []
-            for k, v in type_dict.items():
-                v = _numpy.dtype(v).type
-                if v in _DTYPE_NP_TO_MX:
-                    provided_arg_type_names.append(k)
-                    provided_arg_type_data.append(_DTYPE_NP_TO_MX[v])
-            num_provided_arg_types = mx_uint(len(provided_arg_type_names))
-            provided_arg_type_names = c_str_array(provided_arg_type_names)
-            provided_arg_type_data = c_array_buf(ctypes.c_int, array('i', provided_arg_type_data))
-
-        # storage types
-        num_provided_arg_stypes = 0
-        # provided storage type argument names
-        provided_arg_stype_names = ctypes.POINTER(ctypes.c_char_p)()
-        provided_arg_stype_data = ctypes.POINTER(mx_uint)()  # provided storage types
-        if stype_dict is not None:
-            provided_arg_stype_names = []
-            provided_arg_stype_data = []
-            for k, v in stype_dict.items():
-                if v in _STORAGE_TYPE_STR_TO_ID:
-                    provided_arg_stype_names.append(k)
-                    provided_arg_stype_data.append(_STORAGE_TYPE_STR_TO_ID[v])
-            num_provided_arg_stypes = mx_uint(len(provided_arg_stype_names))
-            provided_arg_stype_names = c_str_array(provided_arg_stype_names)
-            provided_arg_stype_data = c_array_buf(ctypes.c_int, array('i', provided_arg_stype_data))
-
-        provided_arg_shape_data = []  # shape data
-        # argument shape index in sdata,
-        # e.g. [sdata[indptr[0]], sdata[indptr[1]]) is the shape of the first arg
-        provided_arg_shape_idx = [0]
-        provided_arg_shape_names = []  # provided argument names
-        for k, v in kwargs.items():
-            # if k not in listed_arguments and k not in listed_aux_states:
-            #   raise ValueError('arg name %s is not valid', k)
-            if isinstance(v, tuple):
-                provided_arg_shape_names.append(k)
-                provided_arg_shape_data.extend(v)
-                provided_arg_shape_idx.append(len(provided_arg_shape_data))
-
-        provided_req_type_list_len = 0
-        provided_grad_req_types = ctypes.POINTER(ctypes.c_char_p)()
-        provided_grad_req_names = ctypes.POINTER(ctypes.c_char_p)()
-        if grad_req is not None:
-            if isinstance(grad_req, string_types):
-                # use provided_req_type_list_len = 0 to indicate this situation
-                provided_req_type_list_len = 0
-                provided_grad_req_types = [grad_req]
-            elif isinstance(grad_req, list):
-                if len(grad_req) == 0:
-                    raise RuntimeError('grad_req in simple_bind cannot be an empty list')
-                provided_grad_req_types = grad_req
-                provided_req_type_list_len = len(provided_grad_req_types)
-            elif isinstance(grad_req, dict):
-                if len(grad_req) == 0:
-                    raise RuntimeError('grad_req in simple_bind cannot be an empty dict')
-                provided_grad_req_names = []
-                provided_grad_req_types = []
-                for k, v in grad_req.items():
-                    provided_grad_req_names.append(k)
-                    provided_grad_req_types.append(v)
-                provided_grad_req_names = c_str_array(provided_grad_req_names)
-                provided_req_type_list_len = len(provided_grad_req_types)
-            provided_grad_req_types = c_str_array(provided_grad_req_types)
-
-        num_ctx_map_keys = mx_uint(0)
-        ctx_map_keys = ctypes.POINTER(ctypes.c_char_p)()
-        ctx_map_dev_types = ctypes.POINTER(ctypes.c_int)()
-        ctx_map_dev_ids = ctypes.POINTER(ctypes.c_int)()
-        if group2ctx is not None:
-            ctx_map_keys = []
-            ctx_map_dev_types = []
-            ctx_map_dev_ids = []
-            for key, val in group2ctx.items():
-                ctx_map_keys.append(key)
-                ctx_map_dev_types.append(val.device_typeid)
-                ctx_map_dev_ids.append(val.device_id)
-            num_ctx_map_keys = mx_uint(len(ctx_map_keys))
-            ctx_map_keys = c_str_array(ctx_map_keys)
-            ctx_map_dev_types = c_array(ctypes.c_int, array('i', ctx_map_dev_types))
-            ctx_map_dev_ids = c_array(ctypes.c_int, array('i', ctx_map_dev_ids))
-
-        # prepare param names
-        shared_arg_name_list = []
-        if shared_arg_names is not None:
-            if not isinstance(shared_arg_names, list):
-                raise ValueError('shared_arg_names in simple_bind must be a list or None')
-            shared_arg_name_list = shared_arg_names
-
-        # prepare shared_buffer
-        if shared_buffer is None:
-            shared_buffer_len = ctypes.c_int(-1)
-            shared_buffer_names = ctypes.POINTER(ctypes.c_char_p)()
-            shared_buffer_handles = ctypes.POINTER(NDArrayHandle)()
+        assert isinstance(grad_req, (str, dict))
+        # infer shape
+        arg_shapes, _, aux_shapes = self.infer_shape(**kwargs)
+        type_dict = {} if type_dict is None else type_dict
+        arg_dtypes, _, _ = None, None, None
+        try:
+            arg_dtypes, _, aux_dtypes = self.infer_type(**type_dict)
+        except Exception: # pylint: disable=broad-except
+            pass
+        args = [None] * len(arg_shapes) if arg_shapes else []
+        aux_states = [None] * len(aux_shapes) if aux_shapes else []
+
+        arg_names = self.list_arguments()
+        aux_names = self.list_auxiliary_states()
+
+        from ..ndarray import zeros as nd_zeros
+        if arg_shapes:
+            for i, shape in enumerate(arg_shapes):
+                if arg_dtypes:
+                    args[i] = nd_zeros(shape, dtype=arg_dtypes[i])
+                else:
+                    args[i] = nd_zeros(shape)
+        if aux_shapes:
+            for i, shape in enumerate(aux_shapes):
+                if aux_dtypes:
+                    aux_states[i] = nd_zeros(shape, dtype=aux_dtypes[i])
+                else:
+                    aux_states[i] = nd_zeros(shape)
+
+        if stype_dict:
+            for name, stype in stype_dict.items():
+                if name in arg_names:
+                    index = arg_names.index(name)
+                    args[index] = args[index].tostype(stype)
+                else:
+                    assert name in aux_names
+                    index = aux_names.index(name)
+                    aux_states[index] = aux_states[index].totype(stype)
+
+        if grad_req == 'null':
+            args_grad = None
+        elif isinstance(grad_req, dict):
+            args_grad = {}
+            for i, name in enumerate(arg_names):
+                if grad_req[name] != 'null':
+                    args_grad[name] = args[i].copy()
         else:
-            if not isinstance(shared_buffer, dict):
-                raise ValueError('shared_buffer in simple_bind must be dict or None')
-            buffer_names = shared_buffer.keys()
-            buffer_arrays = shared_buffer.values()
-            for v in buffer_arrays:
-                assert(v.stype == 'default'), \
-                    "shared_buffer is expected to only contain NDArrays with default storage"
-            shared_buffer_names = c_str_array(buffer_names)
-            shared_buffer_len = ctypes.c_int(len(buffer_arrays))
-            shared_buffer_handles = c_handle_array(buffer_arrays)
-        updated_shared_buffer_names = ctypes.POINTER(ctypes.c_char_p)()
-        updated_shared_buffer_handles = ctypes.POINTER(NDArrayHandle)()
-
-        # prepare shared_exec_handle
-        shared_exec_handle = shared_exec.handle if shared_exec is not None else ExecutorHandle()
-
-        # prepare current executor handle
-        exe_handle = ExecutorHandle()
-
-        # prepare current executor's in_args, arg_grads, and aux_states
-        num_in_args = ctypes.c_uint()
-        in_arg_handles = ctypes.POINTER(NDArrayHandle)()
-        arg_grad_handles = ctypes.POINTER(NDArrayHandle)()
-        num_aux_states = ctypes.c_uint()
-        aux_state_handles = ctypes.POINTER(NDArrayHandle)()
+            args_grad = [x.copy() for x in args]
+        return Executor(self, ctx, args, args_grad, grad_req, aux_states)
 
-        try:
-            if _int64_enabled():
-                check_call(_LIB.MXExecutorSimpleBindEx64(self.handle,
-                                                         ctypes.c_int(ctx.device_typeid),
-                                                         ctypes.c_int(ctx.device_id),
-                                                         num_ctx_map_keys,
-                                                         ctx_map_keys,
-                                                         ctx_map_dev_types,
-                                                         ctx_map_dev_ids,
-                                                         mx_uint(provided_req_type_list_len),
-                                                         provided_grad_req_names,
-                                                         provided_grad_req_types,
-                                                         mx_uint(len(provided_arg_shape_names)),
-                                                         c_str_array(provided_arg_shape_names),
-                                                         c_array_buf(mx_int64,
-                                                                     array('q', provided_arg_shape_data)),
-                                                         c_array_buf(mx_uint,
-                                                                     array('i', provided_arg_shape_idx)),
-                                                         num_provided_arg_types,
-                                                         provided_arg_type_names,
-                                                         provided_arg_type_data,
-                                                         num_provided_arg_stypes,
-                                                         provided_arg_stype_names,
-                                                         provided_arg_stype_data,
-                                                         mx_uint(len(shared_arg_name_list)),
-                                                         c_str_array(shared_arg_name_list),
-                                                         ctypes.byref(shared_buffer_len),
-                                                         shared_buffer_names,
-                                                         shared_buffer_handles,
-                                                         ctypes.byref(updated_shared_buffer_names),
-                                                         ctypes.byref(updated_shared_buffer_handles),
-                                                         ctypes.byref(num_in_args),
-                                                         ctypes.byref(in_arg_handles),
-                                                         ctypes.byref(arg_grad_handles),
-                                                         ctypes.byref(num_aux_states),
-                                                         ctypes.byref(aux_state_handles),
-                                                         shared_exec_handle,
-                                                         ctypes.byref(exe_handle)))
-            else:
-                check_call(_LIB.MXExecutorSimpleBindEx(self.handle,
-                                                       ctypes.c_int(ctx.device_typeid),
-                                                       ctypes.c_int(ctx.device_id),
-                                                       num_ctx_map_keys,
-                                                       ctx_map_keys,
-                                                       ctx_map_dev_types,
-                                                       ctx_map_dev_ids,
-                                                       mx_uint(provided_req_type_list_len),
-                                                       provided_grad_req_names,
-                                                       provided_grad_req_types,
-                                                       mx_uint(len(provided_arg_shape_names)),
-                                                       c_str_array(provided_arg_shape_names),
-                                                       c_array_buf(mx_int,
-                                                                   array('I', provided_arg_shape_data)),
-                                                       c_array_buf(mx_uint,
-                                                                   array('i', provided_arg_shape_idx)),
-                                                       num_provided_arg_types,
-                                                       provided_arg_type_names,
-                                                       provided_arg_type_data,
-                                                       num_provided_arg_stypes,
-                                                       provided_arg_stype_names,
-                                                       provided_arg_stype_data,
-                                                       mx_uint(len(shared_arg_name_list)),
-                                                       c_str_array(shared_arg_name_list),
-                                                       ctypes.byref(shared_buffer_len),
-                                                       shared_buffer_names,
-                                                       shared_buffer_handles,
-                                                       ctypes.byref(updated_shared_buffer_names),
-                                                       ctypes.byref(updated_shared_buffer_handles),
-                                                       ctypes.byref(num_in_args),
-                                                       ctypes.byref(in_arg_handles),
-                                                       ctypes.byref(arg_grad_handles),
-                                                       ctypes.byref(num_aux_states),
-                                                       ctypes.byref(aux_state_handles),
-                                                       shared_exec_handle,
-                                                       ctypes.byref(exe_handle)))
-        except MXNetError as e:
-            error_msg = "simple_bind error. Arguments:\n"
-            for k, v in kwargs.items():
-                error_msg += "%s: %s\n" % (k, v)
-            error_msg += "%s" % e
-            raise RuntimeError(error_msg)
-
-        # update shared_buffer
-        if shared_buffer is not None:
-            for i in range(shared_buffer_len.value):
-                k = py_str(updated_shared_buffer_names[i])
-                v = NDArray(NDArrayHandle(updated_shared_buffer_handles[i]))
-                shared_buffer[k] = v
-
-        # create in_args, arg_grads, and aux_states for the current executor
-        arg_arrays = [_ndarray_cls(NDArrayHandle(in_arg_handles[i]))
-                      for i in range(num_in_args.value)]
-        grad_arrays = [_ndarray_cls(NDArrayHandle(arg_grad_handles[i]))
-                       if arg_grad_handles[i] is not None
-                       else None for i in range(num_in_args.value)]
-        aux_arrays = [_ndarray_cls(NDArrayHandle(aux_state_handles[i]))
-                      for i in range(num_aux_states.value)]
-
-        executor = Executor(exe_handle, self, ctx, grad_req, group2ctx)
-        executor.arg_arrays = arg_arrays
-        executor.grad_arrays = grad_arrays
-        executor.aux_arrays = aux_arrays
-        return executor
-
-    def bind(self, ctx, args, args_grad=None, grad_req='write',
-             aux_states=None, group2ctx=None, shared_exec=None):
+    def _bind(self, ctx, args, args_grad=None, grad_req='write',
+              aux_states=None):
         """Binds the current symbol to an executor and returns it.
 
         We first declare the computation and then bind to the data to run.
@@ -1866,7 +1666,7 @@ def bind(self, ctx, args, args_grad=None, grad_req='write',
         >>> b = mx.sym.Variable('b')
         >>> c = a + b
         <Symbol _plus1>
-        >>> ex = c.bind(ctx=mx.cpu(), args={'a' : mx.nd.ones([2,3]), 'b' : mx.nd.ones([2,3])})
+        >>> ex = c._bind(ctx=mx.cpu(), args={'a' : mx.nd.ones([2,3]), 'b' : mx.nd.ones([2,3])})
         >>> ex.forward()
         [<NDArray 2x3 @cpu(0)>]
         >>> ex.outputs[0].asnumpy()
@@ -1916,14 +1716,6 @@ def bind(self, ctx, args, args_grad=None, grad_req='write',
               `auxiliary_states` to the corresponding `NDArray`,
             - In either case, all the auxiliary states need to be provided.
 
-        group2ctx : Dict of string to mx.Context
-            The dict mapping the `ctx_group` attribute to the context assignment.
-
-        shared_exec : mx.executor.Executor
-            Executor to share memory with. This is intended for runtime reshaping, variable length
-            sequences, etc. The returned executor shares state with `shared_exec`, and should not be
-            used in parallel with it.
-
         Returns
         -------
         executor : Executor
@@ -1941,74 +1733,8 @@ def bind(self, ctx, args, args_grad=None, grad_req='write',
         One can give up gradient by using a dict in `args_grad` and only specify
         gradient they interested in.
         """
-        # pylint: disable=too-many-locals, too-many-branches
-        if not isinstance(ctx, Context):
-            raise TypeError("Context type error")
-
-        listed_arguments = self.list_arguments()
-        args_handle, args = self._get_ndarray_inputs('args', args, listed_arguments, False)
-        # setup args gradient
-        if args_grad is None:
-            args_grad_handle = c_array(NDArrayHandle, [None] * len(args))
-        else:
-            args_grad_handle, args_grad = self._get_ndarray_inputs(
-                'args_grad', args_grad, listed_arguments, True)
-
-        if aux_states is None:
-            aux_states = []
-        aux_args_handle, aux_states = self._get_ndarray_inputs(
-            'aux_states', aux_states, self.list_auxiliary_states(), False)
-
-        # setup requirements
-        if isinstance(grad_req, string_types):
-            if grad_req not in _GRAD_REQ_MAP:
-                raise ValueError('grad_req must be in %s' % str(_GRAD_REQ_MAP))
-            reqs_array = c_array_buf(mx_uint,
-                                     array('I', [_GRAD_REQ_MAP[grad_req]] * len(listed_arguments)))
-        elif isinstance(grad_req, list):
-            reqs_array = c_array_buf(mx_uint,
-                                     array('I', [_GRAD_REQ_MAP[item] for item in grad_req]))
-        elif isinstance(grad_req, dict):
-            req_array = []
-            for name in listed_arguments:
-                if name in grad_req:
-                    req_array.append(_GRAD_REQ_MAP[grad_req[name]])
-                else:
-                    req_array.append(0)
-            reqs_array = c_array_buf(mx_uint, array('I', req_array))
-
-        ctx_map_keys = []
-        ctx_map_dev_types = []
-        ctx_map_dev_ids = []
-
-        if group2ctx:
-            for key, val in group2ctx.items():
-                ctx_map_keys.append(key)
-                ctx_map_dev_types.append(val.device_typeid)
-                ctx_map_dev_ids.append(val.device_id)
-
-        handle = ExecutorHandle()
-        shared_handle = shared_exec.handle if shared_exec is not None else ExecutorHandle()
-        check_call(_LIB.MXExecutorBindEX(self.handle,
-                                         ctypes.c_int(ctx.device_typeid),
-                                         ctypes.c_int(ctx.device_id),
-                                         mx_uint(len(ctx_map_keys)),
-                                         c_str_array(ctx_map_keys),
-                                         c_array_buf(ctypes.c_int, array('i', ctx_map_dev_types)),
-                                         c_array_buf(ctypes.c_int, array('i', ctx_map_dev_ids)),
-                                         mx_uint(len(args)),
-                                         args_handle,
-                                         args_grad_handle,
-                                         reqs_array,
-                                         mx_uint(len(aux_states)),
-                                         aux_args_handle,
-                                         shared_handle,
-                                         ctypes.byref(handle)))
-        executor = Executor(handle, self, ctx, grad_req, group2ctx)
-        executor.arg_arrays = args
-        executor.grad_arrays = args_grad
-        executor.aux_arrays = aux_states
-        return executor
+        assert isinstance(grad_req, (str, dict))
+        return Executor(self, ctx, args, args_grad, grad_req, aux_states)
 
     def gradient(self, wrt):
         """Gets the autodiff of current symbol.
@@ -2075,7 +1801,7 @@ def eval(self, ctx=None, **kwargs):
         """
         if ctx is None:
             ctx = current_context()
-        return self.bind(ctx, kwargs).forward()
+        return self._bind(ctx, kwargs).forward()
 
     def reshape(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`reshape`.
diff --git a/python/mxnet/test_utils.py b/python/mxnet/test_utils.py
index a8a797df7de7..cbfa49f490be 100644
--- a/python/mxnet/test_utils.py
+++ b/python/mxnet/test_utils.py
@@ -46,7 +46,6 @@
 import mxnet as mx
 from .context import Context, current_context
 from .ndarray.ndarray import _STORAGE_TYPE_STR_TO_ID
-from .ndarray import array
 from .symbol import Symbol
 from .symbol.numpy import _Symbol as np_symbol
 from .util import use_np, use_np_default_dtype  # pylint: disable=unused-import
@@ -757,34 +756,6 @@ def assert_exception(f, exception_type, *args, **kwargs):
         return
 
 
-def simple_forward(sym, ctx=None, is_train=False, **inputs):
-    """A simple forward function for a symbol.
-
-    Primarily used in doctest to test the functionality of a symbol.
-    Takes NumPy arrays as inputs and outputs are also converted to NumPy arrays.
-
-    Parameters
-    ----------
-    ctx : Context
-        If ``None``, will take the default context.
-    inputs : keyword arguments
-        Mapping each input name to a NumPy array.
-
-    Returns
-    -------
-    The result as a numpy array. Multiple results will
-    be returned as a list of NumPy arrays.
-    """
-    ctx = ctx or default_context()
-    inputs = {k: array(v) for k, v in inputs.items()}
-    exe = sym.bind(ctx, args=inputs)
-    exe.forward(is_train=is_train)
-    outputs = [x.asnumpy() for x in exe.outputs]
-    if len(outputs) == 1:
-        outputs = outputs[0]
-    return outputs
-
-
 def _parse_location(sym, location, ctx, dtype=default_dtype()):
     """Parses the given location to a ordered dictionary.
 
@@ -983,7 +954,6 @@ def as_stype(var, stype, dtype):
 
     return approx_grads
 
-
 def check_numeric_gradient(sym, location, aux_states=None, numeric_eps=1e-3, rtol=1e-2,
                            atol=None, grad_nodes=None, use_forward_train=True, ctx=None,
                            grad_stype_dict=None, dtype=default_dtype()):
@@ -1093,18 +1063,25 @@ def random_projection(shape):
                 args_grad[k] = mx.nd.zeros(args_grad[k].shape, args_grad[k].context,
                                            args_grad[k].dtype, v)
 
-    executor = out.bind(ctx, grad_req=grad_req,
-                        args=location, args_grad=args_grad, aux_states=aux_states)
+    grad_req["__random_proj"] = 'write'
+    executor = out._bind(ctx, grad_req=grad_req,
+                         args=location, args_grad=args_grad, aux_states=aux_states)
 
     inps = executor.arg_arrays
     if len(inps) != len(location):
         raise ValueError("Executor arg_arrays and and location len do not match."
                          "Got %d inputs and %d locations"%(len(inps), len(location)))
-    assert len(executor.outputs) == 1
 
     executor.forward(is_train=True)
+    assert len(executor.outputs) == 1
     executor.backward()
-    symbolic_grads = {k:executor.grad_dict[k].asnumpy() for k in grad_nodes}
+    symbolic_grads = {}
+    for k in grad_nodes:
+        grad_k = executor.grad_dict[k]
+        if grad_k is not None:
+            symbolic_grads[k] = grad_k.asnumpy()
+        else:
+            symbolic_grads[k] = None
 
     numeric_gradients = numeric_grad(
         executor, location_npy, aux_states_npy,
@@ -1121,8 +1098,7 @@ def random_projection(shape):
             assert_almost_equal(fd_grad, sym_grad - orig_grad, rtol, atol,
                                 ("NUMERICAL_%s"%name, "BACKWARD_%s"%name))
         elif grad_req[name] == 'null':
-            assert_almost_equal(orig_grad, sym_grad, rtol, atol,
-                                ("NUMERICAL_%s"%name, "BACKWARD_%s"%name))
+            assert sym_grad is None
         else:
             raise ValueError("Invalid grad_req %s for argument %s"%(grad_req[name], name))
 
@@ -1192,7 +1168,7 @@ def check_symbolic_forward(sym, location, expected, rtol=1E-4, atol=None,
     args_grad_data = {k:mx.nd.empty(v.shape, ctx=ctx, dtype=v.dtype if dtype == "asnumpy" else dtype) \
                       for k, v in location.items()}
 
-    executor = sym.bind(ctx=ctx, args=location, args_grad=args_grad_data, aux_states=aux_states)
+    executor = sym._bind(ctx=ctx, args=location, args_grad=args_grad_data, aux_states=aux_states)
     for g in executor.grad_arrays:
         if g.ndim == 0:
             g[()] = 0
@@ -1262,7 +1238,7 @@ def check_symbolic_backward(sym, location, out_grads, expected, rtol=1e-5, atol=
     >>> mat2 = np.array([[5, 6], [7, 8]])
     >>> grad1 = mx.nd.zeros(shape)
     >>> grad2 = mx.nd.zeros(shape)
-    >>> exec_add = sym_add.bind(default_context(), args={'lhs': mat1, 'rhs': mat2},
+    >>> exec_add = sym_add._bind(default_context(), args={'lhs': mat1, 'rhs': mat2},
     ... args_grad={'lhs': grad1, 'rhs': grad2}, grad_req={'lhs': 'write', 'rhs': 'write'})
     >>> exec_add.forward(is_train=True)
     >>> ograd = mx.nd.ones(shape)
@@ -1299,29 +1275,31 @@ def check_symbolic_backward(sym, location, out_grads, expected, rtol=1e-5, atol=
     elif isinstance(grad_req, (list, tuple)):
         grad_req = {k:v for k, v in zip(sym.list_arguments(), grad_req)}
 
-    executor = sym.bind(ctx=ctx, args=location, args_grad=args_grad_data,
-                        aux_states=aux_states, grad_req=grad_req)
-    executor.forward(is_train=True)
+    executor = sym._bind(ctx=ctx, args=location, args_grad=args_grad_data,
+                         aux_states=aux_states, grad_req=grad_req)
+    outputs = executor.forward(is_train=True)
 
     if isinstance(out_grads, (tuple, list)):
         outg = list()
-        for arr in out_grads:
+        for i, arr in enumerate(out_grads):
+            stype = outputs[i].stype
             if isinstance(arr, np.ndarray):
-                outg.append(mx.nd.array(arr, ctx=ctx, dtype=arr.dtype if dtype == "asnumpy" else dtype))
+                dtype = arr.dtype if dtype == "asnumpy" else dtype
+                outg.append(mx.nd.array(arr, ctx=ctx, dtype=dtype).tostype(stype))
             else:
-                outg.append(arr)
+                outg.append(arr.tostype(stype))
         out_grads = outg
     elif isinstance(out_grads, dict):
         outg = dict()
         for k, v in out_grads.items():
             if isinstance(v, np.ndarray):
-                outg[k] = mx.nd.array(v, ctx=ctx, dtype=v.dtype if dtype == "asnumpy" else dtype)
+                dtype = v.dtype if dtype == "asnumpy" else dtype
+                outg[k] = mx.nd.array(v, ctx=ctx, dtype=dtype)
             else:
                 outg[k] = v
         out_grads = outg
     else:
         assert out_grads is None
-
     executor.backward(out_grads)
 
     grads = {k: v.asnumpy() for k, v in args_grad_data.items()}
@@ -1373,13 +1351,13 @@ def check_speed(sym, location=None, ctx=None, N=20, grad_req=None, typ="whole",
     if grad_req is None:
         grad_req = 'write'
     if location is None:
-        exe = sym.simple_bind(grad_req=grad_req, ctx=ctx, **kwargs)
+        exe = sym._simple_bind(grad_req=grad_req, ctx=ctx, **kwargs)
         location = {k: np.random.normal(size=arr.shape, scale=1.0) for k, arr in
                     exe.arg_dict.items()}
     else:
         assert isinstance(location, dict), "Expect dict, get \"location\"=%s" %str(location)
-        exe = sym.simple_bind(grad_req=grad_req, ctx=ctx,
-                              **{k: v.shape for k, v in location.items()})
+        exe = sym._simple_bind(grad_req=grad_req, ctx=ctx,
+                               **{k: v.shape for k, v in location.items()})
 
     for name, iarr in location.items():
         exe.arg_dict[name][:] = iarr.astype(exe.arg_dict[name].dtype)
@@ -1503,7 +1481,7 @@ def check_consistency(sym, ctx_list, scale=1.0, grad_req='write',
     for s, ctx in zip(sym, ctx_list):
         assert s.list_arguments() == arg_names
         assert s.list_outputs() == output_names
-        exe_list.append(s.simple_bind(grad_req=grad_req, **ctx))
+        exe_list.append(s._simple_bind(grad_req=grad_req, **ctx))
 
     arg_params = {} if arg_params is None else arg_params
     aux_params = {} if aux_params is None else aux_params
@@ -1528,17 +1506,15 @@ def check_consistency(sym, ctx_list, scale=1.0, grad_req='write',
             for arr in exe.grad_arrays:
                 arr[:] = np.zeros(arr.shape, dtype=arr.dtype)
 
+    # test
+    for exe in exe_list:
+        exe.forward(is_train=False)
+
     dtypes = [np.dtype(exe.outputs[0].dtype) for exe in exe_list]
     max_idx = np.argmax(dtypes)
     gt = ground_truth
     if gt is None:
         gt = exe_list[max_idx].output_dict.copy()
-        if grad_req != 'null':
-            gt.update(exe_list[max_idx].grad_dict)
-
-    # test
-    for exe in exe_list:
-        exe.forward(is_train=False)
 
     for i, exe in enumerate(exe_list):
         if i == max_idx:
@@ -1565,7 +1541,11 @@ def check_consistency(sym, ctx_list, scale=1.0, grad_req='write',
         for exe in exe_list:
             exe.forward(is_train=True)
             exe.backward(exe.outputs)
-
+        gt = ground_truth
+        if gt is None:
+            gt = exe_list[max_idx].output_dict.copy()
+            if grad_req != 'null':
+                gt.update(exe_list[max_idx].grad_dict)
         for i, exe in enumerate(exe_list):
             if i == max_idx:
                 continue
@@ -1575,7 +1555,7 @@ def check_consistency(sym, ctx_list, scale=1.0, grad_req='write',
             curr = zip(output_names + arg_names, exe.outputs + exe.grad_arrays)
             for name, arr in curr:
                 if gt[name] is None:
-                    assert arr is None
+                    assert arr is None, name
                     continue
 
                 # Previous cast was to dtypes[i], but symbol may be mixed-precision,
diff --git a/src/c_api/c_api_ndarray.cc b/src/c_api/c_api_ndarray.cc
index 2a47afd31788..ebb3134ae7f3 100644
--- a/src/c_api/c_api_ndarray.cc
+++ b/src/c_api/c_api_ndarray.cc
@@ -230,11 +230,27 @@ int MXFreeCachedOp(CachedOpHandle handle) {
   API_END();
 }
 
-int MXInvokeCachedOp(CachedOpHandle handle,
-                     int num_inputs,
-                     NDArrayHandle *inputs,
-                     int *num_outputs,
-                     NDArrayHandle **outputs) {
+/*!
+ * \brief get optimized graph from the cached op
+ */
+int MXCachedOpGetOptimizedSymbol(CachedOpHandle handle,
+                                 SymbolHandle *out) {
+  auto s = new nnvm::Symbol();
+  API_BEGIN();
+  CachedOpPtr op = *static_cast<CachedOpPtr*>(handle);
+  *s = op->GetOptimizedSymbol();
+  *out = s;
+  API_END_HANDLE_ERROR(delete s);
+}
+
+int MXInvokeCachedOpEx(CachedOpHandle handle,
+                       int num_inputs,
+                       NDArrayHandle *inputs,
+                       int default_dev_type,
+                       int default_dev_id,
+                       int *num_outputs,
+                       NDArrayHandle **outputs,
+                       const int **out_stypes) {  // outputs storage types
   MXAPIThreadLocalEntry<> *ret = MXAPIThreadLocalStore<>::Get();
 
   API_BEGIN();
@@ -261,8 +277,10 @@ int MXInvokeCachedOp(CachedOpHandle handle,
       ndoutputs.push_back(reinterpret_cast<NDArray*>((*outputs)[i]));
     }
   }
-
-  op->Forward(op_shared, ndinputs, ndoutputs);
+  // construct default context
+  Context ctx = Context::Create(static_cast<Context::DeviceType>(default_dev_type),
+                                default_dev_id);
+  op->Forward(op_shared, ndinputs, ndoutputs, ctx);
 
   if (*outputs == nullptr) {
     ret->ret_handles.clear();
@@ -273,19 +291,6 @@ int MXInvokeCachedOp(CachedOpHandle handle,
     *outputs = dmlc::BeginPtr(ret->ret_handles);
   }
 
-  API_END();
-}
-
-int MXInvokeCachedOpEx(CachedOpHandle handle,
-                       int num_inputs,
-                       NDArrayHandle *inputs,
-                       int *num_outputs,
-                       NDArrayHandle **outputs,
-                       const int **out_stypes) {  // outputs storage types
-  MXAPIThreadLocalEntry<> *ret = MXAPIThreadLocalStore<>::Get();
-  int err = MXInvokeCachedOp(handle, num_inputs, inputs, num_outputs, outputs);
-  if (err != 0) return err;
-  API_BEGIN();
   NDArray** out_array = reinterpret_cast<NDArray**>(*outputs);
   ret->out_types.clear();
   ret->out_types.reserve(*num_outputs);
@@ -293,6 +298,7 @@ int MXInvokeCachedOpEx(CachedOpHandle handle,
     ret->out_types.emplace_back(out_array[i]->storage_type());
   }
   *out_stypes = dmlc::BeginPtr(ret->out_types);
+
   API_END();
 }
 
diff --git a/src/executor/infer_graph_attr_pass.cc b/src/executor/infer_graph_attr_pass.cc
index 14a232ab8c78..d5d969618f87 100644
--- a/src/executor/infer_graph_attr_pass.cc
+++ b/src/executor/infer_graph_attr_pass.cc
@@ -99,7 +99,7 @@ inline void GetAttrFromForwardNode(const uint32_t nid,
           rshape[eid] = rshape[idx.entry_id(fnode.inputs[i])];
         } else {
           CHECK_EQ(rshape[eid], rshape[idx.entry_id(fnode.inputs[i])])
-              << "Backward shape inconsistent with the forward shape";
+              << "Backward shape/type inconsistent with the forward shape/type";
         }
       }
       if (igrad_node == nullptr) {
diff --git a/src/imperative/cached_op.cc b/src/imperative/cached_op.cc
index 8a7e950fe40b..3ad392280ea0 100644
--- a/src/imperative/cached_op.cc
+++ b/src/imperative/cached_op.cc
@@ -32,6 +32,13 @@ DMLC_REGISTER_PARAMETER(CachedOpConfig);
 
 constexpr uint32_t kEidNotExist = std::numeric_limits<uint32_t>::max();
 
+nnvm::Symbol CachedOp::GetOptimizedSymbol() const {
+  Symbol ret;
+  ret.outputs = std::vector<nnvm::NodeEntry>(full_graph_.outputs.begin(),
+                                             full_graph_.outputs.begin() + num_outputs());
+  return ret.Copy();
+}
+
 CachedOp::CachedOp(
     const nnvm::Symbol& sym,
     const std::vector<std::pair<std::string, std::string> >& flags) : sym_(sym), flags_(flags) {
@@ -160,6 +167,7 @@ bool CachedOp::CheckDynamicShapeExists(const Context& default_ctx,
 }
 
 bool CachedOp::SetForwardGraph(
+    const Context& default_ctx,
     GraphInfo* info,
     const bool recording,
     const std::vector<NDArray*>& inputs) {
@@ -185,7 +193,7 @@ bool CachedOp::SetForwardGraph(
   match &= CheckAndInferShape(&g, std::move(shape_inputs), true,
                               {0, 0}, {0, 0}, &contain_dynamic_shape);
   match &= CheckAndInferType(&g, std::move(dtype_inputs), true);
-  exec::DevMaskVector dev_mask(g.indexed_graph().num_nodes(), inputs[0]->ctx().dev_mask());
+  exec::DevMaskVector dev_mask(g.indexed_graph().num_nodes(), default_ctx.dev_mask());
   match &= CheckAndInferStorageType(&g, std::move(dev_mask),
                                     std::move(storage_type_inputs), true);
 
@@ -624,7 +632,7 @@ OpStatePtr CachedOp::StaticForward(
   // and executors for multiple forward invokes of the same op.
   std::lock_guard<std::mutex> lock(state.mutex);
 
-  bool match = SetForwardGraph(&state.info, recording, inputs);
+  bool match = SetForwardGraph(default_ctx, &state.info, recording, inputs);
   match = match && state.recording == recording;
 
   nnvm::Graph& g = state.info.fwd_graph;
@@ -704,7 +712,7 @@ OpStatePtr CachedOp::DynamicForward(
     auto state_ptr = GetCachedOpState(default_ctx);
     auto& state = state_ptr.get_state<CachedOpState>();
     std::lock_guard<std::mutex> lock(state.mutex);
-    SetForwardGraph(&state.info, recording, inputs);
+    SetForwardGraph(default_ctx, &state.info, recording, inputs);
     runtime.info.fwd_graph = state.info.fwd_graph;
   }
   nnvm::Graph& g = runtime.info.fwd_graph;
@@ -759,7 +767,8 @@ OpStatePtr CachedOp::DynamicForward(
 OpStatePtr CachedOp::Forward(
     const std::shared_ptr<CachedOp>& op_ptr,
     const std::vector<NDArray*>& inputs,
-    const std::vector<NDArray*>& outputs) {
+    const std::vector<NDArray*>& outputs,
+    const Context& default_ctx) {
   static const auto cached_op = nnvm::Op::Get("_CachedOp");
 
   CHECK_EQ(inputs.size(), num_inputs());
@@ -780,7 +789,6 @@ OpStatePtr CachedOp::Forward(
     }
   }
 
-  Context default_ctx = inputs[0]->ctx();
   {
     auto state_ptr = GetCachedOpState(default_ctx);
     auto& state = state_ptr.get_state<CachedOpState>();
@@ -1099,7 +1107,9 @@ void CachedOpForward(const OpStatePtr& state_ptr,
     orig_is_train = Imperative::Get()->set_is_training(true);
   else
     orig_is_train = Imperative::Get()->is_training();
-  s.forward_state = s.op->Forward(nullptr, in_ptrs, out_ptrs);
+  CHECK(inputs.size() > 0) << "cached op forward requires at least 1 input";
+  Context default_ctx = inputs[0].ctx();
+  s.forward_state = s.op->Forward(nullptr, in_ptrs, out_ptrs, default_ctx);
   Imperative::Get()->set_is_training(orig_is_train);
   Imperative::Get()->set_is_recording(orig_is_record);
   // The arrays in out_ptrs may be changed by CachedOp.
diff --git a/src/imperative/cached_op.h b/src/imperative/cached_op.h
index 702a5734b51a..5153620ee693 100644
--- a/src/imperative/cached_op.h
+++ b/src/imperative/cached_op.h
@@ -30,6 +30,7 @@
 #include "../operator/operator_common.h"
 #include "../operator/subgraph/common.h"
 #include "./imperative_utils.h"
+#include "../nnvm/error.h"
 
 namespace mxnet {
 namespace {
@@ -162,16 +163,22 @@ void CreateBackwardGraph(nnvm::Graph* fwd_graph,
     xs.emplace_back(indexed_graph[node_id].weak_ref.lock());
   }
 
-  CHECK(!xs.empty())
-    << "There are no inputs in computation graph that require gradients.";
-
-  *grad_graph = pass::MXGradient(
-    *fwd_graph, fwd_graph->outputs, xs, *ograd_entries,
-    exec::AggregateGradient, nullptr,
-    zero_ops, "_copy");
+  // There are inputs in computation graph that require gradients
+  if (!xs.empty()) {
+    try {
+      *grad_graph = pass::MXGradient(
+           *fwd_graph, fwd_graph->outputs, xs, *ograd_entries,
+           exec::AggregateGradient, nullptr,
+           zero_ops, "_copy");
+    } catch (const nnvm::pass::InvalidGraphError &e) {
+      *grad_graph = nnvm::Graph();
+    }
+  } else {
+    *grad_graph = nnvm::Graph();
+  }
 }
 
-/* \brief construct  fwd_graph, grad_graph and full_graph from symbol */
+/* \brief construct fwd_graph, grad_graph and full_graph from symbol */
 void CreateFullGraph(const nnvm::Symbol& sym,
                      nnvm::Graph* fwd_graph,
                      nnvm::Graph* grad_graph,
@@ -189,15 +196,16 @@ void CreateFullGraph(const nnvm::Symbol& sym,
   CreateBackwardGraph(fwd_graph, grad_graph, ograd_entries,
                       fwd_input_to_grad_output);
 
-  // Add backward graph outputs to full graph
   full_graph->outputs = fwd_graph->outputs;
-  for (const auto &i : grad_graph->outputs) full_graph->outputs.emplace_back(i);
+  // add backward graph outputs to full graph
+  for (const auto &i : grad_graph->outputs) {
+    full_graph->outputs.emplace_back(i);
+  }
 }
 
 /* \brief Set Ref counts for node entries for forward graph */
 void SetForwardRefCounts(nnvm::Graph *fwd_graph) {
   const auto& idx = fwd_graph->indexed_graph();
-  CHECK_GE(idx.input_nodes().size(), 1) << "CachedOp requires at least 1 input";
 
   std::vector<uint32_t> ref_count(idx.num_node_entries(), 0);
   for (const auto& i : idx.input_nodes()) ++ref_count[idx.entry_id(i, 0)];
@@ -371,6 +379,7 @@ class CachedOp {
       const nnvm::Symbol& sym,
       const std::vector<std::pair<std::string, std::string> >& flags);
   virtual ~CachedOp();
+  nnvm::Symbol GetOptimizedSymbol() const;
   uint32_t num_inputs() const {
     return fwd_graph_.indexed_graph().input_nodes().size();
   }
@@ -399,7 +408,8 @@ class CachedOp {
   virtual OpStatePtr Forward(
       const std::shared_ptr<CachedOp>& op_ptr,
       const std::vector<NDArray*>& inputs,
-      const std::vector<NDArray*>& outputs);
+      const std::vector<NDArray*>& outputs,
+      const Context &default_context);
   virtual void Backward(
       const bool retain_graph,
       const OpStatePtr& state,
@@ -496,6 +506,7 @@ class CachedOp {
 
   OpStatePtr GetCachedOpState(const Context& ctx);
   bool SetForwardGraph(
+      const Context& default_ctx,
       GraphInfo* info,
       const bool recording,
       const std::vector<NDArray*>& inputs);
diff --git a/src/imperative/cached_op_threadsafe.cc b/src/imperative/cached_op_threadsafe.cc
index 659061ae8445..bf08d311c74f 100644
--- a/src/imperative/cached_op_threadsafe.cc
+++ b/src/imperative/cached_op_threadsafe.cc
@@ -101,7 +101,7 @@ OpStatePtr CachedOpThreadSafe::DynamicForward(const Context& default_ctx,
     // shape inference, storage type inference and if the graph
     // doesn't have dynamic shapes it also plans and allocates memory
     // for intermediate and final outputs in the graph
-    SetForwardGraph(&state.info, false, inputs);
+    SetForwardGraph(default_ctx, &state.info, false, inputs);
     runtime.info.fwd_graph = state.info.fwd_graph;
   }
   nnvm::Graph &g = runtime.info.fwd_graph;
@@ -145,7 +145,8 @@ OpStatePtr CachedOpThreadSafe::DynamicForward(const Context& default_ctx,
 
 OpStatePtr CachedOpThreadSafe::Forward(const std::shared_ptr<CachedOp>& op_ptr,
                                        const std::vector<NDArray*>& inputs,
-                                       const std::vector<NDArray*>& outputs) {
+                                       const std::vector<NDArray*>& outputs,
+                                       const Context& default_ctx) {
   // Acquiring lock on the mutex in forward
   // Without this there are issues with static_forward,
   // specifically with static_shape=True and dynamic_forward.
@@ -158,7 +159,6 @@ OpStatePtr CachedOpThreadSafe::Forward(const std::shared_ptr<CachedOp>& op_ptr,
   // push of ops for different contexts
   std::lock_guard<std::mutex> lock(mutex_);
   CHECK_EQ(inputs.size(), num_inputs());
-  Context default_ctx = inputs[0]->ctx();
   const auto& idx = fwd_graph_.indexed_graph();
   for (size_t i = 0; i < inputs.size(); ++i) {
     CHECK_EQ(inputs[i]->ctx(), default_ctx)
@@ -222,7 +222,9 @@ void CachedOpThreadSafeForward(const OpStatePtr& state_ptr,
   // Set is_recording correct for the imperative executor.
   CHECK(!ctx.need_grad) << "Only inference use case supported with thread safe cached op";
   CHECK(!ctx.is_train) << "Only inference use case supported with thread safe cached op";
-  s.forward_state = s.op->Forward(nullptr, in_ptrs, out_ptrs);
+  CHECK(inputs.size() > 0) << "thread safe cached op requires at least one input";
+  Context default_ctx = inputs[0].ctx();
+  s.forward_state = s.op->Forward(nullptr, in_ptrs, out_ptrs, default_ctx);
   // The arrays in out_ptrs may be changed by CachedOp.
   // If it is, we need to copy data back.
   for (size_t i = 0; i < out_bufs.size(); i++)
diff --git a/src/imperative/cached_op_threadsafe.h b/src/imperative/cached_op_threadsafe.h
index 81dcaa5152a6..63521c7219e7 100644
--- a/src/imperative/cached_op_threadsafe.h
+++ b/src/imperative/cached_op_threadsafe.h
@@ -97,7 +97,8 @@ class CachedOpThreadSafe : public CachedOp {
   OpStatePtr Forward(
       const std::shared_ptr<CachedOp>& op_ptr,
       const std::vector<NDArray*>& inputs,
-      const std::vector<NDArray*>& outputs);
+      const std::vector<NDArray*>& outputs,
+      const Context& default_ctx);
   std::vector<std::string> ListForwardInputNames() const {
     nnvm::Symbol sym = GetForwardSym();
     return sym.ListInputNames(nnvm::Symbol::kAll);
diff --git a/src/imperative/imperative.cc b/src/imperative/imperative.cc
index c12fcee1910e..0fb5a97ca385 100644
--- a/src/imperative/imperative.cc
+++ b/src/imperative/imperative.cc
@@ -219,7 +219,8 @@ void Imperative::RecordOp(
       << "will cause undefined behavior when evaluating gradients. "
       << "Please call backward first to clear the graph or do this out side of "
       << "a record section. Also note that you cannot use inplace operations "
-      << "like +=, *=, relu(x, out=x), y[idx]=x, etc inside a record section.";
+      << "like +=, *=, relu(x, out=x), y[idx]=x, etc inside a record section. "
+      << "Issue occurred while recording op: " << attrs.name;
   }
 
   bool need_grad = false;
diff --git a/src/imperative/naive_cached_op.cc b/src/imperative/naive_cached_op.cc
index 6138ce89cd26..84425e3f8068 100644
--- a/src/imperative/naive_cached_op.cc
+++ b/src/imperative/naive_cached_op.cc
@@ -30,11 +30,11 @@ namespace mxnet {
 OpStatePtr NaiveCachedOp::Forward(
     const std::shared_ptr<CachedOp>& op_ptr,
     const std::vector<NDArray*>& inputs,
-    const std::vector<NDArray*>& outputs) {
+    const std::vector<NDArray*>& outputs,
+    const Context& default_ctx) {
 
   CHECK_EQ(inputs.size(), num_inputs());
 
-  Context default_ctx = inputs[0]->ctx();
   {
     auto state_ptr = GetCachedOpState(default_ctx);
     auto& state = state_ptr.get_state<CachedOpState>();
@@ -60,7 +60,7 @@ OpStatePtr NaiveCachedOp::Forward(
       auto state_ptr = GetCachedOpState(default_ctx);
       auto& state = state_ptr.get_state<CachedOpState>();
       std::lock_guard<std::mutex> lock(state.mutex);
-      SetForwardGraph(&state.info, recording, inputs);
+      SetForwardGraph(default_ctx, &state.info, recording, inputs);
       runtime.info.fwd_graph = state.info.fwd_graph;
     }
     nnvm::Graph& g = runtime.info.fwd_graph;
diff --git a/src/imperative/naive_cached_op.h b/src/imperative/naive_cached_op.h
index 268c561c3415..f762f0bcc92e 100644
--- a/src/imperative/naive_cached_op.h
+++ b/src/imperative/naive_cached_op.h
@@ -45,7 +45,8 @@ class NaiveCachedOp : public CachedOp {
   OpStatePtr Forward(
       const std::shared_ptr<CachedOp>& op_ptr,
       const std::vector<NDArray*>& inputs,
-      const std::vector<NDArray*>& outputs) override;
+      const std::vector<NDArray*>& outputs,
+      const Context& default_ctx) override;
   void Backward(
       const bool retain_graph,
       const OpStatePtr& state,
diff --git a/src/io/dataset.cc b/src/io/dataset.cc
index 6ae174a010a1..4c47f440150f 100644
--- a/src/io/dataset.cc
+++ b/src/io/dataset.cc
@@ -628,7 +628,9 @@ class LazyTransformDataset final : public Dataset {
     for (size_t i = 0; i < inputs.size(); ++i) {
       inputs[i].WaitToRead();
     }
-    cached_op_->Forward(cached_op_, ndinputs, ndoutputs);
+    CHECK(inputs.size() > 0) << "dataset getitem requires at least one input";
+    Context default_ctx = inputs[0].ctx();
+    cached_op_->Forward(cached_op_, ndinputs, ndoutputs, default_ctx);
     return true;
   }
 
diff --git a/src/operator/contrib/ifft.cu b/src/nnvm/error.h
similarity index 62%
rename from src/operator/contrib/ifft.cu
rename to src/nnvm/error.h
index 7f8516250e51..863513964d91 100644
--- a/src/operator/contrib/ifft.cu
+++ b/src/nnvm/error.h
@@ -17,25 +17,26 @@
  * under the License.
  */
 
-/*!
- * Copyright (c) 2015 by Contributors
- * \file Ifft-inl.h
- * \brief
- * \author Chen Zhu
-*/
+#ifndef MXNET_NNVM_ERROR_H_
+#define MXNET_NNVM_ERROR_H_
 
-#include "./ifft-inl.h"
-namespace mxnet {
-namespace op {
+#include <exception>
+#include <string>
 
-template<>
-Operator* CreateOp<gpu>(IFFTParam param, int dtype) {
-  Operator *op = nullptr;
-  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-      op = new IFFTOp<gpu, DType>(param);
-  })
-  return op;
-}
+namespace nnvm {
+namespace pass {
 
-}  // namespace op
-}  // namespace mxnet
+class InvalidGraphError : public std::exception {
+ public:
+  explicit InvalidGraphError(const std::string& msg = "invalid graph error"): msg_(msg) { }
+  ~InvalidGraphError() throw() {}
+  virtual const char* what() const throw() {
+    return msg_.c_str();
+  }
+ private:
+  std::string msg_;
+};
+
+}  // namespace pass
+}  // namespace nnvm
+#endif  // MXNET_NNVM_ERROR_H_
diff --git a/src/nnvm/gradient.cc b/src/nnvm/gradient.cc
index c9dc67be74a1..a8a836ea71b8 100644
--- a/src/nnvm/gradient.cc
+++ b/src/nnvm/gradient.cc
@@ -37,9 +37,9 @@
 #include <unordered_set>
 #include <vector>
 
+#include "error.h"
 #include "../executor/exec_pass.h"
 
-
 namespace nnvm {
 namespace pass {
 
@@ -636,8 +636,9 @@ Graph BuildGradientGraph(
           input_grads.emplace_back(p, 0, 0);
         }  // for (i ∈ src_fwd_node->num_inputs())
       } else {
-        LOG(FATAL) << "Operator " << src_fwd_node->op()->name << " is non-differentiable "
-                   << "because it didn't register FGradient attribute.";
+        std::string message = "Operator " + std::string(src_fwd_node->op()->name)
+          + "is non-differentiable because it didn't register FGradient attribute.";
+        throw nnvm::pass::InvalidGraphError(message);
       }
       for (const auto& e : input_grads) {
         CHECK(e.node);
diff --git a/src/operator/contrib/ifft-inl.h b/src/operator/contrib/ifft-inl.h
deleted file mode 100644
index 7844f43f035d..000000000000
--- a/src/operator/contrib/ifft-inl.h
+++ /dev/null
@@ -1,310 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * Copyright (c) 2015 by Contributors
- * \file Ifft-inl.h
- * \brief
- * \author Chen Zhu
-*/
-#ifndef MXNET_OPERATOR_CONTRIB_IFFT_INL_H_
-#define MXNET_OPERATOR_CONTRIB_IFFT_INL_H_
-#include <stdio.h>
-#include <dmlc/logging.h>
-#include <dmlc/parameter.h>
-#include <map>
-#include <vector>
-#include <string>
-#include <utility>
-#include "../operator_common.h"
-#include "../mshadow_op.h"
-
-#if MXNET_USE_CUDA
-#include <cufft.h>
-#endif
-
-namespace mxnet {
-namespace op {
-namespace ifft {
-  enum ifftOpInputs {kData};  // input should represent complex
-  enum ifftOpOutputs {kOut};  // output should be real
-  enum ifftOpResource {kTempSpace};
-}
-
-struct IFFTParam : public dmlc::Parameter<IFFTParam> {
-  int compute_size;  // the maximum size of sub-batch to be forwarded through cufft in one time
-  DMLC_DECLARE_PARAMETER(IFFTParam){
-    DMLC_DECLARE_FIELD(compute_size).set_default(128)
-    .describe("Maximum size of sub-batch to be forwarded at one time");
-  }
-};
-
-#if MXNET_USE_CUDA
-template<typename xpu, typename DType>
-class IFFTOp : public Operator {
- public:
-  explicit IFFTOp(IFFTParam p) {
-    this->param_ = p;
-    init_cufft_ = false;
-    dim_ = 0;
-  }
-
-  virtual void Forward(const OpContext &ctx,
-                       const std::vector<TBlob> &in_data,
-                       const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &out_data,
-                       const std::vector<TBlob> &aux_args) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    CHECK_EQ(in_data.size(), 1);
-    CHECK_EQ(out_data.size(), 1);
-
-    if (!init_cufft_) {
-      n_iffts = in_data[ifft::kData].shape_.ProdShape(0, in_data[ifft::kData].ndim()-1);
-      // remember that input is complex
-      dim_ = in_data[ifft::kData].shape_[in_data[ifft::kData].ndim()-1]/2;
-      // stride_ in the number of complex numbers
-      stride_ = param_.compute_size*dim_;
-
-      init_cufft_ = true;
-
-      num_compute = n_iffts/param_.compute_size;
-    }
-
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 2, DType> data = in_data[ifft::kData].get_with_shape<xpu, 2, DType>(
-          Shape2(n_iffts, dim_*2), s);
-    Tensor<xpu, 2, DType> out = out_data[ifft::kOut].get_with_shape<xpu, 2, DType>(
-          Shape2(n_iffts, dim_), s);
-    // need temp space to store the intermediate complex matrices
-    Tensor<xpu, 1, DType> workspace =
-            ctx.requested[ifft::kTempSpace].get_space_typed<xpu, 1, DType>(
-                Shape1(param_.compute_size*dim_*2), s);
-    Tensor<xpu, 2, DType> complex_data = Tensor<xpu, 2, DType>(workspace.dptr_,
-                                              Shape2(param_.compute_size, dim_*2), s);
-    // start ifft
-    cufftHandle plan;
-    cufftPlanMany(&plan, 1, &dim_, nullptr, 0, 0, nullptr, 0, 0, CUFFT_C2C, param_.compute_size);
-    for (size_t idx=0; idx < num_compute; ++idx) {
-      cufftComplex* in_tmp = const_cast<cufftComplex*>(
-        reinterpret_cast<const cufftComplex*>(data.dptr_ + 2*idx*stride_));
-      cufftComplex* out_tmp = reinterpret_cast<cufftComplex*>(complex_data.dptr_);
-      CHECK_EQ(cufftExecC2C(plan, in_tmp, out_tmp, CUFFT_INVERSE), CUFFT_SUCCESS);
-
-      Assign(out.Slice(idx*param_.compute_size, (idx+1)*param_.compute_size),
-             req[ifft::kOut], complex_toreal(complex_data));
-    }
-    cufftDestroy(plan);
-    // handle the remaining samples
-    size_t remain_num = n_iffts - param_.compute_size*num_compute;
-    if (remain_num > 0) {
-      cufftHandle plan_remain;
-      cufftPlanMany(&plan_remain, 1, &dim_, nullptr, 0, 0, nullptr, 0, 0,
-                    CUFFT_C2C, remain_num);
-
-      complex_data = Tensor<xpu, 2, DType>(workspace.dptr_,
-                                              Shape2(remain_num, dim_*2), s);
-
-      cufftComplex* in_tmp = const_cast<cufftComplex*>(
-        reinterpret_cast<const cufftComplex*>(data.dptr_ + 2*num_compute*stride_));
-      cufftComplex* out_tmp = reinterpret_cast<cufftComplex*>(complex_data.dptr_);
-      CHECK_EQ(cufftExecC2C(plan_remain, in_tmp, out_tmp, CUFFT_INVERSE), CUFFT_SUCCESS);
-        Assign(out.Slice(param_.compute_size*num_compute,
-                         param_.compute_size*num_compute+remain_num),
-             req[ifft::kOut], complex_toreal(complex_data));
-      cufftDestroy(plan_remain);
-    }
-    // commenting this out to be consistant with caffe
-    // out /= dim_;
-  }
-  virtual void Backward(const OpContext &ctx,
-                        const std::vector<TBlob> &out_grad,
-                        const std::vector<TBlob> &in_data,
-                        const std::vector<TBlob> &out_data,
-                        const std::vector<OpReqType> &req,
-                        const std::vector<TBlob> &in_grad,
-                        const std::vector<TBlob> &aux_args) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    CHECK_EQ(out_grad.size(), 1);
-    CHECK(in_data.size() == 1 && in_grad.size() == 1);
-    CHECK_EQ(req.size(), 1);
-
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-
-    Tensor<xpu, 2, DType> gdata = in_grad[ifft::kData].get_with_shape<xpu, 2, DType>(
-          Shape2(n_iffts, dim_*2), s);
-    Tensor<xpu, 2, DType> grad = out_grad[ifft::kOut].get_with_shape<xpu, 2, DType>(
-          Shape2(n_iffts, dim_), s);
-    // need temp space to pad the data into complex numbers due to cufft interface
-    Tensor<xpu, 1, DType> workspace =
-            ctx.requested[ifft::kTempSpace].get_space_typed<xpu, 1, DType>(
-                Shape1(param_.compute_size*dim_*2), s);
-    Tensor<xpu, 2, DType> complex_data = Tensor<xpu, 2, DType>(workspace.dptr_,
-                                              Shape2(param_.compute_size, dim_*2), s);
-    // start fft
-    cufftHandle plan;
-    cufftPlanMany(&plan, 1, &dim_, nullptr, 0, 0, nullptr, 0, 0, CUFFT_C2C, param_.compute_size);
-    for (size_t idx = 0; idx < num_compute; ++idx) {
-      complex_data = complex_pad_imag(grad.Slice(idx*param_.compute_size,
-                                                 idx*param_.compute_size+param_.compute_size));
-
-      cufftComplex* in_tmp = const_cast<cufftComplex*>(
-        reinterpret_cast<const cufftComplex*>(complex_data.dptr_));
-      cufftComplex* out_tmp = reinterpret_cast<cufftComplex*>(gdata.dptr_ + 2*idx*stride_);
-      CHECK_EQ(cufftExecC2C(plan, in_tmp, out_tmp, CUFFT_FORWARD), CUFFT_SUCCESS);
-    }
-    cufftDestroy(plan);
-
-    // handle the remaining samples
-    size_t remain_num = n_iffts - param_.compute_size*num_compute;
-    if (remain_num > 0) {
-      cufftHandle plan_remain;
-      cufftPlanMany(&plan_remain, 1, &dim_, nullptr, 0, 0, nullptr, 0, 0,
-                    CUFFT_C2C, remain_num);
-      complex_data = Tensor<xpu, 2, DType>(workspace.dptr_,
-                                          Shape2(remain_num, dim_*2), s);
-      complex_data = complex_pad_imag(grad.Slice(
-          num_compute*param_.compute_size, num_compute*param_.compute_size+remain_num));
-
-      cufftComplex* in_tmp = const_cast<cufftComplex*>(
-        reinterpret_cast<const cufftComplex*>(complex_data.dptr_));
-      cufftComplex* out_tmp = reinterpret_cast<cufftComplex*>(gdata.dptr_ + 2*num_compute*stride_);
-      CHECK_EQ(cufftExecC2C(plan_remain, in_tmp, out_tmp, CUFFT_FORWARD), CUFFT_SUCCESS);
-      cufftDestroy(plan_remain);
-    }
-    // commenting this out to be consistant with caffe
-    // gdata /= dim_;
-  }
-
- private:
-  IFFTParam param_;
-  int dim_, stride_, n_iffts;
-  size_t num_compute;
-  bool init_cufft_;
-};  // class IFFTOp
-
-#endif  // MXNET_USE_CUDA
-
-// Declare Factory Function, used for dispatch specialization
-template<typename xpu>
-Operator* CreateOp(IFFTParam param, int dtype);
-
-#if DMLC_USE_CXX11
-class IFFTProp : public OperatorProperty {
- public:
-  std::vector<std::string> ListArguments() const override {
-    return {"data"};
-  }
-
-  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
-    param_.Init(kwargs);
-  }
-  std::map<std::string, std::string> GetParams() const override {
-    return param_.__DICT__();
-  }
-
-  bool InferShape(mxnet::ShapeVector *in_shape,
-                  mxnet::ShapeVector *out_shape,
-                  mxnet::ShapeVector *aux_shape) const override {
-    using namespace mshadow;
-    CHECK_EQ(in_shape->size(), 1) <<"Input:[data]";
-    const mxnet::TShape &dshape = (*in_shape)[ifft::kData];
-    // require data to be known
-    if (mxnet::op::shape_is_none(dshape)) return false;
-
-    out_shape->clear();
-    if (dshape.ndim() == 4) {
-      out_shape->push_back(Shape4(dshape[0], dshape[1], dshape[2], dshape[3]/2));
-    } else if (dshape.ndim() == 2) {
-      out_shape->push_back(Shape2(dshape[0], dshape[1]/2));
-    } else {
-      return false;
-    }
-    return true;
-  }
-
-  bool InferType(std::vector<int> *in_type,
-                 std::vector<int> *out_type,
-                 std::vector<int> *aux_type) const override {
-    CHECK_GE(in_type->size(), 1);
-    int dtype = (*in_type)[0];
-    CHECK_NE(dtype, -1) << "First input must have specified type";
-    for (size_t i=0; i < in_type->size(); ++i) {
-      if ((*in_type)[i] == -1) {
-        (*in_type)[i] = dtype;
-      } else {
-        UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]);
-      }
-    }
-    out_type->clear();
-    out_type->push_back(dtype);
-    return true;
-  }
-
-  OperatorProperty* Copy() const override {
-    IFFTProp* ifft_sym = new IFFTProp();
-    ifft_sym->param_ = this->param_;
-    return ifft_sym;
-  }
-
-  std::string TypeString() const override {
-    return "_contrib_ifft";
-  }
-
-  std::vector<int> DeclareBackwardDependency(
-    const std::vector<int> &out_grad,
-    const std::vector<int> &in_data,
-    const std::vector<int> &out_data) const override {
-    return {out_grad[ifft::kOut], in_data[ifft::kData]};
-  }
-
-  std::vector<ResourceRequest> ForwardResource(
-      const mxnet::ShapeVector &in_shape) const override {
-    return {ResourceRequest::kTempSpace};
-  }
-
-  std::vector<ResourceRequest> BackwardResource(
-      const mxnet::ShapeVector &in_shape) const override {
-    return {ResourceRequest::kTempSpace};
-  }
-
-  std::vector<std::pair<int, void*> > BackwardInplaceOption(
-    const std::vector<int> &out_grad,
-    const std::vector<int> &in_data,
-    const std::vector<int> &out_data,
-    const std::vector<void*> &in_grad) const override {
-    return {{in_data[ifft::kData], in_grad[ifft::kData]}};
-  }
-
-  Operator* CreateOperator(Context ctx) const override {
-    LOG(FATAL) << "Not Implemented.";
-    return nullptr;
-  }
-
-  Operator* CreateOperatorEx(Context ctx, mxnet::ShapeVector *in_shape,
-                              std::vector<int> *in_type) const override;
-
- private:
-  IFFTParam param_;
-};
-#endif
-}  // namespace op
-}  // namespace mxnet
-#endif  // MXNET_OPERATOR_CONTRIB_IFFT_INL_H_
diff --git a/src/operator/contrib/ifft.cc b/src/operator/contrib/ifft.cc
deleted file mode 100644
index f60220e6190f..000000000000
--- a/src/operator/contrib/ifft.cc
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * Copyright (c) 2015 by Contributors
- * \file Ifft-inl.h
- * \brief
- * \author Chen Zhu
-*/
-
-#include "./ifft-inl.h"
-namespace mxnet {
-namespace op {
-
-template<>
-Operator *CreateOp<cpu>(IFFTParam param, int dtype) {
-  LOG(FATAL) << "ifft is only available for GPU.";
-  return nullptr;
-}
-
-Operator *IFFTProp::CreateOperatorEx(Context ctx, mxnet::ShapeVector *in_shape,
-                                                    std::vector<int> *in_type) const {
-  DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0]);
-}
-
-DMLC_REGISTER_PARAMETER(IFFTParam);
-
-MXNET_REGISTER_OP_PROPERTY(_contrib_ifft, IFFTProp)
-.describe(R"code(Apply 1D ifft to input"
-
-.. note:: `ifft` is only available on GPU.
-
-Currently accept 2 input data shapes: (N, d) or (N1, N2, N3, d). Data is in format: [real0, imag0, real1, imag1, ...].
-Last dimension must be an even number.
-The output data has shape: (N, d/2) or (N1, N2, N3, d/2). It is only the real part of the result.
-
-Example::
-
-   data = np.random.normal(0,1,(3,4))
-   out = mx.contrib.ndarray.ifft(data = mx.nd.array(data,ctx = mx.gpu(0)))
-
-)code" ADD_FILELINE)
-.add_argument("data", "NDArray-or-Symbol", "Input data to the IFFTOp.")
-.add_arguments(IFFTParam::__FIELDS__());
-}  // namespace op
-}  // namespace mxnet
diff --git a/src/operator/control_flow.cc b/src/operator/control_flow.cc
index 474241ce9795..ccf83a5c038f 100644
--- a/src/operator/control_flow.cc
+++ b/src/operator/control_flow.cc
@@ -588,7 +588,9 @@ static void WhileLoopComputeExCPU(const OpStatePtr& state_ptr,
   std::vector<NDArray> func_inputs, func_outputs(outputs.size());
   extract_by_loc(inputs, params.func_input_locs, &func_inputs);
   for (size_t &step = state.n_iterations = 0; step < (size_t) params.max_iterations; ++step) {
-    state.cond_op->Forward(nullptr, cond_input_ptr, cond_output_ptr);
+    CHECK(inputs.size() > 0) << "while loop forward requires at least 1 input";
+    Context default_ctx = inputs[0].ctx();
+    state.cond_op->Forward(nullptr, cond_input_ptr, cond_output_ptr, default_ctx);
     if (!as_bool_scalar(*cond_output_ptr[0])) {
       break;
     }
@@ -910,7 +912,9 @@ static void CondComputeExCPU(const OpStatePtr& state_ptr,
   to_ptr_vec(cond_outputs, &cond_output_ptr);
   int &branch_selection = state.branch_selection;
   // run cond
-  state.cond_op->Forward(nullptr, cond_input_ptr, cond_output_ptr);
+  CHECK(cond_input_ptr.size() > 0) << "condition requires at least 1 input";
+  Context default_ctx = cond_inputs[0].ctx();
+  state.cond_op->Forward(nullptr, cond_input_ptr, cond_output_ptr, default_ctx);
   branch_selection = as_bool_scalar(*cond_output_ptr[0]);
   // select the right branch
   const mxnet::Tuple<dim_t> &func_input_locs = branch_selection
diff --git a/src/operator/subgraph_op_common.cc b/src/operator/subgraph_op_common.cc
index e53d911614a0..619aaca08f94 100644
--- a/src/operator/subgraph_op_common.cc
+++ b/src/operator/subgraph_op_common.cc
@@ -217,8 +217,9 @@ void LoopState::Forward(int iter_no,
     inputs[i] = &in_bufs[i];
   for (size_t i = 0; i < outputs.size(); i++)
     outputs[i] = &out_bufs[i];
-
-  OpStatePtr state = iter_op->Forward(nullptr, inputs, outputs);
+  CHECK(inputs.size() > 0) << "loop forward requires at least 1 input";
+  Context default_ctx = cinputs[0].ctx();
+  OpStatePtr state = iter_op->Forward(nullptr, inputs, outputs, default_ctx);
   // If an input and an output share the array, the output array will be changed
   // by CachedOp. We need to copy data to the real output.
   for (size_t i = 0; i < out_bufs.size(); i++)
diff --git a/src/operator/tensor/elemwise_binary_op.cc b/src/operator/tensor/elemwise_binary_op.cc
index 173d65fa0994..ea2466259494 100644
--- a/src/operator/tensor/elemwise_binary_op.cc
+++ b/src/operator/tensor/elemwise_binary_op.cc
@@ -86,22 +86,6 @@ bool ElemwiseBinaryOp::BackwardUseInStorageType(const nnvm::NodeAttrs& attrs,
                                        dispatch_mode, dispatch_ex);
     }
   }
-  if (!dispatched && ograd_stype == kDefaultStorage &&
-      ((lhs_stype == kCSRStorage && rhs_stype == kDefaultStorage) ||
-       (lhs_stype == kDefaultStorage && rhs_stype == kCSRStorage))) {
-    const bool reverse = (lhs_stype == kCSRStorage);
-    if (reverse &&
-        type_assign(&lhs_grad_stype, kDefaultStorage) &&
-        type_assign(&rhs_grad_stype, kCSRStorage)) {
-      DISPATCH_MODE_ASSIGN_CHECK(dispatch_mode, 0, DispatchMode::kFComputeEx);
-      dispatched = true;
-    } else if (!reverse &&
-               type_assign(&lhs_grad_stype, kCSRStorage) &&
-               type_assign(&rhs_grad_stype, kDefaultStorage)) {
-      DISPATCH_MODE_ASSIGN_CHECK(dispatch_mode, 0, DispatchMode::kFComputeEx);
-      dispatched = true;
-    }
-  }
   if (!dispatched) {
     dispatched = dispatch_fallback(out_attrs, dispatch_mode);
   }
diff --git a/src/operator/tensor/elemwise_binary_op.h b/src/operator/tensor/elemwise_binary_op.h
index c080570afab9..7094e1e7367c 100644
--- a/src/operator/tensor/elemwise_binary_op.h
+++ b/src/operator/tensor/elemwise_binary_op.h
@@ -201,33 +201,6 @@ class ElemwiseBinaryOp : public OpBase {
     }
   }
 
-  template<typename xpu, typename LOP, typename ROP>
-  static inline void DnsCsrCsrOpBackward(const nnvm::NodeAttrs &attrs,
-                                         const OpContext &ctx,
-                                         const std::vector<NDArray> &inputs,
-                                         const std::vector<OpReqType> &req,
-                                         const std::vector<NDArray> &outputs) {
-    const bool supported_ops = std::is_same<mshadow_op::right, LOP>::value &&
-                                std::is_same<mshadow_op::left, ROP>::value;
-    CHECK(supported_ops)
-      << "Only backward for mul is supported (LOP should be right, ROP should be left)";
-    const NDArray& out_grad = inputs[0];
-    const NDArray& lhs_in = inputs[1];
-    const NDArray& rhs_in = inputs[2];
-    const NDArray& lhs_grad = outputs[0];
-    const NDArray& rhs_grad = outputs[1];
-    const bool reverse = (outputs[0].storage_type() == kCSRStorage);
-    if (reverse) {
-      DnsCsrCsrOp<xpu, mshadow_op::mul>(attrs, ctx, out_grad, rhs_in, req[0], lhs_grad, false);
-      Compute<xpu, mshadow_op::mul>(attrs, ctx, {out_grad.data(), lhs_in.data()}, {req[1]},
-                                    {rhs_grad.data()});
-    } else {
-      DnsCsrCsrOp<xpu, mshadow_op::mul>(attrs, ctx, out_grad, lhs_in, req[1], rhs_grad, false);
-      Compute<xpu, mshadow_op::mul>(attrs, ctx, {out_grad.data(), rhs_in.data()}, {req[0]},
-                                    {lhs_grad.data()});
-    }
-  }
-
  public:
   /*! \brief Binary op handling for lhr/rhs: RspDns, RspRsp, DnsRsp, or RspRsp->Dns result */
   template<typename OP>
@@ -827,9 +800,7 @@ template<typename xpu, typename OP>
     });
   }
 
-  template<
-    typename xpu, typename LOP, typename ROP,
-    bool in0_ok_dense = false, bool in1_ok_dense = false, bool in2_ok_dense = false>
+  template<typename xpu, typename LOP, typename ROP>
   static inline void BackwardUseInEx(const nnvm::NodeAttrs &attrs,
                                      const OpContext &ctx,
                                      const std::vector<NDArray> &inputs,
@@ -845,14 +816,10 @@ template<typename xpu, typename OP>
         (lhs_grad_stype == kDefaultStorage || lhs_grad_stype == kRowSparseStorage) &&
         (rhs_grad_stype == kDefaultStorage || rhs_grad_stype == kRowSparseStorage)) {
       // rsp, rsp, rsp -> [dns, rsp], [dns, rsp]
-      RspRspOpBackward<xpu, LOP, ROP, in0_ok_dense, in1_ok_dense, in2_ok_dense>(
+      RspRspOpBackward<xpu, LOP, ROP, false, false, false>(
         attrs, ctx, inputs, req, outputs, BackwardUseIn<xpu, LOP, ROP>);
-    }
-    if (((lhs_grad_stype == kDefaultStorage && rhs_grad_stype == kCSRStorage) ||
-         (lhs_grad_stype == kCSRStorage && rhs_grad_stype == kDefaultStorage)) &&
-        out_grad_stype == kDefaultStorage) {
-      // dns, csr, dns -> [csr, dns] / csr, dns, dns -> [dns, csr]
-      DnsCsrCsrOpBackward<xpu, LOP, ROP>(attrs, ctx, inputs, req, outputs);
+    } else {
+      LOG(FATAL) << "Not Implemented";
     }
   }
 };  // class ElemwiseBinaryOp
diff --git a/src/operator/tensor/elemwise_binary_op_basic.cc b/src/operator/tensor/elemwise_binary_op_basic.cc
index 4bfb2c84f551..469081682b2e 100644
--- a/src/operator/tensor/elemwise_binary_op_basic.cc
+++ b/src/operator/tensor/elemwise_binary_op_basic.cc
@@ -274,8 +274,6 @@ NNVM_REGISTER_OP(_backward_div)
                                   return std::vector<std::pair<int, int> >{{0, 1}};
                                 })
 .set_attr<FCompute>("FCompute<cpu>", ElemwiseBinaryOp::BackwardUseIn<
-  cpu, mshadow_op::div_grad, mshadow_op::div_rgrad>)
-.set_attr<FComputeEx>("FComputeEx<cpu>", ElemwiseBinaryOp::BackwardUseInEx<
   cpu, mshadow_op::div_grad, mshadow_op::div_rgrad>);
 
 MXNET_OPERATOR_REGISTER_BINARY(_mod)
diff --git a/src/operator/tensor/indexing_op.cc b/src/operator/tensor/indexing_op.cc
index 1d16c206a805..e256a02c52d3 100644
--- a/src/operator/tensor/indexing_op.cc
+++ b/src/operator/tensor/indexing_op.cc
@@ -541,7 +541,6 @@ GatherNDBackwardImpl(index_t N, index_t M, index_t K,
 }
 
 DMLC_REGISTER_PARAMETER(EmbeddingParam);
-DMLC_REGISTER_PARAMETER(SparseEmbeddingParam);
 DMLC_REGISTER_PARAMETER(TakeParam);
 DMLC_REGISTER_PARAMETER(OneHotParam);
 DMLC_REGISTER_PARAMETER(ScatterNDParam);
@@ -610,14 +609,12 @@ The storage type of weight can be either row_sparse or default.
   })
 .set_attr<mxnet::FInferShape>("FInferShape", EmbeddingOpShape<EmbeddingParam>)
 .set_attr<nnvm::FInferType>("FInferType", EmbeddingOpType<EmbeddingParam>)
-.set_attr<FInferStorageType>("FInferStorageType", EmbeddingOpForwardStorageType)
 .set_attr<FResourceRequest>("FResourceRequest",
   [](const NodeAttrs& attrs) {
     return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
   })
 .set_attr<THasDeterministicOutput>("THasDeterministicOutput", true)
 .set_attr<FCompute>("FCompute<cpu>", EmbeddingOpForward<cpu>)
-.set_attr<FComputeEx>("FComputeEx<cpu>", SparseEmbeddingOpForwardEx<cpu>)
 .set_attr<nnvm::FGradient>("FGradient",
   [](const nnvm::ObjectPtr& n, const std::vector<nnvm::NodeEntry>& ograds) {
     return MakeNonlossGradNode("_backward_Embedding", n, ograds,
@@ -627,82 +624,6 @@ The storage type of weight can be either row_sparse or default.
 .add_argument("weight", "NDArray-or-Symbol", "The embedding weight matrix.")
 .add_arguments(EmbeddingParam::__FIELDS__());
 
-NNVM_REGISTER_OP(_contrib_SparseEmbedding)
-.describe(R"code(Maps integer indices to vector representations (embeddings).
-
-note:: ``contrib.SparseEmbedding`` is deprecated, use ``Embedding`` instead.
-
-This operator maps words to real-valued vectors in a high-dimensional space,
-called word embeddings. These embeddings can capture semantic and syntactic properties of the words.
-For example, it has been noted that in the learned embedding spaces, similar words tend
-to be close to each other and dissimilar words far apart.
-
-For an input array of shape (d1, ..., dK),
-the shape of an output array is (d1, ..., dK, output_dim).
-All the input values should be integers in the range [0, input_dim).
-
-If the input_dim is ip0 and output_dim is op0, then shape of the embedding weight matrix must be
-(ip0, op0).
-
-The storage type of the gradient will be `row_sparse`.
-
-.. Note::
-
-    `SparseEmbedding` is designed for the use case where `input_dim` is very large (e.g. 100k).
-    The operator is available on both CPU and GPU.
-    When `deterministic` is set to `True`, the accumulation of gradients follows a
-    deterministic order if a feature appears multiple times in the input. However, the
-    accumulation is usually slower when the order is enforced on GPU.
-    When the operator is used on the GPU, the recommended value for `deterministic` is `True`.
-
-Examples::
-
-  input_dim = 4
-  output_dim = 5
-
-  // Each row in weight matrix y represents a word. So, y = (w0,w1,w2,w3)
-  y = [[  0.,   1.,   2.,   3.,   4.],
-       [  5.,   6.,   7.,   8.,   9.],
-       [ 10.,  11.,  12.,  13.,  14.],
-       [ 15.,  16.,  17.,  18.,  19.]]
-
-  // Input array x represents n-grams(2-gram). So, x = [(w1,w3), (w0,w2)]
-  x = [[ 1.,  3.],
-       [ 0.,  2.]]
-
-  // Mapped input x to its vector representation y.
-  SparseEmbedding(x, y, 4, 5) = [[[  5.,   6.,   7.,   8.,   9.],
-                                 [ 15.,  16.,  17.,  18.,  19.]],
-
-                                [[  0.,   1.,   2.,   3.,   4.],
-                                 [ 10.,  11.,  12.,  13.,  14.]]]
-
-)code" ADD_FILELINE)
-.set_num_inputs(2)
-.set_num_outputs(1)
-.set_attr_parser(ParamParser<SparseEmbeddingParam>)
-.set_attr<nnvm::FListInputNames>("FListInputNames",
-  [](const NodeAttrs& attrs) {
-    return std::vector<std::string>{"data", "weight"};
-  })
-.set_attr<FResourceRequest>("FResourceRequest",
-  [](const NodeAttrs& attrs) {
-    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
-  })
-.set_attr<THasDeterministicOutput>("THasDeterministicOutput", true)
-.set_attr<mxnet::FInferShape>("FInferShape", EmbeddingOpShape<SparseEmbeddingParam>)
-.set_attr<nnvm::FInferType>("FInferType", EmbeddingOpType<SparseEmbeddingParam>)
-.set_attr<FInferStorageType>("FInferStorageType", SparseEmbeddingOpForwardStorageType)
-.set_attr<FComputeEx>("FComputeEx<cpu>", SparseEmbeddingOpForwardEx<cpu>)
-.set_attr<nnvm::FGradient>("FGradient",
-  [](const nnvm::ObjectPtr& n, const std::vector<nnvm::NodeEntry>& ograds) {
-    return MakeNonlossGradNode("_backward_SparseEmbedding", n, ograds,
-                               {n->inputs[0]}, n->attrs.dict);
-  })
-.add_argument("data", "NDArray-or-Symbol", "The input array to the embedding operator.")
-.add_argument("weight", "NDArray-or-Symbol", "The embedding weight matrix.")
-.add_arguments(EmbeddingParam::__FIELDS__());
-
 NNVM_REGISTER_OP(_backward_Embedding)
 .set_num_inputs(2)
 .set_num_outputs(2)
@@ -716,18 +637,6 @@ NNVM_REGISTER_OP(_backward_Embedding)
 .set_attr<FCompute>("FCompute<cpu>", EmbeddingOpBackward<cpu>)
 .set_attr<FComputeEx>("FComputeEx<cpu>", EmbeddingOpBackwardEx<cpu>);
 
-NNVM_REGISTER_OP(_backward_SparseEmbedding)
-.set_attr_parser(ParamParser<SparseEmbeddingParam>)
-.set_num_inputs(2)
-.set_num_outputs(2)
-.set_attr<FResourceRequest>("FResourceRequest",
-  [](const NodeAttrs& attrs) {
-    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
-  })
-.set_attr<FInferStorageType>("FInferStorageType", SparseEmbeddingOpBackwardStorageType)
-.set_attr<nnvm::TIsBackward>("TIsBackward", true)
-.set_attr<FComputeEx>("FComputeEx<cpu>", SparseEmbeddingOpBackwardEx<cpu>);
-
 NNVM_REGISTER_OP(take)
 .add_alias("_npi_take")
 .describe(R"code(Takes elements from an input array along the given axis.
diff --git a/src/operator/tensor/indexing_op.cu b/src/operator/tensor/indexing_op.cu
index 6904656b304b..e3c8a787d7ad 100644
--- a/src/operator/tensor/indexing_op.cu
+++ b/src/operator/tensor/indexing_op.cu
@@ -851,19 +851,12 @@ void EmbeddingOpBackward<gpu>(const nnvm::NodeAttrs& attrs,
 }
 
 NNVM_REGISTER_OP(Embedding)
-.set_attr<FCompute>("FCompute<gpu>", EmbeddingOpForward<gpu>)
-.set_attr<FComputeEx>("FComputeEx<gpu>", SparseEmbeddingOpForwardEx<gpu>);
-
-NNVM_REGISTER_OP(_contrib_SparseEmbedding)
-.set_attr<FComputeEx>("FComputeEx<gpu>", SparseEmbeddingOpForwardEx<gpu>);
+.set_attr<FCompute>("FCompute<gpu>", EmbeddingOpForward<gpu>);
 
 NNVM_REGISTER_OP(_backward_Embedding)
 .set_attr<FCompute>("FCompute<gpu>", EmbeddingOpBackward<gpu>)
 .set_attr<FComputeEx>("FComputeEx<gpu>", EmbeddingOpBackwardEx<gpu>);
 
-NNVM_REGISTER_OP(_backward_SparseEmbedding)
-.set_attr<FComputeEx>("FComputeEx<gpu>", SparseEmbeddingOpBackwardEx<gpu>);
-
 NNVM_REGISTER_OP(take)
 .set_attr<FCompute>("FCompute<gpu>", TakeOpForward<gpu>);
 
diff --git a/src/operator/tensor/indexing_op.h b/src/operator/tensor/indexing_op.h
index cd85daa80df3..5454900968ec 100644
--- a/src/operator/tensor/indexing_op.h
+++ b/src/operator/tensor/indexing_op.h
@@ -65,29 +65,6 @@ enum QuantizedEmbeddingOpResource {kTempSpace};
 }  // namespace quantized_embedding
 
 
-struct SparseEmbeddingParam: public dmlc::Parameter<SparseEmbeddingParam> {
-  index_t input_dim;
-  index_t output_dim;
-  int dtype;
-  bool deterministic;
-  DMLC_DECLARE_PARAMETER(SparseEmbeddingParam) {
-    DMLC_DECLARE_FIELD(input_dim).set_lower_bound(1)
-    .describe("Vocabulary size of the input indices.");
-    DMLC_DECLARE_FIELD(output_dim).set_lower_bound(1)
-    .describe("Dimension of the embedding vectors.");
-    DMLC_DECLARE_FIELD(dtype).set_default(mshadow::kFloat32)
-    .add_enum("float32", mshadow::kFloat32)
-    .add_enum("float64", mshadow::kFloat64)
-    .add_enum("float16", mshadow::kFloat16)
-    .add_enum("uint8", mshadow::kUint8)
-    .add_enum("int32", mshadow::kInt32)
-    .describe("Data type of weight.");
-    DMLC_DECLARE_FIELD(deterministic).set_default(false)
-    .describe("Force the backward gradient calculation to be executed based on a deterministic"
-               " order at the cost of slower speed.");
-  }
-};
-
 struct EmbeddingParam: public dmlc::Parameter<EmbeddingParam> {
   index_t input_dim;
   index_t output_dim;
@@ -193,52 +170,6 @@ inline bool EmbeddingOpType(const nnvm::NodeAttrs& attrs,
   return true;
 }
 
-// storage type inference function for Embedding
-inline bool EmbeddingOpForwardStorageType(const nnvm::NodeAttrs& attrs,
-                                          const int dev_mask,
-                                          DispatchMode* dispatch_mode,
-                                          std::vector<int>* in_attrs,
-                                          std::vector<int>* out_attrs) {
-  CHECK_EQ(in_attrs->size(), 2U);
-  CHECK_EQ(out_attrs->size(), 1U);
-  const int& data_stype = in_attrs->at(embedding::kData);
-  const int& weight_stype = in_attrs->at(embedding::kWeight);
-  int& out_stype = out_attrs->at(embedding::kOut);
-  bool dispatched = false;
-  if (!dispatched && data_stype == kDefaultStorage && weight_stype == kDefaultStorage) {
-    // dns, dns -> dns
-    dispatched = storage_type_assign(&out_stype, kDefaultStorage,
-                                     dispatch_mode, DispatchMode::kFCompute);
-  }
-  if (!dispatched && data_stype == kDefaultStorage && weight_stype == kRowSparseStorage) {
-    // dns, rsp -> dns
-    dispatched = storage_type_assign(&out_stype, kDefaultStorage,
-                                     dispatch_mode, DispatchMode::kFComputeEx);
-  }
-  return dispatched;
-}
-
-// storage type inference function for SparseEmbedding
-inline bool SparseEmbeddingOpForwardStorageType(const nnvm::NodeAttrs& attrs,
-                                                const int dev_mask,
-                                                DispatchMode* dispatch_mode,
-                                                std::vector<int>* in_attrs,
-                                                std::vector<int>* out_attrs) {
-  CHECK_EQ(in_attrs->size(), 2U);
-  CHECK_EQ(out_attrs->size(), 1U);
-  const int& data_stype = in_attrs->at(embedding::kData);
-  const int& weight_stype = in_attrs->at(embedding::kWeight);
-  int& out_stype = out_attrs->at(embedding::kOut);
-  bool dispatched = false;
-  if (!dispatched && data_stype == kDefaultStorage &&
-      (weight_stype == kRowSparseStorage || weight_stype == kDefaultStorage)) {
-    // dns, rsp/dns -> dns
-    dispatched = storage_type_assign(&out_stype, kDefaultStorage,
-                                     dispatch_mode, DispatchMode::kFComputeEx);
-  }
-  return dispatched;
-}
-
 // storage type inference function for _backward_Embedding
 inline bool EmbeddingOpBackwardStorageType(const nnvm::NodeAttrs& attrs,
                                            const int dev_mask,
@@ -272,36 +203,6 @@ inline bool EmbeddingOpBackwardStorageType(const nnvm::NodeAttrs& attrs,
   return dispatched;
 }
 
-// storage type inference function for _backward_SparseEmbedding
-inline bool SparseEmbeddingOpBackwardStorageType(const nnvm::NodeAttrs& attrs,
-                                                 const int dev_mask,
-                                                 DispatchMode* dispatch_mode,
-                                                 std::vector<int>* in_attrs,
-                                                 std::vector<int>* out_attrs) {
-  CHECK_EQ(in_attrs->size(), 2U);
-  CHECK_EQ(out_attrs->size(), 2U);
-  const int ograd_stype = in_attrs->at(0);
-  const int data_stype = in_attrs->at(1);
-  int& data_grad_stype = out_attrs->at(0);
-  int& weight_grad_stype = out_attrs->at(1);
-  bool dispatched = false;
-  if (!dispatched && ograd_stype == kDefaultStorage &&
-      data_stype == kDefaultStorage) {
-    // dns, dns -> dns, rsp
-    if (type_assign(&data_grad_stype, kDefaultStorage) &&
-        type_assign(&weight_grad_stype, kRowSparseStorage) &&
-        dispatch_mode_assign(dispatch_mode, DispatchMode::kFComputeEx)) {
-      dispatched = true;
-    }
-  }
-  const SparseEmbeddingParam& param = nnvm::get<SparseEmbeddingParam>(attrs.parsed);
-  if (param.deterministic) {
-    common::LogOnce("_SparseEmbedding_backward with deterministic=True may reduce "
-                    "speed significantly");
-  }
-  return dispatched;
-}
-
 /*! \brief name the struct TakeNonzeroAxis for general take when
  *         axis is not zero, use TakeZeroAxisGPU or TakeZeroAxisCPU for axis zero
  */
@@ -468,38 +369,6 @@ void EmbeddingOpForward(const nnvm::NodeAttrs& attrs,
                                  req[embedding::kOut], outputs[embedding::kOut]);
 }
 
-template<typename xpu>
-void SparseEmbeddingOpForwardEx(const nnvm::NodeAttrs& attrs,
-                                const OpContext& ctx,
-                                const std::vector<NDArray>& inputs,
-                                const std::vector<OpReqType>& req,
-                                const std::vector<NDArray>& outputs) {
-  CHECK_EQ(req[embedding::kOut], kWriteTo);
-  CHECK_EQ(inputs.size(), 2U);
-  CHECK_EQ(outputs.size(), 1U);
-  const NDArray& data = inputs[embedding::kData];
-  const NDArray& weight = inputs[embedding::kWeight];
-  const NDArray& out = outputs[embedding::kOut];
-  CHECK_EQ(weight.shape().ndim(), 2U)
-          << "Embedding layer expects its weight to be two-dimensional. "
-          << weight.shape().ndim() << " dimensional input is given instead";
-  const auto data_stype = data.storage_type();
-  const auto weight_stype = weight.storage_type();
-  const auto out_stype = out.storage_type();
-  if (data_stype == kDefaultStorage && weight_stype == kRowSparseStorage &&
-      out_stype == kDefaultStorage) {
-    // dns, rsp -> dns
-    SparseEmbeddingOpForwardRspImpl<xpu>(ctx, data.data(), weight, req[0], out.data());
-  } else if (data_stype == kDefaultStorage && weight_stype == kDefaultStorage &&
-             out_stype == kDefaultStorage) {
-    // dns, dns -> dns
-    EmbeddingOpForwardDnsImpl<xpu>(ctx.get_stream<xpu>(), data.data(), weight.data(),
-                                   req[0], out.data());
-  } else {
-    LogUnimplementedOp(attrs, ctx, inputs, req, outputs);
-  }
-}
-
 /*! \brief cast to type and clip to range [0, K - 1]
  */
 struct tcast_clip {
@@ -627,32 +496,6 @@ void EmbeddingOpBackwardEx(const nnvm::NodeAttrs& attrs,
   }
 }
 
-template<typename xpu>
-void SparseEmbeddingOpBackwardEx(const nnvm::NodeAttrs& attrs,
-                                 const OpContext& ctx,
-                                 const std::vector<NDArray>& inputs,
-                                 const std::vector<OpReqType>& req,
-                                 const std::vector<NDArray>& outputs) {
-  CHECK_EQ(inputs.size(), 2U);
-  CHECK_EQ(outputs.size(), 2U);
-  const NDArray& weight_grad = outputs[1];
-  const NDArray& ograd = inputs[0];
-  const NDArray& data = inputs[1];
-  // check dtype
-  CHECK_EQ(weight_grad.dtype(), ograd.dtype());
-  // check req
-  CHECK_EQ(req[embedding::kData], kNullOp)
-          << "SparseEmbedding layer doesn't support calculate data gradient";
-  const SparseEmbeddingParam& param = nnvm::get<SparseEmbeddingParam>(attrs.parsed);
-  if (data.storage_type() == kDefaultStorage && ograd.storage_type() == kDefaultStorage &&
-      weight_grad.storage_type() == kRowSparseStorage) {
-    SparseEmbeddingOpBackwardRspImpl<xpu>(param.deterministic, ctx, ograd.data(), data.data(),
-                                          req[embedding::kWeight], weight_grad);
-  } else {
-    LogUnimplementedOp(attrs, ctx, inputs, req, outputs);
-  }
-}
-
 namespace take_ {  // to avoid name conflict
 enum TakeOpInputs {kArr, kIdx};
 enum TakeOpOutputs {kOut};
diff --git a/tests/cpp/thread_safety/thread_safety_test.cc b/tests/cpp/thread_safety/thread_safety_test.cc
index 49bd14d340fe..7ab9aaa3d8c0 100644
--- a/tests/cpp/thread_safety/thread_safety_test.cc
+++ b/tests/cpp/thread_safety/thread_safety_test.cc
@@ -110,7 +110,7 @@ void get_expected_results(const mxnet::cpp::Symbol &sym,
     int num_output = 0;
     const int *stypes;
     int ret4 = MXInvokeCachedOpEx(*hdl, (*arr_handles)[i].size(), (*arr_handles)[i].data(),
-                                  &num_output, &nd_ptrs[i], &stypes);
+                                  cpu::kDevMask, 0, &num_output, &nd_ptrs[i], &stypes);
     if (ret4 < 0) {
       LOG(FATAL) << MXGetLastError();
     }
@@ -158,8 +158,8 @@ inline void get_expected_results_multiple(
       int num_output = 0;
       const int *stypes;
       int ret4 = MXInvokeCachedOpEx(*hdl, (*arr_handles)[i][j].size(),
-                                    (*arr_handles)[i][j].data(), &num_output,
-                                    &nd_ptrs[i][j], &stypes);
+                                    (*arr_handles)[i][j].data(), cpu::kDevMask, 0,
+                                    &num_output, &nd_ptrs[i][j], &stypes);
       if (ret4 < 0) {
         LOG(FATAL) << MXGetLastError();
       }
@@ -309,7 +309,8 @@ void run_inference(const std::string& model,
         const int *stypes;
         int ret = MXInvokeCachedOpEx(
             hdl2, arr_handles2[i][num].size(), arr_handles2[i][num].data(),
-            &num_output, &(cached_op_handles[i * num_threads + num]), &stypes);
+            cpu::kDevMask, 0, &num_output, &(cached_op_handles[i * num_threads + num]),
+            &stypes);
         if (ret < 0) {
             LOG(FATAL) << MXGetLastError();
         }
@@ -492,7 +493,8 @@ void run_inference_unsupported(const std::string& model,
         const int *stypes;
         int ret = MXInvokeCachedOpEx(
             hdl2, arr_handles2[i][num].size(), arr_handles2[i][num].data(),
-            &num_output, &(cached_op_handles[i * num_threads + num]), &stypes);
+            cpu::kDevMask, 0, &num_output, &(cached_op_handles[i * num_threads + num]),
+            &stypes);
         if (ret < 0) {
           LOG(FATAL) << MXGetLastError();
         }
diff --git a/tests/nightly/test_large_array.py b/tests/nightly/test_large_array.py
index 377a8709bfbc..3bd7ac6524ff 100644
--- a/tests/nightly/test_large_array.py
+++ b/tests/nightly/test_large_array.py
@@ -215,7 +215,7 @@ def npy_layer_norm(data, gamma, beta, axis=1, eps=1E-5):
         beta_s = mx.symbol.Variable('beta')
         out_s = mx.symbol.LayerNorm(data=data_s, gamma=gamma_s, beta=beta_s,
                                     axis=axis, eps=eps)
-        exe = out_s.simple_bind(ctx, data=in_shape)
+        exe = out_s._simple_bind(ctx, data=in_shape)
         exe.arg_dict['data'][:] = data
         exe.arg_dict['gamma'][:] = gamma
         exe.arg_dict['beta'][:] = beta
@@ -231,7 +231,7 @@ def check_dropout():
         shape = (LARGE_X, SMALL_Y)
         x = mx.sym.var('data')
         y = mx.sym.Dropout(x, p=1, cudnn_off=True)
-        exe = y.simple_bind(ctx=default_context(), data=shape)
+        exe = y._simple_bind(ctx=default_context(), data=shape)
         exe.arg_arrays[0][:] = 1
         out = exe.forward(is_train=True)
         nd.waitall()
@@ -382,7 +382,7 @@ def npy_instance_norm(data, gamma, beta, axis, eps=1E-5):
         beta_s = mx.symbol.Variable('beta')
         out_s = mx.symbol.InstanceNorm(data=data_s, gamma=gamma_s, beta=beta_s,
                                        eps=eps)
-        exe = out_s.simple_bind(ctx, data=in_shape)
+        exe = out_s._simple_bind(ctx, data=in_shape)
         exe.arg_dict['data'][:] = data
         exe.arg_dict['gamma'][:] = gamma
         exe.arg_dict['beta'][:] = beta
diff --git a/tests/python/gpu/test_contrib_amp.py b/tests/python/gpu/test_contrib_amp.py
index 0d47db3dfbbc..d7a6e80b8982 100644
--- a/tests/python/gpu/test_contrib_amp.py
+++ b/tests/python/gpu/test_contrib_amp.py
@@ -130,14 +130,14 @@ def test_fp16_casting(amp_tests):
     # data should be float32
     res = mx.sym.Group([out1, out2])
     final_res = amp.convert_symbol(res, data_names=[], cast_optional_params=True)
-    exe = final_res.simple_bind(ctx=mx.gpu(), data=(1, 2))
+    exe = final_res._simple_bind(ctx=mx.gpu(), data=(1, 2))
     assert exe.arg_arrays[0].dtype == np.float32
 
     # When two ops from data, both casted to float16,
     # data should be float16
     res = mx.sym.Group([out1, out3])
     final_res = amp.convert_symbol(res, data_names=[], cast_optional_params=True)
-    exe = final_res.simple_bind(ctx=mx.gpu(), data=(1, 2))
+    exe = final_res._simple_bind(ctx=mx.gpu(), data=(1, 2))
     assert exe.arg_arrays[0].dtype == np.float16
 
     # AMP Multicast test where one node is float32, another is float16
@@ -145,7 +145,7 @@ def test_fp16_casting(amp_tests):
     data2 = mx.sym.var("data2", dtype=np.float16)
     out4 = mx.sym.amp_multicast(data, data2, num_outputs=2)
     final_res = amp.convert_symbol(out4, cast_optional_params=True)
-    exe = final_res.simple_bind(ctx=mx.gpu(), data2=(1, 2), data=(1, 2))
+    exe = final_res._simple_bind(ctx=mx.gpu(), data2=(1, 2), data=(1, 2))
     assert exe.arg_arrays[0].dtype == np.float16
 
     # AMP Multicast test where two non input nodes are float16,
@@ -158,7 +158,7 @@ def test_fp16_casting(amp_tests):
                                 num_outputs=2)
     final_res = amp.convert_symbol(out5, target_dtype_ops=[],
                                    fp32_ops=[], cast_optional_params=True)
-    exe = final_res.simple_bind(ctx=mx.gpu(), data=(1, 2), data2=(1, 2), data3=(1, 2))
+    exe = final_res._simple_bind(ctx=mx.gpu(), data=(1, 2), data2=(1, 2), data3=(1, 2))
     assert exe.arg_arrays[0].dtype == np.float32
 
     # AMP Multicast test where three input nodes one fp16, one fp32
@@ -169,7 +169,7 @@ def test_fp16_casting(amp_tests):
     out6 = mx.sym.amp_multicast(data, data2, data3, num_outputs=3)
     final_res = amp.convert_symbol(out6, target_dtype_ops=[],
                                    fp32_ops=[], cast_optional_params=True)
-    exe = final_res.simple_bind(ctx=mx.gpu(), data=(1, 2), data2=(1, 2),
+    exe = final_res._simple_bind(ctx=mx.gpu(), data=(1, 2), data2=(1, 2),
                                 data3=(1, 2))
     assert exe.arg_arrays[2].dtype == np.float32
 
@@ -180,7 +180,7 @@ def test_fp16_casting(amp_tests):
     out7 = mx.sym.Group([mx.sym.amp_multicast(data, data2, num_outputs=2), mx.sym.amp_cast(data, dtype="float16")])
     final_res = amp.convert_symbol(out7, target_dtype_ops=[],
                                    fp32_ops=[], cast_optional_params=True)
-    exe = final_res.simple_bind(ctx=mx.gpu(), data=(1, 2), data2=(1, 2))
+    exe = final_res._simple_bind(ctx=mx.gpu(), data=(1, 2), data2=(1, 2))
     assert exe.arg_arrays[0].dtype == np.float16
 
     # Input node to amp_multicast and amp_cast, if dtypes conflict
@@ -190,7 +190,7 @@ def test_fp16_casting(amp_tests):
     out8 = mx.sym.Group([mx.sym.amp_multicast(data, data2, num_outputs=2), mx.sym.amp_cast(data, dtype="float16")])
     final_res = amp.convert_symbol(out8, target_dtype_ops=[],
                                    fp32_ops=[], cast_optional_params=True)
-    exe = final_res.simple_bind(ctx=mx.gpu(), data=(1, 2), data2=(1, 2))
+    exe = final_res._simple_bind(ctx=mx.gpu(), data=(1, 2), data2=(1, 2))
     assert exe.arg_arrays[0].dtype == np.float16
 
     # Check for symbol which has slice channel
diff --git a/tests/python/gpu/test_extensions_gpu.py b/tests/python/gpu/test_extensions_gpu.py
index 18368e755b6c..1cc06cd8b2c5 100644
--- a/tests/python/gpu/test_extensions_gpu.py
+++ b/tests/python/gpu/test_extensions_gpu.py
@@ -62,8 +62,8 @@ def test_custom_op_gpu():
     base = mx.sym.relu(d)
     in_grad = [mx.nd.empty((2,2), ctx=mx.gpu())]
     in_grad_base = [mx.nd.empty((2,2), ctx=mx.gpu())]
-    exe = e.bind(ctx=mx.gpu(), args={'c':b}, args_grad=in_grad)
-    exe_base = base.bind(ctx=mx.gpu(), args={'d':b}, args_grad=in_grad_base)
+    exe = e._bind(ctx=mx.gpu(), args={'c':b}, args_grad=in_grad)
+    exe_base = base._bind(ctx=mx.gpu(), args={'d':b}, args_grad=in_grad_base)
     out = exe.forward()
     out_base = exe_base.forward()
     assert_almost_equal(out_base[0].asnumpy(), out[0].asnumpy(), rtol=1e-3, atol=1e-3)
diff --git a/tests/python/gpu/test_fusion.py b/tests/python/gpu/test_fusion.py
index 57337dc9c5cf..51a47d591389 100644
--- a/tests/python/gpu/test_fusion.py
+++ b/tests/python/gpu/test_fusion.py
@@ -44,9 +44,9 @@ def check_fused_symbol(sym, **kwargs):
         for grad_req in ['write', 'add']:
             type_dict = {inp : dtype for inp in inputs}
             os.environ["MXNET_USE_FUSION"] = "0"
-            orig_exec = test_sym.simple_bind(ctx=ctx, grad_req=grad_req, type_dict=type_dict, **shapes)
+            orig_exec = test_sym._simple_bind(ctx=ctx, grad_req=grad_req, type_dict=type_dict, **shapes)
             os.environ["MXNET_USE_FUSION"] = "1"
-            fused_exec = test_sym.simple_bind(ctx=ctx, grad_req=grad_req, type_dict=type_dict, **shapes)
+            fused_exec = test_sym._simple_bind(ctx=ctx, grad_req=grad_req, type_dict=type_dict, **shapes)
             fwd_orig = orig_exec.forward(is_train=True, **data)
             out_grads = [mx.nd.ones_like(arr) for arr in fwd_orig]
             orig_exec.backward(out_grads=out_grads)
@@ -311,29 +311,3 @@ def hybrid_forward(self, F, x):
     out = foo(mx.nd.ones((10,10), ctx=mx.gpu()))
     assert np.all(out.asnumpy() == np.ones((10,10)))
     assert out.shape == (10,10,1)
-
-@with_seed()
-def test_fusion_reshape_executor():
-    a = mx.sym.Variable("data1")
-    b = mx.sym.Variable("data2")
-    c = a + b + 1
-    sym = mx.sym.relu(c)
-    orig_shape = (10,10)
-    e = sym.simple_bind(ctx=mx.gpu(), data1=orig_shape, data2=orig_shape)
-    data = mx.nd.zeros(orig_shape, ctx=mx.gpu())
-    out = e.forward(is_train=False)
-    assert out[0].sum().asscalar() == 100
-    changed_shape = (80, 2)
-    new_shape = {'data1': changed_shape, 'data2': changed_shape}
-    data = mx.nd.zeros(new_shape['data1'], ctx=mx.gpu())
-    f = e.reshape(allow_up_sizing=True, **new_shape)
-    out = f.forward(is_train=False, data1=data, data2=data)
-    assert out[0].sum().asscalar() == 160
-    # Reshape again
-    changed_shape = (30, 5)
-    new_shape = {'data1': changed_shape, 'data2': changed_shape}
-    data = mx.nd.zeros(new_shape['data1'], ctx=mx.gpu())
-    f = e.reshape(allow_up_sizing=True, **new_shape)
-    out = f.forward(is_train=False, data1=data, data2=data)
-    assert out[0].sum().asscalar() == 150
-
diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
index 934a10edc782..7bd6dfe98ef5 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -24,6 +24,8 @@
 import numpy as np
 import pytest
 from mxnet.test_utils import check_consistency, set_default_context, assert_almost_equal, assert_allclose
+from mxnet.test_utils import check_symbolic_forward, check_symbolic_backward, discard_stderr
+from mxnet.test_utils import default_context, rand_shape_2d, rand_ndarray, same
 from mxnet.base import MXNetError
 from mxnet import autograd
 
@@ -31,6 +33,7 @@
 sys.path.insert(0, os.path.join(curr_path, '../unittest'))
 from common import setup_module, with_seed, teardown_module, assert_raises_cudnn_not_satisfied, assert_raises_cuda_not_satisfied
 from common import run_in_spawned_process
+from test_operator import check_sequence_reverse, allclose_function
 from test_operator import *
 from test_numpy_ndarray import *
 from test_numpy_op import *
@@ -45,9 +48,9 @@
 from test_gluon_gpu import _test_bulking
 from test_contrib_operator import test_multibox_target_op
 from test_contrib_optimizer import test_adamw
+del test_custom_op_fork  #noqa
 
 set_default_context(mx.gpu(0))
-del test_custom_op_fork  #noqa
 
 def check_countsketch(in_dim,out_dim,n):
     data = mx.sym.Variable("data")
@@ -94,87 +97,6 @@ def test_countsketch():
     check_countsketch(in_dim, out_dim, n)
 
 
-def check_ifft(shape):
-    shape_old = shape
-    if len(shape) == 2:
-        if shape[1]%2 != 0:
-            lst = list(shape)
-            lst[1] = lst[1]*2
-            shape = tuple(lst)
-            shape_old = shape
-        shape = (shape[0],shape[1]*2)
-    if len(shape) == 4:
-        if shape[3]%2 != 0:
-            lst = list(shape)
-            lst[3] = lst[3]*2
-            shape = tuple(lst)
-            shape_old = shape
-        shape = (shape[0],shape[1],shape[2],shape[3]*2)
-    sym = mx.sym.contrib.ifft(name='ifft', compute_size = 128)
-    init = [np.random.normal(size=shape, scale=1.0)]
-    arr_grad = [mx.nd.empty(shape)]
-    ctx_list = [{'ctx': mx.gpu(0),'ifft_data': shape, 'type_dict': {'ifft_data': np.float32}}]
-    exe_list = [sym.simple_bind(args_grad=arr_grad,**ctx) for ctx in ctx_list]
-
-    for exe in exe_list:
-        for arr, iarr in zip(exe.arg_arrays, init):
-            arr[:] = iarr.astype(arr.dtype)
-    # forward
-    for exe in exe_list:
-        exe.forward(is_train= True)
-        out1 = [exe.outputs[0].asnumpy() for exe in exe_list]
-
-    if len(shape) == 2:
-        init_complex = np.zeros(shape_old,dtype = np.complex64)
-        for i in range(0,shape_old[1]):
-            init_complex.real[:,i] = init[0][:,2*i]
-            init_complex.imag[:,i] = init[0][:,2*i+1]
-        a = np.fft.ifft(init_complex, n=None, axis=-1, norm=None)
-        assert_almost_equal(a.real, out1[0]/shape_old[1],rtol=1e-3, atol=1e-5)
-
-    if len(shape) == 4:
-        init_complex = np.zeros(shape_old,dtype = np.complex64)
-        for i in range(0,shape_old[3]):
-            init_complex.real[:,:,:,i] = init[0][:,:,:,2*i]
-            init_complex.imag[:,:,:,i] = init[0][:,:,:,2*i+1]
-        a = np.fft.ifft(init_complex, n=None, axis=-1, norm=None)
-        assert_almost_equal(a.real, out1[0]/shape_old[3],rtol=1e-3, atol=1e-5)
-    # backward
-    if len(shape) == 2:
-        out_grad = mx.nd.empty(shape_old)
-        out_grad[:] = np.random.normal(-3, 3, shape_old)
-        for exe in exe_list:
-            exe.backward([out_grad])
-            temp = exe.grad_arrays[0].asnumpy()
-            temp = np.zeros(shape_old)
-            for i in range(shape_old[1]):
-                temp[:,i] = exe.grad_arrays[0].asnumpy()[:,2*i]
-
-        a = np.fft.fft(out_grad.asnumpy(), n=None, axis=-1, norm=None)
-        assert_almost_equal(a.real, temp, rtol=1e-3, atol=1e-5)
-    if len(shape) == 4:
-        out_grad = mx.nd.empty(shape_old)
-        out_grad[:] = np.random.normal(-3, 3, shape_old)
-        for exe in exe_list:
-            exe.backward([out_grad])
-            temp = exe.grad_arrays[0].asnumpy()
-            temp = np.zeros(shape_old)
-            for i in range(shape_old[3]):
-                temp[:,:,:,i] = exe.grad_arrays[0].asnumpy()[:,:,:,2*i]
-
-        a = np.fft.fft(out_grad.asnumpy(), n=None, axis=-1, norm=None)
-        assert_almost_equal(a.real, temp, rtol=1e-3, atol=1e-5)
-
-@with_seed()
-def test_ifft():
-    nrepeat = 2
-    maxdim = 10
-    for repeat in range(nrepeat):
-        for order in [2,4]:
-            shape = tuple(np.random.randint(1, maxdim, size=order))
-            check_ifft(shape)
-
-
 def check_fft(shape):
     sym = mx.sym.contrib.fft(name='fft', compute_size = 128)
     if len(shape) == 2:
@@ -192,7 +114,7 @@ def check_fft(shape):
     init = [np.random.normal(size=shape, scale=1.0)]
     arr_grad = [mx.nd.empty(shape)]
     ctx_list = [{'ctx': mx.gpu(0),'fft_data': shape, 'type_dict': {'fft_data': np.float32}}]
-    exe_list = [sym.simple_bind(args_grad=arr_grad,**ctx) for ctx in ctx_list]
+    exe_list = [sym._simple_bind(**ctx) for ctx in ctx_list]
 
     for exe in exe_list:
         for arr, iarr in zip(exe.arg_arrays, init):
@@ -452,11 +374,6 @@ def test_preloaded_multi_sgd():
 @with_seed()
 @pytest.mark.serial
 def test_batchnorm_with_type():
-  ctx_list_v1_2D = [
-    {'ctx': mx.cpu(0), 'norm_data': (10, 2, 10, 10), 'type_dict': {'norm_data': np.float32}},
-    {'ctx': mx.gpu(0), 'norm_data': (10, 2, 10, 10), 'type_dict': {'norm_data': np.float32}},
-  ]
-
   ctx_list_v2_2D = [
     {'ctx': mx.cpu(0), 'norm_data': (5, 2, 5, 5), 'type_dict': {'norm_data': np.float32}},
     {'ctx': mx.cpu(0), 'norm_data': (5, 2, 5, 5), 'type_dict': {'norm_data': np.float16}},
@@ -550,8 +467,7 @@ def test_1d_batchnorm(fix_gamma, use_global_stats):
 
   def test_2d_batchnorm(fix_gamma, use_global_stats):
     data = (2, 3, 10, 10)
-    test_batchnorm_versions_helper(batchnorm_op_list=['batchnorm_v1_cpu', 'batchnorm_v1_gpu',
-                                                      'batchnorm_cpu',
+    test_batchnorm_versions_helper(batchnorm_op_list=['batchnorm_cpu',
                                                       'batchnorm_gpu', 'batchnorm_cudnn'],
                                    data=data,
                                    fix_gamma=fix_gamma, use_global_stats=use_global_stats)
@@ -2114,7 +2030,7 @@ def kernel_error_check_symbolic():
         a = mx.sym.Variable('a')
         b = mx.sym.Variable('b')
         c = a / b
-        f = c.bind(mx.gpu(0), { 'a':mx.nd.array([1,2,3],ctx=mx.gpu(0)),
+        f = c._bind(mx.gpu(0), { 'a':mx.nd.array([1,2,3],ctx=mx.gpu(0)),
                                 'b':mx.nd.array([],ctx=mx.gpu(0))})
         f.forward()
         g = f.outputs[0].asnumpy()
@@ -2214,9 +2130,9 @@ def test_bilinear_sampler_versions():
     for item in test_cases:
         data_shape, grid_shape = item
         # kWriteTo
-        exe_cpu = sym1.simple_bind(data=data_shape, grid=grid_shape, ctx=mx.cpu(), grad_req='write')
-        exe_gpu = sym2.simple_bind(data=data_shape, grid=grid_shape, ctx=default_context(), grad_req='write')
-        exe_cudnn = sym3.simple_bind(data=data_shape, grid=grid_shape, ctx=default_context(), grad_req='write')
+        exe_cpu = sym1._simple_bind(data=data_shape, grid=grid_shape, ctx=mx.cpu(), grad_req='write')
+        exe_gpu = sym2._simple_bind(data=data_shape, grid=grid_shape, ctx=default_context(), grad_req='write')
+        exe_cudnn = sym3._simple_bind(data=data_shape, grid=grid_shape, ctx=default_context(), grad_req='write')
         exe_list = [exe_cpu, exe_gpu, exe_cudnn]
         ref_idx = 0
         test_data = np.random.uniform(low=-0.1, high=0.1,size=data_shape).astype(np.float32)
@@ -2237,9 +2153,9 @@ def test_bilinear_sampler_versions():
         grid_grad = exe_list[ref_idx].grad_dict['grid'].asnumpy()
 
         # kAddTo
-        exe_cpu_addto = sym1.simple_bind(data=data_shape, grid=grid_shape, ctx=mx.cpu(), grad_req='add')
-        exe_gpu_addto = sym2.simple_bind(data=data_shape, grid=grid_shape, ctx=default_context(), grad_req='add')
-        exe_cudnn_addto = sym3.simple_bind(data=data_shape, grid=grid_shape, ctx=default_context(), grad_req='add')
+        exe_cpu_addto = sym1._simple_bind(data=data_shape, grid=grid_shape, ctx=mx.cpu(), grad_req='add')
+        exe_gpu_addto = sym2._simple_bind(data=data_shape, grid=grid_shape, ctx=default_context(), grad_req='add')
+        exe_cudnn_addto = sym3._simple_bind(data=data_shape, grid=grid_shape, ctx=default_context(), grad_req='add')
         exe_list = [exe_cpu_addto, exe_gpu_addto, exe_cudnn_addto]
         data_initial_grad = np.random.normal(size=exe_list[ref_idx].grad_dict['data'].shape).astype(np.float32)
         grid_initial_grad = np.random.normal(size=exe_list[ref_idx].grad_dict['grid'].shape).astype(np.float32)
@@ -2257,9 +2173,9 @@ def test_bilinear_sampler_versions():
 
         for req_dict in [{'data' : 'null', 'grid' : 'write'}, {'data' : 'write', 'grid' : 'null'}]:
             # Mixture of kWriteTo and kNullOp
-            exe_cpu_mix = sym1.simple_bind(data=data_shape, grid=grid_shape, ctx=mx.cpu(), grad_req=req_dict)
-            exe_gpu_mix = sym2.simple_bind(data=data_shape, grid=grid_shape, ctx=default_context(), grad_req=req_dict)
-            exe_cudnn_mix = sym3.simple_bind(data=data_shape, grid=grid_shape, ctx=default_context(), grad_req=req_dict)
+            exe_cpu_mix = sym1._simple_bind(data=data_shape, grid=grid_shape, ctx=mx.cpu(), grad_req=req_dict)
+            exe_gpu_mix = sym2._simple_bind(data=data_shape, grid=grid_shape, ctx=default_context(), grad_req=req_dict)
+            exe_cudnn_mix = sym3._simple_bind(data=data_shape, grid=grid_shape, ctx=default_context(), grad_req=req_dict)
             exe_list = [exe_cpu_mix, exe_gpu_mix, exe_cudnn_mix]
             for exe in exe_list:
                 exe.arg_dict['data'][:] = test_data
@@ -2287,7 +2203,7 @@ def _test_bulking_in_process(seed, time_per_iteration):
     x = mx.ndarray.zeros(data_shape)
     dx = mx.ndarray.zeros(data_shape)
     dy = mx.ndarray.ones(data_shape)
-    exe = sym.bind(ctx=ctx, args=[x], args_grad = {'X':dx})
+    exe = sym._bind(ctx=ctx, args=[x], args_grad = {'X':dx})
 
     # time a number of forward() and backward() executions after some warm-up iterations
     warmups = 1
@@ -2412,7 +2328,7 @@ def test_arange_like_dtype():
         y = mx.sym.reshape(x, shape=(0, 0, -1))
         z = mx.sym.contrib.arange_like(y, axis=-1)
 
-        mod = z.simple_bind(ctx=mx.gpu(0), x=(3, 4, 5, 6), grad_req='null')
+        mod = z._simple_bind(ctx=mx.gpu(0), x=(3, 4, 5, 6), grad_req='null')
         mod.arg_arrays[0][:] = np.random.normal(size=mod.arg_arrays[0].shape).astype(t)
         out = mod.forward(is_train=False)
         for v in out:
diff --git a/tests/python/mkl/test_bf16_operator.py b/tests/python/mkl/test_bf16_operator.py
index a67bfb548796..3455c8787608 100644
--- a/tests/python/mkl/test_bf16_operator.py
+++ b/tests/python/mkl/test_bf16_operator.py
@@ -70,7 +70,7 @@ def check_operator_accuracy(sym_fp32, sym_bf16, data_shape, num_input_data=1, bf
     arg_names = sym_fp32.list_arguments()
     aux_names = sym_fp32.list_auxiliary_states()
 
-    exe_fp32 = sym_fp32.simple_bind(ctx=mx.cpu(), data=data_shape)
+    exe_fp32 = sym_fp32._simple_bind(ctx=mx.cpu(), data=data_shape)
 
     arg_params_fp32 = {}
     aux_params_fp32 = {}
@@ -91,7 +91,7 @@ def check_operator_accuracy(sym_fp32, sym_bf16, data_shape, num_input_data=1, bf
 
     output_fp32 = exe_fp32.forward()[0]
 
-    exe_bf16 = sym_bf16.simple_bind(ctx=mx.cpu(), data=data_shape, type_dict=type_dict)
+    exe_bf16 = sym_bf16._simple_bind(ctx=mx.cpu(), data=data_shape, type_dict=type_dict)
 
     arg_params_bf16 = {}
     aux_params_bf16 = {}
diff --git a/tests/python/mkl/test_contrib_amp.py b/tests/python/mkl/test_contrib_amp.py
index cd85ce6f0bb8..fed72e2e6340 100644
--- a/tests/python/mkl/test_contrib_amp.py
+++ b/tests/python/mkl/test_contrib_amp.py
@@ -105,14 +105,14 @@ def test_bf16_casting():
     # data should be float32
     res = mx.sym.Group([out1, out2])
     final_res = amp.convert_symbol(res, data_names=[], target_dtype="bfloat16", cast_optional_params=True)
-    exe = final_res.simple_bind(ctx=mx.cpu(), data=(1, 2))
+    exe = final_res._simple_bind(ctx=mx.cpu(), data=(1, 2))
     assert exe.arg_arrays[0].dtype == np.float32
 
     # When two ops from data, both casted to bfloat16,
     # data should be bfloat16
     res = mx.sym.Group([out1, out3])
     final_res = amp.convert_symbol(res, data_names=[], target_dtype="bfloat16", cast_optional_params=True)
-    exe = final_res.simple_bind(ctx=mx.cpu(), data=(1, 2))
+    exe = final_res._simple_bind(ctx=mx.cpu(), data=(1, 2))
     assert exe.arg_arrays[0].dtype == bfloat16
 
     # AMP Multicast test where one node is float32, another is bfloat16
@@ -120,7 +120,7 @@ def test_bf16_casting():
     data2 = mx.sym.var("data2", dtype=bfloat16)
     out4 = mx.sym.amp_multicast(data, data2, num_outputs=2)
     final_res = amp.convert_symbol(out4, target_dtype="bfloat16", cast_optional_params=True)
-    exe = final_res.simple_bind(ctx=mx.cpu(), data2=(1, 2), data=(1, 2))
+    exe = final_res._simple_bind(ctx=mx.cpu(), data2=(1, 2), data=(1, 2))
     assert exe.arg_arrays[0].dtype == bfloat16
 
     # AMP Multicast test where two non input nodes are bfloat16,
@@ -133,7 +133,7 @@ def test_bf16_casting():
                                 num_outputs=2)
     final_res = amp.convert_symbol(out5, target_dtype_ops=[], target_dtype="bfloat16",
                                    fp32_ops=[], cast_optional_params=True)
-    exe = final_res.simple_bind(ctx=mx.cpu(), data=(1, 2), data2=(1, 2), data3=(1, 2))
+    exe = final_res._simple_bind(ctx=mx.cpu(), data=(1, 2), data2=(1, 2), data3=(1, 2))
     assert exe.arg_arrays[0].dtype == np.float32
 
     # AMP Multicast test where three input nodes one bf16, one fp32
@@ -144,7 +144,7 @@ def test_bf16_casting():
     out6 = mx.sym.amp_multicast(data, data2, data3, num_outputs=3)
     final_res = amp.convert_symbol(out6, target_dtype_ops=[], target_dtype="bfloat16",
                                    fp32_ops=[], cast_optional_params=True)
-    exe = final_res.simple_bind(ctx=mx.cpu(), data=(1, 2), data2=(1, 2),
+    exe = final_res._simple_bind(ctx=mx.cpu(), data=(1, 2), data2=(1, 2),
                                 data3=(1, 2))
     assert exe.arg_arrays[2].dtype == np.float32
 
@@ -155,7 +155,7 @@ def test_bf16_casting():
     out7 = mx.sym.Group([mx.sym.amp_multicast(data, data2, num_outputs=2), mx.sym.amp_cast(data, dtype=bfloat16)])
     final_res = amp.convert_symbol(out7, target_dtype_ops=[], target_dtype="bfloat16",
                                    fp32_ops=[], cast_optional_params=True)
-    exe = final_res.simple_bind(ctx=mx.cpu(), data=(1, 2), data2=(1, 2))
+    exe = final_res._simple_bind(ctx=mx.cpu(), data=(1, 2), data2=(1, 2))
     assert exe.arg_arrays[0].dtype == bfloat16
 
     # Input node to amp_multicast and amp_cast, if dtypes conflict
@@ -165,6 +165,5 @@ def test_bf16_casting():
     out8 = mx.sym.Group([mx.sym.amp_multicast(data, data2, num_outputs=2), mx.sym.amp_cast(data, dtype=bfloat16)])
     final_res = amp.convert_symbol(out8, target_dtype_ops=[], target_dtype="bfloat16",
                                    fp32_ops=[], cast_optional_params=True)
-    exe = final_res.simple_bind(ctx=mx.cpu(), data=(1, 2), data2=(1, 2))
+    exe = final_res._simple_bind(ctx=mx.cpu(), data=(1, 2), data2=(1, 2))
     assert exe.arg_arrays[0].dtype == bfloat16
-
diff --git a/tests/python/mkl/test_mkldnn.py b/tests/python/mkl/test_mkldnn.py
index e9f9e0171834..d8489240d552 100644
--- a/tests/python/mkl/test_mkldnn.py
+++ b/tests/python/mkl/test_mkldnn.py
@@ -75,7 +75,7 @@ def test_reshape_after_conv(dst_shape):
         data = mx.symbol.Variable('data')
         conv = mx.symbol.Convolution(data=data, num_filter=16, kernel=(1, 1), pad=(0, 0), stride=(1, 1))
         res = mx.symbol.reshape(data=conv, shape=dst_shape)
-        exe = res.simple_bind(mx.cpu(), data=shape, grad_req='null')
+        exe = res._simple_bind(mx.cpu(), data=shape, grad_req='null')
 
         val1 = np.random.uniform(-1, 1, shape)
         val2 = np.random.uniform(-1, 1, (16, 1, 1, 1))
@@ -86,7 +86,7 @@ def test_reshape_after_conv(dst_shape):
         exe.arg_arrays[2][:] = val3
         outputs = exe.forward(is_train=False)[0].asnumpy()
 
-        conv_exe = conv.simple_bind(mx.cpu(), data=shape, grad_req='null')
+        conv_exe = conv._simple_bind(mx.cpu(), data=shape, grad_req='null')
         conv_exe.arg_arrays[0][:] = val1
         conv_exe.arg_arrays[1][:] = val2
         conv_exe.arg_arrays[2][:] = val3
@@ -210,7 +210,7 @@ def test_flatten_slice_after_conv():
 
     shape = (2, 16, 16, 16)
     val = np.random.rand(2, 16, 16, 16).astype(np.float32)
-    exe = slice1.simple_bind(Context.default_ctx, data=shape)
+    exe = slice1._simple_bind(Context.default_ctx, data=shape)
     exe.arg_arrays[0][:] = val
     exe.arg_arrays[1][:] = np.random.normal(size=exe.arg_arrays[1].shape)
     exe.arg_arrays[2][:] = np.random.normal(size=exe.arg_arrays[2].shape)
@@ -220,16 +220,15 @@ def test_flatten_slice_after_conv():
 
 
 def test_mkldnn_sum_inplace_with_cpu_layout():
-
     x_shape = (32, 3, 224, 224)
-    x_npy = np.ones(x_shape)
+    x_npy = np.ones(x_shape, dtype='float32')
     y_shape = (32, 32, 222, 222)
-    y_npy = np.ones(y_shape)
+    y_npy = np.ones(y_shape, dtype='float32')
     x = mx.sym.Variable("x")
     y = mx.sym.Variable("y")
     z = mx.symbol.Convolution(data=x, num_filter=32, kernel=(3, 3))
     z = mx.sym.add_n(z, y)
-    exe = z.simple_bind(ctx=mx.cpu(), x=x_shape, y=y_shape)
+    exe = z._simple_bind(ctx=mx.cpu(), x=x_shape, y=y_shape)
     out = exe.forward(is_train=False, x=x_npy, y=y_npy)[0]
     assert_almost_equal(out[0].asnumpy()[0, 0, 0], 1.0)
 
@@ -268,7 +267,7 @@ def check_batchnorm_relu_fusion(shape):
         grad_out = mx.nd.random.uniform(0, 1, shape)
         bn = mx.sym.BatchNorm(data=x, fix_gamma=False)
         relu = mx.sym.Activation(data=bn, act_type='relu', name='relu')
-        exe = relu.simple_bind(ctx=mx.cpu(), x=shape, grad_req='write')
+        exe = relu._simple_bind(ctx=mx.cpu(), x=shape, grad_req='write')
         exe.arg_arrays[0][:] = in_data
         exe.forward(is_train=True)
         exe.backward(grad_out)
@@ -276,7 +275,7 @@ def check_batchnorm_relu_fusion(shape):
         no_fuse_grads = exe.grad_arrays
 
         bnrelu = mx.sym.contrib.BatchNormWithReLU(data=x, fix_gamma=False)
-        exe_fuse = bnrelu.simple_bind(ctx=mx.cpu(), x=shape, grad_req='write')
+        exe_fuse = bnrelu._simple_bind(ctx=mx.cpu(), x=shape, grad_req='write')
         exe_fuse.arg_arrays[0][:] = in_data
         exe_fuse.forward(is_train=True)
         exe_fuse.backward(grad_out)
@@ -473,7 +472,7 @@ def test_softmax_with_large_inputs():
     def softmax_forward(input_data, true_output):
         data = mx.sym.Variable('data')
         out1 = data.softmax(axis=1)
-        exec1 = out1.bind(mx.cpu(), args={'data': input_data})
+        exec1 = out1._bind(mx.cpu(), args={'data': input_data})
         exec1.forward()[0].wait_to_read()
         ndarr = exec1.outputs[0][0][0][0]
         nparr = ndarr.asnumpy()
@@ -525,7 +524,7 @@ def backward(self, req, out_grad, in_data, out_data, in_grad, aux):
     data = mx.symbol.Variable('data')
     conv = mx.sym.Convolution(data=data, kernel=(5, 5), pad=(1, 1), stride=(1,1), num_filter=8, name="conv", no_bias=True)
     custom = mx.symbol.Custom(name='custom', data=conv, op_type='custom')
-    exec1 = custom.bind(mx.cpu(), args={'data': mx.nd.ones([10,3,96,96]), 'conv_weight': mx.nd.ones([8,3,5,5])})
+    exec1 = custom._bind(mx.cpu(), args={'data': mx.nd.ones([10,3,96,96]), 'conv_weight': mx.nd.ones([8,3,5,5])})
     exec1.forward()[0].wait_to_read()
 
 @with_seed()
@@ -595,7 +594,7 @@ def ref_concat(a, b, axis):
       z = mx.sym.concat(a_sym, b_sym, dim=axis)
       a = np.random.uniform(-1, 1, a_shape)
       b = np.random.uniform(-1, 1, b_shape)
-      exe = z.simple_bind(ctx=mx.cpu(), a=a_shape, b=b_shape)
+      exe = z._simple_bind(ctx=mx.cpu(), a=a_shape, b=b_shape)
       out = exe.forward(is_train=False, a=a, b=b)
       ref_out = ref_concat(a, b, axis=axis)
       out = out[0].asnumpy()
@@ -628,7 +627,7 @@ def ref_add(a, b):
     z = mx.sym.elemwise_add(a_sym, b_sym)
     a = np.random.uniform(-1, 1, a_shape)
     b = np.random.uniform(-1, 1, b_shape)
-    exe = z.simple_bind(ctx=mx.cpu(), a=a_shape, b=b_shape)
+    exe = z._simple_bind(ctx=mx.cpu(), a=a_shape, b=b_shape)
     out = exe.forward(is_train=False, a=a, b=b)
     ref_out = ref_add(a, b)
     out = out[0].asnumpy()
diff --git a/tests/python/mkl/test_quantization_mkldnn.py b/tests/python/mkl/test_quantization_mkldnn.py
deleted file mode 100644
index f2432175720a..000000000000
--- a/tests/python/mkl/test_quantization_mkldnn.py
+++ /dev/null
@@ -1,31 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import os
-import sys
-import mxnet as mx
-
-os.environ['ENABLE_MKLDNN_QUANTIZATION_TEST'] = '1'
-os.environ['MXNET_SUBGRAPH_BACKEND'] = 'NONE'
-curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
-sys.path.insert(0, os.path.join(curr_path, '../quantization'))
-from test_quantization import *
-
-if __name__ == '__main__':
-    import pytest
-    pytest.main()
-    del os.environ['ENABLE_MKLDNN_QUANTIZATION_TEST']
-    del os.environ['MXNET_SUBGRAPH_BACKEND']
diff --git a/tests/python/mkl/test_subgraph.py b/tests/python/mkl/test_subgraph.py
index e7efbca4e1b1..ca68d6069390 100644
--- a/tests/python/mkl/test_subgraph.py
+++ b/tests/python/mkl/test_subgraph.py
@@ -21,226 +21,20 @@
 import numpy as np
 import unittest
 import ctypes
-from mxnet.symbol import Symbol
-from importlib import import_module
-from numpy.testing import assert_allclose
-from mxnet.base import SymbolHandle, check_call, _LIB, mx_uint, c_str
-from mxnet.test_utils import DummyIter
-curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
-sys.path.append(os.path.join(curr_path, '../unittest/'))
 from common import with_seed
-from mxnet.test_utils import assert_almost_equal, assert_almost_equal_with_err
-import itertools
 import pytest
-import tempfile
-
-OP_NAME='op_name'
-QUANTIZED_OP_NAME='quantized_op_name'
-SG_PASS_NAME='MKLDNN'
-QUANTIZE_SG_PASS_NAME='MKLDNN_QUANTIZE'
-config =  {
-  'conv': {
-    OP_NAME: 'sg_mkldnn_conv',
-    QUANTIZED_OP_NAME: 'quantized_sg_mkldnn_conv'
-  },
-  'fc': {
-    OP_NAME: 'sg_mkldnn_fully_connected',
-    QUANTIZED_OP_NAME: 'quantized_sg_mkldnn_fully_connected'
-  }
-}
-
-DATA_SHAPE=[(64, 4, 10, 10), (4, 3, 24, 24), (1, 16, 32, 32)]
-fc_post_ops_list=['relu', 'sigmoid', 'tanh', 'softrelu',
-                  'square', 'square_root', 'abs', 'exp', 'bounded_relu']
-
-def check_qsym_calibrated(qsym, out_type, name='conv'):
-  quantized_op_name = 'quantized_' + name
-  assert ''.join(qsym.attr_dict().keys()).find(quantized_op_name) != -1
-  for k, v in qsym.attr_dict().items():
-    if k.find('_quantize') != -1:
-      assert v['out_type'] == out_type
-    if k.find(quantized_op_name) != -1:
-      if quantized_op_name.startswith("quantized_sg_mkldnn_fully_connected") and 'enable_float_output' in v:
-        continue
-      assert 'min_calib_range' in v
-      assert 'max_calib_range' in v
-
-def check_qsym_scale_align(qsym):
-  assert ''.join(qsym.attr_dict().keys()).find('quantized_sg_mkldnn_conv') != -1
-  init = False
-  for k, v in qsym.attr_dict().items():
-    if k.find('quantized_sg_mkldnn_conv') != -1:
-      assert 'min_calib_range' in v
-      assert 'max_calib_range' in v
-      if not init:
-        min_calib_range = v['min_calib_range']
-        max_calib_range = v['max_calib_range']
-        init = True
-      else:
-        assert min_calib_range == v['min_calib_range']
-        assert max_calib_range == v['max_calib_range']
-
-
-def check_qsym_gluon_forward(path, qsym, qarg_params, qaux_params, data_shape):
-  # save qsym to JSON file
-  _, json_path = tempfile.mkstemp(suffix='-symbol.json', dir=path)
-  params_path = json_path.replace('-symbol.json', '-0000.params')
-  qsym.save(json_path)
-  # save params
-  save_dict = {('arg:%s' % k): v.as_in_context(mx.current_context()) for k, v in qarg_params.items()}
-  save_dict.update({('aux:%s' % k): v.as_in_context(mx.current_context()) for k, v in qaux_params.items()})
-  mx.nd.save(params_path, save_dict)
-  # load back with SymbolBlock
-  net = mx.gluon.SymbolBlock.imports(json_path, ['data'], params_path)
-  net.reset_ctx(ctx = mx.current_context())
-  net.hybridize()
-
-  data = mx.random.uniform(-1.0, 1.0, shape=data_shape)
-  net(data)
-
-class CalibIter(mx.io.DataIter):
-    def __init__(self, batch, data_shape, batch_size):
-        super(CalibIter, self).__init__(batch_size)
-        self.data_shape = data_shape
-        self.label_shape = (batch_size,)
-        self.provide_data = [('data', self.data_shape)]
-        self.provide_label = []
-        self.batch = batch
-
-    def __iter__(self):
-        yield self.batch
-
-
-def check_neg_fusion(syms, attrs_name=None, excluded_attrs=None,
-                     date_shape=(4,4,10,10), name='conv'):
-  op_name = config[name][OP_NAME]
-
-  for sym, attrs, excluded_attr in zip(syms, attrs_name, excluded_attrs):
-    sym_sg = sym.get_backend_symbol(SG_PASS_NAME)
-    exe_sg = sym_sg.simple_bind(mx.cpu(), data=date_shape, grad_req='null')
-
-    attrs_dict = sym_sg.attr_dict()
-    for k, v in attrs_dict.items():
-      if k.find(op_name) != -1:
-        for attr in attrs:
-          assert v[attr] == 'true'
-        for exc_attr in excluded_attr:
-          assert exc_attr not in v.keys()
-
-def head_symbol(data_shape):
-  data = mx.symbol.Variable('data', shape=data_shape, dtype='float32')
-  weight = mx.symbol.Variable('weight', dtype='float32')
-  return data, weight
-
-
-# conv + bn fusion case
-def conv_bn(no_bias, data_shape):
-  attr = {'conv': {'with_bn': 'true'}}
-  data, weight = head_symbol(data_shape)
-  conv = mx.symbol.Convolution(data=data, weight=weight, name='conv', num_filter=64,
-                               kernel=(3, 3), stride=(1, 1), no_bias=no_bias)
-  bn1 = mx.symbol.BatchNorm(data=conv, name="bn1")
-  return bn1, attr
-
-# conv + act fusion case
-def conv_act(no_bias, data_shape, alg):
-  attr = {'conv': {'with_act': 'true'}}
-  data, weight = head_symbol(data_shape)
-  conv = mx.symbol.Convolution(data=data, weight=weight, name='conv', num_filter=64,
-                               kernel=(3, 3), stride=(1, 1), no_bias=no_bias)
-  if alg == "relu6":
-    relu = mx.symbol.clip(data=conv, name='relu6', a_min=0, a_max=6)
-  elif alg == "leakyrelu":
-    relu = mx.symbol.LeakyReLU(data=conv, slope=0.25, act_type='leaky')
-  elif alg == "gelu":
-    relu = mx.symbol.LeakyReLU(data=conv, act_type='gelu')
-  else:
-    relu = mx.symbol.Activation(data=conv, name=alg, act_type=alg)
-  return relu, attr
-
-# conv + act + sum fusion case
-def conv_act_sum(no_bias, data_shape, alg):
-  attr = {'conv': {'with_act': 'true', 'with_sum': 'true'}}
-  data, weight = head_symbol(data_shape)
-  conv = mx.symbol.Convolution(data=data, weight=weight, name='conv', num_filter=64,
-                               kernel=(3, 3), stride=(1, 1), no_bias=no_bias)
-  if alg == "relu6":
-    relu = mx.symbol.clip(data=conv, name='relu6', a_min=0, a_max=6)
-  elif alg == "leakyrelu":
-    relu = mx.symbol.LeakyReLU(data=conv, slope=0.25, act_type='leaky')
-  elif alg == "gelu":
-    relu = mx.symbol.LeakyReLU(data=conv, act_type='gelu')
-  else:
-    relu = mx.symbol.Activation(data=conv, name=alg, act_type=alg)
-  conv1 = mx.symbol.Convolution(data=data, weight=weight, name='conv1', num_filter=64,
-                               kernel=(3, 3), stride=(1, 1), no_bias=no_bias)
-  sum = relu + conv1
-  return sum, attr
-
-
-# mobilenetv2 case
-def mobilenetv2_struct(data_shape):
-  attr = {'sg_mkldnn_conv_bn_0' : {'with_bn': 'true'}}
-  data = mx.symbol.Variable('data', shape=data_shape, dtype='float32')
-  weight1 = mx.symbol.Variable('conv1_weight', dtype='float32')
-  weight2 = mx.symbol.Variable('conv2_weight', dtype='float32')
-  conv1 = mx.symbol.Convolution(data=data, weight=weight1, name='conv1', num_filter=64,
-                               kernel=(1, 1), stride=(1, 1), no_bias=True)
-  bn1 = mx.symbol.BatchNorm(data=conv1, name="bn1")
-  conv2 = mx.symbol.Convolution(data=bn1, weight=weight2, name='conv2', num_filter=64,
-                               kernel=(1, 1), stride=(1, 1), no_bias=True)
-  bn2 = mx.symbol.BatchNorm(data=conv2, name="bn2")
-  sum = bn1 + bn2
-  return sum, attr
-
-def single_fc(no_bias, data_shape, flatten=True):
-  attr = {'fc': {}}
-  data, weight = head_symbol(data_shape)
-  fc = mx.symbol.FullyConnected(name='fc', data=data, weight=weight, num_hidden=64,
-                                no_bias=no_bias, flatten=flatten)
-  return fc, attr
-
-# fc + eltwise fusion case
-def fc_eltwise(no_bias, data_shape, flatten=True, alg='relu'):
-  assert alg in fc_post_ops_list
-
-  attr = {'fc': {'with_eltwise': 'true'}}
-  data, weight = head_symbol(data_shape)
-  fc = mx.symbol.FullyConnected(name='fc', data=data, weight=weight, num_hidden=64,
-                                no_bias=no_bias, flatten=flatten)
-  if alg in ['relu', 'sigmoid', 'tanh', 'softrelu']:
-    sym = mx.symbol.Activation(data=fc, name='act', act_type=alg)
-  elif alg == 'square':
-    sym = mx.symbol.square(data=fc, name='square')
-  elif alg == 'square_root':
-    sym = mx.symbol.sqrt(data=fc, name='sqrt')
-  elif alg == 'abs':
-    sym = mx.symbol.abs(data=fc, name='abs')
-  elif alg == 'exp':
-    sym = mx.symbol.exp(data=fc, name='exp')
-  else:
-    sym = mx.symbol.clip(data=fc, name='bounded_relu', a_min=0, a_max=1.0)
-
-  return sym, attr
-
 
 def test_float64_fallback():
     sym = mx.sym.FullyConnected(
         mx.sym.Variable('in'),
         mx.sym.Variable('w'),
         mx.sym.Variable('b'),
-        num_hidden=2
-    )
+        num_hidden=2)
 
     dtype = 'float64'
-    ex = sym.bind(mx.cpu(),
-                  {
-        'in': mx.nd.array([[2, 3, 4]], dtype=dtype),
+    args = {'in': mx.nd.array([[2, 3, 4]], dtype=dtype),
         'w': mx.nd.array([[1, 2, 3], [4, 5, 6]], dtype=dtype),
-        'b': mx.nd.array([7, 8], dtype=dtype)
-    },
-        args_grad=None,
-        grad_req='write'
-    )
+        'b': mx.nd.array([7, 8], dtype=dtype)}
+    ex = sym._bind(mx.cpu(), args, args_grad=None, grad_req='write')
     ex.forward()
     ex.outputs[0].wait_to_read()
diff --git a/tests/python/quantization/common.py b/tests/python/quantization/common.py
deleted file mode 120000
index dccb90b10675..000000000000
--- a/tests/python/quantization/common.py
+++ /dev/null
@@ -1 +0,0 @@
-../unittest/common.py
\ No newline at end of file
diff --git a/tests/python/quantization/test_quantization.py b/tests/python/quantization/test_quantization.py
deleted file mode 100644
index 7e01734e61a8..000000000000
--- a/tests/python/quantization/test_quantization.py
+++ /dev/null
@@ -1,800 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Some of the tests using CUDNN require a special GPU instruction called dp4a.
-Ref: http://images.nvidia.com/content/pdf/tesla/184457-Tesla-P4-Datasheet-NV-Final-Letter-Web.pdf
-"""
-import os
-import mxnet as mx
-import numpy as np
-from mxnet.gluon.model_zoo import vision
-from mxnet.test_utils import assert_almost_equal, assert_exception, rand_ndarray, rand_shape_nd, same, DummyIter
-from common import with_seed, xfail_when_nonstandard_decimal_separator
-from mxnet.io import NDArrayIter
-import unittest
-import operator
-
-def is_test_for_gpu():
-    return mx.current_context().device_type == 'gpu'
-
-def is_test_for_mkldnn():
-    return (mx.current_context().device_type == 'cpu'
-            and os.environ.get('ENABLE_MKLDNN_QUANTIZATION_TEST') == '1')
-
-def is_test_for_native_cpu():
-    return (mx.current_context().device_type == 'cpu'
-            and os.environ.get('ENABLE_MKLDNN_QUANTIZATION_TEST') == None)
-
-@with_seed()
-def test_quantize_float32_to_int8():
-    shape = rand_shape_nd(4)
-    data = rand_ndarray(shape, 'default', dtype='float32')
-    min_range = mx.nd.min(data)
-    max_range = mx.nd.max(data)
-    qdata, min_val, max_val = mx.nd.contrib.quantize(data, min_range, max_range, out_type='int8')
-    data_np = data.asnumpy()
-    min_range = min_range.asscalar()
-    max_range = max_range.asscalar()
-    real_range = np.maximum(np.abs(min_range), np.abs(max_range))
-    quantized_range = 127.0
-    scale = quantized_range / real_range
-    assert qdata.dtype == np.int8
-    assert min_val.dtype == np.float32
-    assert max_val.dtype == np.float32
-    assert same(min_val.asscalar(), -real_range)
-    assert same(max_val.asscalar(), real_range)
-    qdata_np = (np.sign(data_np) * np.minimum(np.abs(data_np) * scale + 0.5, quantized_range)).astype(np.int8)
-    assert_almost_equal(qdata.asnumpy(), qdata_np, atol = 1)
-
-
-@with_seed()
-def test_dequantize_int8_to_float32():
-
-    def get_test_data(real_range, qdata_np):
-        qdata = mx.nd.array(qdata_np, dtype=np.int8)
-        min_range = mx.nd.array([-real_range], dtype=np.float32)
-        max_range = mx.nd.array([real_range], dtype=np.float32)
-        return qdata, min_range, max_range
-
-    def baseline_dequantization(qdata, real_range, qdata_np):
-        quantized_range = 127.0
-        scale = real_range / quantized_range
-        data_np = qdata_np * scale
-        return data_np
-
-    def test_nd_array_dequantization(qdata, min_range, max_range, expected_result):
-        data = mx.nd.contrib.dequantize(qdata, min_range, max_range, out_type='float32')
-        assert data.dtype == np.float32
-        assert_almost_equal(data.asnumpy(), expected_result, atol = 1)
-
-    def test_symbolic_api_dequantization(qdata, min_range, max_range, expected_result):
-        sym_data = mx.sym.Variable('data')
-        sym_min_range = mx.sym.Variable('min_range')
-        sym_max_range = mx.sym.Variable('max_range')
-        dequant = mx.sym.contrib.dequantize(sym_data, sym_min_range,
-                                            sym_max_range, out_type='float32')
-        out = dequant.bind(ctx=mx.current_context(),
-                           args={'data':qdata, 'min_range':min_range, 'max_range':max_range})
-        data = out.forward()[0]
-        assert data.dtype == np.float32
-        assert_almost_equal(data.asnumpy(), expected_result, atol = 1)
-
-    real_range = 128
-    shape = rand_shape_nd(4)
-    qdata_np = np.random.uniform(low=-127, high=127, size=shape).astype(dtype=np.int8)
-    qdata, min_range, max_range = get_test_data(real_range, qdata_np)
-    expected_result = baseline_dequantization(qdata, real_range, qdata_np)
-    # test nd array implementation.
-    test_nd_array_dequantization(qdata, min_range, max_range, expected_result)
-    # test symbolic api implementaion.
-    test_symbolic_api_dequantization(qdata, min_range, max_range, expected_result)
-
-@with_seed()
-def test_requantize_int32_to_int8():
-    def quantized_int32_to_float(qdata, min_range, max_range):
-        assert qdata.dtype == 'int32'
-        quantized_range = np.iinfo('int32').max
-        real_range = np.maximum(np.abs(min_range), np.abs(max_range))
-        scale = float(real_range) / float(quantized_range)
-        return qdata.astype('float32') * scale
-
-    def float_to_quantized_int8(data, min_range, max_range):
-        assert data.dtype == 'float32'
-        real_range = np.maximum(np.abs(min_range), np.abs(max_range))
-        quantized_range = np.iinfo('int8').max
-        scale = float(quantized_range) / float(real_range)
-        return (np.sign(data) * np.minimum(np.abs(data) * scale + 0.5, quantized_range)).astype('int8')
-
-    def requantize(qdata, min_data, max_data, real_range):
-        data = quantized_int32_to_float(qdata, min_data, max_data)
-        output = float_to_quantized_int8(data, -real_range, real_range)
-        return output, -real_range, real_range
-
-    def requantize_baseline(qdata, min_data, max_data, min_calib_range=None, max_calib_range=None):
-        if min_calib_range is not None and max_calib_range is not None:
-            real_range = np.maximum(np.abs(min_calib_range), np.abs(max_calib_range))
-            return requantize(qdata, min_data, max_data, real_range)
-        else:
-            min_range = quantized_int32_to_float(np.min(qdata), min_data, max_data)
-            max_range = quantized_int32_to_float(np.max(qdata), min_data, max_data)
-            return requantize(qdata, min_data, max_data, np.maximum(np.abs(min_range), np.abs(max_range)))
-
-    def check_requantize(shape, min_calib_range=None, max_calib_range=None):
-        qdata = mx.nd.random.uniform(low=-1000.0, high=1000.0, shape=shape).astype('int32')
-        min_range = mx.nd.array([-1010.0])
-        max_range = mx.nd.array([1020.0])
-        if min_calib_range is None or max_calib_range is None:
-            qdata_int8, min_output, max_output = mx.nd.contrib.requantize(qdata, min_range, max_range)
-        else:
-            qdata_int8, min_output, max_output = mx.nd.contrib.requantize(qdata, min_range, max_range,
-                                                                          min_calib_range=min_calib_range,
-                                                                          max_calib_range=max_calib_range)
-
-        qdata_int8_np, min_output_np, max_output_np = requantize_baseline(qdata.asnumpy(), min_range.asscalar(),
-                                                                          max_range.asscalar(),
-                                                                          min_calib_range=min_calib_range,
-                                                                          max_calib_range=max_calib_range)
-        assert_almost_equal(qdata_int8.asnumpy(), qdata_int8_np, atol = 1)
-        assert_almost_equal(min_output.asnumpy(), np.array([min_output_np]))
-        assert_almost_equal(max_output.asnumpy(), np.array([max_output_np]))
-
-    def check_requantize_with_symbol(shape, min_calib_range=None, max_calib_range=None):
-        qdata = mx.nd.random.uniform(low=-1000.0, high=1000.0, shape=shape).astype('int32')
-        min_range = mx.nd.array([-1010.0])
-        max_range = mx.nd.array([1020.0])
-        sym_data = mx.sym.Variable('data')
-        sym_min_range = mx.sym.Variable('min_range')
-        sym_max_range = mx.sym.Variable('max_range')
-        if min_calib_range is None or max_calib_range is None:
-            requant = mx.sym.contrib.requantize(sym_data, sym_min_range, sym_max_range)
-            out = requant.bind(ctx=mx.current_context(),
-                               args={'data':qdata, 'min_range':min_range,
-                               'max_range':max_range})
-            qdata_int8, min_output, max_output = out.forward()
-        else:
-            requant = mx.sym.contrib.requantize(sym_data, sym_min_range, sym_max_range,
-                                                min_calib_range=min_calib_range,
-                                                max_calib_range=max_calib_range)
-            out = requant.bind(ctx=mx.current_context(), args={'data':qdata, 'min_range':min_range,
-                               'max_range':max_range})
-            qdata_int8, min_output, max_output = out.forward()
-
-        qdata_int8_np, min_output_np, max_output_np = requantize_baseline(qdata.asnumpy(), min_range.asscalar(),
-                                                                          max_range.asscalar(),
-                                                                          min_calib_range=min_calib_range,
-                                                                          max_calib_range=max_calib_range)
-        assert_almost_equal(qdata_int8.asnumpy(), qdata_int8_np, atol = 1)
-        assert_almost_equal(min_output.asnumpy(), np.array([min_output_np]))
-        assert_almost_equal(max_output.asnumpy(), np.array([max_output_np]))
-
-    # test with symbol API.
-    check_requantize_with_symbol((3, 4, 10, 10))
-    check_requantize_with_symbol((32, 3, 23, 23))
-    check_requantize_with_symbol((3, 4, 10, 10), min_calib_range=-1050.0, max_calib_range=1040.0)
-    check_requantize_with_symbol((32, 3, 23, 23), min_calib_range=-134.349, max_calib_range=523.43)
-    # Test with nd array API
-    check_requantize((3, 4, 10, 10))
-    check_requantize((32, 3, 23, 23))
-    check_requantize((3, 4, 10, 10), min_calib_range=-1050.0, max_calib_range=1040.0)
-    check_requantize((32, 3, 23, 23), min_calib_range=-134.349, max_calib_range=523.43)
-
-
-@with_seed()
-def test_quantized_conv():
-    def check_quantized_conv(data_shape, kernel, num_filter, pad, stride, dilate, no_bias, qdtype):
-        if is_test_for_native_cpu():
-            print('skipped testing quantized_conv for native cpu since it is not supported yet')
-            return
-        elif is_test_for_mkldnn():
-            # (TODO)Xinyu: https://github.com/apache/incubator-mxnet/issues/16830
-            print('skipped testing quantized_conv for mkldnn cpu since it is a flaky case')
-            return
-        elif qdtype == 'uint8' and is_test_for_gpu():
-            print('skipped testing quantized_conv for gpu uint8 since it is not supported yet')
-            return
-        elif is_test_for_gpu() and len(data_shape) != 4:
-            print('skipped testing quantized_conv for gpu 5d layout since it is not supported yet')
-            return
-
-        # run fp32 conv
-        data = mx.sym.Variable(name='data', shape=data_shape, dtype='float32')
-        conv = mx.sym.Convolution(data=data, kernel=kernel, num_filter=num_filter, pad=pad, stride=stride,
-                                  dilate=dilate, no_bias=no_bias, cudnn_off=False, name='conv')
-        arg_shapes, _, _ = conv.infer_shape(data=data_shape)
-        arg_names = conv.list_arguments()
-        conv_exe_fp32 = conv.simple_bind(ctx=mx.current_context(), grad_req='null')
-        if qdtype == 'uint8':
-            data_low = 0.0
-            data_high = 127.0
-        else:
-            data_low = -127.0
-            data_high = 127.0
-        conv_exe_fp32.arg_dict[arg_names[0]][:] = mx.nd.random.uniform(low=data_low, high=data_high,
-                                                                       shape=data_shape).astype('int32')
-        conv_exe_fp32.arg_dict[arg_names[1]][:] = mx.nd.random.uniform(low=-127.0, high=127.0,
-                                                                       shape=arg_shapes[1]).astype('int32')
-        if not no_bias:
-            conv_exe_fp32.arg_dict[arg_names[2]][:] = mx.nd.random.uniform(low=-127.0, high=127.0,
-                                                                           shape=arg_shapes[2]).astype('int32')
-        output = conv_exe_fp32.forward()[0]
-
-        # run quantized conv
-        qdata = mx.sym.Variable(name='qdata', shape=data_shape, dtype=qdtype)
-        qweight = mx.sym.Variable(name='qweight', dtype='int8')
-        min_data = mx.sym.Variable(name='min_data')
-        max_data = mx.sym.Variable(name='max_data')
-        min_weight = mx.sym.Variable(name='min_weight')
-        max_weight = mx.sym.Variable(name='max_weight')
-        quantized_conv = mx.sym.contrib.quantized_conv(data=qdata, weight=qweight, min_data=min_data,
-                                                       max_data=max_data, min_weight=min_weight,
-                                                       max_weight=max_weight, kernel=kernel,
-                                                       num_filter=num_filter, pad=pad, stride=stride,
-                                                       dilate=dilate, no_bias=no_bias)
-        qarg_names = quantized_conv.list_arguments()
-        type_dict = None
-        if not no_bias:
-            type_dict = {qarg_names[2]: 'int8'}
-        conv_exe_int8 = quantized_conv.simple_bind(ctx=mx.current_context(), type_dict=type_dict, grad_req='null')
-        conv_exe_int8.arg_dict[qarg_names[0]][:] = conv_exe_fp32.arg_dict[arg_names[0]].astype(qdtype)
-        conv_exe_int8.arg_dict[qarg_names[1]][:] = conv_exe_fp32.arg_dict[arg_names[1]].astype('int8')
-        quantized_range = 127.0
-        if no_bias:
-            conv_exe_int8.arg_dict[qarg_names[2]][:] = -quantized_range
-            conv_exe_int8.arg_dict[qarg_names[3]][:] = quantized_range
-            conv_exe_int8.arg_dict[qarg_names[4]][:] = -quantized_range
-            conv_exe_int8.arg_dict[qarg_names[5]][:] = quantized_range
-        else:
-            conv_exe_int8.arg_dict[qarg_names[2]][:] = conv_exe_fp32.arg_dict[arg_names[2]].astype('int8')
-            conv_exe_int8.arg_dict[qarg_names[3]][:] = -quantized_range
-            conv_exe_int8.arg_dict[qarg_names[4]][:] = quantized_range
-            conv_exe_int8.arg_dict[qarg_names[5]][:] = -quantized_range
-            conv_exe_int8.arg_dict[qarg_names[6]][:] = quantized_range
-            conv_exe_int8.arg_dict[qarg_names[7]][:] = -quantized_range
-            conv_exe_int8.arg_dict[qarg_names[8]][:] = quantized_range
-        qoutput, min_range, max_range = conv_exe_int8.forward()
-
-        if no_bias:
-            assert_almost_equal(output.asnumpy(), qoutput.asnumpy(), atol = 1)
-        else:
-            # with adding bias, accuracy loss should not be greater than one
-            diff = mx.nd.abs(output - qoutput.astype(output.dtype))
-            cond = mx.nd.lesser(2, diff).sum().asscalar()
-            assert cond == 0
-
-    for qdtype in ['int8', 'uint8']:
-        check_quantized_conv((3, 4, 28, 28), (3, 3), 128, (1, 1), (1, 1), (1, 1), True, qdtype)
-        check_quantized_conv((3, 4, 28, 28), (3, 3), 128, (1, 1), (1, 1), (1, 1), False, qdtype)
-        check_quantized_conv((1, 3, 4, 28, 28), (1, 3, 3), 128, (1, 1, 1), (1, 1, 1), (1, 1, 1), False, qdtype)
-        check_quantized_conv((1, 3, 4, 28, 28), (1, 3, 3), 128, (1, 1, 1), (1, 1, 1), (1, 1, 1), True, qdtype)
-        check_quantized_conv((1, 3, 4, 28, 28), (1, 3, 3), 128, (1, 1, 1), (1, 1, 1), (2, 2, 2), False, qdtype)
-        check_quantized_conv((1, 3, 4, 28, 28), (1, 3, 3), 128, (1, 1, 1), (1, 1, 1), (2, 2, 2), True, qdtype)
-
-
-@with_seed()
-def test_quantized_elemwise_add():
-    def check_quantized_elemwise_add(data_shape, qtype):
-        if is_test_for_native_cpu():
-            print('skipped testing quantized_elemwise_add for native cpu since it is not supported yet')
-            return
-        elif qtype != 'uint8' and qtype != 'int8':
-            print('skipped testing quantized_elemwise_add for not supported data type')
-            return
-        elif is_test_for_gpu():
-            print('skipped testing quantized_elemwise_add for gpu since it is not supported yet')
-            return
-
-        dataA = mx.sym.Variable(name='dataA', shape=data_shape, dtype='float32')
-        dataB = mx.sym.Variable(name='dataB', shape=data_shape, dtype='float32')
-        elemwise_add_fp32 = mx.sym.elemwise_add(dataA, dataB)
-        arg_names = elemwise_add_fp32.list_arguments()
-        elemwise_add_fp32_exe = elemwise_add_fp32.simple_bind(ctx=mx.current_context(), grad_req='null')
-        if qtype == 'uint8':
-            data_low = 0.0
-            data_high = 255.0
-        else:
-            data_low = -127.0
-            data_high = 127.0
-
-        dataA_val = mx.nd.random.uniform(low=data_low, high=data_high, shape=data_shape).astype('int32')
-        dataB_val = mx.nd.random.uniform(low=data_low, high=data_high, shape=data_shape).astype('int32')
-        elemwise_add_fp32_exe.arg_dict[arg_names[0]][:] = dataA_val
-
-        elemwise_add_fp32_exe.arg_dict[arg_names[1]][:] = dataB_val
-
-        output = elemwise_add_fp32_exe.forward()[0]
-
-        qdataA = mx.sym.Variable(name='qdataA', shape=data_shape, dtype=qtype)
-        qdataB = mx.sym.Variable(name='qdataB', shape=data_shape, dtype=qtype)
-        min_dataA = mx.sym.Variable(name='min_dataA')
-        max_dataA = mx.sym.Variable(name='max_dataA')
-        min_dataB = mx.sym.Variable(name='min_dataB')
-        max_dataB = mx.sym.Variable(name='max_dataB')
-        quantized_elemwise_add = mx.sym.contrib.quantized_elemwise_add(qdataA, qdataB, min_dataA, max_dataA, min_dataB, max_dataB)
-        elemwise_add_int8_exe = quantized_elemwise_add.simple_bind(ctx=mx.current_context(), grad_req='null')
-        qarg_names = quantized_elemwise_add.list_arguments()
-        elemwise_add_int8_exe.arg_dict[qarg_names[0]][:] = elemwise_add_fp32_exe.arg_dict[arg_names[0]].astype(qtype)
-        elemwise_add_int8_exe.arg_dict[qarg_names[1]][:] = elemwise_add_fp32_exe.arg_dict[arg_names[1]].astype(qtype)
-        quantized_range = 127.0
-        elemwise_add_int8_exe.arg_dict[qarg_names[2]][:] = data_low
-        elemwise_add_int8_exe.arg_dict[qarg_names[3]][:] = data_high
-        elemwise_add_int8_exe.arg_dict[qarg_names[4]][:] = data_low
-        elemwise_add_int8_exe.arg_dict[qarg_names[5]][:] = data_high
-        qoutput, min_range, max_range = elemwise_add_int8_exe.forward()
-
-        int8_rslt = qoutput.astype(output.dtype)*max_range/0x7fffffff
-        diff = mx.nd.abs(output - int8_rslt)
-        cond = mx.nd.lesser(2, diff).sum().asscalar()
-        assert cond == 0
-
-    for qtype in ['int8', 'uint8']:
-        check_quantized_elemwise_add((4, 6), qtype)
-        check_quantized_elemwise_add((13, 74, 52), qtype)
-        check_quantized_elemwise_add((3, 4, 56, 56), qtype)
-        check_quantized_elemwise_add((32, 56, 64, 11), qtype)
-
-@with_seed()
-def test_quantized_elemwise_mul():
-    def check_quantized_elemwise_mul(data_shape, qtype):
-        if is_test_for_native_cpu():
-            print('skipped testing quantized_elemwise_mul for native cpu since it is not supported yet')
-            return
-        elif qtype != 'int8':
-            print('skipped testing quantized_elemwise_mul for not supported data type')
-            return
-        elif is_test_for_gpu():
-            print('skipped testing quantized_elemwise_mul for gpu since it is not supported yet')
-            return
-
-        dataA = mx.sym.Variable(name='dataA', shape=data_shape, dtype='float32')
-        dataB = mx.sym.Variable(name='dataB', shape=data_shape, dtype='float32')
-        elemwise_mul_fp32 = mx.sym.elemwise_mul(dataA, dataB)
-        arg_names = elemwise_mul_fp32.list_arguments()
-        elemwise_mul_fp32_exe = elemwise_mul_fp32.simple_bind(ctx=mx.current_context(), grad_req='null')
-        if qtype == 'uint8':
-            data_low = 0.0
-            data_high = 255.0
-        else:
-            data_low = -127.0
-            data_high = 127.0
-
-        dataA_val = mx.nd.random.uniform(low=data_low, high=data_high, shape=data_shape).astype('int32')
-        dataB_val = mx.nd.random.uniform(low=data_low, high=data_high, shape=data_shape).astype('int32')
-        elemwise_mul_fp32_exe.arg_dict[arg_names[0]][:] = dataA_val
-
-        elemwise_mul_fp32_exe.arg_dict[arg_names[1]][:] = dataB_val
-
-        output = elemwise_mul_fp32_exe.forward()[0]
-
-        qdataA = mx.sym.Variable(name='qdataA', shape=data_shape, dtype=qtype)
-        qdataB = mx.sym.Variable(name='qdataB', shape=data_shape, dtype=qtype)
-        min_dataA = mx.sym.Variable(name='min_dataA')
-        max_dataA = mx.sym.Variable(name='max_dataA')
-        min_dataB = mx.sym.Variable(name='min_dataB')
-        max_dataB = mx.sym.Variable(name='max_dataB')
-        quantized_elemwise_mul = mx.sym.contrib.quantized_elemwise_mul(qdataA, qdataB, min_dataA, max_dataA, min_dataB, max_dataB)
-        elemwise_mul_int8_exe = quantized_elemwise_mul.simple_bind(ctx=mx.current_context(), grad_req='null')
-        qarg_names = quantized_elemwise_mul.list_arguments()
-        elemwise_mul_int8_exe.arg_dict[qarg_names[0]][:] = elemwise_mul_fp32_exe.arg_dict[arg_names[0]].astype(qtype)
-        elemwise_mul_int8_exe.arg_dict[qarg_names[1]][:] = elemwise_mul_fp32_exe.arg_dict[arg_names[1]].astype(qtype)
-        quantized_range = 127.0
-        elemwise_mul_int8_exe.arg_dict[qarg_names[2]][:] = data_low
-        elemwise_mul_int8_exe.arg_dict[qarg_names[3]][:] = data_high
-        elemwise_mul_int8_exe.arg_dict[qarg_names[4]][:] = data_low
-        elemwise_mul_int8_exe.arg_dict[qarg_names[5]][:] = data_high
-        qoutput, min_range, max_range = elemwise_mul_int8_exe.forward()
-
-        fp32_rslt = output.asnumpy()
-        int8_rslt = qoutput.astype(output.dtype)
-        assert_almost_equal(fp32_rslt, int8_rslt, atol = 1e-4)
-
-    for qtype in ['int8', 'uint8']:
-        check_quantized_elemwise_mul((4, 6), qtype)
-        check_quantized_elemwise_mul((13, 74, 52), qtype)
-        check_quantized_elemwise_mul((3, 4, 56, 56), qtype)
-        check_quantized_elemwise_mul((32, 56, 64, 11), qtype)
-
-@with_seed()
-def test_quantized_pooling():
-    def check_quantized_pooling(data_shape, kernel, pool_type, pad, stride, global_pool, qdtype, convention='valid'):
-        if is_test_for_native_cpu():
-            print('skipped testing quantized_pooling for native cpu since it is not supported yet')
-            return
-        elif qdtype == 'uint8' and is_test_for_gpu():
-            print('skipped testing quantized_pooling for gpu uint8 since it is not supported yet')
-            return
-        elif is_test_for_gpu() and len(data_shape) != 4:
-            print('skipped testing quantized_pooling for gpu 5d layout since it is not supported yet')
-            return
-
-        data = mx.sym.Variable(name='data', shape=data_shape, dtype='float32')
-        pooling_fp32 = mx.sym.Pooling(data=data, kernel=kernel, pad=pad, stride=stride,
-                                      pool_type=pool_type, global_pool=global_pool, cudnn_off=False,
-                                      pooling_convention=convention)
-        arg_shapes, _, _ = pooling_fp32.infer_shape(data=data_shape)
-        arg_names = pooling_fp32.list_arguments()
-        pooling_fp32_exe = pooling_fp32.simple_bind(ctx=mx.current_context(), grad_req='null')
-        if qdtype == 'uint8':
-            data_low = 0.0
-            data_high = 127.0
-        else:
-            data_low = -127.0
-            data_high = 127.0
-        pooling_fp32_exe.arg_dict[arg_names[0]][:] = mx.nd.random.uniform(low=data_low, high=data_high,
-                                                                            shape=data_shape).astype('int32')
-        output = pooling_fp32_exe.forward()[0]
-
-        qdata = mx.sym.Variable(name='qdata', shape=data_shape, dtype=qdtype)
-        min_data = mx.sym.Variable(name='min_data')
-        max_data = mx.sym.Variable(name='max_data')
-        quantized_pooling = mx.sym.contrib.quantized_pooling(data=qdata, min_data=min_data,
-                                                             max_data=max_data, kernel=kernel,
-                                                             pad=pad, stride=stride, pool_type=pool_type,
-                                                             global_pool=global_pool,
-                                                             pooling_convention=convention)
-        pooling_int8_exe = quantized_pooling.simple_bind(ctx=mx.current_context(), grad_req='null')
-        qarg_names = quantized_pooling.list_arguments()
-        pooling_int8_exe.arg_dict[qarg_names[0]][:] = pooling_fp32_exe.arg_dict[arg_names[0]].astype(qdtype)
-        quantized_range = 127.0
-        pooling_int8_exe.arg_dict[qarg_names[1]][:] = -quantized_range
-        pooling_int8_exe.arg_dict[qarg_names[2]][:] = quantized_range
-        qoutput, min_range, max_range = pooling_int8_exe.forward()
-
-        if pool_type == 'max':
-            assert_almost_equal(output.asnumpy(), qoutput.asnumpy())
-        elif pool_type == 'avg':  # for avg pooling, fp32 and int8 may be different due to rounding errors
-            diff = mx.nd.abs(output - qoutput.astype(output.dtype))
-            cond = mx.nd.lesser(2, diff).sum().asscalar()
-            assert cond == 0
-
-    for qdtype in ['int8', 'uint8']:
-        check_quantized_pooling((3, 4, 56, 56), (3, 3), 'max', (0, 0), (2, 2), False, qdtype)
-        check_quantized_pooling((3, 4, 56, 56), (3, 3), 'max', (0, 0), (2, 2), True, qdtype)
-        check_quantized_pooling((3, 512, 7, 7), (7, 7), 'avg', (0, 0), (1, 1), False, qdtype)
-        check_quantized_pooling((3, 512, 7, 7), (7, 7), 'avg', (0, 0), (1, 1), True, qdtype)
-        check_quantized_pooling((3, 4, 3, 56, 56), (1, 3, 3), 'max', (0, 0, 0), (1, 2, 2), False, qdtype)
-        check_quantized_pooling((3, 4, 3, 56, 56), (1, 3, 3), 'max', (0, 0, 0), (1, 2, 2), True, qdtype)
-        check_quantized_pooling((3, 512, 3, 7, 7), (1, 7, 7), 'avg', (0, 0, 0), (1, 2, 2), False, qdtype)
-        check_quantized_pooling((3, 512, 3, 7, 7), (1, 7, 7), 'avg', (0, 0, 0), (1, 2, 2), True, qdtype)
-
-        check_quantized_pooling((3, 4, 56, 56), (3, 3), 'max', (0, 0), (2, 2), False, qdtype, 'full')
-        check_quantized_pooling((3, 4, 56, 56), (3, 3), 'max', (0, 0), (2, 2), True, qdtype, 'full')
-        check_quantized_pooling((3, 512, 7, 7), (7, 7), 'avg', (0, 0), (1, 1), False, qdtype, 'full')
-        check_quantized_pooling((3, 512, 7, 7), (7, 7), 'avg', (0, 0), (1, 1), True, qdtype, 'full')
-        check_quantized_pooling((3, 4, 3, 56, 56), (1, 3, 3), 'max', (0, 0, 0), (1, 2, 2), False, qdtype, 'full')
-        check_quantized_pooling((3, 4, 3, 56, 56), (1, 3, 3), 'max', (0, 0, 0), (1, 2, 2), True, qdtype, 'full')
-        check_quantized_pooling((3, 512, 3, 7, 7), (1, 7, 7), 'avg', (0, 0, 0), (1, 2, 2), False, qdtype, 'full')
-        check_quantized_pooling((3, 512, 3, 7, 7), (1, 7, 7), 'avg', (0, 0, 0), (1, 2, 2), True, qdtype, 'full')
-
-
-@with_seed()
-def test_quantized_fc():
-    def check_quantized_fc(data_shape, num_hidden, no_bias, qdtype, flatten=True):
-        if is_test_for_native_cpu():
-            hasMKL = False
-            for key in os.environ.keys():
-                if operator.eq(key, "BUILD_TAG"):
-                    if os.environ['BUILD_TAG'].find("MKL") != -1:
-                        hasMKL = True
-                    break
-            if hasMKL == False:
-                print('skipped testing quantized_fc on cpu since s8u8s32 is only supported by MKL BLAS library')
-                return
-        elif qdtype == 'uint8' and is_test_for_gpu():
-            print('skipped testing quantized_fc for gpu uint8 since it is not supported yet')
-            return
-
-        def maxabs(a, b):
-            return mx.nd.maximum(mx.nd.abs(a), mx.nd.abs(b))
-
-        data = mx.sym.Variable(name='data', shape=data_shape, dtype='float32')
-        fc_fp32 = mx.sym.FullyConnected(data=data, num_hidden=num_hidden, no_bias=no_bias, flatten=flatten)
-        arg_shapes, _, _ = fc_fp32.infer_shape(data=data_shape)
-        arg_names = fc_fp32.list_arguments()
-        fc_fp32_exe = fc_fp32.simple_bind(ctx=mx.current_context(), grad_req='null')
-        int8_range = 127.0
-        if qdtype == 'uint8':
-            data_low = 0.0
-            data_high = 63.0
-            quantized_range = 255.0
-        else:
-            data_low = -63.0
-            data_high = 63.0
-            quantized_range = 127.0
-
-        data = mx.nd.random.uniform(low=data_low, high=data_high,
-                                    shape=data_shape).astype('int32')
-        weight = mx.nd.random.uniform(low=data_low, high=data_high,
-                                      shape=arg_shapes[1]).astype('int32')
-        fc_fp32_exe.arg_dict[arg_names[0]][:] = data
-        fc_fp32_exe.arg_dict[arg_names[1]][:] = weight
-
-        data_min = mx.nd.min(data).astype('float32')
-        data_max = mx.nd.max(data).astype('float32')
-        weight_min = mx.nd.min(weight).astype('float32')
-        weight_max = mx.nd.max(weight).astype('float32')
-        data_range = maxabs(data_min, data_max)
-        weight_range = maxabs(weight_min, weight_max)
-
-        if not no_bias:
-            bias = mx.nd.random.uniform(low=data_low, high=data_high,
-                                        shape=arg_shapes[2]).astype('int32')
-            bias_min = mx.nd.min(bias).astype('float32')
-            bias_max = mx.nd.max(bias).astype('float32')
-            bias_range = maxabs(bias_min, bias_max)
-
-            bias_scale = int8_range / bias_range
-            data_scale = quantized_range / data_range
-            weight_scale = int8_range / weight_range
-            bias_int32_rescale = data_scale * weight_scale / bias_scale
-            new_bias = mx.nd.cast(bias, dtype='float32') * bias_int32_rescale
-            fc_fp32_exe.arg_dict[arg_names[2]][:] = new_bias.astype('int32')
-
-        output = fc_fp32_exe.forward()[0]
-
-        qdata = mx.sym.Variable(name='qdata', shape=data_shape, dtype=qdtype)
-        fc_int8 = mx.sym.contrib.quantized_fully_connected(data=qdata, num_hidden=num_hidden,
-                                                           no_bias=no_bias, flatten=flatten)
-        qarg_names = fc_int8.list_arguments()
-        type_dict = {qarg_names[1]: 'int8'}
-        if not no_bias:
-            type_dict.update({qarg_names[2]: 'int8'})
-        fc_int8_exe = fc_int8.simple_bind(ctx=mx.current_context(), type_dict=type_dict, grad_req='null')
-        fc_int8_exe.arg_dict[qarg_names[0]][:] = fc_fp32_exe.arg_dict[arg_names[0]].astype(qdtype)
-        fc_int8_exe.arg_dict[qarg_names[1]][:] = fc_fp32_exe.arg_dict[arg_names[1]].astype('int8')
-        if no_bias:
-            fc_int8_exe.arg_dict[qarg_names[2]][:] = -data_range
-            fc_int8_exe.arg_dict[qarg_names[3]][:] = data_range
-            fc_int8_exe.arg_dict[qarg_names[4]][:] = -weight_range
-            fc_int8_exe.arg_dict[qarg_names[5]][:] = weight_range
-        else:
-            fc_int8_exe.arg_dict[qarg_names[2]][:] = bias.astype('int8')
-            fc_int8_exe.arg_dict[qarg_names[3]][:] = -data_range
-            fc_int8_exe.arg_dict[qarg_names[4]][:] = data_range
-            fc_int8_exe.arg_dict[qarg_names[5]][:] = -weight_range
-            fc_int8_exe.arg_dict[qarg_names[6]][:] = weight_range
-            fc_int8_exe.arg_dict[qarg_names[7]][:] = -bias_range
-            fc_int8_exe.arg_dict[qarg_names[8]][:] = bias_range
-        qoutput, min_range, max_range = fc_int8_exe.forward()
-
-        if no_bias:
-            assert_almost_equal(output.asnumpy(), qoutput.asnumpy())
-        else:
-            # with adding bias, accuracy loss should not be greater than one
-            diff = mx.nd.abs(output - qoutput.astype(output.dtype))
-            cond = mx.nd.lesser(2, diff).sum().asscalar()
-            assert cond == 0
-
-    for qdtype in ['int8', 'uint8']:
-        if is_test_for_mkldnn():
-            check_quantized_fc((32, 512, 2), 100, True, qdtype, flatten=False)
-            check_quantized_fc((32, 512, 2), 100, False, qdtype, flatten=False)
-            check_quantized_fc((32, 512, 2, 2), 100, True, qdtype, flatten=False)
-            check_quantized_fc((32, 512, 2, 2), 100, False, qdtype, flatten=False)
-        check_quantized_fc((32, 512, 2, 2), 100, True, qdtype)
-        check_quantized_fc((32, 111, 2, 2), 100, True, qdtype)
-        check_quantized_fc((32, 512, 2, 2), 100, False, qdtype)
-        check_quantized_fc((32, 111, 2, 2), 100, False, qdtype)
-        check_quantized_fc((256, 2048, 2, 2), 800, False, qdtype)
-        check_quantized_fc((256, 111, 2, 2), 800, False, qdtype)
-        check_quantized_fc((256, 2048, 2, 2), 800, True, qdtype)
-        check_quantized_fc((256, 111, 2, 2), 800, True, qdtype)
-
-@with_seed()
-def test_quantized_embedding():
-    def check_quantized_embedding(data_shape, input_dim, output_dim):
-        if is_test_for_gpu():
-            print('skipped testing test_quantized_embedding for gpu since it is not supported yet')
-            return
-
-        def maxabs(a, b):
-            return mx.nd.maximum(mx.nd.abs(a), mx.nd.abs(b))
-
-        data0 = mx.sym.Variable(name='data', shape=data_shape, dtype='int32')
-        embedding_fp32 = mx.sym.Embedding(data=data0, input_dim=input_dim, output_dim=output_dim)
-        arg_shapes, _, _ = embedding_fp32.infer_shape(data=data_shape)
-        arg_names = embedding_fp32.list_arguments()
-        embedding_fp32_exe = embedding_fp32.simple_bind(ctx=mx.current_context(), grad_req='null')
-        int8_range = 127.0
-        data = mx.nd.random.uniform(low=0, high=input_dim,
-                                      shape=arg_shapes[0]).astype('int32')
-        weight = mx.nd.random.uniform(low=-int8_range, high=int8_range,
-                                      shape=arg_shapes[1]).astype('int32')
-        embedding_fp32_exe.arg_dict[arg_names[0]][:] = data
-        embedding_fp32_exe.arg_dict[arg_names[1]][:] = weight
-
-        weight_min = mx.nd.min(weight).astype('float32')
-        weight_max = mx.nd.max(weight).astype('float32')
-        weight_range = maxabs(weight_min, weight_max)
-
-        output = embedding_fp32_exe.forward()[0]
-
-        embedding_int8 = mx.sym.contrib.quantized_embedding(data=data0, input_dim=input_dim, output_dim=output_dim)
-        qarg_names = embedding_int8.list_arguments()
-        type_dict = {qarg_names[1]: 'int8'}
-        embedding_int8_exe = embedding_int8.simple_bind(ctx=mx.current_context(), type_dict=type_dict, grad_req='null')
-        embedding_int8_exe.arg_dict[qarg_names[0]][:] = embedding_fp32_exe.arg_dict[arg_names[0]]
-        embedding_int8_exe.arg_dict[qarg_names[1]][:] = embedding_fp32_exe.arg_dict[arg_names[1]].astype('int8')
-        embedding_int8_exe.arg_dict[qarg_names[2]][:] = -weight_range
-        embedding_int8_exe.arg_dict[qarg_names[3]][:] = weight_range
-        qoutput, min_range, max_range = embedding_int8_exe.forward()
-
-        assert_almost_equal(output.asnumpy(), qoutput.asnumpy())
-
-    check_quantized_embedding((1,), 1000, 256)
-    check_quantized_embedding((1,), 1024, 512)
-    check_quantized_embedding((32,), 1000, 256)
-    check_quantized_embedding((32,), 1024, 512)
-
-@with_seed()
-def test_quantized_flatten():
-    def check_quantized_flatten(shape, qdtype):
-        if qdtype == 'uint8':
-            data_low = 0.0
-            data_high = 127.0
-        else:
-            data_low = -127.0
-            data_high = 127.0
-        qdata = mx.nd.random.uniform(low=data_low, high=data_high, shape=shape).astype(qdtype)
-        min_data = mx.nd.array([-1023.343], dtype='float32')
-        max_data = mx.nd.array([2343.324275], dtype='float32')
-        qoutput, min_output, max_output = mx.nd.contrib.quantized_flatten(qdata, min_data, max_data)
-        assert qoutput.ndim == 2
-        assert qoutput.shape[0] == qdata.shape[0]
-        assert qoutput.shape[1] == np.prod(qdata.shape[1:])
-        assert same(qdata.asnumpy().flatten(), qoutput.asnumpy().flatten())
-        assert same(min_data.asnumpy(), min_output.asnumpy())
-        assert same(max_data.asnumpy(), max_output.asnumpy())
-
-    for qdtype in ['int8', 'uint8']:
-        check_quantized_flatten((10,), qdtype)
-        check_quantized_flatten((10, 15), qdtype)
-        check_quantized_flatten((10, 15, 18), qdtype)
-        check_quantized_flatten((3, 4, 23, 23), qdtype)
-
-@with_seed()
-def test_quantized_act():
-    def check_quantized_act(data_shape, qdtype):
-        if is_test_for_native_cpu():
-            print('skipped testing quantized_act for native cpu since it is not supported yet')
-            return
-        elif qdtype == 'int8' and is_test_for_mkldnn():
-            print('skipped testing quantized_act for mkldnn cpu int8 since it is not supported yet')
-            return
-        elif is_test_for_gpu():
-            print('skipped testing quantized_act for gpu since it is not supported yet')
-            return
-        data = mx.sym.Variable(name='data', shape=data_shape, dtype='float32')
-        act_fp32 = mx.sym.Activation(data=data, act_type='relu', name='relu')
-        arg_shapes, _, _ = act_fp32.infer_shape(data=data_shape)
-        arg_names = act_fp32.list_arguments()
-        act_fp32_exe = act_fp32.simple_bind(ctx=mx.current_context(), grad_req='null')
-        if qdtype == 'uint8':
-            data_low = 0.0
-            data_high = 127.0
-        else:
-            data_low = -127.0
-            data_high = 127.0
-
-        act_fp32_exe.arg_dict[arg_names[0]][:] = mx.nd.random.uniform(low=data_low,
-                                                high=data_high, shape=data_shape).astype(qdtype)
-        output = act_fp32_exe.forward()[0]
-
-        qdata = mx.sym.Variable(name='qdata', shape=data_shape, dtype=qdtype)
-        min_data = mx.sym.Variable(name='min_data')
-        max_data = mx.sym.Variable(name='max_data')
-        quantized_act = mx.sym.contrib.quantized_act(data=qdata, min_data=min_data, max_data=max_data, act_type='relu')
-        act_int8_exe = quantized_act.simple_bind(ctx=mx.current_context(), grad_req='null')
-        qarg_names = quantized_act.list_arguments()
-
-        act_int8_exe.arg_dict[qarg_names[0]][:] = act_fp32_exe.arg_dict[arg_names[0]].astype(qdtype)
-        quantized_range_min = mx.nd.min(act_int8_exe.arg_dict[qarg_names[0]][:])
-        quantized_range_max = mx.nd.max(act_int8_exe.arg_dict[qarg_names[0]][:])
-        act_int8_exe.arg_dict[qarg_names[1]][:] = quantized_range_min.astype(qdtype)
-        act_int8_exe.arg_dict[qarg_names[2]][:] = quantized_range_max.astype(qdtype)
-        qoutput, min_range, max_range = act_int8_exe.forward()
-
-        assert_almost_equal(output.asnumpy(), qoutput.asnumpy())
-        assert_almost_equal(min_range.asscalar(), quantized_range_min.asscalar())
-        assert_almost_equal(max_range.asscalar(), quantized_range_max.asscalar())
-
-    for qdtype in ['int8', 'uint8']:
-        check_quantized_act((10,), qdtype)
-        check_quantized_act((10, 15), qdtype)
-        check_quantized_act((10, 15, 18), qdtype)
-        check_quantized_act((3, 4, 23, 23), qdtype)
-
-@with_seed()
-def test_quantize_params():
-    if is_test_for_native_cpu():
-        print('skipped testing quantized_params for native cpu since it is not supported yet')
-        return
-
-    data = mx.sym.Variable('data')
-    conv = mx.sym.Convolution(data, kernel=(1, 1), num_filter=2048, name='conv')
-    sym = mx.sym.BatchNorm(data=conv, eps=2e-05, fix_gamma=False, momentum=0.9, use_global_stats=False, name='bn')
-    offline_params = [name for name in sym.list_arguments()
-                      if not name.startswith('data') and not name.endswith('label')]
-    params = {}
-    for name in offline_params:
-        params[name] = mx.nd.uniform(shape=(2, 2))
-    qsym, _ = mx.contrib.quant._quantize_symbol(sym, ctx=mx.current_context(),
-                                                offline_params=offline_params, quantize_mode='full')
-    qparams = mx.contrib.quant._quantize_params(qsym, params, th_dict = {})
-    param_names = params.keys()
-    qparam_names = qparams.keys()
-    for name in qparam_names:
-        if name.startswith('bn'):
-            assert name in param_names
-        elif name.startswith('conv'):
-            assert name not in param_names
-            assert name.find('quantize') != -1
-
-@with_seed()
-def test_smooth_distribution():
-    assert_exception(lambda: mx.contrib.quant._smooth_distribution(np.zeros((2,)), eps=1e-3), ValueError)
-    dirac_delta = np.zeros((5,))
-    dirac_delta[2] = 1
-    smooth_dirac_delta = dirac_delta.copy()
-    smooth_dirac_delta += 1e-3
-    smooth_dirac_delta[2] -= 5e-3
-    assert_almost_equal(mx.contrib.quant._smooth_distribution(dirac_delta, eps=1e-3), smooth_dirac_delta)
-
-
-@with_seed()
-def test_optimal_threshold_adversarial_case():
-    # The worst case for the optimal_threshold function is when the values are concentrated
-    # at one edge: [0, 0, ..., 1000]. (histogram)
-    # We want to make sure that the optimal threshold in this case is the max.
-    hist = []
-    hist_edges = []
-    min_val = -2
-    max_val = 2
-    for i in range(0, 998):
-        hist.append(0)
-    for i in range(0, 999):
-        hist_edges.append((max_val - min_val) / 999 * i + min_val)
-    hist.append(1000)
-    hist_edges.append(max_val)
-    hist_data = (hist, hist_edges, min_val, max_val, max_val)
-    for dtype in ['uint8', 'int8', 'auto']:
-        res = mx.contrib.quant._get_optimal_threshold(hist_data, dtype, num_quantized_bins=5)
-        # The threshold should be 2.
-        print (res)
-        assert abs(res[2] - 2) < 1e-5
-
-
-@with_seed()
-def test_get_optimal_thresholds():
-    # Given an ndarray with elements following a uniform distribution, the optimal threshold
-    # for quantizing the ndarray should be either abs(min(nd)) or abs(max(nd)).
-    def get_threshold(nd):
-        min_nd = mx.nd.min(nd)
-        max_nd = mx.nd.max(nd)
-        return mx.nd.maximum(mx.nd.abs(min_nd), mx.nd.abs(max_nd)).asnumpy()
-
-    for dtype in ['uint8', 'int8', 'auto']:
-        nd = mx.nd.uniform(low=-10.532, high=11.3432, shape=(8, 3, 23, 23), dtype=np.float64)
-        expected_threshold = get_threshold(nd)
-        arr = nd.asnumpy()
-        min_range = np.min(arr)
-        max_range = np.max(arr)
-        th = max(abs(min_range), abs(max_range))
-        hist, hist_edges = np.histogram(arr, bins=8001, range=(-th, th))
-        hist_dict = {'layer1' : (hist, hist_edges, min_range, max_range, th)}
-        th_dict = mx.contrib.quant._get_optimal_thresholds(hist_dict, dtype)
-        assert 'layer1' in th_dict
-        assert_almost_equal(np.array([th_dict['layer1'][1]]), expected_threshold, rtol=1e-2, atol=1e-4)
diff --git a/tests/python/quantization_gpu/test_quantization_gpu.py b/tests/python/quantization_gpu/test_quantization_gpu.py
deleted file mode 100644
index 0f14fa1ac961..000000000000
--- a/tests/python/quantization_gpu/test_quantization_gpu.py
+++ /dev/null
@@ -1,27 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import os
-import sys
-import mxnet as mx
-
-
-curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
-sys.path.insert(0, os.path.join(curr_path, '../quantization'))
-from mxnet.test_utils import set_default_context
-from test_quantization import *
-
-set_default_context(mx.gpu(0))
diff --git a/tests/python/tensorrt/test_cvnets.py b/tests/python/tensorrt/test_cvnets.py
deleted file mode 100644
index cd090c5e2f5c..000000000000
--- a/tests/python/tensorrt/test_cvnets.py
+++ /dev/null
@@ -1,170 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import gc
-import mxnet as mx
-import numpy as np
-
-from mxnet import gluon
-from time import time
-
-from mxnet.gluon.data.vision import transforms
-
-
-def get_classif_model(model_name, use_tensorrt, ctx=mx.gpu(0), batch_size=128):
-    mx.contrib.tensorrt.set_use_fp16(False)
-    h, w = 32, 32
-    model_url = "https://raw.githubusercontent.com/dmlc/web-data/221ce5b7c6d5b0777a1e3471f7f03ff98da90a0a/gluoncv/models"
-    param_file = "{}-0000.params".format(model_name)
-    symbol_file = "{}-symbol.json".format(model_name)
-    mx.test_utils.download("{}/{}".format(model_url, param_file), fname=param_file, overwrite=True)
-    mx.test_utils.download("{}/{}".format(model_url, symbol_file), fname=symbol_file, overwrite=True)
-    net = gluon.SymbolBlock.imports(symbol_file, ['data'], param_file)
-    net.hybridize()
-    net.forward(mx.nd.zeros((batch_size, 3, h, w)))
-    net.export(model_name)
-    _sym, arg_params, aux_params = mx.model.load_checkpoint(model_name, 0)
-    if use_tensorrt:
-        sym = _sym.get_backend_symbol('TensorRT')
-        arg_params, aux_params = mx.contrib.tensorrt.init_tensorrt_params(sym, arg_params,
-                                                                          aux_params)
-    else:
-        sym = _sym
-    executor = sym.simple_bind(ctx=ctx, data=(batch_size, 3, h, w),
-                               softmax_label=(batch_size,),
-                               grad_req='null', force_rebind=True)
-    executor.copy_params_from(arg_params, aux_params)
-    return executor
-
-
-def cifar10_infer(model_name, use_tensorrt, num_workers, ctx=mx.gpu(0), batch_size=128):
-    executor = get_classif_model(model_name, use_tensorrt, ctx, batch_size)
-
-    num_ex = 10000
-    all_preds = np.zeros([num_ex, 10])
-
-    all_label_test = np.zeros(num_ex)
-
-    transform_test = transforms.Compose([
-        transforms.ToTensor(),
-        transforms.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010])
-    ])
-
-    data_loader = lambda: gluon.data.DataLoader(
-        gluon.data.vision.CIFAR10(train=False).transform_first(transform_test),
-        batch_size=batch_size, shuffle=False, num_workers=num_workers)
-
-    val_data = data_loader()
-
-    for idx, (data, label) in enumerate(val_data):
-        # Skip last batch if it's undersized.
-        if data.shape[0] < batch_size:
-            continue
-        offset = idx * batch_size
-        all_label_test[offset:offset + batch_size] = label.asnumpy()
-
-        # warm-up, but don't use result
-        executor.forward(is_train=False, data=data)
-        executor.outputs[0].wait_to_read()
-
-    gc.collect()
-    val_data = data_loader()
-    example_ct = 0
-    start = time()
-
-    # if use_tensorrt:
-    for idx, (data, label) in enumerate(val_data):
-        # Skip last batch if it's undersized.
-        if data.shape[0] < batch_size:
-            continue
-        executor.forward(is_train=False, data=data)
-        preds = executor.outputs[0].asnumpy()
-        offset = idx * batch_size
-        all_preds[offset:offset + batch_size, :] = preds[:batch_size]
-        example_ct += batch_size
-
-    all_preds = np.argmax(all_preds, axis=1)
-    matches = (all_preds[:example_ct] == all_label_test[:example_ct]).sum()
-    duration = time() - start
-
-    return duration, 100.0 * matches / example_ct
-
-
-def run_experiment_for(model_name, batch_size, num_workers):
-    print("\n===========================================")
-    print("Model: %s" % model_name)
-    print("===========================================")
-    print("*** Running inference using pure MXNet ***\n")
-    mx_duration, mx_pct = cifar10_infer(model_name=model_name, batch_size=batch_size,
-                                        num_workers=num_workers, use_tensorrt=False)
-    print("\nMXNet: time elapsed: %.3fs, accuracy: %.2f%%" % (mx_duration, mx_pct))
-    print("\n*** Running inference using MXNet + TensorRT ***\n")
-    trt_duration, trt_pct = cifar10_infer(model_name=model_name, batch_size=batch_size,
-                                          num_workers=num_workers, use_tensorrt=True)
-    print("TensorRT: time elapsed: %.3fs, accuracy: %.2f%%" % (trt_duration, trt_pct))
-    speedup = mx_duration / trt_duration
-    print("TensorRT speed-up (not counting compilation): %.2fx" % speedup)
-
-    acc_diff = abs(mx_pct - trt_pct)
-    print("Absolute accuracy difference: %f" % acc_diff)
-    return speedup, acc_diff
-
-
-def test_tensorrt_on_cifar_resnets(batch_size=32, tolerance=0.1, num_workers=1):
-    original_use_fp16 = mx.contrib.tensorrt.get_use_fp16()
-    try:
-        models = [
-            'cifar_resnet20_v1',
-            'cifar_resnet56_v1',
-            'cifar_resnet110_v1',
-            'cifar_resnet20_v2',
-            'cifar_resnet56_v2',
-            'cifar_resnet110_v2',
-            'cifar_wideresnet16_10'
-        ]
-
-        num_models = len(models)
-
-        speedups = np.zeros(num_models, dtype=np.float32)
-        acc_diffs = np.zeros(num_models, dtype=np.float32)
-
-        test_start = time()
-
-        for idx, model in enumerate(models):
-            speedup, acc_diff = run_experiment_for(model, batch_size, num_workers)
-            speedups[idx] = speedup
-            acc_diffs[idx] = acc_diff
-            assert acc_diff < tolerance, "Accuracy difference between MXNet and TensorRT > %.2f%% for model %s" % (
-                tolerance, model)
-
-        print("Perf and correctness checks run on the following models:")
-        print(models)
-        mean_speedup = np.mean(speedups)
-        std_speedup = np.std(speedups)
-        print("\nSpeedups:")
-        print(speedups)
-        print("Speedup range: [%.2f, %.2f]" % (np.min(speedups), np.max(speedups)))
-        print("Mean speedup: %.2f" % mean_speedup)
-        print("St. dev. of speedups: %.2f" % std_speedup)
-        print("\nAcc. differences: %s" % str(acc_diffs))
-
-        test_duration = time() - test_start
-
-        print("Test duration: %.2f seconds" % test_duration)
-    finally:
-        mx.contrib.tensorrt.set_use_fp16(original_use_fp16)
-
diff --git a/tests/python/tensorrt/test_ops.py b/tests/python/tensorrt/test_ops.py
deleted file mode 100644
index 7c50c589c967..000000000000
--- a/tests/python/tensorrt/test_ops.py
+++ /dev/null
@@ -1,445 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import mxnet as mx
-import numpy as np
-from itertools import product
-import copy
-
-from numpy.testing import assert_allclose
-
-import sys
-import os
-curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
-sys.path.insert(0, os.path.join(curr_path, '../unittest'))
-from common import setup_module, with_seed
-import pytest
-
-def check_unsupported_single_sym(sym):
-    wrapped_sym = mx.sym.Group([mx.sym.identity(s) for s in sym])
-    trt_sym = wrapped_sym.get_backend_symbol('TensorRT')
-    assert len(wrapped_sym.get_internals()) == len(trt_sym.get_internals())
-
-def check_single_sym(sym, data_shapes, arg_params_shapes=None, aux_params_shapes=None,
-                     rtol_fp32=1e-5, atol_fp32=0., rtol_fp16=1e-3, atol_fp16=0.):
-    if arg_params_shapes is None:
-        arg_params_shapes = {}
-    if aux_params_shapes is None:
-        aux_params_shapes = {}
-    for i in range(3):
-        data = {k: mx.nd.array(np.random.rand(*v) + 0.01, dtype='float32', ctx=mx.cpu())
-                for k, v in data_shapes.items()}
-        arg_params = {k: mx.nd.array(np.random.rand(*v) + 0.01, dtype='float32', ctx=mx.cpu())
-                      for k, v in arg_params_shapes.items()}
-        aux_params = {k: mx.nd.array(np.random.rand(*v) + 0.01, dtype='float32', ctx=mx.cpu())
-                      for k, v in aux_params_shapes.items()}
-        wrapped_sym = mx.sym.Group([mx.sym.identity(s) for s in sym])
-
-        # Test FP32 MXNet Native
-        shapes = {}
-        shapes.update(data_shapes)
-        shapes.update(arg_params_shapes)
-        shapes.update(aux_params_shapes)
-        orig_executor = wrapped_sym.simple_bind(ctx=mx.gpu(0), grad_req='null',
-                                                force_rebind=True, **shapes)
-        orig_executor.copy_params_from(arg_params, aux_params)
-        orig_executor.forward(is_train=False, **data)
-        orig_outputs = [arr.asnumpy() for arr in orig_executor.outputs]
-
-        # Test FP32 MXNet-TRT
-        mx.contrib.tensorrt.set_use_fp16(False)
-        trt_sym = wrapped_sym.get_backend_symbol('TensorRT')
-        assert len(trt_sym.get_internals()) < len(wrapped_sym.get_internals())
-        remaining_arg_params, remaining_aux_params = \
-            mx.contrib.tensorrt.init_tensorrt_params(trt_sym, arg_params, aux_params)
-        shapes = {}
-        shapes.update(data_shapes)
-        shapes.update({k: v.shape for k, v in remaining_arg_params.items()})
-        shapes.update({k: v.shape for k, v in remaining_aux_params.items()})
-        trt_fp32_executor = trt_sym.simple_bind(ctx=mx.gpu(0), grad_req='null',
-                                                force_rebind=True, **shapes)
-        trt_fp32_executor.copy_params_from(remaining_arg_params, remaining_aux_params)
-        trt_fp32_executor.forward(is_train=False, **data)
-        trt_fp32_outputs = [arr.asnumpy() for arr in trt_fp32_executor.outputs]
-
-        # Test FP16 MXNet-TRT
-        mx.contrib.tensorrt.set_use_fp16(True)
-        data = {k: v.astype('float16') for k, v in data.items()}
-        arg_params = {k: v.astype('float16') for k, v in arg_params.items()}
-        aux_params = {k: v.astype('float16') for k, v in aux_params.items()}
-        trt_sym = wrapped_sym.get_backend_symbol('TensorRT')
-        assert len(trt_sym.get_internals()) < len(wrapped_sym.get_internals())
-        remaining_arg_params, remaining_aux_params = \
-            mx.contrib.tensorrt.init_tensorrt_params(trt_sym, arg_params, aux_params)
-        shapes = {}
-        shapes.update(data_shapes)
-        shapes.update({k: v.shape for k, v in remaining_arg_params.items()})
-        shapes.update({k: v.shape for k, v in remaining_aux_params.items()})
-
-        trt_fp16_executor = trt_sym.simple_bind(ctx=mx.gpu(0),
-                                                type_dict={k: 'float16' for k in shapes.keys()},
-                                                grad_req='null', force_rebind=True, **shapes)
-        trt_fp16_executor.copy_params_from(remaining_arg_params, remaining_aux_params)
-        trt_fp16_executor.forward(is_train=False, **data)
-        trt_fp16_outputs = [arr.asnumpy() for arr in trt_fp16_executor.outputs]
-
-        for j, (orig, fp16, fp32) in enumerate(zip(orig_outputs, trt_fp16_outputs, trt_fp32_outputs)):
-            abs_orig = abs(orig)
-            diff32 = abs(fp32 - orig)
-            diff16 = abs(fp16.astype('float32') - orig)
-            _atol32 = diff32 - rtol_fp32 * abs_orig
-            _atol16 = diff16 - rtol_fp16 * abs_orig
-            print("{}: diff32({:.2E}) | diff16({:.2E}) | atol32({:.2E}) | atol16({:.2E}) | orig.min({:.2E})".format(
-                  j, diff32.max(), diff16.max(), _atol32.max(), _atol16.max(), abs_orig.min()))
-            assert_allclose(fp32, orig, rtol=rtol_fp32, atol=atol_fp32)
-            assert_allclose(fp16, orig, rtol=rtol_fp16, atol=atol_fp16)
-
-@with_seed()
-def test_noop():
-    data = mx.sym.Variable('data')
-    check_unsupported_single_sym(data)
-
-
-@with_seed()
-def test_identity():
-    data = mx.sym.Variable('data')
-    sym = mx.sym.identity(data)
-    check_single_sym(sym, data_shapes={'data': (8,3,32,32)},
-                     rtol_fp32=0., atol_fp32=0., rtol_fp16=1e-3, atol_fp16=1e-7)
-
-
-@with_seed()
-@pytest.mark.parametrize('kernel', [(3, 3), (1, 1), (3, 1)])
-@pytest.mark.parametrize('stride', [(1, 1), (2, 2), (2, 1)])
-@pytest.mark.parametrize('pad', [(1, 1), (0, 0), (1, 0)])
-@pytest.mark.parametrize('group', [1, 2])
-@pytest.mark.parametrize('layout', ['NCHW', 'NHWC'])
-@pytest.mark.parametrize('no_bias', [True, False])
-@pytest.mark.parametrize('op', [mx.sym.Convolution, mx.sym.Deconvolution])
-def test_conv_deconv_2d(op, kernel, stride, pad, group, layout, no_bias):
-    data = mx.sym.Variable('data')
-    weight = mx.sym.Variable('weight')
-    data_shape = (8,3,16,16)
-    num_filter = 7
-    if stride[0] > kernel[0] or stride[1] > kernel[1]: # doesn't make any sense
-        return
-    if kernel == (3, 3) and stride == (1, 1):
-        atol_fp32 = 1e-5
-        rtol_fp32 = 1e-5
-        atol_fp16 = 1e-3
-        rtol_fp16 = 1e-2
-    else:
-        atol_fp32 = 1e-5
-        rtol_fp32 = 1e-5
-        atol_fp16 = 1e-3
-        rtol_fp16 = 1e-2
-    if op == mx.sym.Convolution:
-        weight_shape = (num_filter, data_shape[1]) + kernel
-    else:
-        weight_shape = (data_shape[1], num_filter) + kernel
-    print("kernel: {} | stride: {} | pad: {} | group: {} | layout: {} | no_bias: {}".format(
-          kernel, stride, pad, group, layout, no_bias))
-    kwargs = {'weight': weight, 'kernel': kernel, 'stride': stride, 'layout': layout,
-              'pad': pad, 'num_filter': num_filter, 'no_bias': no_bias}
-    arg_params_shapes = {'weight': weight_shape}
-    if not no_bias:
-        arg_params_shapes['bias'] = (num_filter,)
-        bias = mx.sym.Variable('bias')
-        kwargs['bias'] = bias
-    sym = op(data, **kwargs)
-
-    if layout == 'NCHW':
-        check_single_sym(sym, {'data': data_shape},
-                         arg_params_shapes,
-                         rtol_fp32=rtol_fp32, atol_fp32=atol_fp32,
-                         rtol_fp16=rtol_fp16, atol_fp16=atol_fp16)
-    else:
-        check_unsupported_single_sym(sym)
-
-@with_seed()
-def test_fully_connected(): # TODO(cfujitsang): take care of flatten option
-    data = mx.sym.Variable('data')
-    weight = mx.sym.Variable('weight')
-    bias = mx.sym.Variable('bias')
-    data_shape = (8,64)
-    num_hidden = 7
-    weight_shape = (num_hidden, data_shape[1])
-    bias_shape = (num_hidden,)
-    sym = mx.sym.FullyConnected(data, weight=weight, bias=bias, no_bias=False,
-                                num_hidden=num_hidden)
-    check_single_sym(sym, {'data': data_shape}, {'weight': weight_shape, 'bias': bias_shape},
-                     rtol_fp16=5e-3, atol_fp16=0.)
-    sym = mx.sym.FullyConnected(data, weight=weight, no_bias=True, num_hidden=num_hidden)
-    check_unsupported_single_sym(sym)
-
-
-@with_seed()
-def test_relu():
-    data = mx.sym.Variable('data')
-    sym = mx.sym.relu(data)
-    for data_shape in [(10, 32), (10, 3, 32), (10, 3, 32, 32), (10, 3, 7, 32, 32)]:
-        check_single_sym(sym, {'data': data_shape}, rtol_fp32=0., atol_fp32=0.,
-                         rtol_fp16=1e-3, atol_fp16=1e-7)
-
-
-@with_seed()
-def test_activation():
-    data = mx.sym.Variable('data')
-    for act_type in ['relu', 'sigmoid', 'tanh']:
-        sym = mx.sym.Activation(data, act_type=act_type)
-        for data_shape in [(10, 32), (10, 3, 32), (10, 3, 32, 32), (10,3,7,32,32)]:
-            check_single_sym(sym, {'data': data_shape}, rtol_fp32=0., atol_fp32=0.,
-                             rtol_fp16=1e-3, atol_fp16=1e-7)
-    for act_type in ['softrelu', 'softsign']:
-        sym = mx.sym.Activation(data, act_type=act_type)
-        check_unsupported_single_sym(sym)
-
-
-@with_seed()
-def test_pooling2d():
-    data = mx.sym.Variable('data')
-    data_shape = (4, 3, 32,32)
-    for pool_type in ['max', 'avg', 'lp', 'sum']:
-        if pool_type == 'max':
-            rtol_fp32 = 1e-6
-            atol_fp32 = 0.
-            rtol_fp16 = 1e-3
-            atol_fp16 = 0.
-        else:
-            rtol_fp32 = 5e-6
-            atol_fp32 = 0.
-            rtol_fp16 = 1e-3
-            atol_fp16 = 0.
-        for layout in ['NHWC', 'NCHW']:
-            for (stride, pad, kernel, count_include_pad, pooling_convention) \
-                 in product([(2,2), (2,1)], [(0,0), (1,1)], [(2,2), (3,2)],
-                            [True, False], ['valid', 'full']):
-                print("pool_type: {} | layout: {} | stride: {} | pad: {} | ".format(
-                      pool_type, layout, stride, pad) +
-                      "kernel: {} | count_include_pad: {} | pooling_convention: {}".format(
-                      kernel, count_include_pad, pooling_convention))
-                sym = mx.sym.Pooling(data, kernel=kernel, pool_type=pool_type, stride=stride,
-                                     pad=pad, layout=layout, count_include_pad=count_include_pad,
-                                     pooling_convention=pooling_convention)
-                if (layout == 'NHWC') or \
-                    pool_type not in ('max', 'avg') or \
-                    pooling_convention != 'valid' or \
-                    (pool_type == 'avg' and count_include_pad):
-                    check_unsupported_single_sym(sym)
-                else:
-                    check_single_sym(sym, {'data': data_shape},
-                                     rtol_fp32=rtol_fp32, atol_fp32=atol_fp32,
-                                     rtol_fp16=rtol_fp16, atol_fp16=atol_fp16)
-            print("pool_type: {} | layout: {} | global_pool".format(pool_type, layout))
-            sym = mx.sym.Pooling(data, global_pool=True, pool_type=pool_type, layout=layout)
-            if layout == 'NHWC' or pool_type not in ('max', 'avg'):
-                check_unsupported_single_sym(sym)
-            else:
-                if pool_type == 'max':
-                    rtol_fp32 = 0.
-                    atol_fp32 = 0.
-                    rtol_fp16 = 1e-3
-                    atol_fp16 = 0.
-                else:
-                    rtol_fp32 = 1e-5
-                    atol_fp32 = 0.
-                    rtol_fp16 = 1e-3
-                    atol_fp16 = 0.
-                check_single_sym(sym, {'data': data_shape}, rtol_fp32=rtol_fp32,
-                                 atol_fp32=atol_fp32, rtol_fp16=rtol_fp16, atol_fp16=atol_fp16)
-
-def check_batch_norm(sym, data_shapes, arg_params_shapes=None, aux_params_shapes=None,
-                     rtol_fp32=1e-5, atol_fp32=1e-7, rtol_fp16=1e-2, atol_fp16=1e-3):
-    if arg_params_shapes is None:
-        arg_params_shapes = {}
-    if aux_params_shapes is None:
-        aux_params_shapes = {}
-    for i in range(3):
-        data = {
-            'data': mx.nd.array(np.random.rand(*data_shapes['data']) + 0.01,
-                                dtype='float32', ctx=mx.cpu())
-        }
-        arg_params = {
-            'gamma': mx.nd.array(np.random.rand(*arg_params_shapes['gamma']) * 0.1 + 1.,
-                                 dtype='float32', ctx=mx.cpu()),
-            'beta': mx.nd.array(np.random.rand(*arg_params_shapes['beta']),
-                                dtype='float32', ctx=mx.cpu())
-        }
-        aux_params = {
-            'moving_mean': mx.nd.array(
-                0.45 + np.random.rand(*aux_params_shapes['moving_mean']) * 0.1 + 0.01,
-                                      dtype='float32', ctx=mx.cpu()),
-            'moving_var': mx.nd.array(
-                0.95 + np.random.rand(*aux_params_shapes['moving_var']) * 0.1,
-                                      dtype='float32', ctx=mx.cpu())
-        }
-        wrapped_sym = mx.sym.Group([mx.sym.identity(s) for s in sym])
-
-        # Test FP32 MXNet Native
-        shapes = {}
-        shapes.update(data_shapes)
-        shapes.update(arg_params_shapes)
-        shapes.update(aux_params_shapes)
-        orig_executor = wrapped_sym.simple_bind(ctx=mx.gpu(0), grad_req='null',
-                                                force_rebind=True, **shapes)
-        orig_executor.copy_params_from(arg_params, aux_params)
-        orig_executor.forward(is_train=False, **data)
-        orig_outputs = [arr.asnumpy() for arr in orig_executor.outputs]
-
-        # Test FP32 MXNet-TRT
-        mx.contrib.tensorrt.set_use_fp16(False)
-        trt_sym = wrapped_sym.get_backend_symbol('TensorRT')
-        assert len(trt_sym.get_internals()) < len(wrapped_sym.get_internals())
-        remaining_arg_params, remaining_aux_params = \
-            mx.contrib.tensorrt.init_tensorrt_params(trt_sym, arg_params, aux_params)
-        shapes = {}
-        shapes.update(data_shapes)
-        shapes.update({k: v.shape for k, v in remaining_arg_params.items()})
-        shapes.update({k: v.shape for k, v in remaining_aux_params.items()})
-        trt_fp32_executor = trt_sym.simple_bind(ctx=mx.gpu(0), grad_req='null',
-                                                force_rebind=True, **shapes)
-        trt_fp32_executor.copy_params_from(remaining_arg_params, remaining_aux_params)
-        trt_fp32_executor.forward(is_train=False, **data)
-        trt_fp32_outputs = [arr.asnumpy() for arr in trt_fp32_executor.outputs]
-
-        # Test FP16 MXNet-TRT
-        mx.contrib.tensorrt.set_use_fp16(True)
-        data = {k: v.astype('float16') for k, v in data.items()}
-        arg_params = {k: v.astype('float32') for k, v in arg_params.items()}
-        aux_params = {k: v.astype('float32') for k, v in aux_params.items()}
-        trt_sym = wrapped_sym.get_backend_symbol('TensorRT')
-        remaining_arg_params, remaining_aux_params = \
-            mx.contrib.tensorrt.init_tensorrt_params(trt_sym, arg_params, aux_params)
-        shapes = {}
-        shapes.update(data_shapes)
-        shapes.update({k: v.shape for k, v in remaining_arg_params.items()})
-        shapes.update({k: v.shape for k, v in remaining_aux_params.items()})
-
-        trt_fp16_executor = trt_sym.simple_bind(ctx=mx.gpu(0),
-                                                type_dict={k: 'float16' for k in shapes.keys()},
-                                                grad_req='null', force_rebind=True, **shapes)
-        trt_fp16_executor.copy_params_from(remaining_arg_params, remaining_aux_params)
-        trt_fp16_executor.forward(is_train=False, **data)
-        trt_fp16_outputs = [arr.asnumpy() for arr in trt_fp16_executor.outputs]
-
-
-        for j, (orig, fp16, fp32) in enumerate(zip(orig_outputs,
-                                                   trt_fp16_outputs,
-                                                   trt_fp32_outputs)):
-            abs_orig = abs(orig)
-            diff32 = abs(fp32 - orig)
-            diff16 = abs(fp16.astype('float32') - orig)
-            _atol32 = diff32 - rtol_fp32 * abs_orig
-            _atol16 = diff16 - rtol_fp16 * abs_orig
-            print("{}: diff32({:.2E}) | diff16({:.2E}) | atol32({:.2E}) | atol16({:.2E}) | orig.min({:.2E})".format(
-                  j, diff32.max(), diff16.max(), _atol32.max(), _atol16.max(), abs_orig.min()))
-            assert_allclose(fp32, orig, rtol=rtol_fp32, atol=atol_fp32)
-            assert_allclose(fp16.astype('float32'), orig, rtol=rtol_fp16, atol=atol_fp16)
-
-@with_seed()
-def test_batch_norm():
-    data = mx.sym.Variable('data')
-    gamma = mx.sym.Variable('gamma')
-    beta = mx.sym.Variable('beta')
-    moving_mean = mx.sym.Variable('moving_mean')
-    moving_var = mx.sym.Variable('moving_var')
-    data_shape = (4,3,32,32)
-    gamma_shape = (3,)
-    beta_shape = (3,)
-    moving_mean_shape = (3,)
-    moving_var_shape = (3,)
-    for fix_gamma in [True, False]:
-        for use_global_stats in [True, False]:
-            for axis in [0, 1, 2, 3]:
-                sym = mx.sym.BatchNorm(data, gamma=gamma, beta=beta, moving_mean=moving_mean,
-                                       fix_gamma=fix_gamma, moving_var=moving_var, momentum=0.9,
-                                       axis=axis, use_global_stats=use_global_stats, eps=1e-5)
-                if axis == 1:
-                    check_batch_norm(sym,
-                        {'data': data_shape}, {'gamma': gamma_shape, 'beta': beta_shape},
-                        {'moving_mean': moving_mean_shape, 'moving_var': moving_var_shape},
-                        atol_fp32=2e-7)
-                else:
-                    check_unsupported_single_sym(sym)
-
-
-@with_seed()
-def test_clip():
-    data = mx.sym.Variable('data')
-    sym = mx.sym.clip(data, 0.25, 0.75)
-    for data_shape in [(10, 32), (10, 3, 32), (10, 3, 32, 32), (10,3,7,32,32)]:
-        check_single_sym(sym, {'data': data_shape},
-                         rtol_fp32=0., atol_fp32=0.,
-                         rtol_fp16=1e-3, atol_fp16=0.)
-
-
-@with_seed()
-def test_concat():
-    lhs = mx.sym.Variable('lhs')
-    rhs = mx.sym.Variable('rhs')
-    shape = [3, 5, 7, 9]
-    lhs_shape = tuple(shape)
-    for axis in range(1, 4):
-        sym = mx.sym.concat(lhs, rhs, dim=axis)
-        rhs_shape = copy.copy(shape)
-        rhs_shape[axis] = 1
-        rhs_shape = tuple(rhs_shape)
-        check_single_sym(sym, {'lhs': lhs_shape, 'rhs': rhs_shape},
-                         rtol_fp32=0., atol_fp32=0., rtol_fp16=1e-3, atol_fp16=1e-7)
-
-
-@with_seed()
-def test_elemwise_ops():
-    lhs = mx.sym.Variable('lhs')
-    rhs = mx.sym.Variable('rhs')
-    shape = (3, 5, 7, 9)
-    lhs_shape = tuple(shape)
-    sym = mx.sym.elemwise_add(lhs, rhs)
-    check_single_sym(sym, {'lhs': shape, 'rhs': shape},
-                     rtol_fp32=0., atol_fp32=0.)
-
-    sym = mx.sym.elemwise_sub(lhs, rhs)
-    # TODO(cfujitsang): is atol_fp16 ok ?
-    check_single_sym(sym, {'lhs': shape, 'rhs': shape},
-                     rtol_fp32=0., atol_fp32=0., rtol_fp16=1e-3, atol_fp16=1e-3)
-
-    sym = mx.sym.elemwise_mul(lhs, rhs)
-    check_single_sym(sym, {'lhs': shape, 'rhs': shape},
-                     rtol_fp32=0., atol_fp32=0., rtol_fp16=5e-3, atol_fp16=1e-7)
-
-@with_seed()
-def test_flatten():
-    data = mx.sym.Variable('data')
-    sym = mx.sym.flatten(data)
-    for data_shape in [(3, 5, 7), (3, 5, 7, 9), (3, 5, 7, 9, 11)]:
-        check_single_sym(sym, {'data': data_shape},
-                         rtol_fp32=0., atol_fp32=0., atol_fp16=1e-7)
-
-@with_seed()
-def test_dropout():
-    data = mx.sym.Variable('data')
-    for data_shape in [(3, 5), (3, 5, 7), (3, 5, 7, 9)]:
-        for mode in ['training', 'always']:
-            sym = mx.sym.Dropout(data, p=0.7, mode=mode)
-            if mode == 'training':
-                check_single_sym(sym, {'data': data_shape},
-                                 rtol_fp32=0., atol_fp32=0., atol_fp16=1e-7)
-            else:
-                check_unsupported_single_sym(sym)
-            sym = mx.sym.Dropout(data, p=0.7, mode=mode, axes=(0,))
-            check_unsupported_single_sym(sym)
-
diff --git a/tests/python/tensorrt/test_resnet18.py b/tests/python/tensorrt/test_resnet18.py
deleted file mode 100644
index e146423e257d..000000000000
--- a/tests/python/tensorrt/test_resnet18.py
+++ /dev/null
@@ -1,71 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from mxnet.gluon.model_zoo import vision
-from mxnet.test_utils import assert_almost_equal
-import mxnet as mx
-import numpy as np
-import os
-
-batch_shape = (1, 3, 224, 224)
-url = 'https://github.com/dmlc/web-data/blob/master/mxnet/doc/tutorials/python/predict_image/cat.jpg?raw=true'
-model_file_name = 'resnet18_v2_trt_test'
-
-def get_image(image_url):
-    fname = mx.test_utils.download(image_url, fname=image_url.split('/')[-1].split('?')[0])
-    img = mx.image.imread(fname)
-    img = mx.image.imresize(img, 224, 224)  # Resize
-    img = img.transpose((2, 0, 1))  # Channel first
-    img = img.expand_dims(axis=0)  # Batchify
-    img = mx.nd.cast(img, dtype=np.float32)
-    return img / 255.0
-
-def test_tensorrt_resnet18_feature_vect():
-    print("downloading sample input")
-    input_data = get_image(url)
-    gluon_resnet18 = vision.resnet18_v2(pretrained=True)
-    gluon_resnet18.hybridize()
-    gluon_resnet18.forward(input_data)
-    gluon_resnet18.export(model_file_name)
-    sym, arg_params, aux_params = mx.model.load_checkpoint(model_file_name, 0)
-
-    executor = sym.simple_bind(ctx=mx.gpu(), data=batch_shape,
-                               grad_req='null', force_rebind=True)
-    executor.copy_params_from(arg_params, aux_params)
-    y = executor.forward(is_train=False, data=input_data)
-    trt_sym = sym.get_backend_symbol('TensorRT')
-    arg_params, aux_params = mx.contrib.tensorrt.init_tensorrt_params(trt_sym, arg_params, aux_params)
-    original_precision_value = mx.contrib.tensorrt.get_use_fp16()
-    try:
-        mx.contrib.tensorrt.set_use_fp16(True)
-        executor = trt_sym.simple_bind(ctx=mx.gpu(), data=batch_shape,
-                                       grad_req='null', force_rebind=True)
-        executor.copy_params_from(arg_params, aux_params)
-        y_trt = executor.forward(is_train=False, data=input_data)
-        mx.contrib.tensorrt.set_use_fp16(False)
-        executor = trt_sym.simple_bind(ctx=mx.gpu(), data=batch_shape,
-                                       grad_req='null', force_rebind=True)
-        executor.copy_params_from(arg_params, aux_params)
-        y_trt_fp32 = executor.forward(is_train=False, data=input_data)
-        no_trt_output = y[0].asnumpy()[0]
-        trt_output = y_trt[0].asnumpy()[0]
-        trt_fp32_output = y_trt_fp32[0].asnumpy()[0]
-        assert_almost_equal(no_trt_output, trt_output, 1e-1, 1e-2)
-        assert_almost_equal(no_trt_output, trt_fp32_output, 1e-4, 1e-4)
-    finally:
-        mx.contrib.tensorrt.set_use_fp16(original_precision_value)
-
diff --git a/tests/python/unittest/onnx/README.md b/tests/python/unittest/onnx/README.md
index d8f58cba3d5c..908c8f435ff4 100644
--- a/tests/python/unittest/onnx/README.md
+++ b/tests/python/unittest/onnx/README.md
@@ -30,4 +30,4 @@ any operator.
 for "both", import and export, "import" alone, or "export" alone.
 * `test_models.py` - custom tests for models
 * `test_node.py` - custom tests for operators. These tests are written independent of ONNX tests, in case
-ONNX doesn't have tests yet or for MXNet specific operators.
\ No newline at end of file
+ONNX doesn't have tests yet or for MXNet specific operators.
diff --git a/tests/python/unittest/onnx/backend_rep.py b/tests/python/unittest/onnx/backend_rep.py
index be6bc88d9d70..597b1c701877 100644
--- a/tests/python/unittest/onnx/backend_rep.py
+++ b/tests/python/unittest/onnx/backend_rep.py
@@ -80,7 +80,7 @@ def run(self, inputs, **kwargs):
                 data_forward.append(mx.nd.array(val))
 
         args = dict(zip(data_names, data_forward))
-        exe = self.symbol.bind(ctx, args=args, aux_states=self.aux_params)
+        exe = self.symbol._bind(ctx, args=args, aux_states=self.aux_params)
         exe.forward(is_train=False)
         result = []
         for output in exe.outputs:
diff --git a/tests/python/unittest/onnx/test_node.py b/tests/python/unittest/onnx/test_node.py
index 1097cf74c642..00e557c306f5 100644
--- a/tests/python/unittest/onnx/test_node.py
+++ b/tests/python/unittest/onnx/test_node.py
@@ -203,9 +203,10 @@ def test_exports(self):
     ("test_expand", "Expand", mx.sym.broadcast_to, (2,1,3,1), {'shape': (2,1,3,1)}),
     ("test_tile", "Tile", mx.sym.tile, (2,1,3,1), {'reps': (2,3)}),
     ("test_topk", "TopK", mx.sym.topk, (2, 10, 2), {'k': 3, 'axis': 1, 'ret_typ': 'both', 'dtype': np.int64}),
-    ("test_slice_axis", "Slice", mx.sym.slice_axis, (2, 10, 2), {'begin': 3, 'end': 7, 'axis': 1}),
-    ("test_LSTM", "LSTM", mx.gluon.rnn.LSTM, (3,1,2), {'hidden_size': 3}),
-    ("test_BiLSTM", "LSTM", mx.gluon.rnn.LSTM, (3,1,2), {'hidden_size': 3, 'bidirectional': True}),
+    ("test_slice_axis", "Slice", mx.sym.slice_axis, (2, 10, 2), {'begin': 3, 'end': 7, 'axis': 1})
+    # https://github.com/apache/incubator-mxnet/issues/18596
+    # ("test_LSTM", "LSTM", mx.gluon.rnn.LSTM, (3,1,2), {'hidden_size': 3}),
+    # ("test_BiLSTM", "LSTM", mx.gluon.rnn.LSTM, (3,1,2), {'hidden_size': 3, 'bidirectional': True}),
 ]
 
 if __name__ == '__main__':
diff --git a/tests/python/unittest/test_contrib_control_flow.py b/tests/python/unittest/test_contrib_control_flow.py
index 4de075c8fad2..962ce6239115 100644
--- a/tests/python/unittest/test_contrib_control_flow.py
+++ b/tests/python/unittest/test_contrib_control_flow.py
@@ -16,6 +16,7 @@
 # under the License.
 
 import copy
+import pytest
 import numpy as np
 import mxnet as mx
 from mxnet import gluon
@@ -218,7 +219,7 @@ def _zeros_like_dict(name_list):
         args_names = ["FreeVar" + str(i) for i, _ in enumerate(free_var_shapes)] \
                    + ["LoopVar" + str(i) for i, _ in enumerate(loop_var_shapes) if i >= loop_var_start]
         args_grad = None if not is_train else _zeros_like_dict(x for x in args_names)
-        executor = loop_result_sym.bind(
+        executor = loop_result_sym._bind(
             ctx=default_context(),
             args=_copy_args_dict(loop_result_sym.list_inputs()),
             args_grad=args_grad,
@@ -251,6 +252,7 @@ def _zeros_like_dict(name_list):
 
 
 @with_seed()
+@pytest.mark.skip(reason="Bug in while loop op, tracked at incubator-mxnet/issues/18575")
 def test_while_loop_for_foreach():
 
     def make_true_cond():
@@ -876,7 +878,7 @@ def _get_sym_result(is_train, args, args_grad, out_grad):
             mx.sym.var("sc"),
         ]
         result_sym = mx.sym.Group(make_loop(i, j, x_sum, sc))
-        executor = result_sym.bind(
+        executor = result_sym._bind(
             ctx=default_context(),
             args=args,
             args_grad=args_grad,
@@ -958,7 +960,7 @@ def _get_symbolic_result(out_grads):
         outputs_sym = _as_list(outputs_sym)
         outputs_sym = [x * 2 for x in outputs_sym]
         outputs_sym = mx.sym.Group(outputs_sym)
-        executor = outputs_sym.bind(
+        executor = outputs_sym._bind(
             ctx=default_context(),
             args={name: _args_dict[name].copy() for name in outputs_sym.list_inputs()},
             args_grad=None if not is_train else _merge_dict(
@@ -1166,9 +1168,9 @@ def verify_foreach(step, in_syms, state_syms, free_syms,
             i = i + 1
 
         if is_train:
-            e = out.bind(ctx=default_context(), args=arg_dict, args_grad=arg_grad_dict)
+            e = out._bind(ctx=default_context(), args=arg_dict, args_grad=arg_grad_dict)
         else:
-            e = out.bind(ctx=default_context(), args=arg_dict)
+            e = out._bind(ctx=default_context(), args=arg_dict)
         # the inputs to forward and backward are the same so forward and backward
         # should always return the same outputs.
         for i in range(num_iters):
@@ -1410,6 +1412,7 @@ def step13(in1, states, free):
     def step14(in1, states, free):
         return (in1 + free[0], [])
     frees = [mx.nd.random.uniform(shape=(2))]
+    out_grads = [[mx.nd.random.uniform(-10, 10, arrs.shape)], []]
     verify_foreach(step14, v3, [], [v4], arrs, [], frees, out_grads)
     verify_foreach(step14, v3, [], [v4], arrs, [], frees, out_grads, False)
     def step15(in1, states, free):
@@ -1469,7 +1472,7 @@ def step_nd(in1, states):
     state = mx.nd.arange(2)
     data_grad = mx.nd.empty(data.shape)
     state_grad = mx.nd.empty(state.shape)
-    e = out.bind(ctx=default_context(), args={'v1':data, 'v2':state},
+    e = out._bind(ctx=default_context(), args={'v1':data, 'v2':state},
             args_grad={'v1':data_grad, 'v2':state_grad})
     e.forward(is_train=True)
     out_grads = []
diff --git a/tests/python/unittest/test_contrib_operator.py b/tests/python/unittest/test_contrib_operator.py
index 6f8b415c648e..37a02a7ce14b 100644
--- a/tests/python/unittest/test_contrib_operator.py
+++ b/tests/python/unittest/test_contrib_operator.py
@@ -45,7 +45,7 @@ def test_box_nms_backward(data, grad, expected, thresh=0.5, valid=0, topk=-1, co
         op = mx.contrib.sym.box_nms(in_var, overlap_thresh=thresh, valid_thresh=valid, topk=topk,
                                     coord_start=coord, score_index=score, id_index=cid, background_id=bid,
                                     force_suppress=force, in_format=in_format, out_format=out_format)
-        exe = op.bind(ctx=default_context(), args=[arr_data], args_grad=[arr_grad])
+        exe = op._bind(ctx=default_context(), args=[arr_data], args_grad=[arr_grad])
         exe.forward(is_train=True)
         exe.backward(mx.nd.array(grad))
         assert_almost_equal(arr_grad.asnumpy(), expected)
diff --git a/tests/python/unittest/test_deferred_compute.py b/tests/python/unittest/test_deferred_compute.py
index 18ed0b4c5103..c2441dc54835 100644
--- a/tests/python/unittest/test_deferred_compute.py
+++ b/tests/python/unittest/test_deferred_compute.py
@@ -93,7 +93,7 @@ def _assert_dc(setup, compute, mode='all', setup_is_deterministic=True, numpy=Tr
                 xs = setup(nd=nd)
 
             args = {name: x for name, x in zip(xs_names, xs)}
-            ys_sym = sym.bind(mx.context.current_context(), args=args).forward()
+            ys_sym = sym._bind(mx.context.current_context(), args=args).forward()
 
             ys_sym_np = [y.asnumpy() for y in ys_sym]
             _all_same(ys_np, ys_sym_np)
diff --git a/tests/python/unittest/test_exc_handling.py b/tests/python/unittest/test_exc_handling.py
index f544ab5d6510..72e21272a1c2 100644
--- a/tests/python/unittest/test_exc_handling.py
+++ b/tests/python/unittest/test_exc_handling.py
@@ -58,7 +58,7 @@ def symbolic(exec_backward=True, waitall=True):
                'y': mx.nd.random.normal(0, 1, x_shape, ctx=default_context()),
                'z': mx.nd.random.normal(0, 1, z_shape, ctx=default_context())}
         arr_grad = {'x': mx.nd.empty(x_shape), 'y': mx.nd.empty(x_shape), 'z': mx.nd.empty(z_shape)}
-        exec1 = out.bind(ctx=default_context(), args=arr, args_grad=arr_grad)
+        exec1 = out._bind(ctx=default_context(), args=arr, args_grad=arr_grad)
         outputs = exec1.forward()
         if exec_backward:
             exec1.backward()
diff --git a/tests/python/unittest/test_executor.py b/tests/python/unittest/test_executor.py
index 300e4b2590c8..0e142bf5b05a 100644
--- a/tests/python/unittest/test_executor.py
+++ b/tests/python/unittest/test_executor.py
@@ -39,15 +39,15 @@ def check_bind_with_uniform(uf, gf, dim, sf=None, lshape=None, rshape=None):
     rhs_arr = mx.nd.array(np.random.uniform(-1, 1, rshape))
     lhs_grad = mx.nd.empty(lshape)
     rhs_grad = mx.nd.empty(rshape)
-    executor = ret.bind(mx.Context('cpu'),
+    executor = ret._bind(mx.Context('cpu'),
                         args=[lhs_arr, rhs_arr],
                         args_grad=[lhs_grad, rhs_grad])
 
-    exec3 = ret.bind(mx.Context('cpu'),
+    exec3 = ret._bind(mx.Context('cpu'),
                      args=[lhs_arr, rhs_arr])
 
 
-    exec4 = ret.bind(mx.Context('cpu'),
+    exec4 = ret._bind(mx.Context('cpu'),
                      args={'rhs': rhs_arr, 'lhs': lhs_arr},
                      args_grad={'lhs': lhs_grad, 'rhs': rhs_grad})
 
@@ -141,26 +141,34 @@ def test_reshape():
     x = mx.sym.Variable('x')
     y = mx.sym.FullyConnected(x, num_hidden=4)
 
-    exe = y.simple_bind(mx.cpu(), x=(5,4), grad_req='null')
+    exe = y._simple_bind(mx.cpu(), x=(5,4), grad_req='null')
     exe.arg_arrays[0][:] = 1
     exe.arg_arrays[1][:] = mx.nd.ones((4,4))
     exe.arg_arrays[2][:] = 0
 
-    new_exe = exe.reshape(x=(3,4))
-    new_exe.forward(is_train=False)
+    exe.forward(is_train=False)
     # test sub exec forward
-    assert np.all(new_exe.outputs[0].asnumpy() == 4)
+    assert np.all(exe.outputs[0].asnumpy() == 4)
     # test shared memory
     assert np.all(exe.outputs[0].asnumpy()[:3] == 4)
     # test base exec forward
     exe.forward(is_train=False)
     assert np.all(exe.outputs[0].asnumpy() == 4)
 
-    # test sharing ndarray depending on new_shape
-    new_exe = exe.reshape(allow_up_sizing=True, x=(6,4))
     # data ndarray is not shared between exe and new_exe
-    new_exe.arg_arrays[0][:] = 0
-    assert np.all(exe.arg_arrays[0].asnumpy() == 1)
+    exe.arg_arrays[0][:] = 0
     # weight ndarray is shared between exe and new_exe
-    assert np.all(new_exe.arg_arrays[1].asnumpy() == 1)
+    assert np.all(exe.arg_arrays[1].asnumpy() == 1)
 
+@with_seed()
+def test_cached_op_init():
+    def check_init(static_alloc, static_shape):
+        out = mx.sym.zeros((3,3))
+        flags = [('static_alloc', static_alloc), ('static_shape', static_shape)]
+        exe = mx.ndarray.CachedOp(out, flags)
+        z = exe(None, default_ctx=mx.cpu())
+        assert np.all(z.asnumpy() == 0)
+
+    check_init(False, False)
+    check_init(True, False)
+    check_init(True, True)
diff --git a/tests/python/unittest/test_extensions.py b/tests/python/unittest/test_extensions.py
index 2e21f927737b..52f999571e13 100644
--- a/tests/python/unittest/test_extensions.py
+++ b/tests/python/unittest/test_extensions.py
@@ -72,9 +72,9 @@ def test_custom_op():
     in_grad2 = [mx.nd.empty((dim_n,dim_k),ctx=mx.cpu()),mx.nd.empty((dim_k,dim_m),ctx=mx.cpu())]
     in_grad_base = [mx.nd.empty((dim_n,dim_k),ctx=mx.cpu()),mx.nd.empty((dim_k,dim_m),ctx=mx.cpu())]
 
-    exe1 = c.bind(ctx=mx.cpu(),args={'s':mat1,'t':mat2},args_grad=in_grad1)
-    exe2 = d.bind(ctx=mx.cpu(),args={'s':mat1,'t':mat2},args_grad=in_grad2)
-    exe_base = base.bind(ctx=mx.cpu(),args={'s':mat1,'t':mat2},args_grad=in_grad_base)
+    exe1 = c._bind(ctx=mx.cpu(),args={'s':mat1,'t':mat2},args_grad=in_grad1)
+    exe2 = d._bind(ctx=mx.cpu(),args={'s':mat1,'t':mat2},args_grad=in_grad2)
+    exe_base = base._bind(ctx=mx.cpu(),args={'s':mat1,'t':mat2},args_grad=in_grad_base)
 
     out1 = exe1.forward()
     out2 = exe2.forward()
@@ -132,13 +132,13 @@ def test_subgraph():
     args = {'a':mx.nd.ones((3,2),ctx=mx.cpu()), 'b':mx.nd.ones((3,2),ctx=mx.cpu())}
 
     # baseline - regular execution in MXNet
-    exe = sym.bind(ctx=mx.cpu(), args=args)
+    exe = sym._bind(ctx=mx.cpu(), args=args)
     out = exe.forward()
 
     # without propogating shapes/types, passing a custom option to subgraph prop "myOpt"
     # should not create subgraph since subgraph prop requires type info
     mysym1 = sym.optimize_for("myProp", myOpt='yello')
-    exe1 = mysym1.bind(ctx=mx.cpu(), args=args)
+    exe1 = mysym1._bind(ctx=mx.cpu(), args=args)
     out1 = exe1.forward()
     # check that result matches one executed by MXNet
     assert_almost_equal(out[0].asnumpy(), out1[0].asnumpy(), rtol=1e-3, atol=1e-3)
@@ -146,14 +146,14 @@ def test_subgraph():
     # with propogating shapes/types, rejecting subgraph
     # this tests creating the subgraph and having the subgraph prop reject it
     mysym2 = sym.optimize_for("myProp", args, reject=True)
-    exe2 = mysym2.bind(ctx=mx.cpu(), args=args)
+    exe2 = mysym2._bind(ctx=mx.cpu(), args=args)
     out2 = exe2.forward()
     # check that result matches one executed by MXNet
     assert_almost_equal(out[0].asnumpy(), out2[0].asnumpy(), rtol=1e-3, atol=1e-3)
 
     # with propogating shapes/types
     mysym3 = sym.optimize_for("myProp",args)
-    exe3 = mysym3.bind(ctx=mx.cpu(), args=args)
+    exe3 = mysym3._bind(ctx=mx.cpu(), args=args)
     out3 = exe3.forward()
     # check that result matches one executed by MXNet
     assert_almost_equal(out[0].asnumpy(), out3[0].asnumpy(), rtol=1e-3, atol=1e-3)
diff --git a/tests/python/unittest/test_gluon_contrib.py b/tests/python/unittest/test_gluon_contrib.py
index d7356575b927..33ea1e495e91 100644
--- a/tests/python/unittest/test_gluon_contrib.py
+++ b/tests/python/unittest/test_gluon_contrib.py
@@ -22,7 +22,7 @@
 from mxnet.gluon import contrib
 from mxnet.gluon import nn
 from mxnet.gluon.contrib.nn import (
-    Concurrent, HybridConcurrent, Identity, SparseEmbedding, PixelShuffle1D,
+    Concurrent, HybridConcurrent, Identity, PixelShuffle1D,
     PixelShuffle2D, PixelShuffle3D)
 from mxnet.test_utils import almost_equal, default_context, assert_almost_equal, assert_allclose
 from common import setup_module, with_seed, teardown_module
@@ -195,18 +195,6 @@ def test_identity():
     x = mx.nd.random.uniform(shape=(128, 33, 64))
     assert_almost_equal(model(x), x)
 
-@with_seed()
-def test_sparse_embedding():
-    layer = SparseEmbedding(10, 100)
-    layer.initialize()
-    trainer = mx.gluon.Trainer(layer.collect_params(), 'sgd')
-    x = mx.nd.array([3,4,2,0,1])
-    with mx.autograd.record():
-        y = layer(x)
-        y.backward()
-    assert (layer.weight.grad().asnumpy()[:5] == 1).all()
-    assert (layer.weight.grad().asnumpy()[5:] == 0).all()
-
 def test_pixelshuffle1d():
     nchan = 2
     up_x = 2
diff --git a/tests/python/unittest/test_infer_type.py b/tests/python/unittest/test_infer_type.py
index 286556a006c4..f103e2957819 100644
--- a/tests/python/unittest/test_infer_type.py
+++ b/tests/python/unittest/test_infer_type.py
@@ -31,6 +31,7 @@ def test_infer_multiout_op():
         y = mx.nd.split(data, axis=0, num_outputs=2)
     y[0].backward()
     assert data.grad.dtype == np.float64
+    mx.nd.waitall()
 
 @with_seed()
 def test_infer_multiout_op2():
diff --git a/tests/python/unittest/test_io.py b/tests/python/unittest/test_io.py
index d0751e3cb094..61c26b3fa36a 100644
--- a/tests/python/unittest/test_io.py
+++ b/tests/python/unittest/test_io.py
@@ -64,7 +64,7 @@ def test_MNISTIter(tmpdir):
     train_dataiter.iter_next()
     label_1 = train_dataiter.getlabel().asnumpy().flatten()
     assert(sum(label_0 - label_1) == 0)
-
+    mx.nd.waitall()
 
 def test_Cifar10Rec(tmpdir):
     path = str(tmpdir)
diff --git a/tests/python/unittest/test_memory_opt.py b/tests/python/unittest/test_memory_opt.py
index af671fe468e7..0cc217ffff47 100644
--- a/tests/python/unittest/test_memory_opt.py
+++ b/tests/python/unittest/test_memory_opt.py
@@ -67,18 +67,7 @@ def test_rnn_cell():
     y = mx.sym.FullyConnected(y, num_hidden=num_hidden)
     tmp = mx.sym._internal._plus(x, y)
     z = mx.sym.Activation(tmp, act_type='tanh')
-    exec = z.simple_bind(mx.cpu(), 'write', x=(num_hidden,), y=(num_hidden,))
-    exec_debug_str = exec.debug_str().split('\n')
-    op_checklist = 0
-    for i, line in enumerate(exec_debug_str):
-        if "Op:elemwise_add" in line:
-            op_checklist += 1
-            assert exec_debug_str[i + 5] == "\t__mirror_stage__=0"
-        if "Op:Activation" in line:
-            op_checklist += 1
-            assert exec_debug_str[i + 4] == "\t__mirror_stage__=0"
-    assert op_checklist == 2, \
-           "Not all operator nodes have been verified on the mirror stage"
+    exec = z._simple_bind(mx.cpu(), 'write', x=(num_hidden,), y=(num_hidden,))
 
 
 @memory_opt_env_check
@@ -101,19 +90,7 @@ def test_mlp_attn():
                                    name="activation%d"%i))
         in_arg_shapes["y_t%d"%i] = (1, num_hidden,)
     z = mx.sym.Group(z)
-    exec = z.simple_bind(mx.cpu(), 'write', **in_arg_shapes)
-    exec_debug_str = exec.debug_str().split('\n')
-    op_checklist = 0
-    for i, line in enumerate(exec_debug_str):
-        for t in range(num_steps):
-            if line == "Op:broadcast_add, Name=broadcast_add%d"%t:
-                op_checklist += 1
-                assert exec_debug_str[i + 5] == "\t__mirror_stage__=1"
-            if line == "Op:Activation, Name=activation%d"%t:
-                op_checklist += 1
-                assert exec_debug_str[i + 4] == "\t__mirror_stage__=1"
-    assert op_checklist == 2 * num_steps, \
-           "Not all operator nodes have been verified on the mirror stage"
+    exec = z._simple_bind(mx.cpu(), 'write', **in_arg_shapes)
 
 
 @memory_opt_env_check
@@ -126,45 +103,7 @@ def test_fc():
     y = mx.sym.Activation(x, act_type='tanh', name='y')
     z = mx.sym.Activation(y, act_type='tanh', name='z')
     z = mx.sym.FullyConnected(z, num_hidden=num_hidden)
-    exec = z.simple_bind(mx.cpu(), 'write', x=(num_hidden,))
-    exec_debug_str = exec.debug_str().split('\n')
-    op_checklist = 0
-    for i, line in enumerate(exec_debug_str):
-        if line == "Op:Activation, Name=y":
-            op_checklist += 1
-            assert exec_debug_str[i + 4] == "\t__mirror_stage__=0"
-        if line == "Op:Activation, Name=z":
-            op_checklist += 1
-            assert exec_debug_str[i + 4] == "\t__mirror_stage__=1"
-        if "Op:FullyConnected" in line:
-            op_checklist += 1
-            assert exec_debug_str[i + 6] == "\t__mirror_stage__=0"
-        if "Op:_backward_FullyConnected" in line:
-            op_checklist += 1
-            assert exec_debug_str[i + 3] == "\targ[1]=z_mirror(0)"
-    assert op_checklist == 4, \
-           "Not all operator nodes have been verified on the mirror stage"
-
-
-def grep_exec_memory_consumption(exec):
-    # Grep the memory consumption (in MB) from the executor debug string.
-    #
-    # It is important to note that, due to various reasons, the memory
-    # consumption reported by the executor debug string might be very different
-    # when compared with the real numbers reported by nvidia-smi. These reasons
-    # include:
-    #   - Allocations by the CUDA Library (e.g., cuDNN, cuBLAS)
-    #   - Fragmentation (of the MXNet Memory Allocator and cudaMalloc)
-    exec_debug_str = exec.debug_str().split('\n')
-
-    import re  # We will be using regular expressions for grepping the model
-               # memory consumption.
-    alloc_line_pattern = re.compile("Total \d+ MB allocated")
-    for line in exec_debug_str:
-        if alloc_line_pattern.match(line) is not None:
-            return int(line.split()[1])
-    assert False, "Unable to gerp the memory consumption numbers from the executor " \
-                  "debug string: %s" % exec_debug_str
+    exec = z._simple_bind(mx.cpu(), 'write', x=(num_hidden,))
 
 
 if __name__ == "__main__":
diff --git a/tests/python/unittest/test_model_parallel.py b/tests/python/unittest/test_model_parallel.py
deleted file mode 100644
index 6036caf0a5ba..000000000000
--- a/tests/python/unittest/test_model_parallel.py
+++ /dev/null
@@ -1,87 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import numpy as np
-import mxnet as mx
-from mxnet.test_utils import *
-
-def reldiff(a, b):
-    diff = np.sum(np.abs(a - b))
-    norm = np.sum(np.abs(a))
-    if diff == 0:
-        return 0
-    reldiff = diff  / norm
-    return reldiff
-
-def test_chain(ctx1=mx.cpu(0), ctx2=mx.cpu(1), dtype=np.float32):
-    n = 2
-    data1 = mx.sym.Variable('data1', dtype=dtype)
-    data2 = mx.sym.Variable('data2', dtype=dtype)
-    data3 = mx.sym.Variable('data3', dtype=dtype)
-    with mx.AttrScope(ctx_group='dev1'):
-        net = data1 + data2
-        net = net * dtype(3)
-
-    with mx.AttrScope(ctx_group='dev2'):
-        net = net + data3
-
-    arr = []
-    arr_grad = []
-    shape = (4, 5)
-    with mx.Context(ctx1):
-        for i in range(n):
-            arr.append(mx.nd.empty(shape, dtype=dtype))
-            arr_grad.append(mx.nd.empty(shape, dtype=dtype))
-    with mx.Context(ctx2):
-        arr.append(mx.nd.empty(shape, dtype=dtype))
-        arr_grad.append(mx.nd.empty(shape, dtype=dtype))
-
-    exec1 = net.bind(ctx1,
-                     args=arr,
-                     args_grad=arr_grad,
-                     group2ctx={'dev1': ctx1, 'dev2': ctx2})
-    arr[0][:] = dtype(1)
-    arr[1][:] = dtype(2)
-    arr[2][:] = dtype(3)
-    arr2 = [a.copyto(ctx1) for a in arr]
-    arr_grad2 = [a.copyto(ctx1) for a in arr_grad]
-    exec2 = net.bind(ctx1,
-                     args=arr2,
-                     args_grad=arr_grad2)
-
-    # Show the execution plan that involves copynode
-    print(exec1.debug_str())
-    exec1.forward(is_train=True)
-    exec2.forward(is_train=True)
-    assert reldiff(exec1.outputs[0].asnumpy(), exec2.outputs[0].asnumpy()) < 1e-6
-    out_grad = mx.nd.empty(shape, ctx1)
-    out_grad[:] = dtype(1)
-    exec1.backward([out_grad])
-    exec2.backward([out_grad.copyto(ctx1)])
-    for a, b in zip(arr_grad, arr_grad2):
-        assert reldiff(a.asnumpy(), b.asnumpy()) < 1e-6
-
-def test_chain_type_device():
-    ctx_pairs = [(mx.cpu(0), mx.cpu(1))]
-    if default_context().device_type == 'gpu':
-        ctx_pairs = ctx_pairs + [(mx.gpu(0), mx.gpu(0)), (mx.cpu(0), mx.gpu(0)), (mx.gpu(0), mx.cpu(0))]
-    for ctx1, ctx2 in ctx_pairs:
-        for dtype in [np.float16, np.float32, np.float64]:
-            test_chain(ctx1, ctx2, dtype)
-
-if __name__ == '__main__':
-    test_chain_type_device()
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index 18dca753f432..5e46a68a7a00 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -60,7 +60,7 @@ def test_rnn_with_new_param():
                 bind_dict['rnn_state_cell'] = mx.ndarray.zeros(
                     shape=(num_layers * directions, batch_size, state_size))
 
-            ex = sym.bind(default_context(), bind_dict)
+            ex = sym._bind(default_context(), bind_dict)
             ex.forward(is_train=True)
             ex01 = ex.output_dict['rnn_output'].asnumpy()
             ex.forward(is_train=False)
@@ -85,7 +85,7 @@ def test_lstm_dropout():
     T, N, I, H = 300, 20, 800, 800
     rnn = mx.sym.RNN(data=X, parameters=Params, state=HX, state_cell=CX,
                      state_size=H, num_layers=5, mode='lstm', p=0.5, state_outputs=True, name='LSTM')
-    exe = rnn.simple_bind(ctx=mx.cpu(), x=(T, N, I))
+    exe = rnn._simple_bind(ctx=mx.cpu(), x=(T, N, I))
     out = exe.forward(is_train=True)
     out[0].wait_to_read()
 
@@ -98,7 +98,7 @@ def test_gru_dropout():
     T, N, I, H = 300, 20, 800, 800
     rnn = mx.sym.RNN(data=X, parameters=Params, state=HX,
                      state_size=H, num_layers=5, mode='gru', p=0.5, state_outputs=True, name='GRU')
-    exe = rnn.simple_bind(ctx=mx.cpu(), x=(T, N, I))
+    exe = rnn._simple_bind(ctx=mx.cpu(), x=(T, N, I))
     out = exe.forward(is_train=True)
     out[0].wait_to_read()
 
@@ -111,7 +111,7 @@ def test_rnntanh_dropout():
     T, N, I, H = 300, 20, 800, 800
     rnn = mx.sym.RNN(data=X, parameters=Params, state=HX,
                      state_size=H, num_layers=5, mode='rnn_tanh', p=0.5, state_outputs=True, name='RNN_TANH')
-    exe = rnn.simple_bind(ctx=mx.cpu(), x=(T, N, I))
+    exe = rnn._simple_bind(ctx=mx.cpu(), x=(T, N, I))
     out = exe.forward(is_train=True)
     out[0].wait_to_read()
 
@@ -124,7 +124,7 @@ def test_rnnrelu_dropout():
     T, N, I, H = 300, 20, 800, 800
     rnn = mx.sym.RNN(data=X, parameters=Params, state=HX,
                      state_size=H, num_layers=5, mode='rnn_relu', p=0.5, state_outputs=True, name='RNN_RELU')
-    exe = rnn.simple_bind(ctx=mx.cpu(), x=(T, N, I))
+    exe = rnn._simple_bind(ctx=mx.cpu(), x=(T, N, I))
     out = exe.forward(is_train=True)
     out[0].wait_to_read()
 
@@ -150,7 +150,7 @@ def test_RNN_float64():
     args_grad = explicit_grad
     grad_req = 'write'
 
-    ex = sym.bind(default_context(),
+    ex = sym._bind(default_context(),
         {
             'in': mx.nd.ones([2, 1, 2], dtype=dtype),
             'par': mx.nd.ones([12], dtype=dtype),
@@ -177,7 +177,7 @@ def check_elementwise_sum_with_shape(shape, n):
     arr_grad = [mx.nd.empty(shape) for i in range(n)]
     for i in range(n):
         arr[i][:] = np.random.uniform(-10, 10, shape)
-    exec1 = out.bind(default_context(),
+    exec1 = out._bind(default_context(),
                      args=arr,
                      args_grad=arr_grad)
 
@@ -231,7 +231,7 @@ def check_concat_with_shape(shapes, dimension, skip_second):
     args = out.list_arguments()
     arg_shapes, out_shapes, aux_shapes = out.infer_shape(**dict(zip(args, shapes)))
     out_grad = mx.nd.empty(out_shapes[0])
-    exec1 = out.bind(default_context(),
+    exec1 = out._bind(default_context(),
                      args=arr,
                      args_grad=dict_grad)
     exec1.forward(is_train=True)
@@ -325,9 +325,9 @@ def check_slice_channel(data_ndim, axis, num_outputs, squeeze_axis):
         out_grads_npy = [np.random.normal(size=out_ele_shape) for i in range(num_outputs)]
         data = mx.sym.Variable('data')
         sym = mx.sym.SliceChannel(data=data, num_outputs=num_outputs, axis=axis, squeeze_axis=squeeze_axis)
-        exe = sym.simple_bind(ctx=default_context(), data=data_npy.shape)
-        assert len(exe.outputs) == num_outputs
+        exe = sym._simple_bind(ctx=default_context(), data=data_npy.shape)
         outputs = exe.forward(is_train=True, data=data_npy)
+        assert len(exe.outputs) == num_outputs
         for i in range(num_outputs):
             gt = data_npy.take(np.arange(i * shape[axis]/num_outputs,
                                          (i+1) * shape[axis]/num_outputs).astype(np.int), axis=axis)
@@ -336,7 +336,8 @@ def check_slice_channel(data_ndim, axis, num_outputs, squeeze_axis):
             else:
                 assert_almost_equal(outputs[i], gt)
         # test backward
-        exe.backward(out_grads=[mx.nd.array(ele, ctx=default_context()) for ele in out_grads_npy])
+        ograd = [mx.nd.array(ele, dtype=outputs[i].dtype) for i, ele in enumerate(out_grads_npy)]
+        exe.backward(out_grads=ograd)
         if squeeze_axis:
             assert_almost_equal(exe.grad_arrays[0],
                                 np.concatenate([np.expand_dims(ele, axis=axis) for ele in out_grads_npy],
@@ -358,7 +359,7 @@ def test_python_op():
     x = mx.ndarray.ones((10))*10
     dx = mx.ndarray.zeros((10))
     dy = mx.ndarray.ones((10))
-    exec1 = s.bind(default_context(), args=[x], args_grad = {'X': dx})
+    exec1 = s._bind(default_context(), args=[x], args_grad = {'X': dx})
     exec1.forward(is_train=True)
     assert_almost_equal(x, exec1.outputs[0])
     exec1.backward(dy)
@@ -374,7 +375,7 @@ def test_swapaxes():
     arr_data = mx.nd.array(data_tmp)
     swap0 = mx.symbol.SwapAxis(data=data, dim1=0, dim2=2)
     swap = mx.symbol.SwapAxis(data=swap0, dim1=1, dim2=2)
-    exe_c = swap.bind(default_context(), args=[arr_data])
+    exe_c = swap._bind(default_context(), args=[arr_data])
     exe_c.forward(is_train=True)
     out = exe_c.outputs[0]
 
@@ -395,7 +396,7 @@ def test_swapaxes():
         data_mx = mx.nd.array(data_np, dtype=data_np.dtype)
         ret_np = np.swapaxes(data_np, axis1=axis1, axis2=axis2)
         ret_mx = mx.symbol.SwapAxis(data, dim1=axis1, dim2=axis2)
-        exe_c = ret_mx.bind(default_context(), args=[data_mx])
+        exe_c = ret_mx._bind(default_context(), args=[data_mx])
         exe_c.forward(is_train=True)
         out = exe_c.outputs[0]
         assert_almost_equal(out, ret_np)
@@ -546,7 +547,7 @@ def fleaky_relu_grad(grad, x, y, act_type, slope=0.25):
                 if dtype is not np.float16:
                     check_numeric_gradient(y, [xa], numeric_eps=eps, rtol=rtol, atol=atol, dtype=dtype)
                 check_symbolic_forward(y, [xa], [ya], rtol=rtol, atol=atol, dtype=dtype)
-                check_symbolic_backward(y, [xa], [np.ones(shape)], [ga], rtol=rtol, atol=atol, dtype=dtype)
+                check_symbolic_backward(y, [xa], [np.ones(shape, dtype=dtype)], [ga], rtol=rtol, atol=atol, dtype=dtype)
 
 
 # NOTE(haojin2): Skipping the numeric check tests for float16 data type due to precision issues,
@@ -607,9 +608,10 @@ def fprelu_grad(x, y, gamma):
                     check_numeric_gradient(y, [xa, gam], numeric_eps=eps, rtol=rtol, atol=atol, dtype=dtype)
                     check_numeric_gradient(y, [xa, gam_full], numeric_eps=eps, rtol=rtol, atol=atol, dtype=dtype)
                 check_symbolic_forward(y, [xa, gam], [ya], rtol=rtol, atol=atol, dtype=dtype)
-                check_symbolic_backward(y, [xa, gam], [np.ones(shape), np.ones(gam.shape)], [g_xa, g_gam], rtol=rtol, atol=atol, dtype=dtype)
+                check_symbolic_backward(y, [xa, gam], [np.ones(ya.shape, dtype=dtype)],
+                                       [g_xa, g_gam], rtol=rtol, atol=atol, dtype=dtype)
                 check_symbolic_forward(y, [xa, gam_full], [ya_full], rtol=rtol, atol=atol, dtype=dtype)
-                check_symbolic_backward(y, [xa, gam_full], [np.ones(shape), np.ones(gam_full.shape)],
+                check_symbolic_backward(y, [xa, gam_full], [np.ones(ya_full.shape, dtype=dtype)],
                                         [g_xa_full, g_gam_full], rtol=rtol, atol=atol, dtype=dtype)
 
 @with_seed()
@@ -640,7 +642,7 @@ def fselu_grad(grad, x, y):
         ga = fselu_grad(np.ones(shape).astype(dtype), xa, ya)
         check_numeric_gradient(y, [xa], numeric_eps=eps, rtol=rtol, atol=atol, dtype=dtype)
         check_symbolic_forward(y, [xa], [ya], rtol=rtol, atol=atol, dtype=dtype)
-        check_symbolic_backward(y, [xa], [np.ones(shape)], [ga], rtol=rtol, atol=atol, dtype=dtype)
+        check_symbolic_backward(y, [xa], [np.ones(shape, dtype=dtype)], [ga], rtol=rtol, atol=atol, dtype=dtype)
 
 
 @with_seed()
@@ -699,7 +701,7 @@ def test_shape_array():
         xg = mx.nd.empty(xa.shape)
         ya = np.shape(xa)
         yg = mx.nd.ones(ya)
-        exe = y.bind(ctx=default_context(), args={'x': xa},
+        exe = y._bind(ctx=default_context(), args={'x': xa},
                      args_grad={'x': xg})
         exe.forward(is_train=True)
         exe.backward([yg])
@@ -717,7 +719,7 @@ def test_size_array():
         xg = mx.nd.empty(xa.shape)
         ya = np.size(xa)
         yg = mx.nd.ones(ya)
-        exe = y.bind(ctx=default_context(), args={'x': xa},
+        exe = y._bind(ctx=default_context(), args={'x': xa},
                      args_grad={'x': xg})
         exe.forward(is_train=True)
         exe.backward([yg])
@@ -781,15 +783,15 @@ def _inner_test(forward_gt, logic_sym, x_shape, y_shape, test_scalar=True):
         z = logic_sym(x, y)
         x_npy = np.random.randint(0, 4, size=x_shape).astype(np.float32)
         y_npy = np.random.randint(0, 4, size=y_shape).astype(np.float32)
-        exe = z.simple_bind(ctx=default_context(), x=x_shape, y=y_shape)
+        exe = z._simple_bind(ctx=default_context(), x=x_shape, y=y_shape)
         mx_out = exe.forward(is_train=True, x=x_npy, y=y_npy)[0]
         assert_almost_equal(mx_out, forward_gt(x_npy, y_npy))
         exe.backward()
         if test_scalar:
             z_lscalar = logic_sym(1, y)
             z_rscalar = logic_sym(x, 1)
-            exe_lscalar = z_lscalar.simple_bind(ctx=default_context(), y=y_shape)
-            exe_rscalar = z_rscalar.simple_bind(ctx=default_context(), x=x_shape)
+            exe_lscalar = z_lscalar._simple_bind(ctx=default_context(), y=y_shape)
+            exe_rscalar = z_rscalar._simple_bind(ctx=default_context(), x=x_shape)
             mx_lscalar_out = exe_lscalar.forward(is_train=True, y=y_npy)[0]
             mx_rscalar_out = exe_rscalar.forward(is_train=True, x=x_npy)[0]
             assert_almost_equal(mx_lscalar_out, forward_gt(1, y_npy))
@@ -841,7 +843,7 @@ def reference(a, dtype):
     assert_almost_equal(mx_out, reference(xa, dtype=xa.dtype))
     x = mx.sym.Variable('x')
     y = mx.sym.logical_not(data=x)
-    exe = y.simple_bind(ctx=default_context(), x=shape)
+    exe = y._simple_bind(ctx=default_context(), x=shape)
     sym_out = exe.forward(is_train=True, x=mx_xa)[0]
     assert_almost_equal(sym_out, reference(xa, dtype=xa.dtype))
 
@@ -854,7 +856,7 @@ def test_embedding():
 
     data = mx.sym.Variable("data")
     embed = mx.sym.Embedding(data=data, input_dim=in_dim, output_dim=out_dim, name="embed")
-    exe_test = embed.simple_bind(default_context(), grad_req={'data': 'null', 'embed_weight': 'write'}, data=(batch,))
+    exe_test = embed._simple_bind(default_context(), grad_req={'data': 'null', 'embed_weight': 'write'}, data=(batch,))
     arg_map = dict(zip(embed.list_arguments(), exe_test.arg_arrays))
     grad_map = dict(zip(embed.list_arguments(), exe_test.grad_arrays))
     np_data = np.random.randint(low=0, high=in_dim, size=batch)
@@ -890,7 +892,7 @@ def test_binary_op_duplicate_input():
     out_grad = mx.nd.empty(shape)
     out_grad[:] = 1
     square = data * data
-    exe_square = square.bind(default_context(), args=[arr_data], args_grad=[arr_grad])
+    exe_square = square._bind(default_context(), args=[arr_data], args_grad=[arr_grad])
     exe_square.forward(is_train=True)
     assert_almost_equal(exe_square.outputs[0], data_tmp * data_tmp)
     exe_square.backward(out_grad)
@@ -908,7 +910,7 @@ def test_sign():
     arr_grad[:]=3
 
     test = mx.sym.sign(data)
-    exe_test = test.bind(default_context(), args=[arr_data], args_grad=[arr_grad])
+    exe_test = test._bind(default_context(), args=[arr_data], args_grad=[arr_grad])
     exe_test.forward(is_train=True)
     out = exe_test.outputs[0]
     npout = np.sign(data_tmp)
@@ -933,7 +935,7 @@ def test_round_ceil_floor():
     arr_grad[:]= 2
 
     test = mx.sym.round(data) + mx.sym.ceil(data) +  mx.sym.floor(data)
-    exe_test = test.bind(default_context(), args=[arr_data])
+    exe_test = test._bind(default_context(), args=[arr_data])
     exe_test.forward(is_train=True)
     out = exe_test.outputs[0]
     npout = np.round(data_tmp) + np.ceil(data_tmp) + np.floor(data_tmp)
@@ -947,7 +949,7 @@ def test_trunc():
     data = mx.symbol.Variable('data')
     test = mx.sym.trunc(data)
 
-    exe_test = test.bind(default_context(), args=[arr_data])
+    exe_test = test._bind(default_context(), args=[arr_data])
     exe_test.forward(is_train=True)
     out = exe_test.outputs[0]
     # 'trunc' is sensitive to the precision of the calculation.  Force numpy to match mxnet's float32.
@@ -968,7 +970,7 @@ def test_rsqrt_cos_sin():
     arr_grad[:]=3
 
     test =  mx.sym.rsqrt(data) + mx.sym.cos(data) + mx.sym.sin(data)
-    exe_test = test.bind(default_context(), args=[arr_data], args_grad=[arr_grad])
+    exe_test = test._bind(default_context(), args=[arr_data], args_grad=[arr_grad])
     exe_test.forward(is_train=True)
     out = exe_test.outputs[0]
     npout =  1/ np.sqrt(data_tmp) + np.cos(data_tmp) + np.sin(data_tmp)
@@ -999,7 +1001,7 @@ def test_maximum_minimum():
     arr_grad2 = mx.nd.empty(shape)
 
     test =  mx.sym.maximum(data1,data2) + mx.sym.minimum(data1,data2)
-    exe_test = test.bind(default_context(), args=[arr_data1,arr_data2], args_grad=[arr_grad1,arr_grad2])
+    exe_test = test._bind(default_context(), args=[arr_data1,arr_data2], args_grad=[arr_grad1,arr_grad2])
     exe_test.forward(is_train=True)
     out = exe_test.outputs[0]
     npout =  np.maximum(data_tmp1,data_tmp2) + np.minimum(data_tmp1,data_tmp2)
@@ -1031,7 +1033,7 @@ def test_maximum_minimum_scalar():
     arr_grad1 = mx.nd.empty(shape)
 
     test =  mx.sym.maximum(data1,3) + mx.sym.maximum(9,data1) + mx.sym.minimum(5,data1) + mx.sym.minimum(data1,4)
-    exe_test = test.bind(default_context(), args=[arr_data1], args_grad=[arr_grad1])
+    exe_test = test._bind(default_context(), args=[arr_data1], args_grad=[arr_grad1])
     exe_test.forward(is_train=True)
     out = exe_test.outputs[0]
     npout =  np.maximum(data_tmp1,3) + np.maximum(9,data_tmp1) + np.minimum(5,data_tmp1) + np.minimum(data_tmp1,4)
@@ -1063,7 +1065,7 @@ def test_abs():
     arr_grad[:]=3
 
     test = mx.sym.abs(data)
-    exe_test = test.bind(default_context(), args=[arr_data], args_grad=[arr_grad])
+    exe_test = test._bind(default_context(), args=[arr_data], args_grad=[arr_grad])
     exe_test.forward(is_train=True)
     out = exe_test.outputs[0]
     npout = abs(data_tmp)
@@ -1104,7 +1106,7 @@ def check_deconvolution_forward_backward(input_shape, num_filter, kernel, stride
         (num_filter, input_shape[1]) + kernel, ctx=mx.cpu()).copyto(default_context())
     args_grad = [mx.nd.empty(s) for s in arg_shapes]
 
-    exe = deconv.bind(default_context(), args=args, args_grad=args_grad)
+    exe = deconv._bind(default_context(), args=args, args_grad=args_grad)
     exe.forward(is_train=True)
     out = exe.outputs[0]
     exe.backward(out_grad)
@@ -1112,7 +1114,7 @@ def check_deconvolution_forward_backward(input_shape, num_filter, kernel, stride
 
     args_grad_addto_npy = [np.random.normal(size=s) for s in arg_shapes]
     args_grad_addto = [mx.nd.array(ele) for ele in args_grad_addto_npy]
-    exe = deconv.bind(default_context(), args=args, args_grad=args_grad_addto, grad_req="add")
+    exe = deconv._bind(default_context(), args=args, args_grad=args_grad_addto, grad_req="add")
     exe.forward(is_train=True)
     out = exe.outputs[0].asnumpy()
     exe.backward(out_grad)
@@ -1146,7 +1148,7 @@ def check_deconvolution_gradient(input_shape, num_filter, pad):
         mx.random.normal(0, 1,(num_filter, input_shape[1]) + kernel, ctx=mx.cpu()).copyto(default_context())
     conv_args_grad = [mx.nd.zeros(conv_data.shape),
         mx.nd.zeros((num_filter, input_shape[1]) + kernel)]
-    exe_conv = conv.bind(default_context(), args=conv_args, args_grad=conv_args_grad)
+    exe_conv = conv._bind(default_context(), args=conv_args, args_grad=conv_args_grad)
     exe_conv.forward(is_train=True)
     conv_out_grad = mx.random.normal(0, 2, exe_conv.outputs[0].shape, ctx=mx.cpu()).copyto(default_context())
     exe_conv.backward(conv_out_grad)
@@ -1161,13 +1163,13 @@ def check_deconvolution_gradient(input_shape, num_filter, pad):
                                   np.random.normal(size=(num_filter, input_shape[1]) + kernel)]
     deconv_addto_args_grad = [mx.nd.array(deconv_addto_args_grad_npy[0]),
                               mx.nd.array(deconv_addto_args_grad_npy[1])]
-    exe_deconv = deconv.bind(default_context(), args=deconv_args, args_grad=deconv_args_grad)
+    exe_deconv = deconv._bind(default_context(), args=deconv_args, args_grad=deconv_args_grad)
     exe_deconv.forward(is_train=True)
     deconv_out_grad = conv_data[:]
     exe_deconv.backward(deconv_out_grad)
     assert_almost_equal(conv_args_grad[1], deconv_args_grad[1], rtol=1e-3, atol=1e-2)
     # Test AddTo
-    exe_deconv_addto = deconv.bind(default_context(), args=deconv_args,
+    exe_deconv_addto = deconv._bind(default_context(), args=deconv_args,
                                    args_grad=deconv_addto_args_grad,
                                    grad_req="add")
     exe_deconv_addto.forward(is_train=True)
@@ -1301,7 +1303,7 @@ def check_deconvolution_forward_with_bias(shape=(1, 16, 5, 5), num_filter=32, nu
         w = mx.sym.Variable('w')
         input_data = mx.random.uniform(-5, 5, shape, ctx=mx.cpu())
         y = mx.sym.Deconvolution(data=x, weight=w, num_filter=num_filter, num_group=num_group, kernel=kernel, no_bias=False, pad=pad)
-        exe = y.simple_bind(ctx=mx.cpu(), x=shape, grad_req='null')
+        exe = y._simple_bind(ctx=mx.cpu(), x=shape, grad_req='null')
 
         exe.arg_arrays[0][:] = np.random.normal(size=exe.arg_arrays[0].shape)
         exe.arg_arrays[1][:] = np.random.normal(size=exe.arg_arrays[1].shape)
@@ -1320,7 +1322,7 @@ def check_nearest_upsampling_with_shape(shapes, scale, root_scale):
     arr_grad = {'arg_%d'%i: mx.nd.zeros(shape) for i, shape in zip(range(len(shapes)), shapes)}
 
     up = mx.sym.UpSampling(*[mx.sym.Variable('arg_%d'%i) for i in range(len(shapes))], sample_type='nearest', scale=root_scale)
-    exe = up.bind(default_context(), args=arr, args_grad=arr_grad)
+    exe = up._bind(default_context(), args=arr, args_grad=arr_grad)
     exe.forward(is_train=True)
     exe.backward(exe.outputs)
     for k in range(len(shapes)):
@@ -1348,7 +1350,7 @@ def _init_bilinear(arr, f):
         'weight':  mx.nd.array(_init_bilinear(mx.ndarray.empty(arg_shapes[1]).asnumpy(), root_scale))}
 
     arr_grad = [mx.nd.empty(s) for s in arg_shapes]
-    exe = up.bind(default_context(), args=arr, args_grad=arr_grad)
+    exe = up._bind(default_context(), args=arr, args_grad=arr_grad)
     exe.forward(is_train=True)
     out = exe.outputs[0].asnumpy()
     exe.backward(exe.outputs)
@@ -1696,7 +1698,7 @@ def np_groupnorm_grad(ograd, data, gamma, beta, mean, std, num_groups, eps):
                                                                       np_beta.astype(dtype),
                                                                       np_mean, np_std,
                                                                       num_groups, eps)
-        check_symbolic_backward(mx_sym, [mx_data, mx_gamma, mx_beta], [mx.nd.array(np_ograd)],
+        check_symbolic_backward(mx_sym, [mx_data, mx_gamma, mx_beta], [mx.nd.array(np_ograd, dtype=np_ograd.dtype)],
                                 [np_data_grad, np_gamma_grad, np_beta_grad],
                                 rtol=1e-2 if dtype == np.float16 else 1e-3,
                                 atol=5e-2 if dtype == np.float16 else 1e-4, dtype=dtype)
@@ -1721,8 +1723,8 @@ def test_convolution_grouping():
                                                     num_filter=num_filter//num_group, kernel=kernel)
                             for i in range(num_group)])
 
-            exe1 = y1.simple_bind(default_context(), x=shape)
-            exe2 = y2.simple_bind(default_context(), x=shape, w=(num_filter, shape[1]//num_group) + kernel, b=(num_filter,))
+            exe1 = y1._simple_bind(default_context(), x=shape)
+            exe2 = y2._simple_bind(default_context(), x=shape, w=(num_filter, shape[1]//num_group) + kernel, b=(num_filter,))
             for arr1, arr2 in zip(exe1.arg_arrays, exe2.arg_arrays):
                 arr1[:] = np.float32(np.random.normal(size=arr1.shape))
                 arr2[:] = arr1
@@ -1765,8 +1767,8 @@ def test_depthwise_convolution():
                                                 for i in range(num_group)])
 
                             dev = default_context()
-                            exe1 = y1.simple_bind(dev, x=shape)
-                            exe2 = y2.simple_bind(dev, x=shape, w=(num_filter, shape[1]//num_group)+kernel,
+                            exe1 = y1._simple_bind(dev, x=shape)
+                            exe2 = y2._simple_bind(dev, x=shape, w=(num_filter, shape[1]//num_group)+kernel,
                                     b=(num_filter,))
                             for arr1, arr2 in zip(exe1.arg_arrays, exe2.arg_arrays):
                                 arr1[:] = np.random.normal(size=arr1.shape)
@@ -1829,7 +1831,7 @@ def test_convolution_independent_gradients():
             grad_req1 = [req_kind] * 3
             grad_req1 = dict(zip(var_names, grad_req1))
 
-            exe1 = conv.bind(ctx, args1, args_grad=grad1, grad_req=grad_req1)
+            exe1 = conv._bind(ctx, args1, args_grad=grad1, grad_req=grad_req1)
             exe1.forward(is_train=True)
             exe1.backward(exe1.outputs[0])
 
@@ -1841,7 +1843,7 @@ def test_convolution_independent_gradients():
                     'w': mx.nd.zeros(shape=w_shape, ctx=ctx),
                     'b': mx.nd.zeros(shape=(num_filter, ), ctx=ctx) if not no_bias else None}
                 grad_req2 = {"x": x_req, "w": w_req, "b": b_req}
-                exe2 = conv.bind(ctx, args2, args_grad=grad2, grad_req=grad_req2)
+                exe2 = conv._bind(ctx, args2, args_grad=grad2, grad_req=grad_req2)
 
                 exe2.forward(is_train=True)
                 np.testing.assert_allclose(exe1.outputs[0].asnumpy(),
@@ -1940,7 +1942,7 @@ def check_binary_op_forward(symbol, baseline, gen_data, rtol=1e-3, atol=1e-5, mx
     sample_num = 200
     for i in range(sample_num):
         d = gen_data(i)
-        y = symbol.bind(default_context(), args={'a': mx.nd.array(d[0]), 'b': mx.nd.array(d[1])})
+        y = symbol._bind(default_context(), args={'a': mx.nd.array(d[0]), 'b': mx.nd.array(d[1])})
         y.forward(is_train=True)
         y = y.outputs[0].asnumpy()
         x = baseline(d[0], d[1]).astype(y.dtype)
@@ -2004,10 +2006,10 @@ def reduce_op(shape, x):
         x_2 = reduce_op(d[1].shape, baseline_grad2)
         y_1 = mx.nd.empty(d[0].shape)
         y_2 = mx.nd.empty(d[1].shape)
-        y = symbol.bind(default_context(), args={'a': mx.nd.array(d[0]), 'b': mx.nd.array(d[1])},
+        y = symbol._bind(default_context(), args={'a': mx.nd.array(d[0]), 'b': mx.nd.array(d[1])},
                         args_grad=[y_1, y_2])
-        y.forward(is_train=True)
-        y.backward([mx.nd.array(out)])
+        o = y.forward(is_train=True)
+        y.backward([mx.nd.array(out, dtype=o[0].dtype)])
         assert_allclose(y_1.asnumpy(), x_1, rtol=rtol, atol=atol)
         assert_allclose(y_2.asnumpy(), x_2, rtol=rtol, atol=atol)
 
@@ -2208,7 +2210,7 @@ def test_run_convolution_dilated_impulse_response(dil=(1,1), kernel_shape=(3,3),
     in_img = mx.symbol.Variable('input')
     net = mx.symbol.Convolution(in_img, num_filter=1,kernel=kernel_shape, dilate=dil, no_bias="true", name='test_convolution')
     net.list_arguments()
-    be = net.bind(default_context(), args={ 'input' : spike_img, 'test_convolution_weight' : kernel_weights},
+    be = net._bind(default_context(), args={ 'input' : spike_img, 'test_convolution_weight' : kernel_weights},
                 args_grad={'input' : spike_img2, 'test_convolution_weight' : kernel_weights2 } )
     be.forward(True)
     out_o = be.outputs[0].asnumpy()
@@ -2227,7 +2229,7 @@ def test_run_convolution_dilated_impulse_response(dil=(1,1), kernel_shape=(3,3),
     # Now check whether the input gradient was computed correctly
     input_grad = mx.nd.array(vgrad)
 
-    be = net.bind(default_context(), args={ 'input' : input_grad, 'test_convolution_weight' : kernel_weights})
+    be = net._bind(default_context(), args={ 'input' : input_grad, 'test_convolution_weight' : kernel_weights})
     be.forward(True)
     out_o = be.outputs[0].asnumpy()
     assert_allclose(out_o[center],np.prod(kernel_shape),atol=1e-5)
@@ -2240,7 +2242,7 @@ def test_run_convolution_dilated_impulse_response(dil=(1,1), kernel_shape=(3,3),
     white_in = mx.nd.ones(shape=data_shape)
     white_in2 = mx.nd.ones(shape=data_shape)
 
-    be = net.bind(default_context(), args={ 'input' : white_in, 'test_convolution_weight' : rnd_kernel},
+    be = net._bind(default_context(), args={ 'input' : white_in, 'test_convolution_weight' : rnd_kernel},
                 args_grad={'input' : white_in2, 'test_convolution_weight' : rnd_kernel2 } )
 
     be.forward(True)
@@ -2250,7 +2252,7 @@ def test_run_convolution_dilated_impulse_response(dil=(1,1), kernel_shape=(3,3),
 
     dkernel = mx.nd.array(rnd_kernel_s + kernel_gradient)
 
-    be = net.bind(default_context(), args={ 'input' : white_in, 'test_convolution_weight' : dkernel})
+    be = net._bind(default_context(), args={ 'input' : white_in, 'test_convolution_weight' : dkernel})
 
     be.forward(True)
     out = be.outputs[0].asnumpy()
@@ -2316,7 +2318,7 @@ def test_reshape_new(src_shape, shape_args, reverse, dst_shape):
                               str(dst_shape), str(output_shape[0]))
     dat_npy = np.random.rand(*src_shape)
     grad_npy = np.random.rand(*dst_shape)
-    exe = net.simple_bind(default_context(), data=src_shape)
+    exe = net._simple_bind(default_context(), data=src_shape)
     exe.arg_dict['data'][:] = dat_npy
     exe.forward(is_train=True)
     assert np.square(exe.outputs[0].asnumpy() - dat_npy.reshape(dst_shape)).mean() < 1E-7, \
@@ -2354,7 +2356,7 @@ def test_reshape_old():
     # Test for Flatten
     data = mx.sym.Variable("data")
     net = mx.sym.Flatten(data)
-    exe = net.simple_bind(ctx=default_context(), data=(5, 4, 3, 7))
+    exe = net._simple_bind(ctx=default_context(), data=(5, 4, 3, 7))
     data_npy = np.random.normal(size=(5, 4, 3, 7))
     out_grad_npy = np.random.normal(size=(5, 4 * 3 * 7))
     outputs = exe.forward(is_train=True, data=data_npy)[0].asnumpy()
@@ -2381,7 +2383,7 @@ def test_reshape_like_new(lhs_shape, rhs_shape, lbeg, lend, rbeg, rend, dst_shap
         rhs_npy = np.random.rand(*rhs_shape)
         grad_npy = np.random.rand(*dst_shape)
 
-        exe = net.simple_bind(default_context(), lhs=lhs_shape, rhs=rhs_shape)
+        exe = net._simple_bind(default_context(), lhs=lhs_shape, rhs=rhs_shape)
         exe.arg_dict['lhs'][:] = lhs_npy
         exe.arg_dict['rhs'][:] = rhs_npy
         exe.forward(is_train=True)
@@ -2472,7 +2474,7 @@ def test_reduce_inner(numpy_reduce_func, numpy_reduce_grad_func, mx_reduce_sym,
                                                       outdata=sum_groundtruth,
                                                       axis=axes, keepdims=keepdims,
                                                       keepdim_shape=keepdim_shape)
-            net = b.bind(default_context(), args={'a': mx.nd.array(dat_npy)},
+            net = b._bind(default_context(), args={'a': mx.nd.array(dat_npy)},
                          args_grad={'a': grad_nd})
             net.forward(is_train=True)
 
@@ -2556,7 +2558,7 @@ def test_broadcasting_ele(sym_bcast):
             outgrad_npy = np.random.rand(*target_shape)
             grad_groundtruth = np_reduce(outgrad_npy, axis=axis, keepdims=True,
                                          numpy_reduce_func=np.sum)
-            net = sym_bcast.bind(default_context(), args={'a': mx.nd.array(dat_npy)},
+            net = sym_bcast._bind(default_context(), args={'a': mx.nd.array(dat_npy)},
                                  args_grad={'a': grad_nd})
             net.forward(is_train=True)
             assert (net.outputs[0].shape == target_shape).all()
@@ -2705,7 +2707,7 @@ def test_slice_axis():
             Y = mx.symbol.slice_axis(data=X, axis=t, begin=b, end=e)
 
             xgrad = mx.nd.empty(x.shape)
-            exec1 = Y.bind(default_context(), args = [x], args_grad = {'X': xgrad})
+            exec1 = Y._bind(default_context(), args = [x], args_grad = {'X': xgrad})
             exec1.forward(is_train=True)
             y = exec1.outputs[0]
             assert_allclose(x.asnumpy()[idx], y.asnumpy())
@@ -2716,7 +2718,7 @@ def test_slice_axis():
             assert_allclose(xx, xgrad.asnumpy())
             x_grad_npy = np.random.normal(size=x.shape)
             xgrad = mx.nd.array(x_grad_npy)
-            exec2 = Y.bind(default_context(), args=[x], args_grad={'X': xgrad}, grad_req="add")
+            exec2 = Y._bind(default_context(), args=[x], args_grad={'X': xgrad}, grad_req="add")
             exec2.forward(is_train=True)
             exec2.backward([exec2.outputs[0]])
             xx = np.zeros(shape=x.shape, dtype=np.float32)
@@ -2752,7 +2754,7 @@ def test_slice_like():
 
             xgrad = mx.nd.empty(x.shape)
             xgrad1 = mx.nd.empty(x1.shape)
-            exec1 = Y.bind(default_context(), args = [x, x1],
+            exec1 = Y._bind(default_context(), args = [x, x1],
                            args_grad = {'X': xgrad, 'X1': xgrad1})
             exec1.forward(is_train=True)
             y = exec1.outputs[0]
@@ -2842,7 +2844,7 @@ def test_stn():
                     args['loc_fc_weight'] = mx.nd.zeros((6, num_filter*data_shape[2]*data_shape[3]), ctx=dev)
                     args['loc_fc_bias'] = mx.nd.array([0.5, 0, 0, 0, 0.5, 0], ctx=dev)
                     grad_grad = [mx.nd.zeros(shape, ctx=dev) for shape in arg_shapes]
-                    exe = stn.bind(dev, args=args, args_grad=grad_grad)
+                    exe = stn._bind(dev, args=args, args_grad=grad_grad)
                     exe.forward(is_train=True)
                     out = exe.outputs[0]
                     # check forward
@@ -2887,7 +2889,7 @@ def test_stn_valid_sampling():
         'data': mx.nd.array(np.zeros_like(data_array)),
         'loc': mx.nd.array(np.zeros_like(loc_array))
     }
-    executor = stn.bind(
+    executor = stn._bind(
         ctx=default_context(),
         args={'data': mx.nd.array(data_array),
               'loc': mx.nd.array(loc_array)},
@@ -2935,7 +2937,7 @@ def test_dot():
                         a = mx.sym.Variable('a', dtype=data_type)
                         b = mx.sym.Variable('b', dtype=data_type)
                         c = mx.sym.dot(a, b)
-                        exe = c.simple_bind(ctx=ctx, a=a_npy.shape, b=b_npy.shape)
+                        exe = c._simple_bind(ctx=ctx, a=a_npy.shape, b=b_npy.shape)
                         outputs = exe.forward(is_train=True, a=a_npy, b=b_npy)
                         assert_almost_equal(outputs[0], c_npy, rtol=tol, atol=tol)
                         exe.backward(out_grads=[mx.nd.array(ograd_npy, mx.cpu()).astype(data_type)])
@@ -3017,9 +3019,9 @@ def test_batch_dot():
                             b_npy = np.transpose(b_npy, axes=(0, 2, 1))
                             bgrad_npy = np.transpose(bgrad_npy, axes=(0, 2, 1))
                             b_init_grad_npy = np.transpose(b_init_grad_npy, axes=(0, 2, 1))
-                        exe = c.simple_bind(ctx=ctx,
+                        exe = c._simple_bind(ctx=ctx,
                             a=a_npy.shape, b=b_npy.shape, grad_req='write')
-                        exe_add = c.simple_bind(ctx=ctx,
+                        exe_add = c._simple_bind(ctx=ctx,
                             a=a_npy.shape, b=b_npy.shape, grad_req='add')
                         exe_add.grad_dict['a'][:] = a_init_grad_npy
                         exe_add.grad_dict['b'][:] = b_init_grad_npy
@@ -3027,7 +3029,7 @@ def test_batch_dot():
                         assert_almost_equal(outputs[0], c_npy,
                                             rtol=1e-2 if data_type == 'float16' else 1e-3,
                                             atol=1e-2 if data_type == 'float16' else 1e-4)
-                        exe.backward(out_grads=[mx.nd.array(ograd_npy, ctx=exe._ctx)])
+                        exe.backward(out_grads=[mx.nd.array(ograd_npy, dtype=outputs[0].dtype, ctx=exe._ctx)])
                         assert_almost_equal(exe.grad_dict['a'], agrad_npy,
                                             rtol=1e-2 if data_type == 'float16' else 1e-3,
                                             atol=1e-2 if data_type == 'float16' else 1e-4)
@@ -3035,7 +3037,7 @@ def test_batch_dot():
                                             rtol=1e-2 if data_type == 'float16' else 1e-3,
                                             atol=1e-2 if data_type == 'float16' else 1e-4)
                         exe_add.forward(is_train=True, a=a_npy, b=b_npy)
-                        exe_add.backward(out_grads=[mx.nd.array(ograd_npy, ctx=exe._ctx)])
+                        exe_add.backward(out_grads=[mx.nd.array(ograd_npy, dtype=exe_add.outputs[0].dtype, ctx=exe._ctx)])
                         assert_almost_equal(exe_add.grad_dict['a'],
                                             agrad_npy + a_init_grad_npy,
                                             rtol=1e-2 if data_type == 'float16' else 1e-3,
@@ -3162,7 +3164,7 @@ def unittest_correlation(data_shape,kernel_size,max_displacement,stride1,stride2
     net1 = get_correlation(img1,img2,kernel_size,max_displacement,stride1,stride2,pad_size,is_multiply)
     net2 = get_correlation(img1,img2,kernel_size,max_displacement,stride1,stride2,pad_size,is_multiply )
 
-    exe1 = net1.simple_bind(default_context(),img1=img1.shape,img2=img1.shape)
+    exe1 = net1._simple_bind(default_context(),img1=img1.shape,img2=img1.shape)
     exe1.arg_dict['img1'][:] = img1
     exe1.arg_dict['img2'][:] = img2
 
@@ -3252,7 +3254,7 @@ def check_pad_with_shape(shape, xpu, pad_width, mode, dtype="float64"):
     np_out = np.pad(x.asnumpy(), pad_grouped, mode)
     # mxnet result
     grad = mx.nd.empty(shape, ctx = xpu, dtype=dtype)
-    exec1 = Y.bind(xpu, args = [x], args_grad = {'X': grad})
+    exec1 = Y._bind(xpu, args = [x], args_grad = {'X': grad})
     exec1.forward(is_train=True)
     out = exec1.outputs[0]
     # compare numpy + mxnet
@@ -3309,7 +3311,7 @@ def check_instance_norm_with_shape(shape, xpu):
     beta = mx.random.normal(0, 1, shape[1], ctx=mx.cpu()).copyto(xpu)
 
     np_out = np_instance_norm(x.asnumpy(), gamma.asnumpy(), beta.asnumpy(), eps)
-    exec1 = Y.bind(xpu, args = {'X':x, 'G':gamma, 'B':beta})
+    exec1 = Y._bind(xpu, args = {'X':x, 'G':gamma, 'B':beta})
     exec1.forward(is_train=False)
     out = exec1.outputs[0]
     assert_almost_equal(out, np_out, rtol=1e-4, atol=1e-4)
@@ -3350,7 +3352,7 @@ def check_l2_normalization(in_shape, mode, dtype, norm_eps=1e-10):
         np_out = np.multiply(in_data, np_norm.reshape(s))
     else:
         raise RuntimeError('Unknown l2 normalization mode')
-    exe = out.simple_bind(ctx=ctx, data=in_data.shape)
+    exe = out._simple_bind(ctx=ctx, data=in_data.shape)
     output = exe.forward(is_train=True, data=in_data)
     # compare numpy + mxnet
     assert_almost_equal(exe.outputs[0], np_out, rtol=1e-2 if dtype is 'float16' else 1e-5, atol=1e-5)
@@ -3411,7 +3413,7 @@ def npy_layer_norm_grad(data, gamma, out_grad, axis, eps):
     gamma_s = mx.symbol.Variable('gamma')
     beta_s = mx.symbol.Variable('beta')
     out_s = mx.symbol.LayerNorm(data=data_s, gamma=gamma_s, beta=beta_s, axis=axis, eps=eps)
-    exe = out_s.simple_bind(ctx, data=in_shape)
+    exe = out_s._simple_bind(ctx, data=in_shape)
     exe.arg_dict['data'][:] = data
     exe.arg_dict['gamma'][:] = gamma
     exe.arg_dict['beta'][:] = beta
@@ -3428,7 +3430,7 @@ def npy_layer_norm_grad(data, gamma, out_grad, axis, eps):
     if npy_grad_check:
         # Test for grad_req = write
         out_grad = np.random.normal(0, 1, in_shape).astype(dtype)
-        exe = out_s.simple_bind(ctx, data=in_shape, grad_req='write')
+        exe = out_s._simple_bind(ctx, data=in_shape, grad_req='write')
         exe.arg_dict['data'][:] = data
         exe.arg_dict['gamma'][:] = gamma
         exe.arg_dict['beta'][:] = beta
@@ -3445,7 +3447,7 @@ def npy_layer_norm_grad(data, gamma, out_grad, axis, eps):
         init_data_grad = np.random.normal(0, 1, in_shape).astype(dtype)
         init_gamma_grad = np.random.normal(0, 1, (in_shape[axis],)).astype(dtype)
         init_beta_grad = np.random.normal(0, 1, (in_shape[axis],)).astype(dtype)
-        exe = out_s.simple_bind(ctx, data=in_shape, grad_req='add')
+        exe = out_s._simple_bind(ctx, data=in_shape, grad_req='add')
         exe.arg_dict['data'][:] = data
         exe.arg_dict['gamma'][:] = gamma
         exe.arg_dict['beta'][:] = beta
@@ -3711,9 +3713,9 @@ def test_wrapper(arr, xpu, sequence_length=None, use_sequence_length=False):
         rev = mx.sym.SequenceReverse(data=seq, sequence_length=seq_len, use_sequence_length=use_sequence_length)
         # MxNet symbol execution
         if sequence_length:
-            bound = rev.bind(xpu, {'seq': mx.nd.array(arr), 'seq_len': mx.nd.array(sequence_length)})
+            bound = rev._bind(xpu, {'seq': mx.nd.array(arr), 'seq_len': mx.nd.array(sequence_length)})
         else:
-            bound = rev.bind(xpu, {'seq': mx.nd.array(arr)})
+            bound = rev._bind(xpu, {'seq': mx.nd.array(arr)})
         fwd = bound.forward()
         return fwd[0].asnumpy()
 
@@ -3754,7 +3756,7 @@ def mathematical_core_binary(name,
     arr_grad2 = mx.nd.empty(shape)
 
     test = forward_mxnet_call(data1, data2)
-    exe_test = test.bind(default_context(), args=[arr_data1, arr_data2], args_grad=[arr_grad1, arr_grad2])
+    exe_test = test._bind(default_context(), args=[arr_data1, arr_data2], args_grad=[arr_grad1, arr_grad2])
     exe_test.forward(is_train=True)
     out = exe_test.outputs[0]
     npout = forward_numpy_call(data_tmp1, data_tmp2)
@@ -3784,7 +3786,7 @@ def mathematical_core(name, forward_mxnet_call, forward_numpy_call, backward_num
     arr_grad[:] = 3
 
     test = forward_mxnet_call(data)
-    exe_test = test.bind(default_context(), args=[arr_data], args_grad=[arr_grad])
+    exe_test = test._bind(default_context(), args=[arr_data], args_grad=[arr_grad])
     exe_test.forward(is_train=True)
     out = exe_test.outputs[0]
     npout = forward_numpy_call(data_tmp)
@@ -3832,7 +3834,7 @@ def rounding(name, forward_mxnet_call, forward_numpy_call, data_init=5., grad_in
     arr_data = mx.nd.array(data_tmp)
 
     test = forward_mxnet_call(data)
-    exe_test = test.bind(default_context(), args=[arr_data])
+    exe_test = test._bind(default_context(), args=[arr_data])
     exe_test.forward(is_train=True)
     out = exe_test.outputs[0]
     npout = forward_numpy_call(data_tmp)
@@ -3954,23 +3956,12 @@ def test_clip():
     check_symbolic_backward(test, [data_tmp], [np.ones(shape)],
                             [np.where(data_tmp <= 0.6, [1], [0]) * np.where(data_tmp >= -0.6, [1], [0])])
 
-    # Test monitor on symbol using clip
-
-    def simple_callback(name, arr):
-        pass
-
-    exe = test.simple_bind(ctx=mx.current_context(), data=shape)
-    exe.set_monitor_callback(simple_callback, monitor_all=True)
-    exe.forward(is_train=True)
-    exe.backward(out_grads=mx.nd.ones(shape))
-    mx.nd.waitall()
-
 
 @with_seed()
 def test_init():
     def test_basic_val_init(sym_func, np_func, shape, dtype):
         x = sym_func(shape=shape, dtype=dtype)
-        exe = x.bind(default_context(), args=[], args_grad=[])
+        exe = x._bind(default_context(), args=[], args_grad=[])
         exe.forward(is_train=True)
         assert_almost_equal(exe.outputs[0], np_func(shape=shape, dtype=dtype))
         assert exe.outputs[0].asnumpy().dtype == dtype
@@ -3994,7 +3985,7 @@ def test_arange():
     def test_arange_inferstop():
         s = mx.sym.arange(start=0, stop=None, infer_range=True)
         s = mx.sym.elemwise_add(s, mx.sym.zeros(shape=[5]))
-        exe = s.bind(ctx=mx.cpu(), args={})
+        exe = s._bind(ctx=mx.cpu(), args={})
         exe.forward()
         assert_almost_equal(exe.outputs[0], np.array([0,1,2,3,4]))
 
@@ -4163,7 +4154,7 @@ def get_large_matrix():
 def test_blockgrad():
     a = mx.sym.Variable('a')
     b = mx.sym.BlockGrad(a)
-    exe = b.simple_bind(ctx=default_context(), a=(10, 10))
+    exe = b._simple_bind(ctx=default_context(), a=(10, 10))
     a_npy = np.random.rand(10, 10)
     exe.forward(is_train=True, a=a_npy)
     assert_almost_equal(exe.outputs[0], a_npy)
@@ -4243,8 +4234,8 @@ def grad_helper(grad_in, axis, idx):
     idx = mx.sym.Variable('indices')
     idx = mx.sym.BlockGrad(idx)
     result = mx.sym.take(a=data, indices=idx, axis=axis, mode=mode)
-    exe = result.simple_bind(default_context(), a=data_shape,
-                             indices=idx_shape, axis=axis, mode=mode)
+    exe = result._simple_bind(default_context(), a=data_shape,
+                             indices=idx_shape)
     data_real = np.random.normal(size=data_shape).astype('float32')
     if out_of_range:
         idx_real = np.random.randint(low=-data_shape[axis], high=data_shape[axis], size=idx_shape)
@@ -4289,7 +4280,7 @@ def test_grid_generator():
     for target_shape in test_case:
         affine_matrix =  mx.sym.Variable('affine')
         grid = mx.sym.GridGenerator(data=affine_matrix,transform_type='affine', target_shape=target_shape)
-        exe = grid.simple_bind(ctx=default_context(), affine=(1,6), grad_req='write')
+        exe = grid._simple_bind(ctx=default_context(), affine=(1,6), grad_req='write')
 
         # check forward
         exe.arg_dict['affine'][:] = np.array([[1.0,0,0,0,1.0,0]])
@@ -4311,7 +4302,7 @@ def test_grid_generator():
         grad_est = np.dot(out_grad[0].reshape(2,target_shape[0]*target_shape[1]),tmp.T).reshape(1,6)
         assert_almost_equal(exe.grad_dict['affine'], grad_est, rtol=1e-3, atol=1e-5)
         # check addto
-        exe = grid.simple_bind(ctx=default_context(), affine=(1,6), grad_req='add')
+        exe = grid._simple_bind(ctx=default_context(), affine=(1,6), grad_req='add')
         grid_grad_npy = np.random.normal(size=exe.grad_dict['affine'].shape)
         exe.grad_dict['affine'][:] = grid_grad_npy
         exe.arg_dict['affine'][:] = np.array([[1.0, 0, 0, 0, 1.0, 0]])
@@ -4324,7 +4315,7 @@ def test_grid_generator():
     for target_shape in test_case:
         flow = mx.sym.Variable('flow')
         grid = mx.sym.GridGenerator(data=flow,transform_type='warp', target_shape=target_shape)
-        exe = grid.simple_bind(ctx=default_context(), flow=(1,2)+target_shape, grad_req='write')
+        exe = grid._simple_bind(ctx=default_context(), flow=(1,2)+target_shape, grad_req='write')
         # check forward
         exe.arg_dict['flow'][:] = np.ones((1,2)+target_shape)
         exe.forward(is_train=True)
@@ -4342,7 +4333,7 @@ def test_grid_generator():
         grad_est[0,1] = out_grad[0,1] / ((target_shape[0]-1.0) / 2.0)
         assert_almost_equal(exe.grad_dict['flow'], grad_est, rtol=1e-3)
         # check addto
-        exe_add = grid.simple_bind(ctx=default_context(), flow=(1, 2) + target_shape, grad_req='add')
+        exe_add = grid._simple_bind(ctx=default_context(), flow=(1, 2) + target_shape, grad_req='add')
         flow_grad_npy = np.random.normal(size=exe_add.grad_dict['flow'].shape)
         exe_add.arg_dict['flow'][:] = np.ones((1, 2) + target_shape)
         exe_add.grad_dict['flow'][:] = flow_grad_npy
@@ -4368,12 +4359,12 @@ def test_cast():
         for dsttype in [np.float32, np.int32, np.float16]:
             x = mx.sym.Variable('x', dtype=srctype)
             y = mx.sym.Cast(x, dtype=dsttype)
-            exe = y.simple_bind(ctx=default_context(), x=(10, 10))
+            exe = y._simple_bind(ctx=default_context(), x=(10, 10))
             assert exe.arg_arrays[0].dtype == srctype
-            assert exe.outputs[0].dtype == dsttype
             X = np.random.uniform(-10, 10, size=(10, 10))
             exe.arg_arrays[0][:] = X
             exe.forward(is_train=True)
+            assert exe.outputs[0].dtype == dsttype
             exe.backward(mx.nd.array(X, dtype=dsttype, ctx=default_context()))
             assert_almost_equal(exe.outputs[0], X.astype(srctype).astype(dsttype), rtol=1e-3, atol=1e-5)
             assert_almost_equal(exe.grad_arrays[0], X.astype(dsttype).astype(srctype), rtol=1e-3, atol=1e-5)
@@ -4410,10 +4401,10 @@ def check_cast(op, input_np, expected_output):
         x = mx.sym.Variable('x', dtype=np.float32)
         sym = op(x, dtype=np.float16)
         ctx = default_context()
-        exe = sym.bind(ctx, {'x': mx.nd.array(input_np, dtype=np.float32, ctx=ctx)})
+        exe = sym._bind(ctx, {'x': mx.nd.array(input_np, dtype=np.float32, ctx=ctx)})
         assert exe.arg_arrays[0].dtype == np.float32
-        assert exe.outputs[0].dtype == np.float16
         exe.forward(is_train=True)
+        assert exe.outputs[0].dtype == np.float16
         sym_output = exe.outputs[0].asnumpy()
         for fp32_val, model_fp16_val, np_fp16_val in zip(input_np, sym_output, expected_output):
             assert (model_fp16_val == np_fp16_val) or \
@@ -4435,7 +4426,7 @@ def test_amp_multicast():
     z = mx.sym.Variable('z', dtype=np.float16)
     ctx = default_context()
     res = mx.sym.amp_multicast(x, y, z, num_outputs=3)
-    exe = res.bind(ctx, {'x': mx.nd.random.uniform(shape=(3, 3), dtype=np.float16, ctx=ctx),
+    exe = res._bind(ctx, {'x': mx.nd.random.uniform(shape=(3, 3), dtype=np.float16, ctx=ctx),
                          'y': mx.nd.random.uniform(shape=(3, 3), dtype=np.float32, ctx=ctx),
                          'z': mx.nd.random.uniform(shape=(3, 3), dtype=np.float16, ctx=ctx)})
     exe.forward(is_train=True)
@@ -4450,7 +4441,7 @@ def check_amp_multicast(input_np, expected_output):
         z = mx.sym.Variable('z', dtype=np.float16)
         ctx = default_context()
         res = mx.sym.amp_multicast(x, y, z, num_outputs=3)
-        exe = res.bind(ctx, {'x': mx.nd.array(input_np, dtype=np.float16, ctx=ctx),
+        exe = res._bind(ctx, {'x': mx.nd.array(input_np, dtype=np.float16, ctx=ctx),
                              'y': mx.nd.array(input_np, dtype=np.float32, ctx=ctx),
                              'z': mx.nd.array(input_np, dtype=np.float16, ctx=ctx)})
         exe.forward(is_train=True)
@@ -4474,21 +4465,21 @@ def test_all_finite():
     inf_arr = mx.nd.array([[np.inf, np.inf]])
     z = mx.sym.all_finite(data)
     ctx = default_context()
-    exe = z.bind(ctx, {'data': inf_arr})
+    exe = z._bind(ctx, {'data': inf_arr})
     exe.forward(is_train=False)
     sym_output = exe.outputs[0].asnumpy()
     assert sym_output[0] == 0
-    exe = z.bind(ctx, {'data': finite_arr})
+    exe = z._bind(ctx, {'data': finite_arr})
     exe.forward(is_train=False)
     sym_output = exe.outputs[0].asnumpy()
     assert sym_output[0] == 1
     z = mx.sym.multi_all_finite(data, data2, num_arrays=2)
-    exe = z.bind(ctx, {'data': finite_arr, 'data2': inf_arr})
+    exe = z._bind(ctx, {'data': finite_arr, 'data2': inf_arr})
     exe.forward(is_train=False)
     sym_output = exe.outputs[0].asnumpy()
     assert sym_output[0] == 0
     z = mx.sym.multi_all_finite(data, data2, num_arrays=2)
-    exe = z.bind(ctx, {'data': finite_arr, 'data2': finite_arr})
+    exe = z._bind(ctx, {'data': finite_arr, 'data2': finite_arr})
     exe.forward(is_train=False)
     sym_output = exe.outputs[0].asnumpy()
     assert sym_output[0] == 1
@@ -4525,7 +4516,7 @@ def test_repeat_backward(axis):
         arr_grad = mx.nd.empty(shape)
         repeats = 2
         test = mx.sym.repeat(data, repeats=repeats, axis=axis)
-        exe = test.bind(ctx=default_context(), args=[arr_data], args_grad=[arr_grad])
+        exe = test._bind(ctx=default_context(), args=[arr_data], args_grad=[arr_grad])
         npout_grad = np.random.randint(0, 10, n1 * n2 * repeats)
         if axis == 0:
             npout_grad = npout_grad.reshape(n1 * repeats, n2)
@@ -4637,7 +4628,7 @@ def test_tile_backward():
         reps2 = 2
         reps = (reps1, reps2)
         test = mx.sym.tile(data, reps=reps)
-        exe = test.bind(ctx=default_context(), args=[arr_data], args_grad=[arr_grad])
+        exe = test._bind(ctx=default_context(), args=[arr_data], args_grad=[arr_grad])
         npout_grad = np.random.randint(0, 10, n1 * n2 * reps1 * reps2).reshape(n1 * reps1, n2 * reps2)
         out_grad = mx.nd.array(npout_grad)
         exe.backward(out_grad)
@@ -4812,11 +4803,11 @@ def test_where_helper(shape, same_shape):
         condition = mx.sym.Variable('condition')
         x = mx.sym.Variable('x')
         y = mx.sym.Variable('y')
-        grad_in_mx = mx.nd.array(grad_in_np, dtype=np.int32)
+        grad_in_mx = mx.nd.array(grad_in_np, dtype=np.int)
         where_sym = mx.sym.where(condition, x, y)
 
         # test req='write'
-        where_exe_write = where_sym.simple_bind(ctx=default_context(),
+        where_exe_write = where_sym._simple_bind(ctx=default_context(),
                                                 condition=condition_np.shape,
                                                 x=x_np.shape, y=y_np.shape,
                                                 grad_req='write')
@@ -4825,7 +4816,7 @@ def test_where_helper(shape, same_shape):
                                           x=x_np, y=y_np)
         assert same(outputs[0].asnumpy(), out_expected)
         # test backward req='write'
-        where_exe_write.backward(grad_in_mx)
+        where_exe_write.backward(grad_in_mx.astype('float32'))
         assert same(where_exe_write.grad_dict['x'].asnumpy(), grad_expected_x)
         assert same(where_exe_write.grad_dict['y'].asnumpy(), grad_expected_y)
         assert same(where_exe_write.grad_dict['condition'].asnumpy(), grad_expected_cond)
@@ -4833,7 +4824,7 @@ def test_where_helper(shape, same_shape):
         # test req='add'
         x_grad_init = np.random.randint(30, 40, np.prod(shape)).reshape(shape)
         y_grad_init = np.random.randint(40, 50, np.prod(shape)).reshape(shape)
-        where_exe_add = where_sym.simple_bind(ctx=default_context(),
+        where_exe_add = where_sym._simple_bind(ctx=default_context(),
                                               condition=condition_np.shape,
                                               x=x_np.shape, y=y_np.shape,
                                               grad_req='add')
@@ -4843,7 +4834,8 @@ def test_where_helper(shape, same_shape):
         outputs = where_exe_add.forward(is_train=True, condition=condition_np, x=x_np, y=y_np)
         assert same(outputs[0].asnumpy(), out_expected)
         # test backward req='add'
-        where_exe_add.backward(grad_in_mx)
+        where_exe_add.backward(grad_in_mx.astype('float32'))
+
         x_ograd = where_exe_add.grad_dict['x'].asnumpy()
         y_ograd = where_exe_add.grad_dict['y'].asnumpy()
         assert same(x_ograd, grad_expected_x+x_grad_init)
@@ -4960,7 +4952,7 @@ def test_softmax_with_large_inputs():
     def softmax_forward(input_data, true_output):
         data = mx.sym.Variable('data')
         out1 = data.softmax(axis=1)
-        exec1 = out1.bind(default_context(), args={'data': input_data})
+        exec1 = out1._bind(default_context(), args={'data': input_data})
         exec1.forward()[0].wait_to_read()
         ndarr = exec1.outputs[0][0][0][0]
         assert_almost_equal(ndarr, true_output, rtol=1e-5, atol=1e-5)
@@ -5098,7 +5090,7 @@ def check_ctc_loss(acts, labels, loss_truth, contrib=False):
         ctc = mx.sym.ctc_loss(in_var, labels_var)
     acts_nd = mx.nd.array(acts, ctx=default_context())
     labels_nd = mx.nd.array(labels, ctx=default_context())
-    exe = ctc.bind(ctx=default_context(), args=[acts_nd, labels_nd])
+    exe = ctc._bind(ctx=default_context(), args=[acts_nd, labels_nd])
     # test forward with grad calc
     exe.forward(is_train=True)
     outTest = exe.outputs[0].copy()
@@ -6619,7 +6611,7 @@ def check_dropout_ratio(ratio, shape, cudnn_off=True):
         # test dropout
         x = mx.sym.var('data')
         y = mx.sym.Dropout(x, p=ratio, cudnn_off=cudnn_off)
-        exe = y.simple_bind(ctx=default_context(), data=shape)
+        exe = y._simple_bind(ctx=default_context(), data=shape)
 
         if ratio == 1:
             max_value = float('nan')
@@ -6651,13 +6643,13 @@ def check_dropout_ratio(ratio, shape, cudnn_off=True):
 
             exe.forward(is_train=False)
             assert (exe.outputs[0].asnumpy() == exe.arg_arrays[0].asnumpy()).all()
-            exe.backward([mx.nd.ones(shape)], is_train=False)
+            exe.backward([mx.nd.ones(shape)])
             assert (exe.grad_arrays[0].asnumpy() == exe.arg_arrays[0].asnumpy()).all()
 
             # test permanent dropout
             x = mx.sym.var('data')
             y = mx.sym.Dropout(x, p=ratio, mode='always', cudnn_off=cudnn_off)
-            exe = y.simple_bind(ctx=default_context(), data=shape)
+            exe = y._simple_bind(ctx=default_context(), data=shape)
 
             exe.arg_arrays[0][:] = 1
             exe.forward(is_train=True)
@@ -6669,7 +6661,7 @@ def check_dropout_ratio(ratio, shape, cudnn_off=True):
             exe.forward(is_train=False)
             assert exe.outputs[0].asnumpy().max() == max_value
             assert exe.outputs[0].asnumpy().min() == min_value
-            exe.backward([mx.nd.ones(shape)], is_train=False)
+            exe.backward([mx.nd.ones(shape)])
             assert (exe.grad_arrays[0].asnumpy() == exe.outputs[0].asnumpy()).all()
 
     def get_slice(x, axis, idx):
@@ -7734,139 +7726,17 @@ def f(x, bins=10, range=None):
         assert_almost_equal(mx_bins2, np_bins2, rtol=1e-3, atol=1e-5)
 
         data = mx.sym.Variable("data")
-
         bins = mx.sym.Variable("bins")
         histo1 = mx.sym.histogram(a=data, bins=bin_cnt, range=bin_range)
         histo2 = mx.sym.histogram(a=data, bins=bins)
-        executor1 = histo1.bind(ctx=default_context(), args={"data" : x})
+        executor1 = histo1._bind(ctx=default_context(), args={"data" : x})
         executor1.forward(is_train=False)
         assert_almost_equal(np_histo1, executor1.outputs[0].asnumpy(), 0, 0, ("EXPECTED_histo1", "FORWARD_histo1"), equal_nan=False)
-        executor2 = histo2.bind(ctx=default_context(), args={"data" : x, "bins" : mx_bins})
+        executor2 = histo2._bind(ctx=default_context(), args={"data" : x, "bins" : mx_bins})
         executor2.forward(is_train=False)
         assert_almost_equal(np_histo2, executor2.outputs[0].asnumpy(), 0, 0, ("EXPECTED_histo2", "FORWARD_histo2"), equal_nan=False)
 
 
-def test_op_output_names_monitor():
-    def check_name(op_sym, expected_names):
-        output_names = []
-
-        def get_output_names_callback(name, arr):
-            output_names.append(py_str(name))
-
-        op_exe = op_sym.simple_bind(ctx=mx.current_context(), grad_req='null')
-        op_exe.set_monitor_callback(get_output_names_callback, monitor_all=False)
-        try:
-            op_exe.forward()
-            mx.nd.waitall()
-        except mx.base.MXNetError:
-            # skip errors since test is to check output names
-            pass
-        for output_name, expected_name in zip(output_names, expected_names):
-            assert output_name == expected_name
-    is_windows = sys.platform.startswith('win')
-    if (is_windows):
-        # Windows doesn't support set environment variable on the fly, so disable it for now
-        pass
-    else:
-        # Disable subgraph in case subgraph will replace symbol
-        os.environ['MXNET_SUBGRAPH_BACKEND'] = "NONE"
-        data = mx.sym.Variable('data', shape=(10, 3, 10, 10))
-        conv_sym = mx.sym.Convolution(data, kernel=(2, 2), num_filter=1, name='conv')
-        check_name(conv_sym, ['conv_output'])
-
-        deconv_sym = mx.sym.Deconvolution(data, kernel=(2, 2), num_filter=1, name='deconv')
-        check_name(deconv_sym, ['deconv_output'])
-
-        fc_sym = mx.sym.FullyConnected(data, num_hidden=10, name='fc')
-        check_name(fc_sym, ['fc_output'])
-
-        lrn_sym = mx.sym.LRN(data, nsize=1, name='lrn')
-        check_name(lrn_sym, ['lrn_output', 'lrn_tmp_norm'])
-
-        act_sym = mx.sym.Activation(data, act_type='relu', name='act')
-        check_name(act_sym, ['act_output'])
-
-        cc_sym = mx.sym.concat(data, data, dim=0, name='concat')
-        check_name(cc_sym, ['concat_output'])
-
-        sm_sym = mx.sym.softmax(data, name='softmax')
-        check_name(sm_sym, ['softmax_output'])
-
-        sa_sym = mx.sym.SoftmaxActivation(data, name='softmax')
-        check_name(sa_sym, ['softmax_output'])
-
-        us_sym = mx.sym.UpSampling(data, scale=2, sample_type='nearest',
-                                name='upsampling')
-        check_name(us_sym, ['upsampling_output'])
-
-        us_sym = mx.sym.Pooling(data, kernel=(2, 2), pool_type='avg',
-                                name='pooling')
-        check_name(us_sym, ['pooling_output'])
-        del os.environ['MXNET_SUBGRAPH_BACKEND']
-
-def test_op_all_names_monitor():
-    def check_name(op_sym, expected_names):
-        output_names = []
-
-        def get_output_names_callback(name, arr):
-            output_names.append(py_str(name))
-
-        op_exe = op_sym.simple_bind(ctx=mx.current_context(), grad_req='null')
-        op_exe.set_monitor_callback(get_output_names_callback, monitor_all=True)
-        try:
-            op_exe.forward()
-            mx.nd.waitall()
-        except mx.base.MXNetError:
-            # skip errors since test is to check all names
-            pass
-        for output_name, expected_name in zip(output_names, expected_names):
-            assert output_name == expected_name
-    is_windows = sys.platform.startswith('win')
-    if (is_windows):
-        # Windows doesn't support set environment variable on the fly, so disable it for now
-        pass
-    else:
-        # Disable subgraph in case subgraph will replace symbol
-        os.environ['MXNET_SUBGRAPH_BACKEND'] = "NONE"
-
-        data = mx.sym.Variable('data', shape=(10, 3, 10, 10))
-        conv_sym = mx.sym.Convolution(data, kernel=(2, 2), num_filter=1, name='conv')
-        check_name(conv_sym, ['data', 'conv_data', 'conv_weight', 'conv_weight', 'conv_bias', 'conv_bias', 'conv_output'])
-
-        deconv_sym = mx.sym.Deconvolution(data, kernel=(2, 2), num_filter=1, name='deconv')
-        check_name(deconv_sym, ['data', 'deconv_data', 'deconv_weight', 'deconv_weight', 'deconv_output'])
-
-        fc_sym = mx.sym.FullyConnected(data, num_hidden=10, name='fc')
-        check_name(fc_sym, ['data', 'fc_data', 'fc_weight', 'fc_weight', 'fc_bias', 'fc_bias', 'fc_output'])
-
-        lrn_sym = mx.sym.LRN(data, nsize=1, name='lrn')
-        check_name(lrn_sym, ['data', 'lrn_data', 'lrn_output', 'lrn_tmp_norm'])
-
-        act_sym = mx.sym.Activation(data, act_type='relu', name='act')
-        check_name(act_sym, ['data', 'act_input0', 'act_output'])
-
-        cc_sym = mx.sym.concat(data, data, dim=0, name='concat')
-        check_name(cc_sym, ['data', 'concat_arg0', 'data', 'concat_arg1', 'concat_output'])
-
-        sm_sym = mx.sym.softmax(data, name='softmax')
-        check_name(sm_sym, ['data', 'softmax_data', 'softmax_output'])
-
-        length = mx.sym.Variable("length", shape=(10, 10, 10))
-        sm_sym = mx.sym.softmax(data, length, axis=1, use_length=True, name='softmax')
-        check_name(sm_sym, ['data', 'softmax_data', 'length', 'softmax_length', 'softmax_output'])
-
-        sa_sym = mx.sym.SoftmaxActivation(data, name='softmax')
-        check_name(sa_sym, ['data', 'softmax_input0', 'softmax_output'])
-
-        us_sym = mx.sym.UpSampling(data, scale=2, sample_type='nearest',
-                                name='upsampling')
-        check_name(us_sym, ['data', 'upsampling_arg0', 'upsampling_output'])
-
-        us_sym = mx.sym.Pooling(data, kernel=(2, 2), pool_type='avg',
-                                name='pooling')
-        check_name(us_sym, ['data', 'pooling_data', 'pooling_output'])
-        del os.environ['MXNET_SUBGRAPH_BACKEND']
-
 @with_seed()
 @pytest.mark.skip(reason="test fails intermittently. temporarily disabled till it gets fixed. tracked at https://github.com/apache/incubator-mxnet/issues/13915")
 def test_activation():
@@ -8879,7 +8749,7 @@ def test_transpose_infer_shape_back():
     o2 = mx.sym.ones(shape=[-1,-1])
     t = mx.sym.transpose(o2)
     b = o1 + t
-    x = b.bind(mx.cpu(), args={})
+    x = b._bind(mx.cpu(), args={})
     y = x.forward()
     assert(y[0].shape == (2,3))
 
@@ -8889,7 +8759,7 @@ def test_transpose_infer_shape_mixed():
     o2 = mx.sym.ones(shape=[3,-1])
     t = mx.sym.transpose(o2)
     b = o1 + t
-    x = b.bind(mx.cpu(), args={})
+    x = b._bind(mx.cpu(), args={})
     y = x.forward()
     assert(y[0].shape == (2,3))
 
@@ -9023,7 +8893,7 @@ def convert_bias(F, q_bias, k_bias, v_bias, num_heads):
                                    num_hidden=out_dim, no_bias=False)
     output = mx.sym.transpose(output, axes=(1, 0, 2))
     output = mx.sym.Group([output, att_score])
-    executor = output.simple_bind(ctx=default_context(),
+    executor = output._simple_bind(ctx=default_context(),
                                   qkv=(batch_size, qkv_length, qkv_dim),
                                   q_weight=(qkv_units, qkv_dim),
                                   q_bias=(qkv_units,),
@@ -9039,13 +8909,13 @@ def convert_bias(F, q_bias, k_bias, v_bias, num_heads):
                                              'k_bias': dtype,
                                              'v_bias': dtype,
                                              'sonde': dtype},
-                                  grad_req='write', force_rebind=True)
-    output_shape = executor.outputs[0].shape
-    output_grads = np.random.rand(*output_shape).astype(dtype) * 0.1
+                                  grad_req='write')
     executor.copy_params_from(arg_params, {})
     executor.arg_dict['sonde'][:] = 0.
     executor.arg_dict['sonde'].wait_to_read()
     executor.forward(is_train=True)
+    output_shape = executor.outputs[0].shape
+    output_grads = np.random.rand(*output_shape).astype(dtype) * 0.1
     output_opti = executor.outputs[0].asnumpy()
     att_score_opti = executor.outputs[1].asnumpy()
     executor.backward([mx.nd.array(output_grads, dtype=dtype),
@@ -9088,10 +8958,10 @@ def convert_bias(F, q_bias, k_bias, v_bias, num_heads):
     output = mx.sym.FullyConnected(weighted_value, weight=out_weight, bias=out_bias, flatten=False,
                                    num_hidden=out_dim, no_bias=False)
     output = mx.sym.Group([output, att_score])
-    executor = output.simple_bind(ctx=default_context(),
+    executor = output._simple_bind(ctx=default_context(),
                                   qkv=(batch_size, qkv_length, qkv_dim),
                                   type_dict={'qkv': dtype},
-                                  grad_req='write', force_rebind=True)
+                                  grad_req='write')
     executor.copy_params_from(arg_params, {})
     executor.arg_dict['sonde'][:] = 0.
     executor.arg_dict['sonde'].wait_to_read()
@@ -9185,7 +9055,7 @@ def convert_bias(F, k_bias, v_bias, num_heads):
                                    num_hidden=out_dim, no_bias=False)
     output = mx.sym.transpose(output, axes=(1, 0, 2))
     output = mx.sym.Group([output, att_score])
-    executor = output.simple_bind(ctx=default_context(),
+    executor = output._simple_bind(ctx=default_context(),
                                   q=(batch_size, qkv_length, qkv_dim),
                                   kv=(batch_size, qkv_length, qkv_dim),
                                   q_weight=(qkv_units, qkv_dim),
@@ -9207,13 +9077,13 @@ def convert_bias(F, k_bias, v_bias, num_heads):
                                              'out_weight': dtype,
                                              'out_bias': dtype,
                                               },
-                                  grad_req='write', force_rebind=True)
-    output_shape = executor.outputs[0].shape
-    output_grads = np.random.rand(*output_shape).astype(dtype) * 0.1
+                                  grad_req='write')
     executor.copy_params_from(arg_params, {})
     executor.arg_dict['sonde'][:] = 0.
     executor.arg_dict['sonde'].wait_to_read()
     executor.forward(is_train=True)
+    output_shape = executor.outputs[0].shape
+    output_grads = np.random.rand(*output_shape).astype(dtype) * 0.1
     output_opti = executor.outputs[0].asnumpy()
     att_score_opti = executor.outputs[1].asnumpy()
     executor.backward([mx.nd.array(output_grads, dtype=dtype), mx.nd.zeros(att_score_opti.shape, dtype=dtype)])
@@ -9258,12 +9128,12 @@ def convert_bias(F, k_bias, v_bias, num_heads):
     output = mx.sym.FullyConnected(weighted_value, weight=out_weight, bias=out_bias, flatten=False,
                                    num_hidden=out_dim, no_bias=False)
     output = mx.sym.Group([output, att_score])
-    executor = output.simple_bind(ctx=default_context(),
+    executor = output._simple_bind(ctx=default_context(),
                                   q=(batch_size, qkv_length, qkv_dim),
                                   kv=(batch_size, qkv_length, qkv_dim),
                                   type_dict={'q': dtype,
                                              'kv': dtype},
-                                  grad_req='write', force_rebind=True)
+                                  grad_req='write')
     executor.copy_params_from(arg_params, {})
     executor.arg_dict['sonde'][:] = 0.
     executor.arg_dict['sonde'].wait_to_read()
diff --git a/tests/python/unittest/test_profiler.py b/tests/python/unittest/test_profiler.py
index 2bdfb0c7f9d2..3b83daa70e5b 100644
--- a/tests/python/unittest/test_profiler.py
+++ b/tests/python/unittest/test_profiler.py
@@ -54,7 +54,7 @@ def test_profiler():
     B = mx.sym.Variable('B')
     C = mx.symbol.dot(A, B)
 
-    executor = C.simple_bind(mx.cpu(1), 'write', A=(4096, 4096), B=(4096, 4096))
+    executor = C._simple_bind(mx.cpu(1), 'write', A=(4096, 4096), B=(4096, 4096))
 
     a = mx.random.uniform(-1.0, 1.0, shape=(4096, 4096))
     b = mx.random.uniform(-1.0, 1.0, shape=(4096, 4096))
@@ -427,8 +427,8 @@ def create_operator(self, ctx, shapes, dtypes):
         a = mx.symbol.Variable('a')
         b = mx.symbol.Custom(data=a, op_type='MyAdd1')
         c = mx.symbol.Custom(data=a, op_type='MyAdd2')
-        y = b.bind(mx.cpu(), {'a': inp})
-        z = c.bind(mx.cpu(), {'a': inp})
+        y = b._bind(mx.cpu(), {'a': inp})
+        z = c._bind(mx.cpu(), {'a': inp})
         yy = y.forward()
         zz = z.forward()
     mx.nd.waitall()
@@ -476,7 +476,7 @@ def test_gpu_memory_profiler_symbolic():
         B = mx.sym.Variable('B')
         C = mx.symbol.dot(A, B, name='dot')
 
-    executor = C.simple_bind(mx.gpu(), 'write', A=(4096, 4096), B=(4096, 4096))
+    executor = C._simple_bind(mx.gpu(), 'write', A=(4096, 4096), B=(4096, 4096))
 
     a = mx.random.uniform(-1.0, 1.0, shape=(4096, 4096))
     b = mx.random.uniform(-1.0, 1.0, shape=(4096, 4096))
@@ -496,23 +496,18 @@ def test_gpu_memory_profiler_symbolic():
              'Requested Size' : str(4 * a.size)},
             {'Attribute Name' : 'tensordot:in_arg:B',
              'Requested Size' : str(4 * b.size)},
-            {'Attribute Name' : 'tensordot:arg_grad:A',
-             'Requested Size' : str(4 * a.size)},
-            {'Attribute Name' : 'tensordot:arg_grad:B',
-             'Requested Size' : str(4 * b.size)},
             {'Attribute Name' : 'tensordot:dot',
-             'Requested Size' : str(4 * c.size)},
-            {'Attribute Name' : 'tensordot:dot_head_grad',
              'Requested Size' : str(4 * c.size)}]
 
     # Sample gpu_memory_profile.csv:
     # "Attribute Name","Requested Size","Device","Actual Size","Reuse?"
-    # "tensordot:arg_grad:A","67108864","0","67108864","0"
-    # "tensordot:arg_grad:B","67108864","0","67108864","0"
-    # "tensordot:dot","67108864","0","67108864","0"
-    # "tensordot:dot_head_grad","67108864","0","67108864","0"
+    # "<unk>:_zeros","67108864","0","67108864","0"
+    # "<unk>:_zeros","67108864","0","67108864","0"
+    # "tensordot:dot","67108864","0","67108864","1"
+    # "tensordot:dot","67108864","0","67108864","1"
     # "tensordot:in_arg:A","67108864","0","67108864","0"
     # "tensordot:in_arg:B","67108864","0","67108864","0"
+    # "nvml_amend","1074790400","0","1074790400","0"
 
     with open('gpu_memory_profile-pid_%d.csv' % (os.getpid()), mode='r') as csv_file:
         csv_reader = csv.DictReader(csv_file)
diff --git a/tests/python/unittest/test_random.py b/tests/python/unittest/test_random.py
index ad7669385930..d45ae0f8f89c 100644
--- a/tests/python/unittest/test_random.py
+++ b/tests/python/unittest/test_random.py
@@ -279,7 +279,7 @@ def check_with_device(device, dtype):
         Y = symbol(**params) + X
         x = mx.nd.zeros(shape, dtype=dtype, ctx=device)
         xgrad = mx.nd.zeros(shape, dtype=dtype, ctx=device)
-        yexec = Y.bind(device, {'X' : x}, {'X': xgrad})
+        yexec = Y._bind(device, {'X' : x}, {'X': xgrad})
         mx.random.seed(128)
         yexec.forward(is_train=True)
         yexec.backward(yexec.outputs[0])
@@ -311,7 +311,7 @@ def check_with_device(device, dtype):
         bindings = { 'v1' : mx.nd.array(symbdic['inputs'][0][1]) }
         if not single_param :
             bindings.update({ 'v2' : mx.nd.array(symbdic['inputs'][1][1]) })
-        yexec = Y.bind(ctx=device, args=bindings)
+        yexec = Y._bind(ctx=device, args=bindings)
         yexec.forward()
         un1 = yexec.outputs[0].copyto(device).asnumpy()
         params = {}
@@ -467,7 +467,7 @@ def test_parallel_random_seed_setting():
             Y = mx.sym.random.uniform(**params) + X
             x = mx.nd.zeros(shape, dtype=dtype, ctx=ctx)
             xgrad = mx.nd.zeros(shape, dtype=dtype, ctx=ctx)
-            yexec = Y.bind(ctx, {'X' : x}, {'X': xgrad})
+            yexec = Y._bind(ctx, {'X' : x}, {'X': xgrad})
             seed = set_seed_variously(seed, num_temp_seeds, seed_to_test)
             yexec.forward(is_train=True)
             yexec.backward(yexec.outputs[0])
@@ -512,7 +512,7 @@ def test_random_seed_setting_for_context():
                 # Check symbolic. `multinomial` uses non-parallel rng.
                 P = mx.sym.Variable("P")
                 X = mx.sym.random.multinomial(data=P, shape=num_samples, get_prob=False)
-                exe = X.bind(ctx, {"P": mx.nd.array(probs, dtype=dtype)})
+                exe = X._bind(ctx, {"P": mx.nd.array(probs, dtype=dtype)})
                 set_seed_variously_for_context(ctx, seed, num_temp_seeds, seed_to_test)
                 exe.forward()
                 samples_sym.append(exe.outputs[0].asnumpy())
@@ -554,7 +554,7 @@ def test_parallel_random_seed_setting_for_context():
                     Y = mx.sym.random.uniform(**params) + X
                     x = mx.nd.zeros(shape, dtype=dtype)
                     xgrad = mx.nd.zeros(shape, dtype=dtype)
-                    yexec = Y.bind(ctx, {'X' : x}, {'X': xgrad})
+                    yexec = Y._bind(ctx, {'X' : x}, {'X': xgrad})
                     set_seed_variously_for_context(ctx, seed, num_temp_seeds, seed_to_test)
                     yexec.forward(is_train=True)
                     yexec.backward(yexec.outputs[0])
@@ -900,7 +900,7 @@ def compute_expected_prob():
     true_classes_var = mx.sym.var('true_classes')
     outputs = mx.sym.contrib.rand_zipfian(true_classes_var, num_sampled, range_max)
     outputs = mx.sym.Group(outputs)
-    executor = outputs.bind(mx.context.current_context(), {'true_classes' : true_classes})
+    executor = outputs._bind(mx.context.current_context(), {'true_classes' : true_classes})
     executor.forward()
     sampled_classes, exp_cnt_true, exp_cnt_sampled = executor.outputs
     assert_almost_equal(exp_cnt_sampled, exp_cnt[sampled_classes], rtol=1e-1, atol=1e-2)
diff --git a/tests/python/unittest/test_sparse_operator.py b/tests/python/unittest/test_sparse_operator.py
index a88ff2e5a05a..26f6829e61a7 100644
--- a/tests/python/unittest/test_sparse_operator.py
+++ b/tests/python/unittest/test_sparse_operator.py
@@ -156,6 +156,10 @@ def all_zero(var):
 
 @with_seed()
 def test_elemwise_binary_ops():
+    # skip testing on GPU because only CPU ops are implemented
+    if default_context().device_type is 'gpu':
+        return
+
     def test_elemwise_binary_op(name, lhs_stype, rhs_stype, shape,
                                 forward_mxnet_call, forward_numpy_call, backward_numpy_call,
                                 lhs_grad_stype,
@@ -303,8 +307,7 @@ def test_elemwise_binary_op(name, lhs_stype, rhs_stype, shape,
             assert igrads_result['lhs'].stype == lhs_grad_stype
         if rhs_grad_stype is not None:
             assert igrads_result['rhs'].stype == rhs_grad_stype
-
-        if skip_gradient_check is not True:
+        if not skip_gradient_check:
             check_numeric_gradient(test, location,
                                    grad_stype_dict=grad_stypes)
 
@@ -331,10 +334,6 @@ def elemwise_mul_stype(lstype, rstype):
             return 'row_sparse'
         elif lstype == 'row_sparse' and rstype == 'default':
             return 'row_sparse'
-        elif lstype == 'default' and rstype == 'csr':
-            return 'csr'
-        elif lstype == 'csr' and rstype == 'default':
-            return 'csr'
         else:
             return 'default'
 
@@ -362,7 +361,6 @@ def check_elemwise_binary_ops(lhs_stype, rhs_stype, shape,
                                 verbose=False)
 
         if ((lhs_stype is 'default' and rhs_stype is 'row_sparse') or
-            (lhs_stype is 'default' and rhs_stype is 'csr') or
             (lhs_stype is 'row_sparse' and rhs_stype is 'row_sparse') and (rhs_density == 0.0)):
             test_elemwise_binary_op("elemwise_add", lhs_stype, rhs_stype, shape,
                                     lambda l, r: mx.sym.sparse.elemwise_add(l, r, out=l),
@@ -384,7 +382,6 @@ def check_elemwise_binary_ops(lhs_stype, rhs_stype, shape,
                                     force_grad_overlap=force_grad_overlap,
                                     lhs_density=lhs_density, rhs_density=rhs_density,
                                     verbose=False)
-
         if ((lhs_stype is 'row_sparse' and rhs_stype is 'row_sparse') and (lhs_density == 0.0)):
             test_elemwise_binary_op("elemwise_add", lhs_stype, rhs_stype, shape,
                                     lambda l, r: mx.sym.sparse.elemwise_add(l, r, out=r),
@@ -509,26 +506,9 @@ def check_elemwise_binary_ops(lhs_stype, rhs_stype, shape,
                         # Try row_sparse overlaps
                         for force_lr_overlap in [False, True]:
                             for force_grad_overlap in [False, True]:
-
                                 print("  force_lr_overlap={}, force_grad_overlap={}, shape={}".
                                       format(force_lr_overlap, force_grad_overlap, shape))
 
-                                # Left and right always overlap when one is default storage
-                                # (assuming the row_sparse one has some entries in it)
-                                if force_lr_overlap is False:
-                                    check_elemwise_binary_ops('default', 'row_sparse', shape,
-                                                              lhs_density=lhs_density,
-                                                              rhs_density=rhs_density,
-                                                              force_lr_overlap=force_lr_overlap,
-                                                              force_grad_overlap=force_grad_overlap,
-                                                              ograd_density=ograd_density)
-                                    check_elemwise_binary_ops('row_sparse', 'default', shape,
-                                                              lhs_density=lhs_density,
-                                                              rhs_density=rhs_density,
-                                                              force_lr_overlap=force_lr_overlap,
-                                                              force_grad_overlap=force_grad_overlap,
-                                                              ograd_density=ograd_density)
-
                                 # Back to left-right overlap possiblities
                                 check_elemwise_binary_ops('row_sparse', 'row_sparse', shape,
                                                           lhs_grad_stype='row_sparse',
@@ -539,32 +519,6 @@ def check_elemwise_binary_ops(lhs_stype, rhs_stype, shape,
                                                           force_grad_overlap=force_grad_overlap,
                                                           ograd_density=ograd_density)
 
-                        # No overlap flags for CSR
-                        check_elemwise_binary_ops('csr', 'csr', shape,
-                                                  lhs_grad_stype='csr',
-                                                  rhs_grad_stype='csr',
-                                                  lhs_density=lhs_density,
-                                                  rhs_density=rhs_density,
-                                                  ograd_density=ograd_density)
-                        check_elemwise_binary_ops('csr', 'csr', shape,
-                                                  lhs_grad_stype='default',
-                                                  rhs_grad_stype='default',
-                                                  lhs_density=lhs_density,
-                                                  rhs_density=rhs_density,
-                                                  ograd_density=ograd_density)
-                        check_elemwise_binary_ops('default', 'csr', shape,
-                                                  lhs_grad_stype='csr',
-                                                  rhs_grad_stype='csr',
-                                                  lhs_density=lhs_density,
-                                                  rhs_density=rhs_density,
-                                                  ograd_density=ograd_density)
-                        check_elemwise_binary_ops('csr', 'default', shape,
-                                                  lhs_grad_stype='csr',
-                                                  rhs_grad_stype='csr',
-                                                  lhs_density=lhs_density,
-                                                  rhs_density=rhs_density,
-                                                  ograd_density=ograd_density)
-
 
 @with_seed()
 def test_elemwise_csr_same_zeros():
@@ -680,9 +634,9 @@ def check_sparse_mathematical_core(name, stype,
     args.append(arr_data)
 
     if arr_grad is not None:
-        exe_test = test.bind(default_context(), args=args, args_grad=[arr_grad])
+        exe_test = test._bind(default_context(), args=args, args_grad=[arr_grad])
     else:
-        exe_test = test.bind(default_context(), args=args)
+        exe_test = test._bind(default_context(), args=args)
 
     exe_test.forward(is_train=True)
     assert exe_test.outputs[0].stype == expected_result_type
@@ -1675,7 +1629,7 @@ def test_sparse_square_sum():
                     dns_data = mx.sym.Variable('data')
                     baseline = mx.sym.sum(mx.sym.square(dns_data), axis=axis, keepdims=keepdim)
                     igrad_expected = mx.nd.empty(dns.shape)
-                    baseline_exec = baseline.bind(default_context(), args=[dns],
+                    baseline_exec = baseline._bind(default_context(), args=[dns],
                                                   args_grad=[igrad_expected])
                     baseline_exec.forward(is_train=True)
                     baseline_exec.backward([ret_expected])
@@ -1689,7 +1643,7 @@ def test_sparse_square_sum():
                     # Need to add one more layer after square_sum to trigger the kernel for ograd
                     # with default stype in square_sum op.
                     baseline1 = baseline + 1
-                    baseline_exec1 = baseline1.bind(default_context(), args=[dns],
+                    baseline_exec1 = baseline1._bind(default_context(), args=[dns],
                                                     args_grad=[igrad_expected])
                     baseline_exec1.forward(is_train=True)
                     baseline_exec1.backward([ret_expected])
@@ -1766,7 +1720,7 @@ def check_sparse_elementwise_sum_with_shape(stypes, shape, n):
         for stype in stypes:
             arr.append(rand_ndarray(shape, stype, densities[np.random.randint(0, len(densities))]))
 
-        exec1 = out.bind(default_context(),
+        exec1 = out._bind(default_context(),
                          args=arr,
                          args_grad=arr_grad)
         exec1.forward(is_train=True)
@@ -1798,70 +1752,24 @@ def check_sparse_elementwise_sum_with_shape(stypes, shape, n):
             check_sparse_elementwise_sum_with_shape(stypes, shape, test_len+1)
 
 
-@with_seed()
-def test_contrib_sparse_embedding():
-    ''' test sparse embedding operator '''
-    def check_sparse_embedding(in_dim, out_dim, batch, densities, deterministic, weight_stype):
-        # init executor
-        data = mx.sym.Variable("data")
-        weight = mx.sym.Variable("embed_weight", stype=weight_stype)
-        embed = mx.sym.contrib.SparseEmbedding(data=data, weight=weight, input_dim=in_dim,
-                                               output_dim=out_dim, deterministic=deterministic,
-                                               name="embed")
-        grad_req = {'data': 'null', 'embed_weight': 'write'}
-        exe_test = embed.simple_bind(default_context(), grad_req=grad_req, data=(batch,))
-        arg_map = dict(zip(embed.list_arguments(), exe_test.arg_arrays))
-        grad_map = dict(zip(embed.list_arguments(), exe_test.grad_arrays))
-        # init data
-        np_data = np.random.randint(low=0, high=in_dim, size=batch)
-        np_onehot = np.zeros((batch, in_dim)).astype(np.float32)
-        np_onehot[np.arange(batch), np_data] = 1.0
-        arg_map["data"][:] = np_data
-        # init grad
-        np_grad = np.random.uniform(-1, 1, exe_test.outputs[0].shape)
-        grad = mx.nd.zeros(np_grad.shape)
-        grad[:] = np_grad
-        # weight
-        weight = arg_map["embed_weight"]
-        for density in densities:
-            # update weight based on density
-            weight[:] = rand_ndarray(weight.shape, weight_stype, density=density)
-            # check forward
-            exe_test.forward(is_train=True)
-            assert_almost_equal(exe_test.outputs[0].asnumpy(), np.dot(np_onehot, weight.asnumpy()), atol=1e-4)
-            # check backward
-            exe_test.backward([grad])
-            assert_almost_equal(grad_map["embed_weight"].asnumpy(), np.dot(np_onehot.T, grad.asnumpy()), atol=1e-4)
-            # run twice to check if the result is deterministic when passing "deterministic=True" to SparseEmbedding
-            if deterministic:
-                grad_ref = grad_map["embed_weight"].asnumpy()
-                exe_test.backward([grad])
-                assert_almost_equal(grad_map["embed_weight"].asnumpy(), grad_ref, atol=0, rtol=0)
-
-    densities = [0, 0.5, 1]
-    in_dim = 50
-    out_dim = 3
-    batch = 8
-    stypes = ['default', 'row_sparse']
-    deterministics = [True, False]
-    for stype in stypes:
-        for deterministic in deterministics:
-            check_sparse_embedding(in_dim, out_dim, batch, densities, deterministic, stype)
-            check_sparse_embedding(in_dim, out_dim, batch, densities, deterministic, stype)
-
 @with_seed()
 @pytest.mark.serial
 def test_sparse_embedding():
     ''' test sparse embedding operator '''
-    def check_sparse_embedding(in_dim, out_dim, batch, densities, sparse_grad, weight_stype):
+    def check_sparse_embedding(in_dim, out_dim, batch, densities, sparse_grad):
         target_stype = 'row_sparse' if sparse_grad else 'default'
         # init executor
         data = mx.sym.Variable("data")
-        weight = mx.sym.Variable("embed_weight", stype=weight_stype)
+        weight = mx.sym.Variable("embed_weight")
         embed = mx.sym.sparse.Embedding(data=data, weight=weight, input_dim=in_dim,
                                         sparse_grad=sparse_grad, output_dim=out_dim, name='embed')
         grad_req = {'data': 'null', 'embed_weight': 'write'}
-        exe_test = embed.simple_bind(default_context(), grad_req=grad_req, data=(batch,))
+        args = {'embed_weight': mx.nd.zeros((in_dim, out_dim)), 'data': mx.nd.ones((batch,))}
+        weight_grad = mx.nd.zeros((in_dim, out_dim))
+        if sparse_grad:
+            weight_grad = weight_grad.tostype('row_sparse')
+        args_grad = {'embed_weight': weight_grad}
+        exe_test = embed._bind(default_context(), args=args, args_grad=args_grad, grad_req=grad_req)
         arg_map = dict(zip(embed.list_arguments(), exe_test.arg_arrays))
         grad_map = dict(zip(embed.list_arguments(), exe_test.grad_arrays))
         # init data
@@ -1869,17 +1777,17 @@ def check_sparse_embedding(in_dim, out_dim, batch, densities, sparse_grad, weigh
         np_onehot = np.zeros((batch, in_dim)).astype(np.float32)
         np_onehot[np.arange(batch), np_data] = 1.0
         arg_map["data"][:] = np_data
-        # init grad
-        np_grad = np.random.uniform(-1, 1, exe_test.outputs[0].shape)
-        grad = mx.nd.zeros(np_grad.shape)
-        grad[:] = np_grad
         # weight
         weight = arg_map["embed_weight"]
         for density in densities:
             # update weight based on density
-            weight[:] = rand_ndarray(weight.shape, weight_stype, density=density)
+            weight[:] = rand_ndarray(weight.shape, 'default', density=density)
             # check forward
             exe_test.forward(is_train=True)
+            # init grad
+            np_grad = np.random.uniform(-1, 1, exe_test.outputs[0].shape)
+            grad = mx.nd.zeros(np_grad.shape)
+            grad[:] = np_grad
             assert_almost_equal(exe_test.outputs[0].asnumpy(), np.dot(np_onehot, weight.asnumpy()), atol=1e-4)
             # check backward
             exe_test.backward([grad])
@@ -1891,12 +1799,9 @@ def check_sparse_embedding(in_dim, out_dim, batch, densities, sparse_grad, weigh
     in_dim = 50
     out_dim = 3
     batch = 8
-    weight_stypes = ['default', 'row_sparse']
     sparse_grads = [True, False]
-    for weight_stype in weight_stypes:
-        for sparse_grad in sparse_grads:
-            check_sparse_embedding(in_dim, out_dim, batch, densities, sparse_grad, weight_stype)
-            check_sparse_embedding(in_dim, out_dim, batch, densities, sparse_grad, weight_stype)
+    for sparse_grad in sparse_grads:
+        check_sparse_embedding(in_dim, out_dim, batch, densities, sparse_grad)
 
 @with_seed()
 def test_sparse_broadcast_add_sub():
@@ -2021,7 +1926,7 @@ def check_scatter_ops(name, shape, lhs_stype, rhs_stype, forward_mxnet_call, for
         location = {'lhs': lhs_nd, 'rhs': rhs_nd}
 
         out = forward_mxnet_call(lhs, rhs)
-        exe_test = out.bind(default_context(), args=location)
+        exe_test = out._bind(default_context(), args=location)
         exe_test.forward(is_train=False)
         out_nd = exe_test.outputs[0]
 
@@ -2238,18 +2143,20 @@ def test_where_helper(shape):
         grad_in_mx = mx.nd.array(grad_in_np, dtype=np.int32)
         where_sym = mx.sym.where(condition, x, y)
 
+        cond_nd = mx.nd.array(condition_np)
+        args = {'condition': cond_nd.tostype('csr'), 'x': mx.nd.array(x_np),
+                'y' : mx.nd.array(y_np)}
+        args_grad = {'condition': mx.nd.zeros_like(cond_nd),
+                     'x': mx.nd.array(x_np).tostype('csr'), 'y' : mx.nd.array(y_np)}
         # test req='write'
-        where_exe_write = where_sym.simple_bind(ctx=default_context(),
-                                                condition=condition_np.shape,
-                                                x=x_np.shape, y=y_np.shape,
-                                                grad_req='write')
+        where_exe_write = where_sym._bind(ctx=default_context(), args=args,
+                                         args_grad=args_grad, grad_req='write')
+
         # test forward req='write'
-        cond_nd = mx.nd.array(condition_np).tostype('csr')
-        outputs = where_exe_write.forward(is_train=True, \
-                                          condition=cond_nd, x=x_np, y=y_np)
+        outputs = where_exe_write.forward(is_train=True)
         assert same(outputs[0].asnumpy(), out_expected)
         # test backward req='write'
-        where_exe_write.backward(grad_in_mx)
+        where_exe_write.backward(grad_in_mx.astype('float32'))
         assert same(where_exe_write.grad_dict['x'].asnumpy(), grad_expected_x)
         assert same(where_exe_write.grad_dict['y'].asnumpy(), grad_expected_y)
         assert same(where_exe_write.grad_dict['condition'].asnumpy(), grad_expected_cond)
@@ -2257,14 +2164,12 @@ def test_where_helper(shape):
         # test req='add'
         x_grad_init = np.random.randint(30, 40, np.prod(shape)).reshape(shape)
         y_grad_init = np.random.randint(40, 50, np.prod(shape)).reshape(shape)
-        where_exe_add = where_sym.simple_bind(ctx=default_context(),
-                                              condition=cond_nd.shape,
-                                              x=x_np.shape, y=y_np.shape,
-                                              grad_req='add')
+        where_exe_add = where_sym._bind(ctx=default_context(), args=args,
+                                       args_grad=args_grad, grad_req='add')
         where_exe_add.grad_dict['x'][:] = x_grad_init
         where_exe_add.grad_dict['y'][:] = y_grad_init
         # test forward req='add'
-        outputs = where_exe_add.forward(is_train=True, condition=cond_nd, x=x_np, y=y_np)
+        outputs = where_exe_add.forward(is_train=True)
         assert same(outputs[0].asnumpy(), out_expected)
 
     def test_where_numeric_gradient(shape):
@@ -2335,10 +2240,9 @@ def test_reshape_backward_fallback():
     out = mx.sym.sparse.dot(x, w_x, name='out_x')
 
     grad_w_nd = rand_ndarray(w_shape, 'row_sparse')
-    executor = out.bind(ctx=ctx, args={"x": x_nd, "w": w_nd},
+    executor = out._bind(ctx=ctx, args={"x": x_nd, "w": w_nd},
                         args_grad={"w": grad_w_nd})
     executor.forward(is_train=True)
     executor.backward(out_x_nd)
 
     assert_almost_equal(grad_w_nd.asnumpy(), expected_grad_nd)
-
diff --git a/tests/python/unittest/test_subgraph.py b/tests/python/unittest/test_subgraph.py
index bf170bfd5104..0568db88335e 100644
--- a/tests/python/unittest/test_subgraph.py
+++ b/tests/python/unittest/test_subgraph.py
@@ -27,7 +27,7 @@
 
 def make_subgraph(subg, *args):
     js = subg.tojson()
-    return mx.sym._internal._CachedOp(*args, subgraph=js)
+    return subg
 
 @with_seed()
 @pytest.mark.serial
@@ -119,10 +119,10 @@ def make_subgraph4(stype):
             all_inputs = copy.deepcopy(inputs)
             all_inputs.update(aux_states)
             args_grad = {key : mx.nd.empty(shape=all_inputs[key].shape) for key in all_inputs.keys()}
-            e1 = orig.bind(ctx=default_context(), args=all_inputs, args_grad=args_grad,
+            e1 = orig._bind(ctx=default_context(), args=all_inputs, args_grad=args_grad,
                     aux_states=all_inputs)
             args_grad = {key : mx.nd.empty(shape=all_inputs[key].shape) for key in all_inputs.keys()}
-            e2 = subg.bind(ctx=default_context(), args=all_inputs, args_grad=args_grad,
+            e2 = subg._bind(ctx=default_context(), args=all_inputs, args_grad=args_grad,
                     aux_states=all_inputs)
             e1.forward(is_train=True)
             e2.forward(is_train=True)
@@ -189,7 +189,6 @@ def create_operator(self, ctx, shapes, dtypes):
     b = a + 1
     b = mx.symbol.Custom(data=a, op_type='MyAdd1')
     c = mx.symbol.Custom(data=a, op_type='MyAdd2')
-    b.bind(mx.cpu(), {'a': inp}).forward()
-    c.bind(mx.cpu(), {'a': inp}).forward()
+    b._bind(mx.cpu(), {'a': inp}).forward()
+    c._bind(mx.cpu(), {'a': inp}).forward()
     mx.nd.waitall()
-
diff --git a/tests/python/unittest/test_subgraph_op.py b/tests/python/unittest/test_subgraph_op.py
index bb693f57f415..c8ddaf84de6d 100644
--- a/tests/python/unittest/test_subgraph_op.py
+++ b/tests/python/unittest/test_subgraph_op.py
@@ -110,7 +110,7 @@ def get_graphs():
 @pytest.mark.parametrize('subgraph_backend', ['default', 'default_v2'])
 @pytest.mark.parametrize('sym,op_names', get_graphs())
 def test_subgraph_exe1(sym, subgraph_backend, op_names):
-    """Use the partitioned sym to simple_bind an executor and compare the outputs
+    """Use the partitioned sym to _simple_bind an executor and compare the outputs
     with those of the original executor"""
     sym, _, _ = sym
     out = SymbolHandle()
@@ -121,8 +121,8 @@ def test_subgraph_exe1(sym, subgraph_backend, op_names):
     assert partitioned_sym.list_inputs() == sym.list_inputs()
     assert partitioned_sym.list_arguments() == sym.list_arguments()
     assert partitioned_sym.list_auxiliary_states() == sym.list_auxiliary_states()
-    exe = sym.simple_bind(ctx=mx.current_context(), grad_req='null')
-    partitioned_exe = partitioned_sym.simple_bind(ctx=mx.current_context(), grad_req='null')
+    exe = sym._simple_bind(ctx=mx.current_context(), grad_req='null')
+    partitioned_exe = partitioned_sym._simple_bind(ctx=mx.current_context(), grad_req='null')
     input_names = sym.list_inputs()
     for name in input_names:
         if name in exe.arg_dict:
@@ -142,14 +142,14 @@ def test_subgraph_exe1(sym, subgraph_backend, op_names):
 @pytest.mark.parametrize('subgraph_backend', ['default', 'default_v2'])
 @pytest.mark.parametrize('sym,op_names', get_graphs())
 def test_subgraph_exe2(sym, subgraph_backend, op_names):
-    """Use env var MXNET_SUBGRAPH_BACKEND=default to trigger graph partitioning in simple_bind
+    """Use env var MXNET_SUBGRAPH_BACKEND=default to trigger graph partitioning in _simple_bind
     and compare results of the partitioned sym and the original sym."""
     def get_executor(sym, subgraph_backend=None, op_names=None, original_exec=None):
         if subgraph_backend is not None:
             os.environ['MXNET_SUBGRAPH_BACKEND'] = subgraph_backend
             check_call(_LIB.MXSetSubgraphPropertyOpNames(c_str(subgraph_backend), mx_uint(len(op_names)),
                                                          c_str_array(op_names)))
-        exe = sym.simple_bind(ctx=mx.current_context(), grad_req='null')
+        exe = sym._simple_bind(ctx=mx.current_context(), grad_req='null')
         input_names = sym.list_inputs()
         for name in input_names:
             if name in exe.arg_dict:
@@ -194,8 +194,8 @@ def test_subgraph_exe3(sym, subgraph_backend, op_names):
     arg_shapes, _, aux_shapes = sym.infer_shape()
     arg_array = [mx.nd.random.uniform(shape=shape) for shape in arg_shapes]
     aux_array = [mx.nd.random.uniform(shape=shape) for shape in aux_shapes]
-    exe = sym.bind(ctx=mx.current_context(), args=arg_array, aux_states=aux_array, grad_req='null')
-    partitioned_exe = partitioned_sym.bind(ctx=mx.current_context(), args=arg_array,
+    exe = sym._bind(ctx=mx.current_context(), args=arg_array, aux_states=aux_array, grad_req='null')
+    partitioned_exe = partitioned_sym._bind(ctx=mx.current_context(), args=arg_array,
                                            aux_states=aux_array, grad_req='null')
     exe.forward()
     partitioned_exe.forward()
@@ -221,7 +221,7 @@ def get_executor(sym, subgraph_backend=None, op_names=None, original_exec=None):
         else:
             arg_array = None
             aux_array = None
-        exe = sym.bind(ctx=mx.current_context(),
+        exe = sym._bind(ctx=mx.current_context(),
                        args=arg_array if subgraph_backend is None else original_exec.arg_arrays,
                        aux_states=aux_array if subgraph_backend is None else original_exec.aux_arrays,
                        grad_req='null')
@@ -262,21 +262,21 @@ def copy_inputs_between_executors(exe1, exe2, input_names):
 @pytest.mark.parametrize('sym,op_names', get_graphs())
 def test_subgraph_exe5(sym, subgraph_backend, op_names):
     """Call optimize_for to trigger graph partitioning without infer shapes/types before,
-    then simple_bind and compare results of the partitioned sym and the original sym."""
-    # simple_bind
+    then _simple_bind and compare results of the partitioned sym and the original sym."""
+    # _simple_bind
     sym, _, _ = sym
-    exe1 = sym.simple_bind(ctx=mx.current_context(), grad_req='null')
+    exe1 = sym._simple_bind(ctx=mx.current_context(), grad_req='null')
     input_names = sym.list_inputs()
     set_random_inputs(exe1, input_names)
     exe1.forward()
 
-    # partition before simple_bind
+    # partition before _simple_bind
     check_call(_LIB.MXSetSubgraphPropertyOpNamesV2(c_str(subgraph_backend), mx_uint(len(op_names)),
                                                  c_str_array(op_names)))
     part_sym = sym.optimize_for(subgraph_backend)
     check_call(_LIB.MXRemoveSubgraphPropertyOpNamesV2(c_str(subgraph_backend)))
 
-    exe2 = part_sym.simple_bind(ctx=mx.current_context(), grad_req='null')
+    exe2 = part_sym._simple_bind(ctx=mx.current_context(), grad_req='null')
     copy_inputs_between_executors(exe1, exe2, input_names)
     exe2.forward()
 
@@ -290,22 +290,22 @@ def test_subgraph_exe5(sym, subgraph_backend, op_names):
 @pytest.mark.parametrize('subgraph_backend', ['default', 'default_v2'])
 @pytest.mark.parametrize('sym,op_names', get_graphs())
 def test_subgraph_exe6(sym, subgraph_backend, op_names):
-    """Call optimize_for to trigger graph partitioning with shapes/types, then simple_bind
+    """Call optimize_for to trigger graph partitioning with shapes/types, then _simple_bind
     and compare results of the partitioned sym and the original sym."""
-    # simple_bind
+    # _simple_bind
     sym, _, _ = sym
-    exe1 = sym.simple_bind(ctx=mx.current_context(), grad_req='null')
+    exe1 = sym._simple_bind(ctx=mx.current_context(), grad_req='null')
     input_names = sym.list_inputs()
     set_random_inputs(exe1, input_names)
     exe1.forward()
 
-    # infer shape/type before partition before simple_bind
+    # infer shape/type before partition before _simple_bind
     check_call(_LIB.MXSetSubgraphPropertyOpNamesV2(c_str(subgraph_backend), mx_uint(len(op_names)),
                                                  c_str_array(op_names)))
     part_sym = sym.optimize_for(subgraph_backend, exe1.arg_dict, exe1.aux_dict)
     check_call(_LIB.MXRemoveSubgraphPropertyOpNamesV2(c_str(subgraph_backend)))
 
-    exe2 = part_sym.simple_bind(ctx=mx.current_context(), grad_req='null')
+    exe2 = part_sym._simple_bind(ctx=mx.current_context(), grad_req='null')
     copy_inputs_between_executors(exe1, exe2, input_names)
     exe2.forward()
 
@@ -326,7 +326,7 @@ def test_subgraph_exe7(sym, subgraph_backend, op_names):
     arg_shapes, _, aux_shapes = sym.infer_shape()
     arg_array = [mx.nd.random.uniform(shape=shape) for shape in arg_shapes]
     aux_array = [mx.nd.random.uniform(shape=shape) for shape in aux_shapes]
-    exe1 = sym.bind(ctx=mx.current_context(), args=arg_array, aux_states=aux_array, grad_req='null')
+    exe1 = sym._bind(ctx=mx.current_context(), args=arg_array, aux_states=aux_array, grad_req='null')
     exe1.forward()
 
     # partition before bind
@@ -335,7 +335,7 @@ def test_subgraph_exe7(sym, subgraph_backend, op_names):
     part_sym = sym.optimize_for(subgraph_backend)
     check_call(_LIB.MXRemoveSubgraphPropertyOpNamesV2(c_str(subgraph_backend)))
 
-    exe2 = part_sym.bind(ctx=mx.current_context(), args=arg_array, aux_states=aux_array, grad_req='null')
+    exe2 = part_sym._bind(ctx=mx.current_context(), args=arg_array, aux_states=aux_array, grad_req='null')
     exe2.forward()
 
     # compare outputs
@@ -357,7 +357,7 @@ def test_subgraph_exe8(sym, subgraph_backend, op_names):
     aux_names = sym.list_auxiliary_states()
     arg_dict = {name:mx.nd.random.uniform(shape=shape) for name,shape in zip(arg_names,arg_shapes)}
     aux_dict = {name:mx.nd.random.uniform(shape=shape) for name,shape in zip(aux_names,aux_shapes)}
-    exe1 = sym.bind(ctx=mx.current_context(), args=arg_dict, aux_states=aux_dict, grad_req='null')
+    exe1 = sym._bind(ctx=mx.current_context(), args=arg_dict, aux_states=aux_dict, grad_req='null')
     exe1.forward()
 
     # infer shape/type before partition before bind
@@ -366,7 +366,7 @@ def test_subgraph_exe8(sym, subgraph_backend, op_names):
     part_sym = sym.optimize_for(subgraph_backend, arg_dict, aux_dict)
     check_call(_LIB.MXRemoveSubgraphPropertyOpNamesV2(c_str(subgraph_backend)))
 
-    exe2 = part_sym.bind(ctx=mx.current_context(), args=arg_dict, aux_states=aux_dict, grad_req='null')
+    exe2 = part_sym._bind(ctx=mx.current_context(), args=arg_dict, aux_states=aux_dict, grad_req='null')
     exe2.forward()
 
     # compare outputs
diff --git a/tests/python/unittest/test_symbol.py b/tests/python/unittest/test_symbol.py
index 5bfc4be6f324..1c84af0c668e 100644
--- a/tests/python/unittest/test_symbol.py
+++ b/tests/python/unittest/test_symbol.py
@@ -257,52 +257,24 @@ def check_symbol_consistency(sym1, sym2, ctx, skip_grad=False, equal_nan=False):
 def test_blockgrad():
     a = mx.sym.Variable('a')
     b = mx.sym.BlockGrad(2*a)
-    exe = b.simple_bind(ctx=mx.cpu(), a=(10,10))
+    exe = b._simple_bind(ctx=mx.cpu(), a=(10,10))
 
 
-def test_zero_prop():
-    data = mx.symbol.Variable('data')
-    for i in range(10):
-        data = data * data
-
-    exe = data.simple_bind(ctx=mx.cpu(), data=(10, 3, 256, 256))
-    big = int(re.search(r'Total (\d+) MB allocated', exe.debug_str()).group(1))
-
-    exe = data.simple_bind(ctx=mx.cpu(), data=(10, 3, 256, 256), grad_req='null')
-    small1 = int(re.search(r'Total (\d+) MB allocated', exe.debug_str()).group(1))
-
-    data = mx.sym.stop_gradient(data)
-    exe = data.simple_bind(ctx=mx.cpu(), data=(10, 3, 256, 256))
-    small2 = int(re.search(r'Total (\d+) MB allocated', exe.debug_str()).group(1))
-
-    assert big > small2
-    assert small1 == small2
-
 def test_zero_prop2():
     x = mx.sym.Variable('x')
     idx = mx.sym.Variable('idx')
     y = mx.sym.batch_take(x, idx)
     z = mx.sym.stop_gradient(y)
-    exe = z.simple_bind(ctx=mx.cpu(), x=(10, 10), idx=(10,),
+    exe = z._simple_bind(ctx=mx.cpu(), x=(10, 10), idx=(10,),
                         type_dict={'x': np.float32, 'idx': np.int32})
-    exe.forward()
+    exe.forward(is_train=True)
     exe.backward()
-
-    # The following bind() should throw an exception. We discard the expected stderr
-    # output for this operation only in order to keep the test logs clean.
-    with discard_stderr():
-        try:
-            y.simple_bind(ctx=mx.cpu(), x=(10, 10), idx=(10,),
-                          type_dict={'x': np.float32, 'idx': np.int32})
-        except:
-            return
-
-    assert False
+    mx.nd.waitall()
 
 
 def test_simple_bind_incomplete_shape_inference_in_one_forward_pass():
     r"""This is a special case that results in shape inference
-    failure after moving simple_bind logic from frontend to backend.
+    failure after moving _simple_bind logic from frontend to backend.
     Added here for testing against the network similar to the following one.
 
     Network diagram:
@@ -322,7 +294,7 @@ def test_simple_bind_incomplete_shape_inference_in_one_forward_pass():
     fc = mx.sym.FullyConnected(data=data, num_hidden=1, no_bias=True, name='fc')
     modified_weight = mx.sym.abs(fc.get_internals()['fc_weight'])
     net = mx.sym.sum(modified_weight) + mx.sym.sum(fc)
-    net.simple_bind(ctx=mx.cpu(), data=data_shape)
+    net._simple_bind(ctx=mx.cpu(), data=data_shape)
 
 
 def test_simple_bind_gradient_graph_possible_with_cycle():
@@ -336,7 +308,7 @@ def test_simple_bind_gradient_graph_possible_with_cycle():
     for more details."""
     data = mx.symbol.Variable('data')
     res = data + data + data + data + data + data + data + data
-    res.simple_bind(ctx=mx.cpu(), data=(1,))
+    res._simple_bind(ctx=mx.cpu(), data=(1,))
 
 def test_children_same_name():
     a = mx.sym.Variable('data')
@@ -399,10 +371,10 @@ def check_cse_on_symbol(sym, expected_savings, check_data, **kwargs):
                 for grad_req in ['write', 'add']:
                     type_dict = {inp : dtype for inp in inputs}
                     os.environ[env_var_name] = '0'
-                    orig_exec = sym.simple_bind(ctx=mx.cpu(0), grad_req=grad_req,
+                    orig_exec = sym._simple_bind(ctx=mx.cpu(0), grad_req=grad_req,
                                                 type_dict=type_dict, **shapes)
                     os.environ[env_var_name] = '1'
-                    cse_exec = sym.simple_bind(ctx=mx.cpu(0), grad_req=grad_req,
+                    cse_exec = sym._simple_bind(ctx=mx.cpu(0), grad_req=grad_req,
                                                type_dict=type_dict, **shapes)
                     fwd_orig = orig_exec.forward(is_train=True, **data)
                     out_grads = [mx.nd.ones_like(arr) for arr in fwd_orig]
@@ -501,7 +473,7 @@ def test_infershape_happens_for_all_ops_in_graph():
         try:
             # This should throw an exception as you cannot add arrays
             # with shapes [2,3] and [3,2]
-            e = s3.simple_bind(ctx=mx.cpu(), x=(2,3), grad_req='null')
+            e = s3._simple_bind(ctx=mx.cpu(), x=(2,3), grad_req='null')
         except:
             return
 
diff --git a/tests/python/unittest/test_thread_local.py b/tests/python/unittest/test_thread_local.py
index 41c7ff5f49c6..521bc834c4f5 100644
--- a/tests/python/unittest/test_thread_local.py
+++ b/tests/python/unittest/test_thread_local.py
@@ -159,7 +159,7 @@ def f():
         b = mx.sym.var("b")
         a_ = mx.nd.ones((2, 2))
         c_ = a_.copy()
-        func1 = (a + b).bind(mx.cpu(), args={'a': a_, 'b': c_})
+        func1 = (a + b)._bind(mx.cpu(), args={'a': a_, 'b': c_})
         func1.forward()[0].wait_to_read()
         status[0] = True
     thread = threading.Thread(target=f)