From db91e8e6e4c60ef816d2f2f80b4e4c445be8a7ee Mon Sep 17 00:00:00 2001
From: SparkSnail <shinyang@microsoft.com>
Date: Wed, 25 Dec 2019 15:27:07 +0800
Subject: [PATCH 01/14] Support https in paiHost (#1873)

---
 src/nni_manager/training_service/pai/paiConfig.ts  |  2 +-
 .../training_service/pai/paiJobInfoCollector.ts    |  2 +-
 .../pai/paiK8S/paiK8STrainingService.ts            |  3 ++-
 .../training_service/pai/paiTrainingService.ts     | 14 ++++++++++++--
 .../pai/paiYarn/paiYarnTrainingService.ts          |  4 ++--
 5 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/src/nni_manager/training_service/pai/paiConfig.ts b/src/nni_manager/training_service/pai/paiConfig.ts
index c8f1c414fc..d6c405b0aa 100644
--- a/src/nni_manager/training_service/pai/paiConfig.ts
+++ b/src/nni_manager/training_service/pai/paiConfig.ts
@@ -9,7 +9,7 @@ import { TrialJobApplicationForm, TrialJobDetail, TrialJobStatus  } from '../../
 export class PAIClusterConfig {
     public readonly userName: string;
     public readonly passWord?: string;
-    public readonly host: string;
+    public host: string;
     public readonly token?: string;
 
     /**
diff --git a/src/nni_manager/training_service/pai/paiJobInfoCollector.ts b/src/nni_manager/training_service/pai/paiJobInfoCollector.ts
index ce50d4cd57..07ec1cda7f 100644
--- a/src/nni_manager/training_service/pai/paiJobInfoCollector.ts
+++ b/src/nni_manager/training_service/pai/paiJobInfoCollector.ts
@@ -52,7 +52,7 @@ export class PAIJobInfoCollector {
         // Rest call to get PAI job info and update status
         // Refer https://github.com/Microsoft/pai/blob/master/docs/rest-server/API.md for more detail about PAI Rest API
         const getJobInfoRequest: request.Options = {
-            uri: `http://${paiClusterConfig.host}/rest-server/api/v1/user/${paiClusterConfig.userName}/jobs/${paiTrialJob.paiJobName}`,
+            uri: `${paiClusterConfig.host}/rest-server/api/v1/user/${paiClusterConfig.userName}/jobs/${paiTrialJob.paiJobName}`,
             method: 'GET',
             json: true,
                headers: {
diff --git a/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts b/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts
index 4b0a0f81a2..2888a9f425 100644
--- a/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts
+++ b/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts
@@ -68,6 +68,7 @@ class PAIK8STrainingService extends PAITrainingService {
                 } else if(this.paiClusterConfig.token) {
                     this.paiToken = this.paiClusterConfig.token;
                 }
+                this.paiClusterConfig.host = this.formatPAIHost(this.paiClusterConfig.host);
                 break;
 
             case TrialConfigMetadataKey.TRIAL_CONFIG:
@@ -257,7 +258,7 @@ class PAIK8STrainingService extends PAITrainingService {
         // Step 3. Submit PAI job via Rest call
         // Refer https://github.com/Microsoft/pai/blob/master/docs/rest-server/API.md for more detail about PAI Rest API
         const submitJobRequest: request.Options = {
-            uri: `http://${this.paiClusterConfig.host}/rest-server/api/v2/jobs`,
+            uri: `${this.paiClusterConfig.host}/rest-server/api/v2/jobs`,
             method: 'POST',
             body: paiJobConfig,
             headers: {
diff --git a/src/nni_manager/training_service/pai/paiTrainingService.ts b/src/nni_manager/training_service/pai/paiTrainingService.ts
index 0edbc66966..5709d4678d 100644
--- a/src/nni_manager/training_service/pai/paiTrainingService.ts
+++ b/src/nni_manager/training_service/pai/paiTrainingService.ts
@@ -165,7 +165,7 @@ abstract class PAITrainingService implements TrainingService {
         }
 
         const stopJobRequest: request.Options = {
-            uri: `http://${this.paiClusterConfig.host}/rest-server/api/v1/user/${this.paiClusterConfig.userName}\
+            uri: `${this.paiClusterConfig.host}/rest-server/api/v1/user/${this.paiClusterConfig.userName}\
 /jobs/${trialJobDetail.paiJobName}/executionType`, 
             method: 'PUT',
             json: true,
@@ -216,6 +216,16 @@ abstract class PAITrainingService implements TrainingService {
         return this.metricsEmitter;
     }
 
+    protected formatPAIHost(host: string): string {
+        // If users' host start with 'http://' or 'https://', use the original host,
+        // or format to 'http//${host}'
+        if (host.startsWith('http://') || host.startsWith('https://')) {
+            return host;
+        } else {
+            return `http://${host}`;
+        }
+    }
+
     protected async statusCheckingLoop(): Promise<void> {
         while (!this.stopping) {
             if(this.paiClusterConfig && this.paiClusterConfig.passWord) {
@@ -259,7 +269,7 @@ abstract class PAITrainingService implements TrainingService {
         }
 
         const authenticationReq: request.Options = {
-            uri: `http://${this.paiClusterConfig.host}/rest-server/api/v1/token`,
+            uri: `${this.paiClusterConfig.host}/rest-server/api/v1/token`,
             method: 'POST',
             json: true,
             body: {
diff --git a/src/nni_manager/training_service/pai/paiYarn/paiYarnTrainingService.ts b/src/nni_manager/training_service/pai/paiYarn/paiYarnTrainingService.ts
index 2106cf145f..6b6f905b72 100644
--- a/src/nni_manager/training_service/pai/paiYarn/paiYarnTrainingService.ts
+++ b/src/nni_manager/training_service/pai/paiYarn/paiYarnTrainingService.ts
@@ -107,7 +107,7 @@ class PAIYarnTrainingService extends PAITrainingService {
                 } else {
                     throw new Error('pai cluster config format error, please set password or token!');
                 }
-
+                this.paiClusterConfig.host = this.formatPAIHost(this.paiClusterConfig.host);
                 break;
 
             case TrialConfigMetadataKey.TRIAL_CONFIG:
@@ -272,7 +272,7 @@ class PAIYarnTrainingService extends PAITrainingService {
         // Step 3. Submit PAI job via Rest call
         // Refer https://github.com/Microsoft/pai/blob/master/docs/rest-server/API.md for more detail about PAI Rest API
         const submitJobRequest: request.Options = {
-            uri: `http://${this.paiClusterConfig.host}/rest-server/api/v1/user/${this.paiClusterConfig.userName}/jobs`,
+            uri: `${this.paiClusterConfig.host}/rest-server/api/v1/user/${this.paiClusterConfig.userName}/jobs`,
             method: 'POST',
             json: true,
             body: paiJobConfig,

From 9f40659da07018ca22ab27c5c97726a2cd188852 Mon Sep 17 00:00:00 2001
From: Yuge Zhang <scottyugochang@gmail.com>
Date: Fri, 27 Dec 2019 09:15:23 +0800
Subject: [PATCH 02/14] Fix a few issues related to fixed arc and from-tuner
 arc (#1876)

---
 .../nni/nas/pytorch/classic_nas/mutator.py    | 30 +++++++++++--------
 src/sdk/pynni/nni/nas/pytorch/fixed.py        | 17 ++++-------
 2 files changed, 24 insertions(+), 23 deletions(-)

diff --git a/src/sdk/pynni/nni/nas/pytorch/classic_nas/mutator.py b/src/sdk/pynni/nni/nas/pytorch/classic_nas/mutator.py
index 76f15c7646..e1a0b390f6 100644
--- a/src/sdk/pynni/nni/nas/pytorch/classic_nas/mutator.py
+++ b/src/sdk/pynni/nni/nas/pytorch/classic_nas/mutator.py
@@ -10,7 +10,7 @@
 
 import nni
 from nni.env_vars import trial_env_vars
-from nni.nas.pytorch.mutables import LayerChoice, InputChoice
+from nni.nas.pytorch.mutables import LayerChoice, InputChoice, MutableScope
 from nni.nas.pytorch.mutator import Mutator
 
 logger = logging.getLogger(__name__)
@@ -104,10 +104,11 @@ def _sample_input_choice(self, mutable, idx, value, search_space_item):
         search_space_item : list
             The list for corresponding search space.
         """
+        candidate_repr = search_space_item["candidates"]
         multihot_list = [False] * mutable.n_candidates
         for i, v in zip(idx, value):
-            assert 0 <= i < mutable.n_candidates and search_space_item[i] == v, \
-                "Index '{}' in search space '{}' is not '{}'".format(i, search_space_item, v)
+            assert 0 <= i < mutable.n_candidates and candidate_repr[i] == v, \
+                "Index '{}' in search space '{}' is not '{}'".format(i, candidate_repr, v)
             assert not multihot_list[i], "'{}' is selected twice in '{}', which is not allowed.".format(i, idx)
             multihot_list[i] = True
         return torch.tensor(multihot_list, dtype=torch.bool)  # pylint: disable=not-callable
@@ -121,17 +122,20 @@ def sample_final(self):
                                                                                        self._chosen_arch.keys())
         result = dict()
         for mutable in self.mutables:
-            assert mutable.key in self._chosen_arch, "Expected '{}' in chosen arch, but not found.".format(mutable.key)
-            data = self._chosen_arch[mutable.key]
-            assert isinstance(data, dict) and "_value" in data and "_idx" in data, \
-                "'{}' is not a valid choice.".format(data)
-            value = data["_value"]
-            idx = data["_idx"]
-            search_space_item = self._search_space[mutable.key]["_value"]
+            if isinstance(mutable, (LayerChoice, InputChoice)):
+                assert mutable.key in self._chosen_arch, \
+                    "Expected '{}' in chosen arch, but not found.".format(mutable.key)
+                data = self._chosen_arch[mutable.key]
+                assert isinstance(data, dict) and "_value" in data and "_idx" in data, \
+                    "'{}' is not a valid choice.".format(data)
             if isinstance(mutable, LayerChoice):
-                result[mutable.key] = self._sample_layer_choice(mutable, idx, value, search_space_item)
+                result[mutable.key] = self._sample_layer_choice(mutable, data["_idx"], data["_value"],
+                                                                self._search_space[mutable.key]["_value"])
             elif isinstance(mutable, InputChoice):
-                result[mutable.key] = self._sample_input_choice(mutable, idx, value, search_space_item)
+                result[mutable.key] = self._sample_input_choice(mutable, data["_idx"], data["_value"],
+                                                                self._search_space[mutable.key]["_value"])
+            elif isinstance(mutable, MutableScope):
+                logger.info("Mutable scope '%s' is skipped during parsing choices.", mutable.key)
             else:
                 raise TypeError("Unsupported mutable type: '%s'." % type(mutable))
         return result
@@ -190,6 +194,8 @@ def _generate_search_space(self):
                 search_space[key] = {"_type": INPUT_CHOICE,
                                      "_value": {"candidates": mutable.choose_from,
                                                 "n_chosen": mutable.n_chosen}}
+            elif isinstance(mutable, MutableScope):
+                logger.info("Mutable scope '%s' is skipped during generating search space.", mutable.key)
             else:
                 raise TypeError("Unsupported mutable type: '%s'." % type(mutable))
         return search_space
diff --git a/src/sdk/pynni/nni/nas/pytorch/fixed.py b/src/sdk/pynni/nni/nas/pytorch/fixed.py
index 6840097579..593a60ae44 100644
--- a/src/sdk/pynni/nni/nas/pytorch/fixed.py
+++ b/src/sdk/pynni/nni/nas/pytorch/fixed.py
@@ -41,18 +41,18 @@ def sample_final(self):
         return self._fixed_arc
 
 
-def _encode_tensor(data, device):
+def _encode_tensor(data):
     if isinstance(data, list):
         if all(map(lambda o: isinstance(o, bool), data)):
-            return torch.tensor(data, dtype=torch.bool, device=device)  # pylint: disable=not-callable
+            return torch.tensor(data, dtype=torch.bool)  # pylint: disable=not-callable
         else:
-            return torch.tensor(data, dtype=torch.float, device=device)  # pylint: disable=not-callable
+            return torch.tensor(data, dtype=torch.float)  # pylint: disable=not-callable
     if isinstance(data, dict):
-        return {k: _encode_tensor(v, device) for k, v in data.items()}
+        return {k: _encode_tensor(v) for k, v in data.items()}
     return data
 
 
-def apply_fixed_architecture(model, fixed_arc_path, device=None):
+def apply_fixed_architecture(model, fixed_arc_path):
     """
     Load architecture from `fixed_arc_path` and apply to model.
 
@@ -62,21 +62,16 @@ def apply_fixed_architecture(model, fixed_arc_path, device=None):
         Model with mutables.
     fixed_arc_path : str
         Path to the JSON that stores the architecture.
-    device : torch.device
-        Architecture weights will be transfered to `device`.
 
     Returns
     -------
     FixedArchitecture
     """
 
-    if device is None:
-        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     if isinstance(fixed_arc_path, str):
         with open(fixed_arc_path, "r") as f:
             fixed_arc = json.load(f)
-    fixed_arc = _encode_tensor(fixed_arc, device)
+    fixed_arc = _encode_tensor(fixed_arc)
     architecture = FixedArchitecture(model, fixed_arc)
-    architecture.to(device)
     architecture.reset()
     return architecture

From c5c0e72a35e47e88e38df608cfa0cf21510f8cd4 Mon Sep 17 00:00:00 2001
From: Cjkkkk <656569648@qq.com>
Date: Fri, 27 Dec 2019 11:21:01 +0800
Subject: [PATCH 03/14] export for default (#1883)

---
 src/sdk/pynni/nni/compression/torch/builtin_quantizers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/sdk/pynni/nni/compression/torch/builtin_quantizers.py b/src/sdk/pynni/nni/compression/torch/builtin_quantizers.py
index 2204428574..09f88ee40e 100644
--- a/src/sdk/pynni/nni/compression/torch/builtin_quantizers.py
+++ b/src/sdk/pynni/nni/compression/torch/builtin_quantizers.py
@@ -5,7 +5,7 @@
 import torch
 from .compressor import Quantizer, QuantGrad, QuantType
 
-__all__ = ['NaiveQuantizer', 'QAT_Quantizer', 'DoReFaQuantizer']
+__all__ = ['NaiveQuantizer', 'QAT_Quantizer', 'DoReFaQuantizer', 'BNNQuantizer']
 
 logger = logging.getLogger(__name__)
 

From 9b49245e6ebdfdc6d4ade82c6f9f2333afab1c0d Mon Sep 17 00:00:00 2001
From: Cjkkkk <656569648@qq.com>
Date: Sun, 29 Dec 2019 14:13:02 +0800
Subject: [PATCH 04/14] Doc for quantization (#1881)

---
 docs/en_US/Compressor/Overview.md  | 102 ++++++++++++++++++++++++-----
 docs/en_US/Compressor/Quantizer.md |  65 ++++--------------
 2 files changed, 99 insertions(+), 68 deletions(-)

diff --git a/docs/en_US/Compressor/Overview.md b/docs/en_US/Compressor/Overview.md
index f277de5c0f..b8e2903afb 100644
--- a/docs/en_US/Compressor/Overview.md
+++ b/docs/en_US/Compressor/Overview.md
@@ -1,8 +1,11 @@
 # Compressor
+As larger neural networks with more layers and nodes are considered, reducing their storage and computational cost becomes critical, especially for some real-time applications. Model compression can be used to address this problem. 
 
 We are glad to announce the alpha release for model compression toolkit on top of NNI, it's still in the experiment phase which might evolve based on usage feedback. We'd like to invite you to use, feedback and even contribute.
 
-NNI provides an easy-to-use toolkit to help user design and use compression algorithms. It supports Tensorflow and PyTorch with unified interface. For users to compress their models, they only need to add several lines in their code. There are some popular model compression algorithms built-in in NNI. Users could further use NNI's auto tuning power to find the best compressed model, which is detailed in [Auto Model Compression](./AutoCompression.md). On the other hand, users could easily customize their new compression algorithms using NNI's interface, refer to the tutorial [here](#customize-new-compression-algorithms).
+NNI provides an easy-to-use toolkit to help user design and use compression algorithms. It currently supports PyTorch with unified interface. For users to compress their models, they only need to add several lines in their code. There are some popular model compression algorithms built-in in NNI. Users could further use NNI's auto tuning power to find the best compressed model, which is detailed in [Auto Model Compression](./AutoCompression.md). On the other hand, users could easily customize their new compression algorithms using NNI's interface, refer to the tutorial [here](#customize-new-compression-algorithms).
+
+For a survey of model compression, you can refer to this paper: [Recent Advances in Efficient Computation of Deep Convolutional Neural Networks](https://arxiv.org/pdf/1802.00939.pdf).
 
 ## Supported algorithms
 
@@ -10,6 +13,8 @@ We have provided several compression algorithms, including several pruning and q
 
 **Pruning**
 
+Pruning algorithms compress the original network by removing redundant weights or channels of layers, which can reduce model complexity and address the over-ﬁtting issue.
+
 |Name|Brief Introduction of Algorithm|
 |---|---|
 | [Level Pruner](./Pruner.md#level-pruner) | Pruning the specified ratio on each weight based on absolute values of weights |
@@ -25,11 +30,14 @@ We have provided several compression algorithms, including several pruning and q
 
 **Quantization**
 
+Quantization algorithms compress the original network by reducing the number of bits required to represent weights or activations, which can reduce the computations and the inference time.
+
 |Name|Brief Introduction of Algorithm|
 |---|---|
 | [Naive Quantizer](./Quantizer.md#naive-quantizer) |  Quantize weights to default 8 bits |
 | [QAT Quantizer](./Quantizer.md#qat-quantizer) | Quantization and Training of Neural Networks for Efficient Integer-Arithmetic-Only Inference. [Reference Paper](http://openaccess.thecvf.com/content_cvpr_2018/papers/Jacob_Quantization_and_Training_CVPR_2018_paper.pdf)|
 | [DoReFa Quantizer](./Quantizer.md#dorefa-quantizer) | DoReFa-Net: Training Low Bitwidth Convolutional Neural Networks with Low Bitwidth Gradients. [Reference Paper](https://arxiv.org/abs/1606.06160)|
+| [BNN Quantizer](./Quantizer.md#BNN-Quantizer) | Binarized Neural Networks: Training Deep Neural Networks with Weights and Activations Constrained to +1 or -1. [Reference Paper](https://arxiv.org/abs/1602.02830)|
 
 ## Usage of built-in compression algorithms
 
@@ -61,17 +69,47 @@ The function call `pruner.compress()` modifies user defined model (in Tensorflow
 When instantiate a compression algorithm, there is `config_list` passed in. We describe how to write this config below.
 
 ### User configuration for a compression algorithm
+When compressing a model, users may want to specify the ratio for sparsity, to specify different ratios for different types of operations, to exclude certain types of operations, or to compress only a certain types of operations. For users to express these kinds of requirements, we define a configuration specification. It can be seen as a python `list` object, where each element is a `dict` object. 
+
+The `dict`s in the `list` are applied one by one, that is, the configurations in latter `dict` will overwrite the configurations in former ones on the operations that are within the scope of both of them. 
 
-When compressing a model, users may want to specify the ratio for sparsity, to specify different ratios for different types of operations, to exclude certain types of operations, or to compress only a certain types of operations. For users to express these kinds of requirements, we define a configuration specification. It can be seen as a python `list` object, where each element is a `dict` object. In each `dict`, there are some keys commonly supported by NNI compression:
+#### Common keys
+In each `dict`, there are some keys commonly supported by NNI compression:
 
 * __op_types__: This is to specify what types of operations to be compressed. 'default' means following the algorithm's default setting.
 * __op_names__: This is to specify by name what operations to be compressed. If this field is omitted, operations will not be filtered by it.
 * __exclude__: Default is False. If this field is True, it means the operations with specified types and names will be excluded from the compression.
 
-There are also other keys in the `dict`, but they are specific for every compression algorithm. For example, some , some.
+#### Keys for quantization algorithms
+**If you use quantization algorithms, you need to specify more keys. If you use pruning algorithms, you can safely skip these keys**
 
-The `dict`s in the `list` are applied one by one, that is, the configurations in latter `dict` will overwrite the configurations in former ones on the operations that are within the scope of both of them. 
+* __quant_types__ : list of string. 
+
+Type of quantization you want to apply, currently support 'weight', 'input', 'output'. 'weight' means applying quantization operation
+to the weight parameter of modules. 'input' means applying quantization operation to the input of module forward method. 'output' means applying quantization operation to the output of module forward method, which is often called as 'activation' in some papers.
+
+* __quant_bits__ : int or dict of {str : int}
 
+bits length of quantization, key is the quantization type, value is the quantization bits length, eg. 
+```
+{
+    quant_bits: {
+        'weight': 8,
+        'output': 4,
+        },
+}
+```
+when the value is int type, all quantization types share same bits length. eg. 
+```
+{
+    quant_bits: 8, # weight or output quantization are all 8 bits
+}
+```
+#### Other keys specified for every compression algorithm
+There are also other keys in the `dict`, but they are specific for every compression algorithm. For example, [Level Pruner](./Pruner.md#level-pruner) requires `sparsity` key to specify how much a model should be pruned.
+
+
+#### example
 A simple example of configuration is shown below:
 
 ```python
@@ -183,11 +221,9 @@ Some algorithms may want global information for generating masks, for example, a
 The interface for customizing quantization algorithm is similar to that of pruning algorithms. The only difference is that `calc_mask` is replaced with `quantize_weight`. `quantize_weight` directly returns the quantized weights rather than mask, because for quantization the quantized weights cannot be obtained by applying mask.
 
 ```python
-# This is writing a Quantizer in tensorflow.
-# For writing a Quantizer in PyTorch, you can simply replace
-# nni.compression.tensorflow.Quantizer with
-# nni.compression.torch.Quantizer
-class YourQuantizer(nni.compression.tensorflow.Quantizer):
+from nni.compression.torch.compressor import Quantizer
+
+class YourQuantizer(Quantizer):
     def __init__(self, model, config_list):
         """
         Suggest you to use the NNI defined spec for config
@@ -245,19 +281,55 @@ class YourQuantizer(nni.compression.tensorflow.Quantizer):
 
         return new_input
 
-    # note for pytorch version, there is no sess in input arguments
-    def update_epoch(self, epoch_num, sess):
+    def update_epoch(self, epoch_num):
         pass
 
-    # note for pytorch version, there is no sess in input arguments
-    def step(self, sess):
+    def step(self):
         """
         Can do some processing based on the model or weights binded
         in the func bind_model
         """
         pass
 ```
+#### Customize backward function
+Sometimes it's necessary for a quantization operation to have a customized backward function, such as [Straight-Through Estimator](https://stackoverflow.com/questions/38361314/the-concept-of-straight-through-estimator-ste), user can customize a backward function as follow:
+
+```python
+from nni.compression.torch.compressor import Quantizer, QuantGrad, QuantType
+
+class ClipGrad(QuantGrad):
+    @staticmethod
+    def quant_backward(tensor, grad_output, quant_type):
+        """
+        This method should be overrided by subclass to provide customized backward function,
+        default implementation is Straight-Through Estimator
+        Parameters
+        ----------
+        tensor : Tensor
+            input of quantization operation
+        grad_output : Tensor
+            gradient of the output of quantization operation
+        quant_type : QuantType
+            the type of quantization, it can be `QuantType.QUANT_INPUT`, `QuantType.QUANT_WEIGHT`, `QuantType.QUANT_OUTPUT`,
+            you can define different behavior for different types.
+        Returns
+        -------
+        tensor
+            gradient of the input of quantization operation
+        """
+
+        # for quant_output function, set grad to zero if the absolute value of tensor is larger than 1
+        if quant_type == QuantType.QUANT_OUTPUT: 
+            grad_output[torch.abs(tensor) > 1] = 0
+        return grad_output
+
 
-### Usage of user customized compression algorithm
+class YourQuantizer(Quantizer):
+    def __init__(self, model, config_list):
+        super().__init__(model, config_list)
+        # set your customized backward function to overwrite default backward function
+        self.quant_grad = ClipGrad
+
+```
 
-__[TODO]__ ...
+If you do not customize `QuantGrad`, the default backward is Straight-Through Estimator. 
\ No newline at end of file
diff --git a/docs/en_US/Compressor/Quantizer.md b/docs/en_US/Compressor/Quantizer.md
index 67791117e1..3308f25c1b 100644
--- a/docs/en_US/Compressor/Quantizer.md
+++ b/docs/en_US/Compressor/Quantizer.md
@@ -6,12 +6,10 @@ We provide Naive Quantizer to quantizer weight to default 8 bits, you can use it
 
 ### Usage
 tensorflow
-```python
-nni.compressors.tensorflow.NaiveQuantizer(model_graph).compress()
+```python nni.compression.tensorflow.NaiveQuantizer(model_graph).compress()
 ```
 pytorch
-```python
-nni.compressors.torch.NaiveQuantizer(model).compress()
+```python nni.compression.torch.NaiveQuantizer(model).compress()
 ```
 
 ***
@@ -29,7 +27,7 @@ You can quantize your model to 8 bits with the code below before your training c
 
 PyTorch code
 ```python
-from nni.compressors.torch import QAT_Quantizer
+from nni.compression.torch import QAT_Quantizer
 model = Mnist()
 
 config_list = [{
@@ -51,22 +49,9 @@ quantizer.compress()
 You can view example for more information
 
 #### User configuration for QAT Quantizer
-* **quant_types:** : list of string
-
-type of quantization you want to apply, currently support 'weight', 'input', 'output'.
-
-* **op_types:** list of string
-
-specify the type of modules that will be quantized. eg. 'Conv2D'
-
-* **op_names:** list of string
+common configuration needed by compression algorithms can be found at : [Common configuration](./Overview.md#User-configuration-for-a-compression-algorithm)
 
-specify the name of modules that will be quantized. eg. 'conv1'
-
-* **quant_bits:** int or dict of {str : int}
-
-bits length of quantization, key is the quantization type, value is the length, eg. {'weight': 8},
-when the type is int, all quantization types share same bits length.
+configuration needed by this algorithm :
 
 * **quant_start_step:** int
 
@@ -85,7 +70,7 @@ To implement DoReFa Quantizer, you can add code below before your training code
 
 PyTorch code
 ```python
-from nni.compressors.torch import DoReFaQuantizer
+from nni.compression.torch import DoReFaQuantizer
 config_list = [{ 
     'quant_types': ['weight'],
     'quant_bits': 8, 
@@ -98,22 +83,9 @@ quantizer.compress()
 You can view example for more information
 
 #### User configuration for DoReFa Quantizer
-* **quant_types:** : list of string
-
-type of quantization you want to apply, currently support 'weight', 'input', 'output'.
-
-* **op_types:** list of string
-
-specify the type of modules that will be quantized. eg. 'Conv2D'
-
-* **op_names:** list of string
+common configuration needed by compression algorithms can be found at : [Common configuration](./Overview.md#User-configuration-for-a-compression-algorithm)
 
-specify the name of modules that will be quantized. eg. 'conv1'
-
-* **quant_bits:** int or dict of {str : int}
-
-bits length of quantization, key is the quantization type, value is the length, eg. {'weight': 8},
-when the type is int, all quantization types share same bits length.
+configuration needed by this algorithm :
 
 
 ## BNN Quantizer
@@ -130,13 +102,13 @@ from nni.compression.torch import BNNQuantizer
 model = VGG_Cifar10(num_classes=10)
 
 configure_list = [{
-    'quant_types': ['weight'],
     'quant_bits': 1,
+    'quant_types': ['weight'],
     'op_types': ['Conv2d', 'Linear'],
     'op_names': ['features.0', 'features.3', 'features.7', 'features.10', 'features.14', 'features.17', 'classifier.0', 'classifier.3']
 }, {
-    'quant_types': ['output'],
     'quant_bits': 1,
+    'quant_types': ['output'],
     'op_types': ['Hardtanh'],
     'op_names': ['features.6', 'features.9', 'features.13', 'features.16', 'features.20', 'classifier.2', 'classifier.5']
 }]
@@ -148,22 +120,9 @@ model = quantizer.compress()
 You can view example [examples/model_compress/BNN_quantizer_cifar10.py]( https://github.com/microsoft/nni/tree/master/examples/model_compress/BNN_quantizer_cifar10.py) for more information.
 
 #### User configuration for BNN Quantizer
-* **quant_types:** : list of string
-
-type of quantization you want to apply, currently support 'weight', 'input', 'output'.
-
-* **op_types:** list of string
-
-specify the type of modules that will be quantized. eg. 'Conv2D'
-
-* **op_names:** list of string
-
-specify the name of modules that will be quantized. eg. 'conv1'
-
-* **quant_bits:** int or dict of {str : int}
+common configuration needed by compression algorithms can be found at : [Common configuration](./Overview.md#User-configuration-for-a-compression-algorithm)
 
-bits length of quantization, key is the quantization type, value is the length, eg. {'weight': 8},
-when the type is int, all quantization types share same bits length.
+configuration needed by this algorithm :
 
 ### Experiment
 We implemented one of the experiments in [Binarized Neural Networks: Training Deep Neural Networks with Weights and Activations Constrained to +1 or -1](https://arxiv.org/abs/1602.02830), we quantized the **VGGNet** for CIFAR-10 in the paper. Our experiments results are as follows:

From 06db4729b00bd2d6f5582fe8528014ffe106953a Mon Sep 17 00:00:00 2001
From: QuanluZhang <Quanlu.Zhang@microsoft.com>
Date: Mon, 30 Dec 2019 07:40:22 +0800
Subject: [PATCH 05/14] refactor code structure of pruning algorithms (#1882)

---
 .../pynni/nni/compression/torch/__init__.py   |   7 +-
 .../torch/activation_rank_filter_pruners.py   | 252 ++++++
 .../nni/compression/torch/builtin_pruners.py  | 741 ------------------
 .../nni/compression/torch/lottery_ticket.py   | 150 ----
 .../pynni/nni/compression/torch/pruners.py    | 383 +++++++++
 .../{builtin_quantizers.py => quantizers.py}  |   0
 .../torch/weight_rank_filter_pruners.py       | 262 +++++++
 7 files changed, 901 insertions(+), 894 deletions(-)
 create mode 100644 src/sdk/pynni/nni/compression/torch/activation_rank_filter_pruners.py
 delete mode 100644 src/sdk/pynni/nni/compression/torch/builtin_pruners.py
 delete mode 100644 src/sdk/pynni/nni/compression/torch/lottery_ticket.py
 create mode 100644 src/sdk/pynni/nni/compression/torch/pruners.py
 rename src/sdk/pynni/nni/compression/torch/{builtin_quantizers.py => quantizers.py} (100%)
 create mode 100644 src/sdk/pynni/nni/compression/torch/weight_rank_filter_pruners.py

diff --git a/src/sdk/pynni/nni/compression/torch/__init__.py b/src/sdk/pynni/nni/compression/torch/__init__.py
index 1aa652875a..d79a8f76c4 100644
--- a/src/sdk/pynni/nni/compression/torch/__init__.py
+++ b/src/sdk/pynni/nni/compression/torch/__init__.py
@@ -2,6 +2,7 @@
 # Licensed under the MIT license.
 
 from .compressor import LayerInfo, Compressor, Pruner, Quantizer
-from .builtin_pruners import *
-from .builtin_quantizers import *
-from .lottery_ticket import LotteryTicketPruner
+from .pruners import *
+from .weight_rank_filter_pruners import *
+from .activation_rank_filter_pruners import *
+from .quantizers import *
diff --git a/src/sdk/pynni/nni/compression/torch/activation_rank_filter_pruners.py b/src/sdk/pynni/nni/compression/torch/activation_rank_filter_pruners.py
new file mode 100644
index 0000000000..d9c811531a
--- /dev/null
+++ b/src/sdk/pynni/nni/compression/torch/activation_rank_filter_pruners.py
@@ -0,0 +1,252 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import logging
+import torch
+from .compressor import Pruner
+
+__all__ = ['ActivationAPoZRankFilterPruner', 'ActivationMeanRankFilterPruner']
+
+logger = logging.getLogger('torch activation rank filter pruners')
+
+class ActivationRankFilterPruner(Pruner):
+    """
+    A structured pruning base class that prunes the filters with the smallest
+    importance criterion in convolution layers (using activation values)
+    to achieve a preset level of network sparsity.
+    """
+
+    def __init__(self, model, config_list, activation='relu', statistics_batch_num=1):
+        """
+        Parameters
+        ----------
+        model : torch.nn.module
+            Model to be pruned
+        config_list : list
+            support key for each list item:
+                - sparsity: percentage of convolutional filters to be pruned.
+        activation : str
+            Activation function
+        statistics_batch_num : int
+            Num of batches for activation statistics
+        """
+
+        super().__init__(model, config_list)
+        self.mask_calculated_ops = set()
+        self.statistics_batch_num = statistics_batch_num
+        self.collected_activation = {}
+        self.hooks = {}
+        assert activation in ['relu', 'relu6']
+        if activation == 'relu':
+            self.activation = torch.nn.functional.relu
+        elif activation == 'relu6':
+            self.activation = torch.nn.functional.relu6
+        else:
+            self.activation = None
+
+    def compress(self):
+        """
+        Compress the model, register a hook for collecting activations.
+        """
+        modules_to_compress = self.detect_modules_to_compress()
+        for layer, config in modules_to_compress:
+            self._instrument_layer(layer, config)
+            self.collected_activation[layer.name] = []
+
+            def _hook(module_, input_, output, name=layer.name):
+                if len(self.collected_activation[name]) < self.statistics_batch_num:
+                    self.collected_activation[name].append(self.activation(output.detach().cpu()))
+
+            layer.module.register_forward_hook(_hook)
+        return self.bound_model
+
+    def get_mask(self, base_mask, activations, num_prune):
+        raise NotImplementedError('{} get_mask is not implemented'.format(self.__class__.__name__))
+
+    def calc_mask(self, layer, config):
+        """
+        Calculate the mask of given layer.
+        Filters with the smallest importance criterion which is calculated from the activation are masked.
+
+        Parameters
+        ----------
+        layer : LayerInfo
+            the layer to instrument the compression operation
+        config : dict
+            layer's pruning config
+
+        Returns
+        -------
+        dict
+            dictionary for storing masks
+        """
+
+        weight = layer.module.weight.data
+        op_name = layer.name
+        op_type = layer.type
+        assert 0 <= config.get('sparsity') < 1, "sparsity must in the range [0, 1)"
+        assert op_type in ['Conv2d'], "only support Conv2d"
+        assert op_type in config.get('op_types')
+        if op_name in self.mask_calculated_ops:
+            assert op_name in self.mask_dict
+            return self.mask_dict.get(op_name)
+        mask_weight = torch.ones(weight.size()).type_as(weight).detach()
+        if hasattr(layer.module, 'bias') and layer.module.bias is not None:
+            mask_bias = torch.ones(layer.module.bias.size()).type_as(layer.module.bias).detach()
+        else:
+            mask_bias = None
+        mask = {'weight': mask_weight, 'bias': mask_bias}
+        try:
+            filters = weight.size(0)
+            num_prune = int(filters * config.get('sparsity'))
+            if filters < 2 or num_prune < 1 or len(self.collected_activation[layer.name]) < self.statistics_batch_num:
+                return mask
+            mask = self.get_mask(mask, self.collected_activation[layer.name], num_prune)
+        finally:
+            if len(self.collected_activation[layer.name]) == self.statistics_batch_num:
+                self.mask_dict.update({op_name: mask})
+                self.mask_calculated_ops.add(op_name)
+        return mask
+
+
+class ActivationAPoZRankFilterPruner(ActivationRankFilterPruner):
+    """
+    A structured pruning algorithm that prunes the filters with the
+    smallest APoZ(average percentage of zeros) of output activations.
+    Hengyuan Hu, Rui Peng, Yu-Wing Tai and Chi-Keung Tang,
+    "Network Trimming: A Data-Driven Neuron Pruning Approach towards Efficient Deep Architectures", ICLR 2016.
+    https://arxiv.org/abs/1607.03250
+    """
+
+    def __init__(self, model, config_list, activation='relu', statistics_batch_num=1):
+        """
+        Parameters
+        ----------
+        model : torch.nn.module
+            Model to be pruned
+        config_list : list
+            support key for each list item:
+                - sparsity: percentage of convolutional filters to be pruned.
+        activation : str
+            Activation function
+        statistics_batch_num : int
+            Num of batches for activation statistics
+        """
+        super().__init__(model, config_list, activation, statistics_batch_num)
+
+    def get_mask(self, base_mask, activations, num_prune):
+        """
+        Calculate the mask of given layer.
+        Filters with the smallest APoZ(average percentage of zeros) of output activations are masked.
+
+        Parameters
+        ----------
+        base_mask : dict
+            The basic mask with the same shape of weight, all item in the basic mask is 1.
+        activations : list
+            Layer's output activations
+        num_prune : int
+            Num of filters to prune
+
+        Returns
+        -------
+        dict
+            dictionary for storing masks
+        """
+        apoz = self._calc_apoz(activations)
+        prune_indices = torch.argsort(apoz, descending=True)[:num_prune]
+        for idx in prune_indices:
+            base_mask['weight'][idx] = 0.
+            if base_mask['bias'] is not None:
+                base_mask['bias'][idx] = 0.
+        return base_mask
+
+    def _calc_apoz(self, activations):
+        """
+        Calculate APoZ(average percentage of zeros) of activations.
+
+        Parameters
+        ----------
+        activations : list
+            Layer's output activations
+
+        Returns
+        -------
+        torch.Tensor
+            Filter's APoZ(average percentage of zeros) of the activations
+        """
+        activations = torch.cat(activations, 0)
+        _eq_zero = torch.eq(activations, torch.zeros_like(activations))
+        _apoz = torch.sum(_eq_zero, dim=(0, 2, 3)) / torch.numel(_eq_zero[:, 0, :, :])
+        return _apoz
+
+
+class ActivationMeanRankFilterPruner(ActivationRankFilterPruner):
+    """
+    A structured pruning algorithm that prunes the filters with the
+    smallest mean value of output activations.
+    Pavlo Molchanov, Stephen Tyree, Tero Karras, Timo Aila and Jan Kautz,
+    "Pruning Convolutional Neural Networks for Resource Efficient Inference", ICLR 2017.
+    https://arxiv.org/abs/1611.06440
+    """
+
+    def __init__(self, model, config_list, activation='relu', statistics_batch_num=1):
+        """
+        Parameters
+        ----------
+        model : torch.nn.module
+            Model to be pruned
+        config_list : list
+            support key for each list item:
+                - sparsity: percentage of convolutional filters to be pruned.
+        activation : str
+            Activation function
+        statistics_batch_num : int
+            Num of batches for activation statistics
+        """
+        super().__init__(model, config_list, activation, statistics_batch_num)
+
+    def get_mask(self, base_mask, activations, num_prune):
+        """
+        Calculate the mask of given layer.
+        Filters with the smallest APoZ(average percentage of zeros) of output activations are masked.
+
+        Parameters
+        ----------
+        base_mask : dict
+            The basic mask with the same shape of weight, all item in the basic mask is 1.
+        activations : list
+            Layer's output activations
+        num_prune : int
+            Num of filters to prune
+
+        Returns
+        -------
+        dict
+            dictionary for storing masks
+        """
+        mean_activation = self._cal_mean_activation(activations)
+        prune_indices = torch.argsort(mean_activation)[:num_prune]
+        for idx in prune_indices:
+            base_mask['weight'][idx] = 0.
+            if base_mask['bias'] is not None:
+                base_mask['bias'][idx] = 0.
+        return base_mask
+
+    def _cal_mean_activation(self, activations):
+        """
+        Calculate mean value of activations.
+
+        Parameters
+        ----------
+        activations : list
+            Layer's output activations
+
+        Returns
+        -------
+        torch.Tensor
+            Filter's mean value of the output activations
+        """
+        activations = torch.cat(activations, 0)
+        mean_activation = torch.mean(activations, dim=(0, 2, 3))
+        return mean_activation
diff --git a/src/sdk/pynni/nni/compression/torch/builtin_pruners.py b/src/sdk/pynni/nni/compression/torch/builtin_pruners.py
deleted file mode 100644
index 8e19ea394d..0000000000
--- a/src/sdk/pynni/nni/compression/torch/builtin_pruners.py
+++ /dev/null
@@ -1,741 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import logging
-import torch
-from .compressor import Pruner
-
-__all__ = ['LevelPruner', 'AGP_Pruner', 'SlimPruner', 'L1FilterPruner', 'L2FilterPruner', 'FPGMPruner',
-           'ActivationAPoZRankFilterPruner', 'ActivationMeanRankFilterPruner']
-
-logger = logging.getLogger('torch pruner')
-
-
-class LevelPruner(Pruner):
-    """
-    Prune to an exact pruning level specification
-    """
-
-    def __init__(self, model, config_list):
-        """
-        Parameters
-        ----------
-        model : torch.nn.module
-            Model to be pruned
-        config_list : list
-            List on pruning configs
-        """
-
-        super().__init__(model, config_list)
-        self.mask_calculated_ops = set()
-
-    def calc_mask(self, layer, config):
-        """
-        Calculate the mask of given layer
-        Parameters
-        ----------
-        layer : LayerInfo
-            the layer to instrument the compression operation
-        config : dict
-            layer's pruning config
-        Returns
-        -------
-        dict
-            dictionary for storing masks
-        """
-
-        weight = layer.module.weight.data
-        op_name = layer.name
-        if op_name not in self.mask_calculated_ops:
-            w_abs = weight.abs()
-            k = int(weight.numel() * config['sparsity'])
-            if k == 0:
-                return torch.ones(weight.shape).type_as(weight)
-            threshold = torch.topk(w_abs.view(-1), k, largest=False)[0].max()
-            mask_weight = torch.gt(w_abs, threshold).type_as(weight)
-            mask = {'weight': mask_weight}
-            self.mask_dict.update({op_name: mask})
-            self.mask_calculated_ops.add(op_name)
-        else:
-            assert op_name in self.mask_dict, "op_name not in the mask_dict"
-            mask = self.mask_dict[op_name]
-        return mask
-
-
-class AGP_Pruner(Pruner):
-    """
-    An automated gradual pruning algorithm that prunes the smallest magnitude
-    weights to achieve a preset level of network sparsity.
-    Michael Zhu and Suyog Gupta, "To prune, or not to prune: exploring the
-    efficacy of pruning for model compression", 2017 NIPS Workshop on Machine
-    Learning of Phones and other Consumer Devices,
-    https://arxiv.org/pdf/1710.01878.pdf
-    """
-
-    def __init__(self, model, config_list):
-        """
-        Parameters
-        ----------
-        model : torch.nn.module
-            Model to be pruned
-        config_list : list
-            List on pruning configs
-        """
-
-        super().__init__(model, config_list)
-        self.now_epoch = 0
-        self.if_init_list = {}
-
-    def calc_mask(self, layer, config):
-        """
-        Calculate the mask of given layer
-        Parameters
-        ----------
-        layer : LayerInfo
-            the layer to instrument the compression operation
-        config : dict
-            layer's pruning config
-        Returns
-        -------
-        dict
-            dictionary for storing masks
-        """
-
-        weight = layer.module.weight.data
-        op_name = layer.name
-        start_epoch = config.get('start_epoch', 0)
-        freq = config.get('frequency', 1)
-        if self.now_epoch >= start_epoch and self.if_init_list.get(op_name, True) \
-                and (self.now_epoch - start_epoch) % freq == 0:
-            mask = self.mask_dict.get(op_name, {'weight': torch.ones(weight.shape).type_as(weight)})
-            target_sparsity = self.compute_target_sparsity(config)
-            k = int(weight.numel() * target_sparsity)
-            if k == 0 or target_sparsity >= 1 or target_sparsity <= 0:
-                return mask
-            # if we want to generate new mask, we should update weigth first
-            w_abs = weight.abs() * mask
-            threshold = torch.topk(w_abs.view(-1), k, largest=False)[0].max()
-            new_mask = {'weight': torch.gt(w_abs, threshold).type_as(weight)}
-            self.mask_dict.update({op_name: new_mask})
-            self.if_init_list.update({op_name: False})
-        else:
-            new_mask = self.mask_dict.get(op_name, {'weight': torch.ones(weight.shape).type_as(weight)})
-        return new_mask
-
-    def compute_target_sparsity(self, config):
-        """
-        Calculate the sparsity for pruning
-        Parameters
-        ----------
-        config : dict
-            Layer's pruning config
-        Returns
-        -------
-        float
-            Target sparsity to be pruned
-        """
-
-        end_epoch = config.get('end_epoch', 1)
-        start_epoch = config.get('start_epoch', 0)
-        freq = config.get('frequency', 1)
-        final_sparsity = config.get('final_sparsity', 0)
-        initial_sparsity = config.get('initial_sparsity', 0)
-        if end_epoch <= start_epoch or initial_sparsity >= final_sparsity:
-            logger.warning('your end epoch <= start epoch or initial_sparsity >= final_sparsity')
-            return final_sparsity
-
-        if end_epoch <= self.now_epoch:
-            return final_sparsity
-
-        span = ((end_epoch - start_epoch - 1) // freq) * freq
-        assert span > 0
-        target_sparsity = (final_sparsity +
-                           (initial_sparsity - final_sparsity) *
-                           (1.0 - ((self.now_epoch - start_epoch) / span)) ** 3)
-        return target_sparsity
-
-    def update_epoch(self, epoch):
-        """
-        Update epoch
-        Parameters
-        ----------
-        epoch : int
-            current training epoch
-        """
-
-        if epoch > 0:
-            self.now_epoch = epoch
-            for k in self.if_init_list.keys():
-                self.if_init_list[k] = True
-
-
-class SlimPruner(Pruner):
-    """
-    A structured pruning algorithm that prunes channels by pruning the weights of BN layers.
-    Zhuang Liu, Jianguo Li, Zhiqiang Shen, Gao Huang, Shoumeng Yan and Changshui Zhang
-    "Learning Efficient Convolutional Networks through Network Slimming", 2017 ICCV
-    https://arxiv.org/pdf/1708.06519.pdf
-    """
-
-    def __init__(self, model, config_list):
-        """
-        Parameters
-        ----------
-        config_list : list
-            support key for each list item:
-                - sparsity: percentage of convolutional filters to be pruned.
-        """
-
-        super().__init__(model, config_list)
-        self.mask_calculated_ops = set()
-        weight_list = []
-        if len(config_list) > 1:
-            logger.warning('Slim pruner only supports 1 configuration')
-        config = config_list[0]
-        for (layer, config) in self.detect_modules_to_compress():
-            assert layer.type == 'BatchNorm2d', 'SlimPruner only supports 2d batch normalization layer pruning'
-            weight_list.append(layer.module.weight.data.abs().clone())
-        all_bn_weights = torch.cat(weight_list)
-        k = int(all_bn_weights.shape[0] * config['sparsity'])
-        self.global_threshold = torch.topk(all_bn_weights.view(-1), k, largest=False)[0].max()
-
-    def calc_mask(self, layer, config):
-        """
-        Calculate the mask of given layer.
-        Scale factors with the smallest absolute value in the BN layer are masked.
-        Parameters
-        ----------
-        layer : LayerInfo
-            the layer to instrument the compression operation
-        config : dict
-            layer's pruning config
-        Returns
-        -------
-        dict
-            dictionary for storing masks
-        """
-
-        weight = layer.module.weight.data
-        op_name = layer.name
-        op_type = layer.type
-        assert op_type == 'BatchNorm2d', 'SlimPruner only supports 2d batch normalization layer pruning'
-        if op_name in self.mask_calculated_ops:
-            assert op_name in self.mask_dict
-            return self.mask_dict.get(op_name)
-        base_mask = torch.ones(weight.size()).type_as(weight).detach()
-        mask = {'weight': base_mask.detach(), 'bias': base_mask.clone().detach()}
-        try:
-            filters = weight.size(0)
-            num_prune = int(filters * config.get('sparsity'))
-            if filters < 2 or num_prune < 1:
-                return mask
-            w_abs = weight.abs()
-            mask_weight = torch.gt(w_abs, self.global_threshold).type_as(weight)
-            mask_bias = mask_weight.clone()
-            mask = {'weight': mask_weight.detach(), 'bias': mask_bias.detach()}
-        finally:
-            self.mask_dict.update({layer.name: mask})
-            self.mask_calculated_ops.add(layer.name)
-
-        return mask
-
-
-class WeightRankFilterPruner(Pruner):
-    """
-    A structured pruning base class that prunes the filters with the smallest
-    importance criterion in convolution layers to achieve a preset level of network sparsity.
-    """
-
-    def __init__(self, model, config_list):
-        """
-        Parameters
-        ----------
-        model : torch.nn.module
-            Model to be pruned
-        config_list : list
-            support key for each list item:
-                - sparsity: percentage of convolutional filters to be pruned.
-        """
-
-        super().__init__(model, config_list)
-        self.mask_calculated_ops = set()  # operations whose mask has been calculated
-
-    def _get_mask(self, base_mask, weight, num_prune):
-        return {'weight': None, 'bias': None}
-
-    def calc_mask(self, layer, config):
-        """
-        Calculate the mask of given layer.
-        Filters with the smallest importance criterion of the kernel weights are masked.
-        Parameters
-        ----------
-        layer : LayerInfo
-            the layer to instrument the compression operation
-        config : dict
-            layer's pruning config
-        Returns
-        -------
-        dict
-            dictionary for storing masks
-        """
-
-        weight = layer.module.weight.data
-        op_name = layer.name
-        op_type = layer.type
-        assert 0 <= config.get('sparsity') < 1, "sparsity must in the range [0, 1)"
-        assert op_type in ['Conv1d', 'Conv2d'], "only support Conv1d and Conv2d"
-        assert op_type in config.get('op_types')
-        if op_name in self.mask_calculated_ops:
-            assert op_name in self.mask_dict
-            return self.mask_dict.get(op_name)
-        mask_weight = torch.ones(weight.size()).type_as(weight).detach()
-        if hasattr(layer.module, 'bias') and layer.module.bias is not None:
-            mask_bias = torch.ones(layer.module.bias.size()).type_as(layer.module.bias).detach()
-        else:
-            mask_bias = None
-        mask = {'weight': mask_weight, 'bias': mask_bias}
-        try:
-            filters = weight.size(0)
-            num_prune = int(filters * config.get('sparsity'))
-            if filters < 2 or num_prune < 1:
-                return mask
-            mask = self._get_mask(mask, weight, num_prune)
-        finally:
-            self.mask_dict.update({op_name: mask})
-            self.mask_calculated_ops.add(op_name)
-        return mask
-
-
-class L1FilterPruner(WeightRankFilterPruner):
-    """
-    A structured pruning algorithm that prunes the filters of smallest magnitude
-    weights sum in the convolution layers to achieve a preset level of network sparsity.
-    Hao Li, Asim Kadav, Igor Durdanovic, Hanan Samet and Hans Peter Graf,
-    "PRUNING FILTERS FOR EFFICIENT CONVNETS", 2017 ICLR
-    https://arxiv.org/abs/1608.08710
-    """
-
-    def __init__(self, model, config_list):
-        """
-        Parameters
-        ----------
-        model : torch.nn.module
-            Model to be pruned
-        config_list : list
-            support key for each list item:
-                - sparsity: percentage of convolutional filters to be pruned.
-        """
-
-        super().__init__(model, config_list)
-
-    def _get_mask(self, base_mask, weight, num_prune):
-        """
-        Calculate the mask of given layer.
-        Filters with the smallest sum of its absolute kernel weights are masked.
-        Parameters
-        ----------
-        base_mask : dict
-            The basic mask with the same shape of weight or bias, all item in the basic mask is 1.
-        weight : torch.Tensor
-            Layer's weight
-        num_prune : int
-            Num of filters to prune
-
-        Returns
-        -------
-        dict
-            dictionary for storing masks
-        """
-
-        filters = weight.shape[0]
-        w_abs = weight.abs()
-        w_abs_structured = w_abs.view(filters, -1).sum(dim=1)
-        threshold = torch.topk(w_abs_structured.view(-1), num_prune, largest=False)[0].max()
-        mask_weight = torch.gt(w_abs_structured, threshold)[:, None, None, None].expand_as(weight).type_as(weight)
-        mask_bias = torch.gt(w_abs_structured, threshold).type_as(weight)
-
-        return {'weight': mask_weight.detach(), 'bias': mask_bias.detach()}
-
-
-class L2FilterPruner(WeightRankFilterPruner):
-    """
-    A structured pruning algorithm that prunes the filters with the
-    smallest L2 norm of the weights.
-    """
-
-    def __init__(self, model, config_list):
-        """
-        Parameters
-        ----------
-        model : torch.nn.module
-            Model to be pruned
-        config_list : list
-            support key for each list item:
-                - sparsity: percentage of convolutional filters to be pruned.
-        """
-
-        super().__init__(model, config_list)
-
-    def _get_mask(self, base_mask, weight, num_prune):
-        """
-        Calculate the mask of given layer.
-        Filters with the smallest L2 norm of the absolute kernel weights are masked.
-        Parameters
-        ----------
-        base_mask : dict
-            The basic mask with the same shape of weight or bias, all item in the basic mask is 1.
-        weight : torch.Tensor
-            Layer's weight
-        num_prune : int
-            Num of filters to prune
-        Returns
-        -------
-        dict
-            dictionary for storing masks
-        """
-        filters = weight.shape[0]
-        w = weight.view(filters, -1)
-        w_l2_norm = torch.sqrt((w ** 2).sum(dim=1))
-        threshold = torch.topk(w_l2_norm.view(-1), num_prune, largest=False)[0].max()
-        mask_weight = torch.gt(w_l2_norm, threshold)[:, None, None, None].expand_as(weight).type_as(weight)
-        mask_bias = torch.gt(w_l2_norm, threshold).type_as(weight)
-
-        return {'weight': mask_weight.detach(), 'bias': mask_bias.detach()}
-
-
-class FPGMPruner(WeightRankFilterPruner):
-    """
-    A filter pruner via geometric median.
-    "Filter Pruning via Geometric Median for Deep Convolutional Neural Networks Acceleration",
-    https://arxiv.org/pdf/1811.00250.pdf
-    """
-
-    def __init__(self, model, config_list):
-        """
-        Parameters
-        ----------
-        model : pytorch model
-            the model user wants to compress
-        config_list: list
-            support key for each list item:
-                - sparsity: percentage of convolutional filters to be pruned.
-        """
-        super().__init__(model, config_list)
-
-    def _get_mask(self, base_mask, weight, num_prune):
-        """
-        Calculate the mask of given layer.
-        Filters with the smallest sum of its absolute kernel weights are masked.
-        Parameters
-        ----------
-        base_mask : dict
-            The basic mask with the same shape of weight and bias, all item in the basic mask is 1.
-        weight : torch.Tensor
-            Layer's weight
-        num_prune : int
-            Num of filters to prune
-        Returns
-        -------
-        dict
-            dictionary for storing masks
-        """
-        min_gm_idx = self._get_min_gm_kernel_idx(weight, num_prune)
-        for idx in min_gm_idx:
-            base_mask['weight'][idx] = 0.
-            if base_mask['bias'] is not None:
-                base_mask['bias'][idx] = 0.
-        return base_mask
-
-    def _get_min_gm_kernel_idx(self, weight, n):
-        assert len(weight.size()) in [3, 4]
-
-        dist_list = []
-        for out_i in range(weight.size(0)):
-            dist_sum = self._get_distance_sum(weight, out_i)
-            dist_list.append((dist_sum, out_i))
-        min_gm_kernels = sorted(dist_list, key=lambda x: x[0])[:n]
-        return [x[1] for x in min_gm_kernels]
-
-    def _get_distance_sum(self, weight, out_idx):
-        """
-        Calculate the total distance between a specified filter (by out_idex and in_idx) and
-        all other filters.
-        Optimized verision of following naive implementation:
-        def _get_distance_sum(self, weight, in_idx, out_idx):
-            w = weight.view(-1, weight.size(-2), weight.size(-1))
-            dist_sum = 0.
-            for k in w:
-                dist_sum += torch.dist(k, weight[in_idx, out_idx], p=2)
-            return dist_sum
-        Parameters
-        ----------
-        weight: Tensor
-            convolutional filter weight
-        out_idx: int
-            output channel index of specified filter, this method calculates the total distance
-            between this specified filter and all other filters.
-        Returns
-        -------
-        float32
-            The total distance
-        """
-        logger.debug('weight size: %s', weight.size())
-        assert len(weight.size()) in [3, 4], 'unsupported weight shape'
-
-        w = weight.view(weight.size(0), -1)
-        anchor_w = w[out_idx].unsqueeze(0).expand(w.size(0), w.size(1))
-        x = w - anchor_w
-        x = (x * x).sum(-1)
-        x = torch.sqrt(x)
-        return x.sum()
-
-    def update_epoch(self, epoch):
-        self.mask_calculated_ops = set()
-
-
-class ActivationRankFilterPruner(Pruner):
-    """
-    A structured pruning base class that prunes the filters with the smallest
-    importance criterion in convolution layers to achieve a preset level of network sparsity.
-    Hengyuan Hu, Rui Peng, Yu-Wing Tai and Chi-Keung Tang,
-    "Network Trimming: A Data-Driven Neuron Pruning Approach towards Efficient Deep Architectures", ICLR 2016.
-    https://arxiv.org/abs/1607.03250
-    Pavlo Molchanov, Stephen Tyree, Tero Karras, Timo Aila and Jan Kautz,
-    "Pruning Convolutional Neural Networks for Resource Efficient Inference", ICLR 2017.
-    https://arxiv.org/abs/1611.06440
-    """
-
-    def __init__(self, model, config_list, activation='relu', statistics_batch_num=1):
-        """
-        Parameters
-        ----------
-        model : torch.nn.module
-            Model to be pruned
-        config_list : list
-            support key for each list item:
-                - sparsity: percentage of convolutional filters to be pruned.
-        activation : str
-            Activation function
-        statistics_batch_num : int
-            Num of batches for activation statistics
-        """
-
-        super().__init__(model, config_list)
-        self.mask_calculated_ops = set()
-        self.statistics_batch_num = statistics_batch_num
-        self.collected_activation = {}
-        self.hooks = {}
-        assert activation in ['relu', 'relu6']
-        if activation == 'relu':
-            self.activation = torch.nn.functional.relu
-        elif activation == 'relu6':
-            self.activation = torch.nn.functional.relu6
-        else:
-            self.activation = None
-
-    def compress(self):
-        """
-        Compress the model, register a hook for collecting activations.
-        """
-        modules_to_compress = self.detect_modules_to_compress()
-        for layer, config in modules_to_compress:
-            self._instrument_layer(layer, config)
-            self.collected_activation[layer.name] = []
-
-            def _hook(module_, input_, output, name=layer.name):
-                if len(self.collected_activation[name]) < self.statistics_batch_num:
-                    self.collected_activation[name].append(self.activation(output.detach().cpu()))
-
-            layer.module.register_forward_hook(_hook)
-        return self.bound_model
-
-    def _get_mask(self, base_mask, activations, num_prune):
-        return {'weight': None, 'bias': None}
-
-    def calc_mask(self, layer, config):
-        """
-        Calculate the mask of given layer.
-        Filters with the smallest importance criterion which is calculated from the activation are masked.
-
-        Parameters
-        ----------
-        layer : LayerInfo
-            the layer to instrument the compression operation
-        config : dict
-            layer's pruning config
-
-        Returns
-        -------
-        dict
-            dictionary for storing masks
-        """
-
-        weight = layer.module.weight.data
-        op_name = layer.name
-        op_type = layer.type
-        assert 0 <= config.get('sparsity') < 1, "sparsity must in the range [0, 1)"
-        assert op_type in ['Conv2d'], "only support Conv2d"
-        assert op_type in config.get('op_types')
-        if op_name in self.mask_calculated_ops:
-            assert op_name in self.mask_dict
-            return self.mask_dict.get(op_name)
-        mask_weight = torch.ones(weight.size()).type_as(weight).detach()
-        if hasattr(layer.module, 'bias') and layer.module.bias is not None:
-            mask_bias = torch.ones(layer.module.bias.size()).type_as(layer.module.bias).detach()
-        else:
-            mask_bias = None
-        mask = {'weight': mask_weight, 'bias': mask_bias}
-        try:
-            filters = weight.size(0)
-            num_prune = int(filters * config.get('sparsity'))
-            if filters < 2 or num_prune < 1 or len(self.collected_activation[layer.name]) < self.statistics_batch_num:
-                return mask
-            mask = self._get_mask(mask, self.collected_activation[layer.name], num_prune)
-        finally:
-            if len(self.collected_activation[layer.name]) == self.statistics_batch_num:
-                self.mask_dict.update({op_name: mask})
-                self.mask_calculated_ops.add(op_name)
-        return mask
-
-
-class ActivationAPoZRankFilterPruner(ActivationRankFilterPruner):
-    """
-    A structured pruning algorithm that prunes the filters with the
-    smallest APoZ(average percentage of zeros) of output activations.
-    Hengyuan Hu, Rui Peng, Yu-Wing Tai and Chi-Keung Tang,
-    "Network Trimming: A Data-Driven Neuron Pruning Approach towards Efficient Deep Architectures", ICLR 2016.
-    https://arxiv.org/abs/1607.03250
-    """
-
-    def __init__(self, model, config_list, activation='relu', statistics_batch_num=1):
-        """
-        Parameters
-        ----------
-        model : torch.nn.module
-            Model to be pruned
-        config_list : list
-            support key for each list item:
-                - sparsity: percentage of convolutional filters to be pruned.
-        activation : str
-            Activation function
-        statistics_batch_num : int
-            Num of batches for activation statistics
-        """
-        super().__init__(model, config_list, activation, statistics_batch_num)
-
-    def _get_mask(self, base_mask, activations, num_prune):
-        """
-        Calculate the mask of given layer.
-        Filters with the smallest APoZ(average percentage of zeros) of output activations are masked.
-
-        Parameters
-        ----------
-        base_mask : dict
-            The basic mask with the same shape of weight, all item in the basic mask is 1.
-        activations : list
-            Layer's output activations
-        num_prune : int
-            Num of filters to prune
-
-        Returns
-        -------
-        dict
-            dictionary for storing masks
-        """
-        apoz = self._calc_apoz(activations)
-        prune_indices = torch.argsort(apoz, descending=True)[:num_prune]
-        for idx in prune_indices:
-            base_mask['weight'][idx] = 0.
-            if base_mask['bias'] is not None:
-                base_mask['bias'][idx] = 0.
-        return base_mask
-
-    def _calc_apoz(self, activations):
-        """
-        Calculate APoZ(average percentage of zeros) of activations.
-
-        Parameters
-        ----------
-        activations : list
-            Layer's output activations
-
-        Returns
-        -------
-        torch.Tensor
-            Filter's APoZ(average percentage of zeros) of the activations
-        """
-        activations = torch.cat(activations, 0)
-        _eq_zero = torch.eq(activations, torch.zeros_like(activations))
-        _apoz = torch.sum(_eq_zero, dim=(0, 2, 3)) / torch.numel(_eq_zero[:, 0, :, :])
-        return _apoz
-
-
-class ActivationMeanRankFilterPruner(ActivationRankFilterPruner):
-    """
-    A structured pruning algorithm that prunes the filters with the
-    smallest mean value of output activations.
-    Pavlo Molchanov, Stephen Tyree, Tero Karras, Timo Aila and Jan Kautz,
-    "Pruning Convolutional Neural Networks for Resource Efficient Inference", ICLR 2017.
-    https://arxiv.org/abs/1611.06440
-    """
-
-    def __init__(self, model, config_list, activation='relu', statistics_batch_num=1):
-        """
-        Parameters
-        ----------
-        model : torch.nn.module
-            Model to be pruned
-        config_list : list
-            support key for each list item:
-                - sparsity: percentage of convolutional filters to be pruned.
-        activation : str
-            Activation function
-        statistics_batch_num : int
-            Num of batches for activation statistics
-        """
-        super().__init__(model, config_list, activation, statistics_batch_num)
-
-    def _get_mask(self, base_mask, activations, num_prune):
-        """
-        Calculate the mask of given layer.
-        Filters with the smallest APoZ(average percentage of zeros) of output activations are masked.
-
-        Parameters
-        ----------
-        base_mask : dict
-            The basic mask with the same shape of weight, all item in the basic mask is 1.
-        activations : list
-            Layer's output activations
-        num_prune : int
-            Num of filters to prune
-
-        Returns
-        -------
-        dict
-            dictionary for storing masks
-        """
-        mean_activation = self._cal_mean_activation(activations)
-        prune_indices = torch.argsort(mean_activation)[:num_prune]
-        for idx in prune_indices:
-            base_mask['weight'][idx] = 0.
-            if base_mask['bias'] is not None:
-                base_mask['bias'][idx] = 0.
-        return base_mask
-
-    def _cal_mean_activation(self, activations):
-        """
-        Calculate mean value of activations.
-
-        Parameters
-        ----------
-        activations : list
-            Layer's output activations
-
-        Returns
-        -------
-        torch.Tensor
-            Filter's mean value of the output activations
-        """
-        activations = torch.cat(activations, 0)
-        mean_activation = torch.mean(activations, dim=(0, 2, 3))
-        return mean_activation
diff --git a/src/sdk/pynni/nni/compression/torch/lottery_ticket.py b/src/sdk/pynni/nni/compression/torch/lottery_ticket.py
deleted file mode 100644
index 233d90ced8..0000000000
--- a/src/sdk/pynni/nni/compression/torch/lottery_ticket.py
+++ /dev/null
@@ -1,150 +0,0 @@
-import copy
-import logging
-import torch
-from .compressor import Pruner
-
-_logger = logging.getLogger(__name__)
-
-
-class LotteryTicketPruner(Pruner):
-    """
-    This is a Pytorch implementation of the paper "The Lottery Ticket Hypothesis: Finding Sparse, Trainable Neural Networks",
-    following NNI model compression interface.
-
-    1. Randomly initialize a neural network f(x;theta_0) (where theta_0 follows D_{theta}).
-    2. Train the network for j iterations, arriving at parameters theta_j.
-    3. Prune p% of the parameters in theta_j, creating a mask m.
-    4. Reset the remaining parameters to their values in theta_0, creating the winning ticket f(x;m*theta_0).
-    5. Repeat step 2, 3, and 4.
-    """
-
-    def __init__(self, model, config_list, optimizer, lr_scheduler=None, reset_weights=True):
-        """
-        Parameters
-        ----------
-        model : pytorch model
-            The model to be pruned
-        config_list : list
-            Supported keys:
-                - prune_iterations : The number of rounds for the iterative pruning.
-                - sparsity : The final sparsity when the compression is done.
-        optimizer : pytorch optimizer
-            The optimizer for the model
-        lr_scheduler : pytorch lr scheduler
-            The lr scheduler for the model if used
-        reset_weights : bool
-            Whether reset weights and optimizer at the beginning of each round.
-        """
-        super().__init__(model, config_list)
-        self.curr_prune_iteration = None
-        self.prune_iterations = self._validate_config(config_list)
-
-        # save init weights and optimizer
-        self.reset_weights = reset_weights
-        if self.reset_weights:
-            self._model = model
-            self._optimizer = optimizer
-            self._model_state = copy.deepcopy(model.state_dict())
-            self._optimizer_state = copy.deepcopy(optimizer.state_dict())
-            self._lr_scheduler = lr_scheduler
-            if lr_scheduler is not None:
-                self._scheduler_state = copy.deepcopy(lr_scheduler.state_dict())
-
-    def _validate_config(self, config_list):
-        prune_iterations = None
-        for config in config_list:
-            assert 'prune_iterations' in config, 'prune_iterations must exist in your config'
-            assert 'sparsity' in config, 'sparsity must exist in your config'
-            if prune_iterations is not None:
-                assert prune_iterations == config[
-                    'prune_iterations'], 'The values of prune_iterations must be equal in your config'
-            prune_iterations = config['prune_iterations']
-        return prune_iterations
-
-    def _print_masks(self, print_mask=False):
-        torch.set_printoptions(threshold=1000)
-        for op_name in self.mask_dict.keys():
-            mask = self.mask_dict[op_name]
-            print('op name: ', op_name)
-            if print_mask:
-                print('mask: ', mask)
-            # calculate current sparsity
-            mask_num = mask['weight'].sum().item()
-            mask_size = mask['weight'].numel()
-            print('sparsity: ', 1 - mask_num / mask_size)
-        torch.set_printoptions(profile='default')
-
-    def _calc_sparsity(self, sparsity):
-        keep_ratio_once = (1 - sparsity) ** (1 / self.prune_iterations)
-        curr_keep_ratio = keep_ratio_once ** self.curr_prune_iteration
-        return max(1 - curr_keep_ratio, 0)
-
-    def _calc_mask(self, weight, sparsity, op_name):
-        if self.curr_prune_iteration == 0:
-            mask = torch.ones(weight.shape).type_as(weight)
-        else:
-            curr_sparsity = self._calc_sparsity(sparsity)
-            assert self.mask_dict.get(op_name) is not None
-            curr_mask = self.mask_dict.get(op_name)
-            w_abs = weight.abs() * curr_mask['weight']
-            k = int(w_abs.numel() * curr_sparsity)
-            threshold = torch.topk(w_abs.view(-1), k, largest=False).values.max()
-            mask = torch.gt(w_abs, threshold).type_as(weight)
-        return {'weight': mask}
-
-    def calc_mask(self, layer, config):
-        """
-        Generate mask for the given ``weight``.
-
-        Parameters
-        ----------
-        layer : LayerInfo
-            The layer to be pruned
-        config : dict
-            Pruning configurations for this weight
-
-        Returns
-        -------
-        tensor
-            The mask for this weight
-        """
-        assert self.mask_dict.get(layer.name) is not None, 'Please call iteration_start before training'
-        mask = self.mask_dict[layer.name]
-        return mask
-
-    def get_prune_iterations(self):
-        """
-        Return the range for iterations.
-        In the first prune iteration, masks are all one, thus, add one more iteration
-
-        Returns
-        -------
-        list
-            A list for pruning iterations
-        """
-        return range(self.prune_iterations + 1)
-
-    def prune_iteration_start(self):
-        """
-        Control the pruning procedure on updated epoch number.
-        Should be called at the beginning of the epoch.
-        """
-        if self.curr_prune_iteration is None:
-            self.curr_prune_iteration = 0
-        else:
-            self.curr_prune_iteration += 1
-        assert self.curr_prune_iteration < self.prune_iterations + 1, 'Exceed the configured prune_iterations'
-
-        modules_to_compress = self.detect_modules_to_compress()
-        for layer, config in modules_to_compress:
-            sparsity = config.get('sparsity')
-            mask = self._calc_mask(layer.module.weight.data, sparsity, layer.name)
-            self.mask_dict.update({layer.name: mask})
-        self._print_masks()
-
-        # reinit weights back to original after new masks are generated
-        if self.reset_weights:
-            self._model.load_state_dict(self._model_state)
-            self._optimizer.load_state_dict(self._optimizer_state)
-            if self._lr_scheduler is not None:
-                self._lr_scheduler.load_state_dict(self._scheduler_state)
diff --git a/src/sdk/pynni/nni/compression/torch/pruners.py b/src/sdk/pynni/nni/compression/torch/pruners.py
new file mode 100644
index 0000000000..82f37a488c
--- /dev/null
+++ b/src/sdk/pynni/nni/compression/torch/pruners.py
@@ -0,0 +1,383 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import copy
+import logging
+import torch
+from .compressor import Pruner
+
+__all__ = ['LevelPruner', 'AGP_Pruner', 'SlimPruner', 'LotteryTicketPruner']
+
+logger = logging.getLogger('torch pruner')
+
+
+class LevelPruner(Pruner):
+    """
+    Prune to an exact pruning level specification
+    """
+
+    def __init__(self, model, config_list):
+        """
+        Parameters
+        ----------
+        model : torch.nn.module
+            Model to be pruned
+        config_list : list
+            List on pruning configs
+        """
+
+        super().__init__(model, config_list)
+        self.mask_calculated_ops = set()
+
+    def calc_mask(self, layer, config):
+        """
+        Calculate the mask of given layer
+        Parameters
+        ----------
+        layer : LayerInfo
+            the layer to instrument the compression operation
+        config : dict
+            layer's pruning config
+        Returns
+        -------
+        dict
+            dictionary for storing masks
+        """
+
+        weight = layer.module.weight.data
+        op_name = layer.name
+        if op_name not in self.mask_calculated_ops:
+            w_abs = weight.abs()
+            k = int(weight.numel() * config['sparsity'])
+            if k == 0:
+                return torch.ones(weight.shape).type_as(weight)
+            threshold = torch.topk(w_abs.view(-1), k, largest=False)[0].max()
+            mask_weight = torch.gt(w_abs, threshold).type_as(weight)
+            mask = {'weight': mask_weight}
+            self.mask_dict.update({op_name: mask})
+            self.mask_calculated_ops.add(op_name)
+        else:
+            assert op_name in self.mask_dict, "op_name not in the mask_dict"
+            mask = self.mask_dict[op_name]
+        return mask
+
+
+class AGP_Pruner(Pruner):
+    """
+    An automated gradual pruning algorithm that prunes the smallest magnitude
+    weights to achieve a preset level of network sparsity.
+    Michael Zhu and Suyog Gupta, "To prune, or not to prune: exploring the
+    efficacy of pruning for model compression", 2017 NIPS Workshop on Machine
+    Learning of Phones and other Consumer Devices,
+    https://arxiv.org/pdf/1710.01878.pdf
+    """
+
+    def __init__(self, model, config_list):
+        """
+        Parameters
+        ----------
+        model : torch.nn.module
+            Model to be pruned
+        config_list : list
+            List on pruning configs
+        """
+
+        super().__init__(model, config_list)
+        self.now_epoch = 0
+        self.if_init_list = {}
+
+    def calc_mask(self, layer, config):
+        """
+        Calculate the mask of given layer
+        Parameters
+        ----------
+        layer : LayerInfo
+            the layer to instrument the compression operation
+        config : dict
+            layer's pruning config
+        Returns
+        -------
+        dict
+            dictionary for storing masks
+        """
+
+        weight = layer.module.weight.data
+        op_name = layer.name
+        start_epoch = config.get('start_epoch', 0)
+        freq = config.get('frequency', 1)
+        if self.now_epoch >= start_epoch and self.if_init_list.get(op_name, True) \
+                and (self.now_epoch - start_epoch) % freq == 0:
+            mask = self.mask_dict.get(op_name, {'weight': torch.ones(weight.shape).type_as(weight)})
+            target_sparsity = self.compute_target_sparsity(config)
+            k = int(weight.numel() * target_sparsity)
+            if k == 0 or target_sparsity >= 1 or target_sparsity <= 0:
+                return mask
+            # if we want to generate new mask, we should update weigth first
+            w_abs = weight.abs() * mask
+            threshold = torch.topk(w_abs.view(-1), k, largest=False)[0].max()
+            new_mask = {'weight': torch.gt(w_abs, threshold).type_as(weight)}
+            self.mask_dict.update({op_name: new_mask})
+            self.if_init_list.update({op_name: False})
+        else:
+            new_mask = self.mask_dict.get(op_name, {'weight': torch.ones(weight.shape).type_as(weight)})
+        return new_mask
+
+    def compute_target_sparsity(self, config):
+        """
+        Calculate the sparsity for pruning
+        Parameters
+        ----------
+        config : dict
+            Layer's pruning config
+        Returns
+        -------
+        float
+            Target sparsity to be pruned
+        """
+
+        end_epoch = config.get('end_epoch', 1)
+        start_epoch = config.get('start_epoch', 0)
+        freq = config.get('frequency', 1)
+        final_sparsity = config.get('final_sparsity', 0)
+        initial_sparsity = config.get('initial_sparsity', 0)
+        if end_epoch <= start_epoch or initial_sparsity >= final_sparsity:
+            logger.warning('your end epoch <= start epoch or initial_sparsity >= final_sparsity')
+            return final_sparsity
+
+        if end_epoch <= self.now_epoch:
+            return final_sparsity
+
+        span = ((end_epoch - start_epoch - 1) // freq) * freq
+        assert span > 0
+        target_sparsity = (final_sparsity +
+                           (initial_sparsity - final_sparsity) *
+                           (1.0 - ((self.now_epoch - start_epoch) / span)) ** 3)
+        return target_sparsity
+
+    def update_epoch(self, epoch):
+        """
+        Update epoch
+        Parameters
+        ----------
+        epoch : int
+            current training epoch
+        """
+
+        if epoch > 0:
+            self.now_epoch = epoch
+            for k in self.if_init_list.keys():
+                self.if_init_list[k] = True
+
+
+class SlimPruner(Pruner):
+    """
+    A structured pruning algorithm that prunes channels by pruning the weights of BN layers.
+    Zhuang Liu, Jianguo Li, Zhiqiang Shen, Gao Huang, Shoumeng Yan and Changshui Zhang
+    "Learning Efficient Convolutional Networks through Network Slimming", 2017 ICCV
+    https://arxiv.org/pdf/1708.06519.pdf
+    """
+
+    def __init__(self, model, config_list):
+        """
+        Parameters
+        ----------
+        config_list : list
+            support key for each list item:
+                - sparsity: percentage of convolutional filters to be pruned.
+        """
+
+        super().__init__(model, config_list)
+        self.mask_calculated_ops = set()
+        weight_list = []
+        if len(config_list) > 1:
+            logger.warning('Slim pruner only supports 1 configuration')
+        config = config_list[0]
+        for (layer, config) in self.detect_modules_to_compress():
+            assert layer.type == 'BatchNorm2d', 'SlimPruner only supports 2d batch normalization layer pruning'
+            weight_list.append(layer.module.weight.data.abs().clone())
+        all_bn_weights = torch.cat(weight_list)
+        k = int(all_bn_weights.shape[0] * config['sparsity'])
+        self.global_threshold = torch.topk(all_bn_weights.view(-1), k, largest=False)[0].max()
+
+    def calc_mask(self, layer, config):
+        """
+        Calculate the mask of given layer.
+        Scale factors with the smallest absolute value in the BN layer are masked.
+        Parameters
+        ----------
+        layer : LayerInfo
+            the layer to instrument the compression operation
+        config : dict
+            layer's pruning config
+        Returns
+        -------
+        dict
+            dictionary for storing masks
+        """
+
+        weight = layer.module.weight.data
+        op_name = layer.name
+        op_type = layer.type
+        assert op_type == 'BatchNorm2d', 'SlimPruner only supports 2d batch normalization layer pruning'
+        if op_name in self.mask_calculated_ops:
+            assert op_name in self.mask_dict
+            return self.mask_dict.get(op_name)
+        base_mask = torch.ones(weight.size()).type_as(weight).detach()
+        mask = {'weight': base_mask.detach(), 'bias': base_mask.clone().detach()}
+        try:
+            filters = weight.size(0)
+            num_prune = int(filters * config.get('sparsity'))
+            if filters < 2 or num_prune < 1:
+                return mask
+            w_abs = weight.abs()
+            mask_weight = torch.gt(w_abs, self.global_threshold).type_as(weight)
+            mask_bias = mask_weight.clone()
+            mask = {'weight': mask_weight.detach(), 'bias': mask_bias.detach()}
+        finally:
+            self.mask_dict.update({layer.name: mask})
+            self.mask_calculated_ops.add(layer.name)
+
+        return mask
+
+class LotteryTicketPruner(Pruner):
+    """
+    This is a Pytorch implementation of the paper "The Lottery Ticket Hypothesis: Finding Sparse, Trainable Neural Networks",
+    following NNI model compression interface.
+
+    1. Randomly initialize a neural network f(x;theta_0) (where theta_0 follows D_{theta}).
+    2. Train the network for j iterations, arriving at parameters theta_j.
+    3. Prune p% of the parameters in theta_j, creating a mask m.
+    4. Reset the remaining parameters to their values in theta_0, creating the winning ticket f(x;m*theta_0).
+    5. Repeat step 2, 3, and 4.
+    """
+
+    def __init__(self, model, config_list, optimizer, lr_scheduler=None, reset_weights=True):
+        """
+        Parameters
+        ----------
+        model : pytorch model
+            The model to be pruned
+        config_list : list
+            Supported keys:
+                - prune_iterations : The number of rounds for the iterative pruning.
+                - sparsity : The final sparsity when the compression is done.
+        optimizer : pytorch optimizer
+            The optimizer for the model
+        lr_scheduler : pytorch lr scheduler
+            The lr scheduler for the model if used
+        reset_weights : bool
+            Whether reset weights and optimizer at the beginning of each round.
+        """
+        super().__init__(model, config_list)
+        self.curr_prune_iteration = None
+        self.prune_iterations = self._validate_config(config_list)
+
+        # save init weights and optimizer
+        self.reset_weights = reset_weights
+        if self.reset_weights:
+            self._model = model
+            self._optimizer = optimizer
+            self._model_state = copy.deepcopy(model.state_dict())
+            self._optimizer_state = copy.deepcopy(optimizer.state_dict())
+            self._lr_scheduler = lr_scheduler
+            if lr_scheduler is not None:
+                self._scheduler_state = copy.deepcopy(lr_scheduler.state_dict())
+
+    def _validate_config(self, config_list):
+        prune_iterations = None
+        for config in config_list:
+            assert 'prune_iterations' in config, 'prune_iterations must exist in your config'
+            assert 'sparsity' in config, 'sparsity must exist in your config'
+            if prune_iterations is not None:
+                assert prune_iterations == config[
+                    'prune_iterations'], 'The values of prune_iterations must be equal in your config'
+            prune_iterations = config['prune_iterations']
+        return prune_iterations
+
+    def _print_masks(self, print_mask=False):
+        torch.set_printoptions(threshold=1000)
+        for op_name in self.mask_dict.keys():
+            mask = self.mask_dict[op_name]
+            print('op name: ', op_name)
+            if print_mask:
+                print('mask: ', mask)
+            # calculate current sparsity
+            mask_num = mask['weight'].sum().item()
+            mask_size = mask['weight'].numel()
+            print('sparsity: ', 1 - mask_num / mask_size)
+        torch.set_printoptions(profile='default')
+
+    def _calc_sparsity(self, sparsity):
+        keep_ratio_once = (1 - sparsity) ** (1 / self.prune_iterations)
+        curr_keep_ratio = keep_ratio_once ** self.curr_prune_iteration
+        return max(1 - curr_keep_ratio, 0)
+
+    def _calc_mask(self, weight, sparsity, op_name):
+        if self.curr_prune_iteration == 0:
+            mask = torch.ones(weight.shape).type_as(weight)
+        else:
+            curr_sparsity = self._calc_sparsity(sparsity)
+            assert self.mask_dict.get(op_name) is not None
+            curr_mask = self.mask_dict.get(op_name)
+            w_abs = weight.abs() * curr_mask['weight']
+            k = int(w_abs.numel() * curr_sparsity)
+            threshold = torch.topk(w_abs.view(-1), k, largest=False).values.max()
+            mask = torch.gt(w_abs, threshold).type_as(weight)
+        return {'weight': mask}
+
+    def calc_mask(self, layer, config):
+        """
+        Generate mask for the given ``weight``.
+
+        Parameters
+        ----------
+        layer : LayerInfo
+            The layer to be pruned
+        config : dict
+            Pruning configurations for this weight
+
+        Returns
+        -------
+        tensor
+            The mask for this weight
+        """
+        assert self.mask_dict.get(layer.name) is not None, 'Please call iteration_start before training'
+        mask = self.mask_dict[layer.name]
+        return mask
+
+    def get_prune_iterations(self):
+        """
+        Return the range for iterations.
+        In the first prune iteration, masks are all one, thus, add one more iteration
+
+        Returns
+        -------
+        list
+            A list for pruning iterations
+        """
+        return range(self.prune_iterations + 1)
+
+    def prune_iteration_start(self):
+        """
+        Control the pruning procedure on updated epoch number.
+        Should be called at the beginning of the epoch.
+        """
+        if self.curr_prune_iteration is None:
+            self.curr_prune_iteration = 0
+        else:
+            self.curr_prune_iteration += 1
+        assert self.curr_prune_iteration < self.prune_iterations + 1, 'Exceed the configured prune_iterations'
+
+        modules_to_compress = self.detect_modules_to_compress()
+        for layer, config in modules_to_compress:
+            sparsity = config.get('sparsity')
+            mask = self._calc_mask(layer.module.weight.data, sparsity, layer.name)
+            self.mask_dict.update({layer.name: mask})
+        self._print_masks()
+
+        # reinit weights back to original after new masks are generated
+        if self.reset_weights:
+            self._model.load_state_dict(self._model_state)
+            self._optimizer.load_state_dict(self._optimizer_state)
+            if self._lr_scheduler is not None:
+                self._lr_scheduler.load_state_dict(self._scheduler_state)
diff --git a/src/sdk/pynni/nni/compression/torch/builtin_quantizers.py b/src/sdk/pynni/nni/compression/torch/quantizers.py
similarity index 100%
rename from src/sdk/pynni/nni/compression/torch/builtin_quantizers.py
rename to src/sdk/pynni/nni/compression/torch/quantizers.py
diff --git a/src/sdk/pynni/nni/compression/torch/weight_rank_filter_pruners.py b/src/sdk/pynni/nni/compression/torch/weight_rank_filter_pruners.py
new file mode 100644
index 0000000000..918ed95ec1
--- /dev/null
+++ b/src/sdk/pynni/nni/compression/torch/weight_rank_filter_pruners.py
@@ -0,0 +1,262 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import logging
+import torch
+from .compressor import Pruner
+
+__all__ = ['L1FilterPruner', 'L2FilterPruner', 'FPGMPruner']
+
+logger = logging.getLogger('torch weight rank filter pruners')
+
+class WeightRankFilterPruner(Pruner):
+    """
+    A structured pruning base class that prunes the filters with the smallest
+    importance criterion in convolution layers to achieve a preset level of network sparsity.
+    """
+
+    def __init__(self, model, config_list):
+        """
+        Parameters
+        ----------
+        model : torch.nn.module
+            Model to be pruned
+        config_list : list
+            support key for each list item:
+                - sparsity: percentage of convolutional filters to be pruned.
+        """
+
+        super().__init__(model, config_list)
+        self.mask_calculated_ops = set()  # operations whose mask has been calculated
+
+    def get_mask(self, base_mask, weight, num_prune):
+        raise NotImplementedError('{} get_mask is not implemented'.format(self.__class__.__name__))
+
+    def calc_mask(self, layer, config):
+        """
+        Calculate the mask of given layer.
+        Filters with the smallest importance criterion of the kernel weights are masked.
+        Parameters
+        ----------
+        layer : LayerInfo
+            the layer to instrument the compression operation
+        config : dict
+            layer's pruning config
+        Returns
+        -------
+        dict
+            dictionary for storing masks
+        """
+
+        weight = layer.module.weight.data
+        op_name = layer.name
+        op_type = layer.type
+        assert 0 <= config.get('sparsity') < 1, "sparsity must in the range [0, 1)"
+        assert op_type in ['Conv1d', 'Conv2d'], "only support Conv1d and Conv2d"
+        assert op_type in config.get('op_types')
+        if op_name in self.mask_calculated_ops:
+            assert op_name in self.mask_dict
+            return self.mask_dict.get(op_name)
+        mask_weight = torch.ones(weight.size()).type_as(weight).detach()
+        if hasattr(layer.module, 'bias') and layer.module.bias is not None:
+            mask_bias = torch.ones(layer.module.bias.size()).type_as(layer.module.bias).detach()
+        else:
+            mask_bias = None
+        mask = {'weight': mask_weight, 'bias': mask_bias}
+        try:
+            filters = weight.size(0)
+            num_prune = int(filters * config.get('sparsity'))
+            if filters < 2 or num_prune < 1:
+                return mask
+            mask = self.get_mask(mask, weight, num_prune)
+        finally:
+            self.mask_dict.update({op_name: mask})
+            self.mask_calculated_ops.add(op_name)
+        return mask
+
+
+class L1FilterPruner(WeightRankFilterPruner):
+    """
+    A structured pruning algorithm that prunes the filters of smallest magnitude
+    weights sum in the convolution layers to achieve a preset level of network sparsity.
+    Hao Li, Asim Kadav, Igor Durdanovic, Hanan Samet and Hans Peter Graf,
+    "PRUNING FILTERS FOR EFFICIENT CONVNETS", 2017 ICLR
+    https://arxiv.org/abs/1608.08710
+    """
+
+    def __init__(self, model, config_list):
+        """
+        Parameters
+        ----------
+        model : torch.nn.module
+            Model to be pruned
+        config_list : list
+            support key for each list item:
+                - sparsity: percentage of convolutional filters to be pruned.
+        """
+
+        super().__init__(model, config_list)
+
+    def get_mask(self, base_mask, weight, num_prune):
+        """
+        Calculate the mask of given layer.
+        Filters with the smallest sum of its absolute kernel weights are masked.
+        Parameters
+        ----------
+        base_mask : dict
+            The basic mask with the same shape of weight or bias, all item in the basic mask is 1.
+        weight : torch.Tensor
+            Layer's weight
+        num_prune : int
+            Num of filters to prune
+
+        Returns
+        -------
+        dict
+            dictionary for storing masks
+        """
+
+        filters = weight.shape[0]
+        w_abs = weight.abs()
+        w_abs_structured = w_abs.view(filters, -1).sum(dim=1)
+        threshold = torch.topk(w_abs_structured.view(-1), num_prune, largest=False)[0].max()
+        mask_weight = torch.gt(w_abs_structured, threshold)[:, None, None, None].expand_as(weight).type_as(weight)
+        mask_bias = torch.gt(w_abs_structured, threshold).type_as(weight)
+
+        return {'weight': mask_weight.detach(), 'bias': mask_bias.detach()}
+
+
+class L2FilterPruner(WeightRankFilterPruner):
+    """
+    A structured pruning algorithm that prunes the filters with the
+    smallest L2 norm of the weights.
+    """
+
+    def __init__(self, model, config_list):
+        """
+        Parameters
+        ----------
+        model : torch.nn.module
+            Model to be pruned
+        config_list : list
+            support key for each list item:
+                - sparsity: percentage of convolutional filters to be pruned.
+        """
+
+        super().__init__(model, config_list)
+
+    def get_mask(self, base_mask, weight, num_prune):
+        """
+        Calculate the mask of given layer.
+        Filters with the smallest L2 norm of the absolute kernel weights are masked.
+        Parameters
+        ----------
+        base_mask : dict
+            The basic mask with the same shape of weight or bias, all item in the basic mask is 1.
+        weight : torch.Tensor
+            Layer's weight
+        num_prune : int
+            Num of filters to prune
+        Returns
+        -------
+        dict
+            dictionary for storing masks
+        """
+        filters = weight.shape[0]
+        w = weight.view(filters, -1)
+        w_l2_norm = torch.sqrt((w ** 2).sum(dim=1))
+        threshold = torch.topk(w_l2_norm.view(-1), num_prune, largest=False)[0].max()
+        mask_weight = torch.gt(w_l2_norm, threshold)[:, None, None, None].expand_as(weight).type_as(weight)
+        mask_bias = torch.gt(w_l2_norm, threshold).type_as(weight)
+
+        return {'weight': mask_weight.detach(), 'bias': mask_bias.detach()}
+
+
+class FPGMPruner(WeightRankFilterPruner):
+    """
+    A filter pruner via geometric median.
+    "Filter Pruning via Geometric Median for Deep Convolutional Neural Networks Acceleration",
+    https://arxiv.org/pdf/1811.00250.pdf
+    """
+
+    def __init__(self, model, config_list):
+        """
+        Parameters
+        ----------
+        model : pytorch model
+            the model user wants to compress
+        config_list: list
+            support key for each list item:
+                - sparsity: percentage of convolutional filters to be pruned.
+        """
+        super().__init__(model, config_list)
+
+    def get_mask(self, base_mask, weight, num_prune):
+        """
+        Calculate the mask of given layer.
+        Filters with the smallest sum of its absolute kernel weights are masked.
+        Parameters
+        ----------
+        base_mask : dict
+            The basic mask with the same shape of weight and bias, all item in the basic mask is 1.
+        weight : torch.Tensor
+            Layer's weight
+        num_prune : int
+            Num of filters to prune
+        Returns
+        -------
+        dict
+            dictionary for storing masks
+        """
+        min_gm_idx = self._get_min_gm_kernel_idx(weight, num_prune)
+        for idx in min_gm_idx:
+            base_mask['weight'][idx] = 0.
+            if base_mask['bias'] is not None:
+                base_mask['bias'][idx] = 0.
+        return base_mask
+
+    def _get_min_gm_kernel_idx(self, weight, n):
+        assert len(weight.size()) in [3, 4]
+
+        dist_list = []
+        for out_i in range(weight.size(0)):
+            dist_sum = self._get_distance_sum(weight, out_i)
+            dist_list.append((dist_sum, out_i))
+        min_gm_kernels = sorted(dist_list, key=lambda x: x[0])[:n]
+        return [x[1] for x in min_gm_kernels]
+
+    def _get_distance_sum(self, weight, out_idx):
+        """
+        Calculate the total distance between a specified filter (by out_idex and in_idx) and
+        all other filters.
+        Optimized verision of following naive implementation:
+        def _get_distance_sum(self, weight, in_idx, out_idx):
+            w = weight.view(-1, weight.size(-2), weight.size(-1))
+            dist_sum = 0.
+            for k in w:
+                dist_sum += torch.dist(k, weight[in_idx, out_idx], p=2)
+            return dist_sum
+        Parameters
+        ----------
+        weight: Tensor
+            convolutional filter weight
+        out_idx: int
+            output channel index of specified filter, this method calculates the total distance
+            between this specified filter and all other filters.
+        Returns
+        -------
+        float32
+            The total distance
+        """
+        logger.debug('weight size: %s', weight.size())
+        assert len(weight.size()) in [3, 4], 'unsupported weight shape'
+
+        w = weight.view(weight.size(0), -1)
+        anchor_w = w[out_idx].unsqueeze(0).expand(w.size(0), w.size(1))
+        x = w - anchor_w
+        x = (x * x).sum(-1)
+        x = torch.sqrt(x)
+        return x.sum()
+
+    def update_epoch(self, epoch):
+        self.mask_calculated_ops = set()

From 98754c70eceba7dc0a73b07c6c96963c89b5f8f7 Mon Sep 17 00:00:00 2001
From: Lijiao <35484733+lvybriage@users.noreply.github.com>
Date: Mon, 30 Dec 2019 09:33:58 +0800
Subject: [PATCH 06/14] fix overview page table trialId style (#1875)

---
 src/webui/src/components/Modal/Compare.tsx         | 3 ++-
 src/webui/src/components/overview/SuccessTable.tsx | 3 +--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/webui/src/components/Modal/Compare.tsx b/src/webui/src/components/Modal/Compare.tsx
index 2bc9a9c1e8..2221e5893e 100644
--- a/src/webui/src/components/Modal/Compare.tsx
+++ b/src/webui/src/components/Modal/Compare.tsx
@@ -91,7 +91,8 @@ class Compare extends React.Component<CompareProps, {}> {
             },
             yAxis: {
                 type: 'value',
-                name: 'Metric'
+                name: 'Metric',
+                scale: true
             },
             series: trialIntermediate
         };
diff --git a/src/webui/src/components/overview/SuccessTable.tsx b/src/webui/src/components/overview/SuccessTable.tsx
index 9019afec82..cb3ab1d33e 100644
--- a/src/webui/src/components/overview/SuccessTable.tsx
+++ b/src/webui/src/components/overview/SuccessTable.tsx
@@ -28,12 +28,11 @@ class SuccessTable extends React.Component<SuccessTableProps, {}> {
             {
                 title: 'Trial No.',
                 dataIndex: 'sequenceId',
-                width: 140,
                 className: 'tableHead'
             }, {
                 title: 'ID',
                 dataIndex: 'id',
-                width: 60,
+                width: 80,
                 className: 'tableHead leftTitle',
                 render: (text: string, record: TableRecord): React.ReactNode => {
                     return (

From 23c56b1f35a0488edbf7e2401424622163cb3f05 Mon Sep 17 00:00:00 2001
From: SparkSnail <shinyang@microsoft.com>
Date: Mon, 30 Dec 2019 13:41:11 +0800
Subject: [PATCH 07/14] quick fix nnictl view command (#1892)

---
 tools/nni_cmd/launcher.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/nni_cmd/launcher.py b/tools/nni_cmd/launcher.py
index 3b52030f2f..54ce4aec77 100644
--- a/tools/nni_cmd/launcher.py
+++ b/tools/nni_cmd/launcher.py
@@ -517,7 +517,7 @@ def manage_stopped_experiment(args, mode):
     experiment_id = None
     #find the latest stopped experiment
     if not args.id:
-        print_error('Please set experiment id! \nYou could use \'nnictl {0} {id}\' to {0} a stopped experiment!\n' \
+        print_error('Please set experiment id! \nYou could use \'nnictl {0} id\' to {0} a stopped experiment!\n' \
         'You could use \'nnictl experiment list --all\' to show all experiments!'.format(mode))
         exit(1)
     else:

From 8d8c9de46b3118f0123db7b11a601190eada6086 Mon Sep 17 00:00:00 2001
From: Yuge Zhang <v-yugzh@microsoft.com>
Date: Mon, 30 Dec 2019 14:13:57 +0800
Subject: [PATCH 08/14] [SPOS] Clarify checkpoint directory in docs (#1891)

---
 examples/nas/spos/README.md         | 6 ++++--
 examples/nas/spos/config_search.yml | 2 +-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/examples/nas/spos/README.md b/examples/nas/spos/README.md
index ed239f30a1..0bba5968f9 100644
--- a/examples/nas/spos/README.md
+++ b/examples/nas/spos/README.md
@@ -18,7 +18,7 @@ Only GPU version is provided here.
 Need to download the flops lookup table from [here](https://1drv.ms/u/s!Am_mmG2-KsrnajesvSdfsq_cN48?e=aHVppN).
 Put `op_flops_dict.pkl` and `checkpoint-150000.pth.tar` (if you don't want to retrain the supernet) under `data` directory.
 
-Prepare ImageNet in the standard format (follow the script [here](https://gist.github.com/BIGBALLON/8a71d225eff18d88e469e6ea9b39cef4)). Link it to `data/imagenet` will be more convenient.
+Prepare ImageNet in the standard format (follow the script [here](https://gist.github.com/BIGBALLON/8a71d225eff18d88e469e6ea9b39cef4)). Linking it to `data/imagenet` will be more convenient.
 
 After preparation, it's expected to have the following code structure:
 
@@ -48,7 +48,7 @@ spos
 python supernet.py
 ```
 
-Will export the checkpoint to checkpoints directory, for the next step.
+Will export the checkpoint to `checkpoints` directory, for the next step.
 
 NOTE: The data loading used in the official repo is [slightly different from usual](https://github.com/megvii-model/SinglePathOneShot/issues/5), as they use BGR tensor and keep the values between 0 and 255 intentionally to align with their own DL framework. The option `--spos-preprocessing` will simulate the behavior used originally and enable you to use the checkpoints pretrained.
 
@@ -64,6 +64,8 @@ nnictl ss_gen -t "python tester.py"
 
 This will generate a file called `nni_auto_gen_search_space.json`, which is a serialized representation of your search space.
 
+By default, it will use `checkpoint-150000.pth.tar` downloaded previously. In case you want to use the checkpoint trained by yourself from the last step, specify `--checkpoint` in the command in `config_search.yml`.
+
 Then search with evolution tuner.
 
 ```
diff --git a/examples/nas/spos/config_search.yml b/examples/nas/spos/config_search.yml
index fe27faefc8..80770568ac 100644
--- a/examples/nas/spos/config_search.yml
+++ b/examples/nas/spos/config_search.yml
@@ -11,6 +11,6 @@ tuner:
   classFileName: tuner.py
   className: EvolutionWithFlops
 trial:
-  command: python tester.py --imagenet-dir /path/to/your/imagenet --spos-prep
+  command: python tester.py --spos-prep
   codeDir: .
   gpuNum: 1

From 2906315c6384903922ca3cbff7de3aeca62c61a3 Mon Sep 17 00:00:00 2001
From: SparkSnail <shinyang@microsoft.com>
Date: Mon, 30 Dec 2019 15:23:34 +0800
Subject: [PATCH 09/14] quick fix http error (#1896)

---
 .../training_service/pai/paiK8S/paiK8STrainingService.ts        | 2 +-
 .../training_service/pai/paiYarn/paiYarnTrainingService.ts      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts b/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts
index 2888a9f425..5c0360df6f 100644
--- a/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts
+++ b/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts
@@ -62,13 +62,13 @@ class PAIK8STrainingService extends PAITrainingService {
             case TrialConfigMetadataKey.PAI_CLUSTER_CONFIG:
                 this.paiJobRestServer = new PAIJobRestServer(component.get(PAIK8STrainingService));
                 this.paiClusterConfig = <PAIClusterConfig>JSON.parse(value);
+                this.paiClusterConfig.host = this.formatPAIHost(this.paiClusterConfig.host);
                 if(this.paiClusterConfig.passWord) {
                     // Get PAI authentication token
                     await this.updatePaiToken();
                 } else if(this.paiClusterConfig.token) {
                     this.paiToken = this.paiClusterConfig.token;
                 }
-                this.paiClusterConfig.host = this.formatPAIHost(this.paiClusterConfig.host);
                 break;
 
             case TrialConfigMetadataKey.TRIAL_CONFIG:
diff --git a/src/nni_manager/training_service/pai/paiYarn/paiYarnTrainingService.ts b/src/nni_manager/training_service/pai/paiYarn/paiYarnTrainingService.ts
index 6b6f905b72..b10a7172ad 100644
--- a/src/nni_manager/training_service/pai/paiYarn/paiYarnTrainingService.ts
+++ b/src/nni_manager/training_service/pai/paiYarn/paiYarnTrainingService.ts
@@ -99,6 +99,7 @@ class PAIYarnTrainingService extends PAITrainingService {
                     path: '/webhdfs/api/v1',
                     host: this.paiClusterConfig.host
                 });
+                this.paiClusterConfig.host = this.formatPAIHost(this.paiClusterConfig.host);
                 if(this.paiClusterConfig.passWord) {
                     // Get PAI authentication token
                     await this.updatePaiToken();
@@ -107,7 +108,6 @@ class PAIYarnTrainingService extends PAITrainingService {
                 } else {
                     throw new Error('pai cluster config format error, please set password or token!');
                 }
-                this.paiClusterConfig.host = this.formatPAIHost(this.paiClusterConfig.host);
                 break;
 
             case TrialConfigMetadataKey.TRIAL_CONFIG:

From faca02c3e1490690a52b5181d30aab69af56a370 Mon Sep 17 00:00:00 2001
From: QuanluZhang <Quanlu.Zhang@microsoft.com>
Date: Mon, 30 Dec 2019 16:11:55 +0800
Subject: [PATCH 10/14] update doc for pruning algorithms (#1897)

---
 .../Compressor/ActivationRankFilterPruner.md  | 58 ------------
 docs/en_US/Compressor/Overview.md             |  8 +-
 docs/en_US/Compressor/Pruner.md               | 91 ++++++++++++-------
 ...tRankFilterPruner.md => l1filterpruner.md} | 48 +---------
 4 files changed, 64 insertions(+), 141 deletions(-)
 delete mode 100644 docs/en_US/Compressor/ActivationRankFilterPruner.md
 rename docs/en_US/Compressor/{WeightRankFilterPruner.md => l1filterpruner.md} (51%)

diff --git a/docs/en_US/Compressor/ActivationRankFilterPruner.md b/docs/en_US/Compressor/ActivationRankFilterPruner.md
deleted file mode 100644
index 7c836cb140..0000000000
--- a/docs/en_US/Compressor/ActivationRankFilterPruner.md
+++ /dev/null
@@ -1,58 +0,0 @@
-ActivationRankFilterPruner on NNI Compressor
-===
-
-## 1. Introduction
-
-ActivationRankFilterPruner is a series of pruners which prune filters according to some importance criterion calculated from the filters' output activations.
-
-|             Pruner             |       Importance criterion        |                       Reference paper                        |
-| :----------------------------: | :-------------------------------: | :----------------------------------------------------------: |
-| ActivationAPoZRankFilterPruner | APoZ(average percentage of zeros) | [Network Trimming: A Data-Driven Neuron Pruning Approach towards Efficient Deep Architectures](https://arxiv.org/abs/1607.03250) |
-| ActivationMeanRankFilterPruner | mean value of output activations  | [Pruning Convolutional Neural Networks for Resource Efficient Inference](https://arxiv.org/abs/1611.06440) |
-
-## 2. Pruners
-
-### ActivationAPoZRankFilterPruner
-
-Hengyuan Hu, Rui Peng, Yu-Wing Tai and Chi-Keung Tang,
-
-"[Network Trimming: A Data-Driven Neuron Pruning Approach towards Efficient Deep Architectures](https://arxiv.org/abs/1607.03250)", ICLR 2016.
-
-ActivationAPoZRankFilterPruner prunes the filters with the smallest APoZ(average percentage of zeros) of output activations.
-
-The APoZ is defined as:
-
-![](../../img/apoz.png)
-
-### ActivationMeanRankFilterPruner
-
-Pavlo Molchanov, Stephen Tyree, Tero Karras, Timo Aila and Jan Kautz,
-
-"[Pruning Convolutional Neural Networks for Resource Efficient Inference](https://arxiv.org/abs/1611.06440)", ICLR 2017.
-
-ActivationMeanRankFilterPruner prunes the filters with the smallest mean value of output activations
-
-## 3. Usage
-
-PyTorch code
-
-```python
-from nni.compression.torch import ActivationAPoZRankFilterPruner
-config_list = [{ 'sparsity': 0.8, 'op_types': ['Conv2d'], 'op_names': ['conv1', 'conv2'] }]
-pruner = ActivationAPoZRankFilterPruner(model, config_list, statistics_batch_num=1)
-pruner.compress()
-```
-
-#### User configuration for ActivationAPoZRankFilterPruner
-
-- **sparsity:** This is to specify the sparsity operations to be compressed to
-- **op_types:** Only Conv2d is supported in ActivationAPoZRankFilterPruner
-
-## 4. Experiment
-
-TODO. 
-
-
-
-
-
diff --git a/docs/en_US/Compressor/Overview.md b/docs/en_US/Compressor/Overview.md
index b8e2903afb..0cde1cdf75 100644
--- a/docs/en_US/Compressor/Overview.md
+++ b/docs/en_US/Compressor/Overview.md
@@ -21,11 +21,11 @@ Pruning algorithms compress the original network by removing redundant weights o
 | [AGP Pruner](./Pruner.md#agp-pruner) | Automated gradual pruning (To prune, or not to prune: exploring the efficacy of pruning for model compression) [Reference Paper](https://arxiv.org/abs/1710.01878)|
 | [Lottery Ticket Pruner](./Pruner.md#agp-pruner) | The pruning process used by "The Lottery Ticket Hypothesis: Finding Sparse, Trainable Neural Networks". It prunes a model iteratively. [Reference Paper](https://arxiv.org/abs/1803.03635)|
 | [FPGM Pruner](./Pruner.md#fpgm-pruner) | Filter Pruning via Geometric Median for Deep Convolutional Neural Networks Acceleration [Reference Paper](https://arxiv.org/pdf/1811.00250.pdf)|
-| [L1Filter Pruner](./Pruner.md#l1filter-pruner) | Pruning filters with the smallest L1 norm of weights in convolution layers(PRUNING FILTERS FOR EFFICIENT CONVNETS)[Reference Paper](https://arxiv.org/abs/1608.08710) |
+| [L1Filter Pruner](./Pruner.md#l1filter-pruner) | Pruning filters with the smallest L1 norm of weights in convolution layers (Pruning Filters for Efficient Convnets) [Reference Paper](https://arxiv.org/abs/1608.08710) |
 | [L2Filter Pruner](./Pruner.md#l2filter-pruner) | Pruning filters with the smallest L2 norm of weights in convolution layers |
-| [ActivationAPoZRankFilterPruner](./Pruner.md#ActivationAPoZRankFilterPruner) | Pruning filters prunes the filters with the smallest APoZ(average percentage of zeros) of output activations(Network Trimming: A Data-Driven Neuron Pruning Approach towards Efficient Deep Architectures)[Reference Paper](https://arxiv.org/abs/1607.03250) |
-| [ActivationMeanRankFilterPruner](./Pruner.md#ActivationMeanRankFilterPruner) | Pruning filters prunes the filters with the smallest mean value of output activations(Pruning Convolutional Neural Networks for Resource Efficient Inference)[Reference Paper](https://arxiv.org/abs/1611.06440) |
-| [Slim Pruner](./Pruner.md#slim-pruner) | Pruning channels in convolution layers by pruning scaling factors in BN layers(Learning Efficient Convolutional Networks through Network Slimming)[Reference Paper](https://arxiv.org/abs/1708.06519) |
+| [ActivationAPoZRankFilterPruner](./Pruner.md#ActivationAPoZRankFilterPruner) | Pruning filters based on the metric APoZ (average percentage of zeros) which measures the percentage of zeros in activations of (convolutional) layers. [Reference Paper](https://arxiv.org/abs/1607.03250) |
+| [ActivationMeanRankFilterPruner](./Pruner.md#ActivationMeanRankFilterPruner) | Pruning filters based on the metric that calculates the smallest mean value of output activations |
+| [Slim Pruner](./Pruner.md#slim-pruner) | Pruning channels in convolution layers by pruning scaling factors in BN layers(Learning Efficient Convolutional Networks through Network Slimming) [Reference Paper](https://arxiv.org/abs/1708.06519) |
 
 
 **Quantization**
diff --git a/docs/en_US/Compressor/Pruner.md b/docs/en_US/Compressor/Pruner.md
index a96414edae..4b5fdf16af 100644
--- a/docs/en_US/Compressor/Pruner.md
+++ b/docs/en_US/Compressor/Pruner.md
@@ -1,6 +1,19 @@
 Pruner on NNI Compressor
 ===
 
+Index of supported pruning algorithms
+* [Level Pruner](#level-pruner)
+* [AGP Pruner](#agp-pruner)
+* [Lottery Ticket Hypothesis](#lottery-ticket-hypothesis)
+* [Slim Pruner](#slim-pruner)
+* [Filter Pruners with Weight Rank](#weightrankfilterpruner)
+    * [FPGM Pruner](#fpgm-pruner)
+    * [L1Filter Pruner](#l1filter-pruner)
+    * [L2Filter Pruner](#l2filter-pruner)
+* [Filter Pruners with Activation Rank](#activationrankfilterpruner)
+    * [APoZ Rank Pruner](#activationapozrankfilterpruner)
+    * [Activation Mean Rank Pruner](#activationmeanrankfilterpruner)
+
 ## Level Pruner
 
 This is one basic one-shot pruner: you can set a target sparsity level (expressed as a fraction, 0.6 means we will prune 60%). 
@@ -131,13 +144,43 @@ The above configuration means that there are 5 times of iterative pruning. As th
 * **sparsity:** The final sparsity when the compression is done.
 
 ***
+
+## Slim Pruner
+
+This is an one-shot pruner, In ['Learning Efficient Convolutional Networks through Network Slimming'](https://arxiv.org/pdf/1708.06519.pdf), authors Zhuang Liu, Jianguo Li, Zhiqiang Shen, Gao Huang, Shoumeng Yan and Changshui Zhang.
+
+![](../../img/slim_pruner.png)
+
+> Slim Pruner **prunes channels in the convolution layers by masking corresponding scaling factors in the later BN layers**, L1 regularization on the scaling factors should be applied in batch normalization (BN) layers while training, scaling factors of BN layers are **globally ranked** while pruning, so the sparse model can be automatically found given sparsity.
+
+### Usage
+
+PyTorch code
+
+```python
+from nni.compression.torch import SlimPruner
+config_list = [{ 'sparsity': 0.8, 'op_types': ['BatchNorm2d'] }]
+pruner = SlimPruner(model, config_list)
+pruner.compress()
+```
+
+#### User configuration for Slim Pruner
+
+- **sparsity:** This is to specify the sparsity operations to be compressed to
+- **op_types:** Only BatchNorm2d is supported in Slim Pruner
+
+
 ## WeightRankFilterPruner
 WeightRankFilterPruner is a series of pruners which prune the filters with the smallest importance criterion calculated from the weights in convolution layers to achieve a preset level of network sparsity
 
-### 1, FPGM Pruner
+### FPGM Pruner
 
 This is an one-shot pruner, FPGM Pruner is an implementation of paper [Filter Pruning via Geometric Median for Deep Convolutional Neural Networks Acceleration](https://arxiv.org/pdf/1811.00250.pdf)
 
+FPGMPruner prune filters with the smallest geometric median
+
+ ![](../../img/fpgm_fig1.png)
+
 >Previous works utilized “smaller-norm-less-important” criterion to prune filters with smaller norm values in a convolutional neural network. In this paper, we analyze this norm-based criterion and point out that its effectiveness depends on two requirements that are not always met: (1) the norm deviation of the filters should be large; (2) the minimum norm of the filters should be small. To solve this problem, we propose a novel filter pruning method, namely Filter Pruning via Geometric Median (FPGM), to compress the model regardless of those two requirements. Unlike previous methods, FPGM compresses CNN models by pruning filters with redundancy, rather than those with “relatively less” importance.
 
 #### Usage
@@ -181,9 +224,9 @@ You can view example for more information
 
 ***
 
-### 2, L1Filter Pruner
+### L1Filter Pruner
 
-This is an one-shot pruner, In ['PRUNING FILTERS FOR EFFICIENT CONVNETS'](https://arxiv.org/abs/1608.08710), authors Hao Li, Asim Kadav, Igor Durdanovic, Hanan Samet and Hans Peter Graf.
+This is an one-shot pruner, In ['PRUNING FILTERS FOR EFFICIENT CONVNETS'](https://arxiv.org/abs/1608.08710), authors Hao Li, Asim Kadav, Igor Durdanovic, Hanan Samet and Hans Peter Graf. The reproduced experiment results can be found [here](l1filterpruner.md)
 
 ![](../../img/l1filter_pruner.png)
 
@@ -217,9 +260,9 @@ pruner.compress()
 
 ***
 
-### 3, L2Filter Pruner
+### L2Filter Pruner
 
-This is a structured pruning algorithm that prunes the filters with the smallest L2 norm of the weights.
+This is a structured pruning algorithm that prunes the filters with the smallest L2 norm of the weights. It is implemented as a one-shot pruner.
 
 #### Usage
 
@@ -240,9 +283,13 @@ pruner.compress()
 ## ActivationRankFilterPruner
 ActivationRankFilterPruner is a series of pruners which prune the filters with the smallest importance criterion calculated from the output activations of convolution layers to achieve a preset level of network sparsity
 
-### 1, ActivationAPoZRankFilterPruner
+### ActivationAPoZRankFilterPruner
+
+We implemented it as a one-shot pruner, it prunes convolutional layers based on the criterion `APoZ` which is explained in the paper [Network Trimming: A Data-Driven Neuron Pruning Approach towards Efficient Deep Architectures](https://arxiv.org/abs/1607.03250). Iterative pruning based on `APoZ` will be supported in future release.
 
-This is an one-shot pruner, ActivationAPoZRankFilterPruner is an implementation of paper [Network Trimming: A Data-Driven Neuron Pruning Approach towards Efficient Deep Architectures](https://arxiv.org/abs/1607.03250)
+The APoZ is defined as:
+
+![](../../img/apoz.png)
 
 #### Usage
 
@@ -269,9 +316,9 @@ You can view example for more information
 
 ***
 
-### 2, ActivationMeanRankFilterPruner
+### ActivationMeanRankFilterPruner
 
-This is an one-shot pruner, ActivationMeanRankFilterPruner is an implementation of paper [Pruning Convolutional Neural Networks for Resource Efficient Inference](https://arxiv.org/abs/1611.06440)
+We implemented it as a one-shot pruner, it prunes convolutional layers based on the criterion `mean activation` which is explained in section 2.2 of the paper[Pruning Convolutional Neural Networks for Resource Efficient Inference](https://arxiv.org/abs/1611.06440). Other pruning criteria mentioned in this paper will be supported in future release.
 
 #### Usage
 
@@ -296,28 +343,4 @@ You can view example for more information
 - **sparsity:** How much percentage of convolutional filters are to be pruned.
 - **op_types:** Only Conv2d is supported in ActivationMeanRankFilterPruner
 
-***
-
-## Slim Pruner
-
-This is an one-shot pruner, In ['Learning Efficient Convolutional Networks through Network Slimming'](https://arxiv.org/pdf/1708.06519.pdf), authors Zhuang Liu, Jianguo Li, Zhiqiang Shen, Gao Huang, Shoumeng Yan and Changshui Zhang.
-
-![](../../img/slim_pruner.png)
-
-> Slim Pruner **prunes channels in the convolution layers by masking corresponding scaling factors in the later BN layers**, L1 regularization on the scaling factors should be applied in batch normalization (BN) layers while training, scaling factors of BN layers are **globally ranked** while pruning, so the sparse model can be automatically found given sparsity.
-
-### Usage
-
-PyTorch code
-
-```python
-from nni.compression.torch import SlimPruner
-config_list = [{ 'sparsity': 0.8, 'op_types': ['BatchNorm2d'] }]
-pruner = SlimPruner(model, config_list)
-pruner.compress()
-```
-
-#### User configuration for Slim Pruner
-
-- **sparsity:** This is to specify the sparsity operations to be compressed to
-- **op_types:** Only BatchNorm2d is supported in Slim Pruner
+***
\ No newline at end of file
diff --git a/docs/en_US/Compressor/WeightRankFilterPruner.md b/docs/en_US/Compressor/l1filterpruner.md
similarity index 51%
rename from docs/en_US/Compressor/WeightRankFilterPruner.md
rename to docs/en_US/Compressor/l1filterpruner.md
index ef99dcff03..dc42d6478d 100644
--- a/docs/en_US/Compressor/WeightRankFilterPruner.md
+++ b/docs/en_US/Compressor/l1filterpruner.md
@@ -1,19 +1,7 @@
-WeightRankFilterPruner on NNI Compressor
+L1FilterPruner on NNI
 ===
 
-## 1. Introduction
-
-WeightRankFilterPruner is a series of pruners which prune filters according to some importance criterion calculated from the filters' weight.
-
-|     Pruner     |    Importance criterion     |                       Reference paper                        |
-| :------------: | :-------------------------: | :----------------------------------------------------------: |
-| L1FilterPruner |     L1 norm of weights      | [PRUNING FILTERS FOR EFFICIENT CONVNETS](https://arxiv.org/abs/1608.08710) |
-| L2FilterPruner |     L2 norm of weights      |                                                              |
-|   FPGMPruner   | Geometric Median of weights | [Filter Pruning via Geometric Median for Deep Convolutional Neural Networks Acceleration](https://arxiv.org/pdf/1811.00250.pdf) |
-
-## 2. Pruners
-
-### L1FilterPruner
+## Introduction
 
 L1FilterPruner is a general structured pruning algorithm for pruning filters in the convolutional layers.
 
@@ -33,37 +21,7 @@ In ['PRUNING FILTERS FOR EFFICIENT CONVNETS'](https://arxiv.org/abs/1608.08710),
 > 4. A new kernel matrix is created for both the ![](http://latex.codecogs.com/gif.latex?i)th and ![](http://latex.codecogs.com/gif.latex?i+1)th layers, and the remaining kernel
 >      weights are copied to the new model.
 
-### L2FilterPruner
-
-L2FilterPruner is similar to L1FilterPruner, but only replace the importance criterion from L1 norm to L2 norm
-
-### FPGMPruner
-
-Yang He, Ping Liu, Ziwei Wang, Zhilan Hu, Yi Yang
-
-"[Filter Pruning via Geometric Median for Deep Convolutional Neural Networks Acceleration](https://arxiv.org/abs/1811.00250)", CVPR 2019.
-
-FPGMPruner prune filters with the smallest geometric median
-
- ![](../../img/fpgm_fig1.png)
-
-## 3. Usage
-
-PyTorch code
-
-```
-from nni.compression.torch import L1FilterPruner
-config_list = [{ 'sparsity': 0.8, 'op_types': ['Conv2d'], 'op_names': ['conv1', 'conv2'] }]
-pruner = L1FilterPruner(model, config_list)
-pruner.compress()
-```
-
-#### User configuration for L1Filter Pruner
-
-- **sparsity:** This is to specify the sparsity operations to be compressed to
-- **op_types:** Only Conv2d is supported in L1Filter Pruner
-
-## 4. Experiment
+## Experiment
 
 We implemented one of the experiments in ['PRUNING FILTERS FOR EFFICIENT CONVNETS'](https://arxiv.org/abs/1608.08710) with **L1FilterPruner**, we pruned **VGG-16** for CIFAR-10 to **VGG-16-pruned-A** in the paper, in which $64\%$ parameters are pruned. Our experiments results are as follows:
 

From 01385bb076efaa3ef7253c12fb01e938d5893d41 Mon Sep 17 00:00:00 2001
From: SparkSnail <shinyang@microsoft.com>
Date: Tue, 31 Dec 2019 11:11:15 +0800
Subject: [PATCH 11/14] Fix pai http format error, add protocol (#1898)

---
 .../training_service/pai/paiJobInfoCollector.ts | 16 ++++++++++------
 .../pai/paiK8S/paiK8STrainingService.ts         |  2 +-
 .../training_service/pai/paiTrainingService.ts  | 17 +++++++++++------
 .../pai/paiYarn/paiYarnTrainingService.ts       |  4 +++-
 4 files changed, 25 insertions(+), 14 deletions(-)

diff --git a/src/nni_manager/training_service/pai/paiJobInfoCollector.ts b/src/nni_manager/training_service/pai/paiJobInfoCollector.ts
index 07ec1cda7f..e88a4d6f41 100644
--- a/src/nni_manager/training_service/pai/paiJobInfoCollector.ts
+++ b/src/nni_manager/training_service/pai/paiJobInfoCollector.ts
@@ -25,7 +25,7 @@ export class PAIJobInfoCollector {
         this.finalStatuses = ['SUCCEEDED', 'FAILED', 'USER_CANCELED', 'SYS_CANCELED', 'EARLY_STOPPED'];
     }
 
-    public async retrieveTrialStatus(token? : string, paiBaseClusterConfig?: PAIClusterConfig): Promise<void> {
+    public async retrieveTrialStatus(protocol: string, token? : string, paiBaseClusterConfig?: PAIClusterConfig): Promise<void> {
         if (paiBaseClusterConfig === undefined || token === undefined) {
             return Promise.resolve();
         }
@@ -35,13 +35,13 @@ export class PAIJobInfoCollector {
             if (paiTrialJob === undefined) {
                 throw new NNIError(NNIErrorNames.NOT_FOUND, `trial job id ${trialJobId} not found`);
             }
-            updatePaiTrialJobs.push(this.getSinglePAITrialJobInfo(paiTrialJob, token, paiBaseClusterConfig));
+            updatePaiTrialJobs.push(this.getSinglePAITrialJobInfo(protocol, paiTrialJob, token, paiBaseClusterConfig));
         }
 
         await Promise.all(updatePaiTrialJobs);
     }
 
-    private getSinglePAITrialJobInfo(paiTrialJob: PAITrialJobDetail, paiToken: string, paiClusterConfig: PAIClusterConfig): Promise<void> {
+    private getSinglePAITrialJobInfo(protocol: string, paiTrialJob: PAITrialJobDetail, paiToken: string, paiClusterConfig: PAIClusterConfig): Promise<void> {
         const deferred: Deferred<void> = new Deferred<void>();
         if (!this.statusesNeedToCheck.includes(paiTrialJob.status)) {
             deferred.resolve();
@@ -52,7 +52,7 @@ export class PAIJobInfoCollector {
         // Rest call to get PAI job info and update status
         // Refer https://github.com/Microsoft/pai/blob/master/docs/rest-server/API.md for more detail about PAI Rest API
         const getJobInfoRequest: request.Options = {
-            uri: `${paiClusterConfig.host}/rest-server/api/v1/user/${paiClusterConfig.userName}/jobs/${paiTrialJob.paiJobName}`,
+            uri: `${protocol}://${paiClusterConfig.host}/rest-server/api/v1/user/${paiClusterConfig.userName}/jobs/${paiTrialJob.paiJobName}`,
             method: 'GET',
             json: true,
                headers: {
@@ -81,7 +81,11 @@ export class PAIJobInfoCollector {
                                 paiTrialJob.startTime = response.body.jobStatus.appLaunchedTime;
                             }
                             if (paiTrialJob.url === undefined) {
-                                paiTrialJob.url = response.body.jobStatus.appTrackingUrl;
+                                if (response.body.jobStatus.appTrackingUrl) {
+                                    paiTrialJob.url = response.body.jobStatus.appTrackingUrl;
+                                } else {
+                                    paiTrialJob.url = paiTrialJob.logPath;
+                                }
                             }
                             break;
                         case 'SUCCEEDED':
@@ -114,7 +118,7 @@ export class PAIJobInfoCollector {
                         }
                         // Set pai trial job's url to WebHDFS output path
                         if (paiTrialJob.logPath !== undefined) {
-                            if (paiTrialJob.url) {
+                            if (paiTrialJob.url && paiTrialJob.url !== paiTrialJob.logPath) {
                                 paiTrialJob.url += `,${paiTrialJob.logPath}`;
                             } else {
                                 paiTrialJob.url = `${paiTrialJob.logPath}`;
diff --git a/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts b/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts
index 5c0360df6f..fc64d4dbdc 100644
--- a/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts
+++ b/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts
@@ -258,7 +258,7 @@ class PAIK8STrainingService extends PAITrainingService {
         // Step 3. Submit PAI job via Rest call
         // Refer https://github.com/Microsoft/pai/blob/master/docs/rest-server/API.md for more detail about PAI Rest API
         const submitJobRequest: request.Options = {
-            uri: `${this.paiClusterConfig.host}/rest-server/api/v2/jobs`,
+            uri: `${this.protocol}://${this.paiClusterConfig.host}/rest-server/api/v2/jobs`,
             method: 'POST',
             body: paiJobConfig,
             headers: {
diff --git a/src/nni_manager/training_service/pai/paiTrainingService.ts b/src/nni_manager/training_service/pai/paiTrainingService.ts
index 5709d4678d..01cd4ed9dc 100644
--- a/src/nni_manager/training_service/pai/paiTrainingService.ts
+++ b/src/nni_manager/training_service/pai/paiTrainingService.ts
@@ -52,6 +52,7 @@ abstract class PAITrainingService implements TrainingService {
     protected authFileHdfsPath: string | undefined = undefined;
     protected portList?: string | undefined;
     protected paiJobRestServer?: PAIJobRestServer;
+    protected protocol: string = 'http';
 
     constructor() {
         this.log = getLogger();
@@ -165,7 +166,7 @@ abstract class PAITrainingService implements TrainingService {
         }
 
         const stopJobRequest: request.Options = {
-            uri: `${this.paiClusterConfig.host}/rest-server/api/v1/user/${this.paiClusterConfig.userName}\
+            uri: `${this.protocol}://${this.paiClusterConfig.host}/rest-server/api/v1/user/${this.paiClusterConfig.userName}\
 /jobs/${trialJobDetail.paiJobName}/executionType`, 
             method: 'PUT',
             json: true,
@@ -219,10 +220,14 @@ abstract class PAITrainingService implements TrainingService {
     protected formatPAIHost(host: string): string {
         // If users' host start with 'http://' or 'https://', use the original host,
         // or format to 'http//${host}'
-        if (host.startsWith('http://') || host.startsWith('https://')) {
-            return host;
+        if (host.startsWith('http://')) {
+            this.protocol = 'http';
+            return host.replace('http://', '');
+        } else if (host.startsWith('https://')) {
+            this.protocol = 'https';
+            return host.replace('https://', '');
         } else {
-            return `http://${host}`;
+            return host;
         }
     }
 
@@ -239,7 +244,7 @@ abstract class PAITrainingService implements TrainingService {
                     }
                 }
             }
-            await this.paiJobCollector.retrieveTrialStatus(this.paiToken, this.paiClusterConfig);
+            await this.paiJobCollector.retrieveTrialStatus(this.protocol, this.paiToken, this.paiClusterConfig);
             if (this.paiJobRestServer === undefined) {
                 throw new Error('paiBaseJobRestServer not implemented!');
             }
@@ -269,7 +274,7 @@ abstract class PAITrainingService implements TrainingService {
         }
 
         const authenticationReq: request.Options = {
-            uri: `${this.paiClusterConfig.host}/rest-server/api/v1/token`,
+            uri: `${this.protocol}://${this.paiClusterConfig.host}/rest-server/api/v1/token`,
             method: 'POST',
             json: true,
             body: {
diff --git a/src/nni_manager/training_service/pai/paiYarn/paiYarnTrainingService.ts b/src/nni_manager/training_service/pai/paiYarn/paiYarnTrainingService.ts
index b10a7172ad..08038e5b59 100644
--- a/src/nni_manager/training_service/pai/paiYarn/paiYarnTrainingService.ts
+++ b/src/nni_manager/training_service/pai/paiYarn/paiYarnTrainingService.ts
@@ -91,6 +91,7 @@ class PAIYarnTrainingService extends PAITrainingService {
             case TrialConfigMetadataKey.PAI_YARN_CLUSTER_CONFIG:
                 this.paiJobRestServer = new PAIJobRestServer(component.get(PAIYarnTrainingService));
                 this.paiClusterConfig = <PAIClusterConfig>JSON.parse(value);
+                this.paiClusterConfig.host = this.formatPAIHost(this.paiClusterConfig.host);
 
                 this.hdfsClient = WebHDFS.createClient({
                     user: this.paiClusterConfig.userName,
@@ -98,6 +99,7 @@ class PAIYarnTrainingService extends PAITrainingService {
                     port: 80,
                     path: '/webhdfs/api/v1',
                     host: this.paiClusterConfig.host
+                    
                 });
                 this.paiClusterConfig.host = this.formatPAIHost(this.paiClusterConfig.host);
                 if(this.paiClusterConfig.passWord) {
@@ -272,7 +274,7 @@ class PAIYarnTrainingService extends PAITrainingService {
         // Step 3. Submit PAI job via Rest call
         // Refer https://github.com/Microsoft/pai/blob/master/docs/rest-server/API.md for more detail about PAI Rest API
         const submitJobRequest: request.Options = {
-            uri: `${this.paiClusterConfig.host}/rest-server/api/v1/user/${this.paiClusterConfig.userName}/jobs`,
+            uri: `${this.protocol}://${this.paiClusterConfig.host}/rest-server/api/v1/user/${this.paiClusterConfig.userName}/jobs`,
             method: 'POST',
             json: true,
             body: paiJobConfig,

From 31f545ee043becc272ea158b1d969d9539529662 Mon Sep 17 00:00:00 2001
From: Yan Ni <yann@microsoft.com>
Date: Tue, 31 Dec 2019 12:13:03 +0800
Subject: [PATCH 12/14] Update KDExample.md: fix missing links in doc (#1894)

---
 docs/en_US/TrialExample/KDExample.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/en_US/TrialExample/KDExample.md b/docs/en_US/TrialExample/KDExample.md
index b0ead466d2..51a91f0ea1 100644
--- a/docs/en_US/TrialExample/KDExample.md
+++ b/docs/en_US/TrialExample/KDExample.md
@@ -30,4 +30,4 @@ for batch_idx, (data, target) in enumerate(train_loader):
 * **kd_teacher_model:** The pre-trained teacher model 
 * **kd_T:** Temperature for smoothing teacher model's output
 
-The complete code can be found here
\ No newline at end of file
+The complete code can be found [here](https://github.com/microsoft/nni/tree/v1.3/examples/model_compress/knowledge_distill/)

From c993f767a8a5ce041921c1825ac407d9be847bb9 Mon Sep 17 00:00:00 2001
From: Yuge Zhang <v-yugzh@microsoft.com>
Date: Tue, 31 Dec 2019 13:00:24 +0800
Subject: [PATCH 13/14] Add SPOS docs and improve NAS doc structure (#1907)

* darts mutator docs

* fix docs

* update

* add docs for SPOS

* index SPOS

* restore workers
---
 docs/en_US/NAS/DARTS.md                       |  44 ++++++-
 docs/en_US/NAS/ENAS.md                        |  43 ++++++-
 docs/en_US/NAS/Overview.md                    |  79 ++----------
 docs/en_US/NAS/PDARTS.md                      |  18 +++
 docs/en_US/NAS/SPOS.md                        | 119 ++++++++++++++++++
 docs/en_US/nas.rst                            |   3 +-
 examples/nas/darts/README.md                  |   1 +
 examples/nas/enas/README.md                   |   1 +
 examples/nas/naive/README.md                  |   1 +
 examples/nas/pdarts/README.md                 |   1 +
 examples/nas/spos/README.md                   |  91 +-------------
 .../pynni/nni/nas/pytorch/darts/mutator.py    |  20 +++
 .../pynni/nni/nas/pytorch/darts/trainer.py    |  36 ++++++
 src/sdk/pynni/nni/nas/pytorch/enas/mutator.py |  25 ++++
 src/sdk/pynni/nni/nas/pytorch/enas/trainer.py |  48 +++++++
 .../pynni/nni/nas/pytorch/spos/evolution.py   |   1 +
 src/sdk/pynni/nni/nas/pytorch/spos/mutator.py |   1 +
 src/sdk/pynni/nni/nas/pytorch/spos/trainer.py |  31 +++++
 src/sdk/pynni/nni/nas/pytorch/trainer.py      |   2 +-
 19 files changed, 395 insertions(+), 170 deletions(-)
 create mode 100644 docs/en_US/NAS/PDARTS.md
 create mode 100644 docs/en_US/NAS/SPOS.md
 create mode 100644 examples/nas/darts/README.md
 create mode 100644 examples/nas/enas/README.md
 create mode 100644 examples/nas/naive/README.md
 create mode 100644 examples/nas/pdarts/README.md

diff --git a/docs/en_US/NAS/DARTS.md b/docs/en_US/NAS/DARTS.md
index 2621fc45b7..d742a8ef6f 100644
--- a/docs/en_US/NAS/DARTS.md
+++ b/docs/en_US/NAS/DARTS.md
@@ -1,18 +1,50 @@
-# DARTS on NNI
+# DARTS
 
 ## Introduction
 
-The paper [DARTS: Differentiable Architecture Search](https://arxiv.org/abs/1806.09055) addresses the scalability challenge of architecture search by formulating the task in a differentiable manner. Their method is based on the continuous relaxation of the architecture representation, allowing efficient search of the architecture using gradient descent
+The paper [DARTS: Differentiable Architecture Search](https://arxiv.org/abs/1806.09055) addresses the scalability challenge of architecture search by formulating the task in a differentiable manner. Their method is based on the continuous relaxation of the architecture representation, allowing efficient search of the architecture using gradient descent.
 
-To implement, authors optimize the network weights and architecture weights alternatively in mini-batches. They further explore the possibility that uses second order optimization (unroll) instead of first order, to improve the performance.
+Authors' code optimizes the network weights and architecture weights alternatively in mini-batches. They further explore the possibility that uses second order optimization (unroll) instead of first order, to improve the performance.
 
-Implementation on NNI is based on the [official implementation](https://github.com/quark0/darts) and a [popular 3rd-party repo](https://github.com/khanrc/pt.darts). So far, first and second order optimization and training from scratch on CIFAR10 have been implemented.
+Implementation on NNI is based on the [official implementation](https://github.com/quark0/darts) and a [popular 3rd-party repo](https://github.com/khanrc/pt.darts). DARTS on NNI is designed to be general for arbitrary search space. A CNN search space tailored for CIFAR10, same as the original paper, is implemented as a use case of DARTS.
 
-## Reproduce Results
+## Reproduction Results
 
-To reproduce the results in the paper, we do experiments with first and second order optimization. Due to the time limit, we retrain *only the best architecture* derived from the search phase and we repeat the experiment *only once*. Our results is currently on par with the results reported in paper. We will add more results later when ready.
+The above-mentioned example is meant to reproduce the results in the paper, we do experiments with first and second order optimization. Due to the time limit, we retrain *only the best architecture* derived from the search phase and we repeat the experiment *only once*. Our results is currently on par with the results reported in paper. We will add more results later when ready.
 
 |                        | In paper      | Reproduction |
 | ---------------------- | ------------- | ------------ |
 | First order (CIFAR10)  | 3.00 +/- 0.14 | 2.78         |
 | Second order (CIFAR10) | 2.76 +/- 0.09 | 2.89         |
+
+## Examples
+
+### CNN Search Space
+
+[Example code](https://github.com/microsoft/nni/tree/master/examples/nas/darts)
+
+```bash
+# In case NNI code is not cloned. If the code is cloned already, ignore this line and enter code folder.
+git clone https://github.com/Microsoft/nni.git
+
+# search the best architecture
+cd examples/nas/darts
+python3 search.py
+
+# train the best architecture
+python3 retrain.py --arc-checkpoint ./checkpoints/epoch_49.json
+```
+
+## Reference
+
+### PyTorch
+
+```eval_rst
+..  autoclass:: nni.nas.pytorch.darts.DartsTrainer
+    :members:
+
+    .. automethod:: __init__
+
+..  autoclass:: nni.nas.pytorch.darts.DartsMutator
+    :members:
+```
diff --git a/docs/en_US/NAS/ENAS.md b/docs/en_US/NAS/ENAS.md
index 164bc4397d..ad389f28b9 100644
--- a/docs/en_US/NAS/ENAS.md
+++ b/docs/en_US/NAS/ENAS.md
@@ -1,7 +1,46 @@
-# ENAS on NNI
+# ENAS
 
 ## Introduction
 
 The paper [Efficient Neural Architecture Search via Parameter Sharing](https://arxiv.org/abs/1802.03268) uses parameter sharing between child models to accelerate the NAS process. In ENAS, a controller learns to discover neural network architectures by searching for an optimal subgraph within a large computational graph. The controller is trained with policy gradient to select a subgraph that maximizes the expected reward on the validation set. Meanwhile the model corresponding to the selected subgraph is trained to minimize a canonical cross entropy loss.
 
-Implementation on NNI is based on the [official implementation in Tensorflow](https://github.com/melodyguan/enas), macro and micro search space on CIFAR10 included. Since code to train from scratch on NNI is not ready yet, reproduction results are currently unavailable.
+Implementation on NNI is based on the [official implementation in Tensorflow](https://github.com/melodyguan/enas), including a general-purpose Reinforcement-learning controller and a trainer that trains target network and this controller alternatively. Following paper, we have also implemented macro and micro search space on CIFAR10 to demonstrate how to use these trainers. Since code to train from scratch on NNI is not ready yet, reproduction results are currently unavailable.
+
+## Examples
+
+### CIFAR10 Macro/Micro Search Space
+
+[Example code](https://github.com/microsoft/nni/tree/master/examples/nas/enas)
+
+```bash
+# In case NNI code is not cloned. If the code is cloned already, ignore this line and enter code folder.
+git clone https://github.com/Microsoft/nni.git
+
+# search the best architecture
+cd examples/nas/enas
+
+# search in macro search space
+python3 search.py --search-for macro
+
+# search in micro search space
+python3 search.py --search-for micro
+
+# view more options for search
+python3 search.py -h
+```
+
+## Reference
+
+### PyTorch
+
+```eval_rst
+..  autoclass:: nni.nas.pytorch.enas.EnasTrainer
+    :members:
+
+    .. automethod:: __init__
+
+..  autoclass:: nni.nas.pytorch.enas.EnasMutator
+    :members:
+
+    .. automethod:: __init__
+```
diff --git a/docs/en_US/NAS/Overview.md b/docs/en_US/NAS/Overview.md
index 3426673669..aaf6e599c3 100644
--- a/docs/en_US/NAS/Overview.md
+++ b/docs/en_US/NAS/Overview.md
@@ -6,7 +6,7 @@ However, it takes great efforts to implement NAS algorithms, and it is hard to r
 
 With this motivation, our ambition is to provide a unified architecture in NNI, to accelerate innovations on NAS, and apply state-of-art algorithms on real world problems faster.
 
-With [the unified interface](./NasInterface.md), there are two different modes for the architecture search. [The one](#supported-one-shot-nas-algorithms) is the so-called one-shot NAS, where a super-net is built based on search space, and using one shot training to generate good-performing child model. [The other](./NasInterface.md#classic-distributed-search) is the traditional searching approach, where each child model in search space runs as an independent trial, the performance result is sent to tuner and the tuner generates new child model.
+With [the unified interface](./NasInterface.md), there are two different modes for the architecture search. [One](#supported-one-shot-nas-algorithms) is the so-called one-shot NAS, where a super-net is built based on search space, and using one shot training to generate good-performing child model. [The other](./NasInterface.md#classic-distributed-search) is the traditional searching approach, where each child model in search space runs as an independent trial, the performance result is sent to tuner and the tuner generates new child model.
 
 * [Supported One-shot NAS Algorithms](#supported-one-shot-nas-algorithms)
 * [Classic Distributed NAS with NNI experiment](./NasInterface.md#classic-distributed-search)
@@ -14,85 +14,24 @@ With [the unified interface](./NasInterface.md), there are two different modes f
 
 ## Supported One-shot NAS Algorithms
 
-NNI supports below NAS algorithms now and being adding more. User can reproduce an algorithm or use it on owned dataset. we also encourage user to implement other algorithms with [NNI API](#use-nni-api), to benefit more people.
+NNI supports below NAS algorithms now and is adding more. User can reproduce an algorithm or use it on their own dataset. We also encourage users to implement other algorithms with [NNI API](#use-nni-api), to benefit more people.
 
 |Name|Brief Introduction of Algorithm|
 |---|---|
-| [ENAS](#enas) | Efficient Neural Architecture Search via Parameter Sharing [Reference Paper][1] |
-| [DARTS](#darts) | DARTS: Differentiable Architecture Search [Reference Paper][3] |
-| [P-DARTS](#p-darts) | Progressive Differentiable Architecture Search: Bridging the Depth Gap between Search and Evaluation [Reference Paper](https://arxiv.org/abs/1904.12760)|
+| [ENAS](ENAS.md) | [Efficient Neural Architecture Search via Parameter Sharing](https://arxiv.org/abs/1802.03268). In ENAS, a controller learns to discover neural network architectures by searching for an optimal subgraph within a large computational graph. It uses parameter sharing between child models to achieve fast speed and excellent performance. |
+| [DARTS](DARTS.md) | [DARTS: Differentiable Architecture Search](https://arxiv.org/abs/1806.09055) introduces a novel algorithm for differentiable network architecture search on bilevel optimization. |
+| [P-DARTS](PDARTS.md) | [Progressive Differentiable Architecture Search: Bridging the Depth Gap between Search and Evaluation](https://arxiv.org/abs/1904.12760) is based on DARTS. It introduces an efficient algorithm which allows the depth of searched architectures to grow gradually during the training procedure. |
+| [SPOS](SPOS.md) | [Single Path One-Shot Neural Architecture Search with Uniform Sampling](https://arxiv.org/abs/1904.00420) constructs a simplified supernet trained with an uniform path sampling method, and applies an evolutionary algorithm to efficiently search for the best-performing architectures. |
 
-Note, these algorithms run **standalone without nnictl**, and supports PyTorch only. Tensorflow 2.0 will be supported in future release.
+One-shot algorithms run **standalone without nnictl**. Only PyTorch version has been implemented. Tensorflow 2.x will be supported in future release.
 
-### Dependencies
+Here are some common dependencies to run the examples. PyTorch needs to be above 1.2 to use ``BoolTensor``.
 
 * NNI 1.2+
 * tensorboard
 * PyTorch 1.2+
 * git
 
-### ENAS
-
-[Efficient Neural Architecture Search via Parameter Sharing][1]. In ENAS, a controller learns to discover neural network architectures by searching for an optimal subgraph within a large computational graph. It uses parameter sharing between child models to achieve fast speed and excellent performance.
-
-#### Usage
-
-ENAS in NNI is still under development and we only support search phase for macro/micro search space on CIFAR10. Training from scratch and search space on PTB has not been finished yet. [Detailed Description](ENAS.md)
-
-```bash
-# In case NNI code is not cloned. If the code is cloned already, ignore this line and enter code folder.
-git clone https://github.com/Microsoft/nni.git
-
-# search the best architecture
-cd examples/nas/enas
-
-# search in macro search space
-python3 search.py --search-for macro
-
-# search in micro search space
-python3 search.py --search-for micro
-
-# view more options for search
-python3 search.py -h
-```
-
-### DARTS
-
-The main contribution of [DARTS: Differentiable Architecture Search][3] on algorithm is to introduce a novel algorithm for differentiable network architecture search on bilevel optimization. [Detailed Description](DARTS.md)
-
-#### Usage
-
-```bash
-# In case NNI code is not cloned. If the code is cloned already, ignore this line and enter code folder.
-git clone https://github.com/Microsoft/nni.git
-
-# search the best architecture
-cd examples/nas/darts
-python3 search.py
-
-# train the best architecture
-python3 retrain.py --arc-checkpoint ./checkpoints/epoch_49.json
-```
-
-### P-DARTS
-
-[Progressive Differentiable Architecture Search: Bridging the Depth Gap between Search and Evaluation](https://arxiv.org/abs/1904.12760) bases on [DARTS](#DARTS). It's contribution on algorithm is to introduce an efficient algorithm which allows the depth of searched architectures to grow gradually during the training procedure.
-
-#### Usage
-
-```bash
-# In case NNI code is not cloned. If the code is cloned already, ignore this line and enter code folder.
-git clone https://github.com/Microsoft/nni.git
-
-# search the best architecture
-cd examples/nas/pdarts
-python3 search.py
-
-# train the best architecture, it's the same progress as darts.
-cd ../darts
-python3 retrain.py --arc-checkpoint ../pdarts/checkpoints/epoch_2.json
-```
-
 ## Use NNI API
 
 NOTE, we are trying to support various NAS algorithms with unified programming interface, and it's in very experimental stage. It means the current programing interface may be updated in future.
@@ -104,7 +43,7 @@ The programming interface of designing and searching a model is often demanded i
 1. When designing a neural network, there may be multiple operation choices on a layer, sub-model, or connection, and it's undetermined which one or combination performs  best. So, it needs an easy way to express the candidate layers or sub-models.
 2. When applying NAS on a neural network, it needs an unified way to express the search space of architectures, so that it doesn't need to update trial code for different searching algorithms.
 
-NNI proposed API is [here](https://github.com/microsoft/nni/tree/master/src/sdk/pynni/nni/nas/pytorch). And [here](https://github.com/microsoft/nni/tree/master/examples/nas/darts) is an example of NAS implementation, which bases on NNI proposed interface.
+NNI proposed API is [here](https://github.com/microsoft/nni/tree/master/src/sdk/pynni/nni/nas/pytorch). And [here](https://github.com/microsoft/nni/tree/master/examples/nas/naive) is an example of NAS implementation, which bases on NNI proposed interface.
 
 [1]: https://arxiv.org/abs/1802.03268
 [2]: https://arxiv.org/abs/1707.07012
diff --git a/docs/en_US/NAS/PDARTS.md b/docs/en_US/NAS/PDARTS.md
new file mode 100644
index 0000000000..e7ebede1c8
--- /dev/null
+++ b/docs/en_US/NAS/PDARTS.md
@@ -0,0 +1,18 @@
+# P-DARTS
+
+## Examples
+
+[Example code](https://github.com/microsoft/nni/tree/master/examples/nas/pdarts)
+
+```bash
+# In case NNI code is not cloned. If the code is cloned already, ignore this line and enter code folder.
+git clone https://github.com/Microsoft/nni.git
+
+# search the best architecture
+cd examples/nas/pdarts
+python3 search.py
+
+# train the best architecture, it's the same progress as darts.
+cd ../darts
+python3 retrain.py --arc-checkpoint ../pdarts/checkpoints/epoch_2.json
+```
diff --git a/docs/en_US/NAS/SPOS.md b/docs/en_US/NAS/SPOS.md
new file mode 100644
index 0000000000..189310c1a1
--- /dev/null
+++ b/docs/en_US/NAS/SPOS.md
@@ -0,0 +1,119 @@
+# Single Path One-Shot (SPOS)
+
+## Introduction
+
+Proposed in [Single Path One-Shot Neural Architecture Search with Uniform Sampling](https://arxiv.org/abs/1904.00420) is a one-shot NAS method that addresses the difficulties in training One-Shot NAS models by constructing a simplified supernet trained with an uniform path sampling method, so that all underlying architectures (and their weights) get trained fully and equally. An evolutionary algorithm is then applied to efficiently search for the best-performing architectures without any fine tuning.
+
+Implementation on NNI is based on [official repo](https://github.com/megvii-model/SinglePathOneShot). We implement a trainer that trains the supernet and a evolution tuner that leverages the power of NNI framework that speeds up the evolutionary search phase. We have also shown 
+
+## Examples
+
+Here is a use case, which is the search space in paper, and the way to use flops limit to perform uniform sampling.
+
+[Example code](https://github.com/microsoft/nni/tree/master/examples/nas/spos)
+
+### Requirements
+
+NVIDIA DALI >= 0.16 is needed as we use DALI to accelerate the data loading of ImageNet. [Installation guide](https://docs.nvidia.com/deeplearning/sdk/dali-developer-guide/docs/installation.html)
+
+Download the flops lookup table from [here](https://1drv.ms/u/s!Am_mmG2-KsrnajesvSdfsq_cN48?e=aHVppN) (maintained by [Megvii](https://github.com/megvii-model)).
+Put `op_flops_dict.pkl` and `checkpoint-150000.pth.tar` (if you don't want to retrain the supernet) under `data` directory.
+
+Prepare ImageNet in the standard format (follow the script [here](https://gist.github.com/BIGBALLON/8a71d225eff18d88e469e6ea9b39cef4)). Linking it to `data/imagenet` will be more convenient.
+
+After preparation, it's expected to have the following code structure:
+
+```
+spos
+├── architecture_final.json
+├── blocks.py
+├── config_search.yml
+├── data
+│   ├── imagenet
+│   │   ├── train
+│   │   └── val
+│   └── op_flops_dict.pkl
+├── dataloader.py
+├── network.py
+├── readme.md
+├── scratch.py
+├── supernet.py
+├── tester.py
+├── tuner.py
+└── utils.py
+```
+
+### Step 1. Train Supernet
+
+```
+python supernet.py
+```
+
+Will export the checkpoint to `checkpoints` directory, for the next step.
+
+NOTE: The data loading used in the official repo is [slightly different from usual](https://github.com/megvii-model/SinglePathOneShot/issues/5), as they use BGR tensor and keep the values between 0 and 255 intentionally to align with their own DL framework. The option `--spos-preprocessing` will simulate the behavior used originally and enable you to use the checkpoints pretrained.
+
+### Step 2. Evolution Search
+
+Single Path One-Shot leverages evolution algorithm to search for the best architecture. The tester, which is responsible for testing the sampled architecture, recalculates all the batch norm for a subset of training images, and evaluates the architecture on the full validation set.
+
+In order to make the tuner aware of the flops limit and have the ability to calculate the flops, we created a new tuner called `EvolutionWithFlops` in `tuner.py`, inheriting the tuner in SDK.
+
+To have a search space ready for NNI framework, first run
+
+```
+nnictl ss_gen -t "python tester.py"
+```
+
+This will generate a file called `nni_auto_gen_search_space.json`, which is a serialized representation of your search space.
+
+By default, it will use `checkpoint-150000.pth.tar` downloaded previously. In case you want to use the checkpoint trained by yourself from the last step, specify `--checkpoint` in the command in `config_search.yml`.
+
+Then search with evolution tuner.
+
+```
+nnictl create --config config_search.yml
+```
+
+The final architecture exported from every epoch of evolution can be found in `checkpoints` under the working directory of your tuner, which, by default, is `$HOME/nni/experiments/your_experiment_id/log`.
+
+### Step 3. Train from Scratch
+
+```
+python scratch.py
+```
+
+By default, it will use `architecture_final.json`. This architecture is provided by the official repo (converted into NNI format). You can use any architecture (e.g., the architecture found in step 2) with `--fixed-arc` option.
+
+## Reference
+
+### PyTorch
+
+```eval_rst
+..  autoclass:: nni.nas.pytorch.spos.SPOSEvolution
+    :members:
+
+    .. automethod:: __init__
+
+..  autoclass:: nni.nas.pytorch.spos.SPOSSupernetTrainer
+    :members:
+
+    .. automethod:: __init__
+
+..  autoclass:: nni.nas.pytorch.spos.SPOSSupernetTrainingMutator
+    :members:
+
+    .. automethod:: __init__
+```
+
+## Known Limitations
+
+* Block search only. Channel search is not supported yet.
+* Only GPU version is provided here.
+
+## Current Reproduction Results
+
+Reproduction is still undergoing. Due to the gap between official release and original paper, we compare our current results with official repo (our run) and paper.
+
+* Evolution phase is almost aligned with official repo. Our evolution algorithm shows a converging trend and reaches ~65% accuracy at the end of search. Nevertheless, this result is not on par with paper. For details, please refer to [this issue](https://github.com/megvii-model/SinglePathOneShot/issues/6).
+* Retrain phase is not aligned. Our retraining code, which uses the architecture released by the authors, reaches 72.14% accuracy, still having a gap towards 73.61% by official release and 74.3% reported in original paper.
diff --git a/docs/en_US/nas.rst b/docs/en_US/nas.rst
index 2228e52d76..32c235b3bb 100644
--- a/docs/en_US/nas.rst
+++ b/docs/en_US/nas.rst
@@ -22,4 +22,5 @@ For details, please refer to the following tutorials:
     NAS Interface <NAS/NasInterface>
     ENAS <NAS/ENAS>
     DARTS <NAS/DARTS>
-    P-DARTS <NAS/Overview>
+    P-DARTS <NAS/PDARTS>
+    SPOS <NAS/SPOS>
diff --git a/examples/nas/darts/README.md b/examples/nas/darts/README.md
new file mode 100644
index 0000000000..6977be71ef
--- /dev/null
+++ b/examples/nas/darts/README.md
@@ -0,0 +1 @@
+[Documentation](https://nni.readthedocs.io/en/latest/NAS/DARTS.html)
diff --git a/examples/nas/enas/README.md b/examples/nas/enas/README.md
new file mode 100644
index 0000000000..c942ff41ad
--- /dev/null
+++ b/examples/nas/enas/README.md
@@ -0,0 +1 @@
+[Documentation](https://nni.readthedocs.io/en/latest/NAS/ENAS.html)
diff --git a/examples/nas/naive/README.md b/examples/nas/naive/README.md
new file mode 100644
index 0000000000..871d7f0fdd
--- /dev/null
+++ b/examples/nas/naive/README.md
@@ -0,0 +1 @@
+This is a naive example that demonstrates how to use NNI interface to implement a NAS search space.
\ No newline at end of file
diff --git a/examples/nas/pdarts/README.md b/examples/nas/pdarts/README.md
new file mode 100644
index 0000000000..15465360b1
--- /dev/null
+++ b/examples/nas/pdarts/README.md
@@ -0,0 +1 @@
+[Documentation](https://nni.readthedocs.io/en/latest/NAS/PDARTS.html)
diff --git a/examples/nas/spos/README.md b/examples/nas/spos/README.md
index 0bba5968f9..e9d3fafc86 100644
--- a/examples/nas/spos/README.md
+++ b/examples/nas/spos/README.md
@@ -1,90 +1 @@
-# Single Path One-Shot Neural Architecture Search with Uniform Sampling
-
-Single Path One-Shot by Megvii Research. [Paper link](https://arxiv.org/abs/1904.00420). [Official repo](https://github.com/megvii-model/SinglePathOneShot).
-
-Block search only. Channel search is not supported yet.
-
-Only GPU version is provided here.
-
-## Preparation
-
-### Requirements
-
-* PyTorch >= 1.2
-* NVIDIA DALI >= 0.16 as we use DALI to accelerate the data loading of ImageNet. [Installation guide](https://docs.nvidia.com/deeplearning/sdk/dali-developer-guide/docs/installation.html)
-
-### Data
-
-Need to download the flops lookup table from [here](https://1drv.ms/u/s!Am_mmG2-KsrnajesvSdfsq_cN48?e=aHVppN).
-Put `op_flops_dict.pkl` and `checkpoint-150000.pth.tar` (if you don't want to retrain the supernet) under `data` directory.
-
-Prepare ImageNet in the standard format (follow the script [here](https://gist.github.com/BIGBALLON/8a71d225eff18d88e469e6ea9b39cef4)). Linking it to `data/imagenet` will be more convenient.
-
-After preparation, it's expected to have the following code structure:
-
-```
-spos
-├── architecture_final.json
-├── blocks.py
-├── config_search.yml
-├── data
-│   ├── imagenet
-│   │   ├── train
-│   │   └── val
-│   └── op_flops_dict.pkl
-├── dataloader.py
-├── network.py
-├── readme.md
-├── scratch.py
-├── supernet.py
-├── tester.py
-├── tuner.py
-└── utils.py
-```
-
-## Step 1. Train Supernet
-
-```
-python supernet.py
-```
-
-Will export the checkpoint to `checkpoints` directory, for the next step.
-
-NOTE: The data loading used in the official repo is [slightly different from usual](https://github.com/megvii-model/SinglePathOneShot/issues/5), as they use BGR tensor and keep the values between 0 and 255 intentionally to align with their own DL framework. The option `--spos-preprocessing` will simulate the behavior used originally and enable you to use the checkpoints pretrained.
-
-## Step 2. Evolution Search
-
-Single Path One-Shot leverages evolution algorithm to search for the best architecture. The tester, which is responsible for testing the sampled architecture, recalculates all the batch norm for a subset of training images, and evaluates the architecture on the full validation set.
-
-To have a search space ready for NNI framework, first run
-
-```
-nnictl ss_gen -t "python tester.py"
-```
-
-This will generate a file called `nni_auto_gen_search_space.json`, which is a serialized representation of your search space.
-
-By default, it will use `checkpoint-150000.pth.tar` downloaded previously. In case you want to use the checkpoint trained by yourself from the last step, specify `--checkpoint` in the command in `config_search.yml`.
-
-Then search with evolution tuner.
-
-```
-nnictl create --config config_search.yml
-```
-
-The final architecture exported from every epoch of evolution can be found in `checkpoints` under the working directory of your tuner, which, by default, is `$HOME/nni/experiments/your_experiment_id/log`.
-
-## Step 3. Train from Scratch
-
-```
-python scratch.py
-```
-
-By default, it will use `architecture_final.json`. This architecture is provided by the official repo (converted into NNI format). You can use any architecture (e.g., the architecture found in step 2) with `--fixed-arc` option.
-
-## Current Reproduction Results
-
-Reproduction is still undergoing. Due to the gap between official release and original paper, we compare our current results with official repo (our run) and paper.
-
-* Evolution phase is almost aligned with official repo. Our evolution algorithm shows a converging trend and reaches ~65% accuracy at the end of search. Nevertheless, this result is not on par with paper. For details, please refer to [this issue](https://github.com/megvii-model/SinglePathOneShot/issues/6).
-* Retrain phase is not aligned. Our retraining code, which uses the architecture released by the authors, reaches 72.14% accuracy, still having a gap towards 73.61% by official release and 74.3% reported in original paper.
+[Documentation](https://nni.readthedocs.io/en/latest/NAS/SPOS.html)
diff --git a/src/sdk/pynni/nni/nas/pytorch/darts/mutator.py b/src/sdk/pynni/nni/nas/pytorch/darts/mutator.py
index b257e32351..b3a21f3a31 100644
--- a/src/sdk/pynni/nni/nas/pytorch/darts/mutator.py
+++ b/src/sdk/pynni/nni/nas/pytorch/darts/mutator.py
@@ -14,6 +14,26 @@
 
 
 class DartsMutator(Mutator):
+    """
+    Connects the model in a DARTS (differentiable) way.
+
+    An extra connection is automatically inserted for each LayerChoice, when this connection is selected, there is no
+    op on this LayerChoice (namely a ``ZeroOp``), in which case, every element in the exported choice list is ``false``
+    (not chosen).
+
+    All input choice will be fully connected in the search phase. On exporting, the input choice will choose inputs based
+    on keys in ``choose_from``. If the keys were to be keys of LayerChoices, the top logit of the corresponding LayerChoice
+    will join the competition of input choice to compete against other logits. Otherwise, the logit will be assumed 0.
+
+    It's possible to cut branches by setting parameter ``choices`` in a particular position to ``-inf``. After softmax, the
+    value would be 0. Framework will ignore 0 values and not connect. Note that the gradient on the ``-inf`` location will
+    be 0. Since manipulations with ``-inf`` will be ``nan``, you need to handle the gradient update phase carefully.
+
+    Attributes
+    ----------
+    choices: ParameterDict
+        dict that maps keys of LayerChoices to weighted-connection float tensors.
+    """
     def __init__(self, model):
         super().__init__(model)
         self.choices = nn.ParameterDict()
diff --git a/src/sdk/pynni/nni/nas/pytorch/darts/trainer.py b/src/sdk/pynni/nni/nas/pytorch/darts/trainer.py
index 2032631b2b..9ea2085852 100644
--- a/src/sdk/pynni/nni/nas/pytorch/darts/trainer.py
+++ b/src/sdk/pynni/nni/nas/pytorch/darts/trainer.py
@@ -19,6 +19,42 @@ def __init__(self, model, loss, metrics,
                  optimizer, num_epochs, dataset_train, dataset_valid,
                  mutator=None, batch_size=64, workers=4, device=None, log_frequency=None,
                  callbacks=None, arc_learning_rate=3.0E-4, unrolled=False):
+        """
+        Initialize a DartsTrainer.
+
+        Parameters
+        ----------
+        model : nn.Module
+            PyTorch model to be trained.
+        loss : callable
+            Receives logits and ground truth label, return a loss tensor.
+        metrics : callable
+            Receives logits and ground truth label, return a dict of metrics.
+        optimizer : Optimizer
+            The optimizer used for optimizing the model.
+        num_epochs : int
+            Number of epochs planned for training.
+        dataset_train : Dataset
+            Dataset for training. Will be split for training weights and architecture weights.
+        dataset_valid : Dataset
+            Dataset for testing.
+        mutator : DartsMutator
+            Use in case of customizing your own DartsMutator. By default will instantiate a DartsMutator.
+        batch_size : int
+            Batch size.
+        workers : int
+            Workers for data loading.
+        device : torch.device
+            ``torch.device("cpu")`` or ``torch.device("cuda")``.
+        log_frequency : int
+            Step count per logging.
+        callbacks : list of Callback
+            list of callbacks to trigger at events.
+        arc_learning_rate : float
+            Learning rate of architecture parameters.
+        unrolled : float
+            ``True`` if using second order optimization, else first order optimization.
+        """
         super().__init__(model, mutator if mutator is not None else DartsMutator(model),
                          loss, metrics, optimizer, num_epochs, dataset_train, dataset_valid,
                          batch_size, workers, device, log_frequency, callbacks)
diff --git a/src/sdk/pynni/nni/nas/pytorch/enas/mutator.py b/src/sdk/pynni/nni/nas/pytorch/enas/mutator.py
index ec079c3e5d..1f2f9bd7ad 100644
--- a/src/sdk/pynni/nni/nas/pytorch/enas/mutator.py
+++ b/src/sdk/pynni/nni/nas/pytorch/enas/mutator.py
@@ -31,6 +31,31 @@ class EnasMutator(Mutator):
 
     def __init__(self, model, lstm_size=64, lstm_num_layers=1, tanh_constant=1.5, cell_exit_extra_step=False,
                  skip_target=0.4, branch_bias=0.25):
+        """
+        Initialize a EnasMutator.
+
+        Parameters
+        ----------
+        model : nn.Module
+            PyTorch model.
+        lstm_size : int
+            Controller LSTM hidden units.
+        lstm_num_layers : int
+            Number of layers for stacked LSTM.
+        tanh_constant : float
+            Logits will be equal to ``tanh_constant * tanh(logits)``. Don't use ``tanh`` if this value is ``None``.
+        cell_exit_extra_step : bool
+            If true, RL controller will perform an extra step at the exit of each MutableScope, dump the hidden state
+            and mark it as the hidden state of this MutableScope. This is to align with the original implementation of paper.
+        skip_target : float
+            Target probability that skipconnect will appear.
+        branch_bias : float
+            Manual bias applied to make some operations more likely to be chosen.
+            Currently this is implemented with a hardcoded match rule that aligns with original repo.
+            If a mutable has a ``reduce`` in its key, all its op choices
+            that contains `conv` in their typename will receive a bias of ``+self.branch_bias`` initially; while others
+            receive a bias of ``-self.branch_bias``.
+        """
         super().__init__(model)
         self.lstm_size = lstm_size
         self.lstm_num_layers = lstm_num_layers
diff --git a/src/sdk/pynni/nni/nas/pytorch/enas/trainer.py b/src/sdk/pynni/nni/nas/pytorch/enas/trainer.py
index 4c198594ab..6cd5924f39 100644
--- a/src/sdk/pynni/nni/nas/pytorch/enas/trainer.py
+++ b/src/sdk/pynni/nni/nas/pytorch/enas/trainer.py
@@ -18,6 +18,54 @@ def __init__(self, model, loss, metrics, reward_function,
                  mutator=None, batch_size=64, workers=4, device=None, log_frequency=None, callbacks=None,
                  entropy_weight=0.0001, skip_weight=0.8, baseline_decay=0.999,
                  mutator_lr=0.00035, mutator_steps_aggregate=20, mutator_steps=50, aux_weight=0.4):
+        """
+        Initialize an EnasTrainer.
+
+        Parameters
+        ----------
+        model : nn.Module
+            PyTorch model to be trained.
+        loss : callable
+            Receives logits and ground truth label, return a loss tensor.
+        metrics : callable
+            Receives logits and ground truth label, return a dict of metrics.
+        reward_function : callable
+            Receives logits and ground truth label, return a tensor, which will be feeded to RL controller as reward.
+        optimizer : Optimizer
+            The optimizer used for optimizing the model.
+        num_epochs : int
+            Number of epochs planned for training.
+        dataset_train : Dataset
+            Dataset for training. Will be split for training weights and architecture weights.
+        dataset_valid : Dataset
+            Dataset for testing.
+        mutator : EnasMutator
+            Use when customizing your own mutator or a mutator with customized parameters.
+        batch_size : int
+            Batch size.
+        workers : int
+            Workers for data loading.
+        device : torch.device
+            ``torch.device("cpu")`` or ``torch.device("cuda")``.
+        log_frequency : int
+            Step count per logging.
+        callbacks : list of Callback
+            list of callbacks to trigger at events.
+        entropy_weight : float
+            Weight of sample entropy loss.
+        skip_weight : float
+            Weight of skip penalty loss.
+        baseline_decay : float
+            Decay factor of baseline. New baseline will be equal to ``baseline_decay * baseline_old + reward * (1 - baseline_decay)``.
+        mutator_lr : float
+            Learning rate for RL controller.
+        mutator_steps_aggregate : int
+            Number of steps that will be aggregated into one mini-batch for RL controller.
+        mutator_steps : int
+            Number of mini-batches for each epoch of RL controller learning.
+        aux_weight : float
+            Weight of auxiliary head loss. ``aux_weight * aux_loss`` will be added to total loss.
+        """
         super().__init__(model, mutator if mutator is not None else EnasMutator(model),
                          loss, metrics, optimizer, num_epochs, dataset_train, dataset_valid,
                          batch_size, workers, device, log_frequency, callbacks)
diff --git a/src/sdk/pynni/nni/nas/pytorch/spos/evolution.py b/src/sdk/pynni/nni/nas/pytorch/spos/evolution.py
index 3541c81fd7..2eb07fac10 100644
--- a/src/sdk/pynni/nni/nas/pytorch/spos/evolution.py
+++ b/src/sdk/pynni/nni/nas/pytorch/spos/evolution.py
@@ -211,6 +211,7 @@ def export_results(self, result):
         Parameters
         ----------
         result : dict
+            Chosen architectures to be exported.
         """
         os.makedirs("checkpoints", exist_ok=True)
         for i, cand in enumerate(result):
diff --git a/src/sdk/pynni/nni/nas/pytorch/spos/mutator.py b/src/sdk/pynni/nni/nas/pytorch/spos/mutator.py
index 88a01eeeaf..838f2fcd05 100644
--- a/src/sdk/pynni/nni/nas/pytorch/spos/mutator.py
+++ b/src/sdk/pynni/nni/nas/pytorch/spos/mutator.py
@@ -17,6 +17,7 @@ def __init__(self, model, flops_func=None, flops_lb=None, flops_ub=None,
         Parameters
         ----------
         model : nn.Module
+            PyTorch model.
         flops_func : callable
             Callable that takes a candidate from `sample_search` and returns its candidate. When `flops_func`
             is None, functions related to flops will be deactivated.
diff --git a/src/sdk/pynni/nni/nas/pytorch/spos/trainer.py b/src/sdk/pynni/nni/nas/pytorch/spos/trainer.py
index ab23760bf9..3b5e69f8cd 100644
--- a/src/sdk/pynni/nni/nas/pytorch/spos/trainer.py
+++ b/src/sdk/pynni/nni/nas/pytorch/spos/trainer.py
@@ -21,6 +21,37 @@ def __init__(self, model, loss, metrics,
                  optimizer, num_epochs, train_loader, valid_loader,
                  mutator=None, batch_size=64, workers=4, device=None, log_frequency=None,
                  callbacks=None):
+        """
+        Parameters
+        ----------
+        model : nn.Module
+            Model with mutables.
+        mutator : Mutator
+            A mutator object that has been initialized with the model.
+        loss : callable
+            Called with logits and targets. Returns a loss tensor.
+        metrics : callable
+            Returns a dict that maps metrics keys to metrics data.
+        optimizer : Optimizer
+            Optimizer that optimizes the model.
+        num_epochs : int
+            Number of epochs of training.
+        train_loader : iterable
+            Data loader of training. Raise ``StopIteration`` when one epoch is exhausted.
+        dataset_valid : iterable
+            Data loader of validation. Raise ``StopIteration`` when one epoch is exhausted.
+        batch_size : int
+            Batch size.
+        workers: int
+            Number of threads for data preprocessing. Not used for this trainer. Maybe removed in future.
+        device : torch.device
+            Device object. Either ``torch.device("cuda")`` or ``torch.device("cpu")``. When ``None``, trainer will
+            automatic detects GPU and selects GPU first.
+        log_frequency : int
+            Number of mini-batches to log metrics.
+        callbacks : list of Callback
+            Callbacks to plug into the trainer. See Callbacks.
+        """
         assert torch.cuda.is_available()
         super().__init__(model, mutator if mutator is not None else SPOSSupernetTrainingMutator(model),
                          loss, metrics, optimizer, num_epochs, None, None,
diff --git a/src/sdk/pynni/nni/nas/pytorch/trainer.py b/src/sdk/pynni/nni/nas/pytorch/trainer.py
index 32ba2e2709..218d6a2d50 100644
--- a/src/sdk/pynni/nni/nas/pytorch/trainer.py
+++ b/src/sdk/pynni/nni/nas/pytorch/trainer.py
@@ -52,7 +52,7 @@ def __init__(self, model, mutator, loss, metrics, optimizer, num_epochs,
         workers : int
             Number of workers used in data preprocessing.
         device : torch.device
-            Device object. Either `torch.device("cuda")` or torch.device("cpu")`. When `None`, trainer will
+            Device object. Either ``torch.device("cuda")`` or ``torch.device("cpu")``. When ``None``, trainer will
             automatic detects GPU and selects GPU first.
         log_frequency : int
             Number of mini-batches to log metrics.

From a6467ad88d1090543b842804d28e7f162b1f1c02 Mon Sep 17 00:00:00 2001
From: Yan Ni <yann@microsoft.com>
Date: Tue, 31 Dec 2019 13:23:57 +0800
Subject: [PATCH 14/14] release note draft for v1.3 (#1895)

---
 docs/en_US/Release.md | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/docs/en_US/Release.md b/docs/en_US/Release.md
index 811a030b1a..02e6604767 100644
--- a/docs/en_US/Release.md
+++ b/docs/en_US/Release.md
@@ -1,5 +1,38 @@
 # ChangeLog
 
+## Release 1.3 - 12/30/2019
+
+### Major Features
+
+#### Neural Architecture Search Algorithms Support
+* [Single Path One Shot](https://github.com/microsoft/nni/tree/v1.3/examples/nas/spos/) algorithm and the example using it
+
+#### Model Compression Algorithms Support
+* [Knowledge Distillation](https://github.com/microsoft/nni/blob/v1.3/docs/en_US/TrialExample/KDExample.md) algorithm and the example using itExample
+* Pruners
+    * [L2Filter Pruner](https://github.com/microsoft/nni/blob/v1.3/docs/en_US/Compressor/Pruner.md#3-l2filter-pruner)
+    * [ActivationAPoZRankFilterPruner](https://github.com/microsoft/nni/blob/v1.3/docs/en_US/Compressor/Pruner.md#1-activationapozrankfilterpruner)
+    * [ActivationMeanRankFilterPruner](https://github.com/microsoft/nni/blob/v1.3/docs/en_US/Compressor/Pruner.md#2-activationmeanrankfilterpruner)
+* [BNN Quantizer](https://github.com/microsoft/nni/blob/v1.3/docs/en_US/Compressor/Quantizer.md#bnn-quantizer)
+#### Training Service
+* NFS Support for PAI
+    
+    Instead of using HDFS as default storage, since OpenPAI v0.11, OpenPAI can have NFS or AzureBlob or other storage as default storage. In this release, NNI extended the support for this recent change made by OpenPAI, and could integrate with OpenPAI v0.11 or later version with various default storage.
+
+* Kubeflow update adoption
+
+    Adopted the Kubeflow 0.7's new supports for tf-operator.
+
+### Engineering (code and build automation)
+* Enforced [ESLint](https://eslint.org/) on static code analysis.
+
+### Small changes & Bug Fixes
+* correctly recognize builtin tuner and customized tuner
+* logging in dispatcher base
+* fix the bug where tuner/assessor's failure sometimes kills the experiment.
+* Fix local system as remote machine [issue](https://github.com/microsoft/nni/issues/1852)
+* de-duplicate trial configuration in smac tuner [ticket](https://github.com/microsoft/nni/issues/1364)
+
 ## Release 1.2 - 12/02/2019
 
 ### Major Features