From 10ec345a6ca81f53f9c469ec464fd898c1615e1f Mon Sep 17 00:00:00 2001
From: Chi Song <27178119+squirrelsc@users.noreply.github.com>
Date: Wed, 20 Nov 2019 11:13:04 +0800
Subject: [PATCH 01/30] update pdarts to use new darts

---
 docs/en_US/NAS/Overview.md                    | 124 ++++++------
 examples/nas/pdarts/{main.py => search.py}    |  31 ++-
 .../pynni/nni/nas/pytorch/darts/cnn_cell.py   |  69 -------
 .../nni/nas/pytorch/darts/cnn_network.py      |  73 -------
 .../pynni/nni/nas/pytorch/darts/cnn_ops.py    | 189 ------------------
 src/sdk/pynni/nni/nas/pytorch/modules.py      |   9 -
 .../pynni/nni/nas/pytorch/pdarts/mutator.py   |  50 +++--
 .../pynni/nni/nas/pytorch/pdarts/trainer.py   |  20 +-
 8 files changed, 113 insertions(+), 452 deletions(-)
 rename examples/nas/pdarts/{main.py => search.py} (74%)
 delete mode 100644 src/sdk/pynni/nni/nas/pytorch/darts/cnn_cell.py
 delete mode 100644 src/sdk/pynni/nni/nas/pytorch/darts/cnn_network.py
 delete mode 100644 src/sdk/pynni/nni/nas/pytorch/darts/cnn_ops.py
 delete mode 100644 src/sdk/pynni/nni/nas/pytorch/modules.py

diff --git a/docs/en_US/NAS/Overview.md b/docs/en_US/NAS/Overview.md
index 92b06b413f..c5aaeeb795 100644
--- a/docs/en_US/NAS/Overview.md
+++ b/docs/en_US/NAS/Overview.md
@@ -1,62 +1,62 @@
-# Neural Architecture Search (NAS) on NNI
-
-Automatic neural architecture search is taking an increasingly important role on finding better models. Recent research works have proved the feasibility of automatic NAS, and also found some models that could beat manually designed and tuned models. Some of representative works are [NASNet][2], [ENAS][1], [DARTS][3], [Network Morphism][4], and [Evolution][5]. There are new innovations keeping emerging.
-
-However, it takes great efforts to implement NAS algorithms, and it is hard to reuse code base of existing algorithms in new one. To facilitate NAS innovations (e.g., design and implement new NAS models, compare different NAS models side-by-side), an easy-to-use and flexible programming interface is crucial.
-
-With this motivation, our ambition is to provide a unified architecture in NNI, to accelerate innovations on NAS, and apply state-of-art algorithms on real world problems faster.
-
-## Supported algorithms
-
-NNI supports below NAS algorithms now, and being adding more. User can reproduce an algorithm, or use it on owned dataset. we also encourage user to implement other algorithms with [NNI API](#use-nni-api), to benefit more people.
-
-Note, these algorithms run standalone without nnictl, and supports PyTorch only.
-
-### DARTS
-
-The main contribution of [DARTS: Differentiable Architecture Search][3] on algorithm is to introduce a novel algorithm for differentiable network architecture search on bilevel optimization.
-
-#### Usage
-
-```bash
-### In case NNI code is not cloned.
-git clone https://github.com/Microsoft/nni.git
-
-cd examples/nas/darts
-python search.py
-```
-
-### P-DARTS
-
-[Progressive Differentiable Architecture Search: Bridging the Depth Gap between Search and Evaluation](https://arxiv.org/abs/1904.12760) bases on DARTS(#DARTS). It main contribution on algorithm is to introduce an efficient algorithm which allows the depth of searched architectures to grow gradually during the training procedure.
-
-#### Usage
-
-```bash
-### In case NNI code is not cloned.
-git clone https://github.com/Microsoft/nni.git
-
-cd examples/nas/pdarts
-python main.py
-```
-
-## Use NNI API
-
-NOTE, we are trying to support various NAS algorithms with unified programming interface, and it's in very experimental stage. It means the current programing interface may be updated significantly.
-
-*previous [NAS annotation](../AdvancedFeature/GeneralNasInterfaces.md) interface will be deprecated soon.*
-
-### Programming interface
-
-The programming interface of designing and searching a model is often demanded in two scenarios.
-
-1. When designing a neural network, there may be multiple operation choices on a layer, sub-model, or connection, and it's undetermined which one or combination performs  best. So it needs an easy way to express the candidate layers or sub-models.
-2. When applying NAS on a neural network, it needs an unified way to express the search space of architectures, so that it doesn't need to update trial code for different searching algorithms.
-
-NNI proposed API is [here](https://github.com/microsoft/nni/tree/dev-nas-refactor/src/sdk/pynni/nni/nas/pytorch). And [here](https://github.com/microsoft/nni/tree/dev-nas-refactor/examples/nas/darts) is an example of NAS implementation, which bases on NNI proposed interface.
-
-[1]: https://arxiv.org/abs/1802.03268
-[2]: https://arxiv.org/abs/1707.07012
-[3]: https://arxiv.org/abs/1806.09055
-[4]: https://arxiv.org/abs/1806.10282
-[5]: https://arxiv.org/abs/1703.01041
+# Neural Architecture Search (NAS) on NNI
+
+Automatic neural architecture search is taking an increasingly important role on finding better models. Recent research works have proved the feasibility of automatic NAS, and also found some models that could beat manually designed and tuned models. Some of representative works are [NASNet][2], [ENAS][1], [DARTS][3], [Network Morphism][4], and [Evolution][5]. There are new innovations keeping emerging.
+
+However, it takes great efforts to implement NAS algorithms, and it is hard to reuse code base of existing algorithms in new one. To facilitate NAS innovations (e.g., design and implement new NAS models, compare different NAS models side-by-side), an easy-to-use and flexible programming interface is crucial.
+
+With this motivation, our ambition is to provide a unified architecture in NNI, to accelerate innovations on NAS, and apply state-of-art algorithms on real world problems faster.
+
+## Supported algorithms
+
+NNI supports below NAS algorithms now and being adding more. User can reproduce an algorithm or use it on owned dataset. we also encourage user to implement other algorithms with [NNI API](#use-nni-api), to benefit more people.
+
+Note, these algorithms run standalone without nnictl, and supports PyTorch only.
+
+### DARTS
+
+The main contribution of [DARTS: Differentiable Architecture Search][3] on algorithm is to introduce a novel algorithm for differentiable network architecture search on bilevel optimization.
+
+#### Usage
+
+```bash
+### In case NNI code is not cloned.
+git clone https://github.com/Microsoft/nni.git
+
+cd examples/nas/darts
+python search.py
+```
+
+### P-DARTS
+
+[Progressive Differentiable Architecture Search: Bridging the Depth Gap between Search and Evaluation](https://arxiv.org/abs/1904.12760) bases on [DARTS](#DARTS). It's contribution on algorithm is to introduce an efficient algorithm which allows the depth of searched architectures to grow gradually during the training procedure.
+
+#### Usage
+
+```bash
+### In case NNI code is not cloned.
+git clone https://github.com/Microsoft/nni.git
+
+cd examples/nas/pdarts
+python search.py
+```
+
+## Use NNI API
+
+NOTE, we are trying to support various NAS algorithms with unified programming interface, and it's in very experimental stage. It means the current programing interface may be updated significantly.
+
+*previous [NAS annotation](../AdvancedFeature/GeneralNasInterfaces.md) interface will be deprecated soon.*
+
+### Programming interface
+
+The programming interface of designing and searching a model is often demanded in two scenarios.
+
+1. When designing a neural network, there may be multiple operation choices on a layer, sub-model, or connection, and it's undetermined which one or combination performs  best. So, it needs an easy way to express the candidate layers or sub-models.
+2. When applying NAS on a neural network, it needs an unified way to express the search space of architectures, so that it doesn't need to update trial code for different searching algorithms.
+
+NNI proposed API is [here](https://github.com/microsoft/nni/tree/dev-nas-refactor/src/sdk/pynni/nni/nas/pytorch). And [here](https://github.com/microsoft/nni/tree/dev-nas-refactor/examples/nas/darts) is an example of NAS implementation, which bases on NNI proposed interface.
+
+[1]: https://arxiv.org/abs/1802.03268
+[2]: https://arxiv.org/abs/1707.07012
+[3]: https://arxiv.org/abs/1806.09055
+[4]: https://arxiv.org/abs/1806.10282
+[5]: https://arxiv.org/abs/1703.01041
diff --git a/examples/nas/pdarts/main.py b/examples/nas/pdarts/search.py
similarity index 74%
rename from examples/nas/pdarts/main.py
rename to examples/nas/pdarts/search.py
index 68a59c8856..c186e4cd86 100644
--- a/examples/nas/pdarts/main.py
+++ b/examples/nas/pdarts/search.py
@@ -5,7 +5,8 @@
 import torch.nn as nn
 import nni.nas.pytorch as nas
 from nni.nas.pytorch.pdarts import PdartsTrainer
-from nni.nas.pytorch.darts import CnnNetwork, CnnCell
+# pylint: disable=relative-beyond-top-level
+from ..darts.model import CNN
 
 
 def accuracy(output, target, topk=(1,)):
@@ -29,37 +30,35 @@ def accuracy(output, target, topk=(1,)):
 
 
 if __name__ == "__main__":
-    parser = ArgumentParser("darts")
-    parser.add_argument("--layers", default=5, type=int)
+    parser = ArgumentParser("pdarts")
     parser.add_argument('--add_layers', action='append',
                         default=[0, 6, 12], help='add layers')
     parser.add_argument("--nodes", default=4, type=int)
+    parser.add_argument("--layers", default=5, type=int)
     parser.add_argument("--batch-size", default=128, type=int)
     parser.add_argument("--log-frequency", default=1, type=int)
+    parser.add_argument("--epochs", default=50, type=int)
     args = parser.parse_args()
 
     dataset_train, dataset_valid = datasets.get_dataset("cifar10")
 
-    def model_creator(layers, n_nodes):
-        model = CnnNetwork(3, 16, 10, layers, n_nodes=n_nodes, cell_type=CnnCell)
-        loss = nn.CrossEntropyLoss()
+    def model_creator(layers):
+        model = CNN(32, 3, 16, 10, layers, n_nodes=args.nodes)
+        criterion = nn.CrossEntropyLoss()
+
+        optim = torch.optim.SGD(model.parameters(), 0.025, momentum=0.9, weight_decay=3.0E-4)
+        lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optim, args.epochs, eta_min=0.001)
 
-        model_optim = torch.optim.SGD(model.parameters(), 0.025,
-                                      momentum=0.9, weight_decay=3.0E-4)
-        n_epochs = 50
-        lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(model_optim, n_epochs, eta_min=0.001)
-        return model, loss, model_optim, lr_scheduler
+        return model, criterion, optim, lr_scheduler
 
     trainer = PdartsTrainer(model_creator,
+                            layers=args.layers,
                             metrics=lambda output, target: accuracy(output, target, topk=(1,)),
-                            num_epochs=50,
                             pdarts_num_layers=[0, 6, 12],
                             pdarts_num_to_drop=[3, 2, 2],
+                            num_epochs=args.epochs,
                             dataset_train=dataset_train,
                             dataset_valid=dataset_valid,
-                            layers=args.layers,
-                            n_nodes=args.nodes,
                             batch_size=args.batch_size,
                             log_frequency=args.log_frequency)
-    trainer.train()
-    trainer.export()
+    trainer.train_and_validate()
diff --git a/src/sdk/pynni/nni/nas/pytorch/darts/cnn_cell.py b/src/sdk/pynni/nni/nas/pytorch/darts/cnn_cell.py
deleted file mode 100644
index 69dc28e8f0..0000000000
--- a/src/sdk/pynni/nni/nas/pytorch/darts/cnn_cell.py
+++ /dev/null
@@ -1,69 +0,0 @@
-
-import torch
-import torch.nn as nn
-
-import nni.nas.pytorch as nas
-from nni.nas.pytorch.modules import RankedModule
-
-from .cnn_ops import OPS, PRIMITIVES, FactorizedReduce, StdConv
-
-
-class CnnCell(RankedModule):
-    """
-    Cell for search.
-    """
-
-    def __init__(self, n_nodes, channels_pp, channels_p, channels, reduction_p, reduction):
-        """
-        Initialization a search cell.
-
-        Parameters
-        ----------
-        n_nodes: int
-            Number of nodes in current DAG.
-        channels_pp: int
-            Number of output channels from previous previous cell.
-        channels_p: int
-            Number of output channels from previous cell.
-        channels: int
-            Number of channels that will be used in the current DAG.
-        reduction_p: bool
-            Flag for whether the previous cell is reduction cell or not.
-        reduction: bool
-            Flag for whether the current cell is reduction cell or not.
-        """
-        super(CnnCell, self).__init__(rank=1, reduction=reduction)
-        self.n_nodes = n_nodes
-
-        # If previous cell is reduction cell, current input size does not match with
-        # output size of cell[k-2]. So the output[k-2] should be reduced by preprocessing.
-        if reduction_p:
-            self.preproc0 = FactorizedReduce(channels_pp, channels, affine=False)
-        else:
-            self.preproc0 = StdConv(channels_pp, channels, 1, 1, 0, affine=False)
-        self.preproc1 = StdConv(channels_p, channels, 1, 1, 0, affine=False)
-
-        # generate dag
-        self.mutable_ops = nn.ModuleList()
-        for depth in range(self.n_nodes):
-            self.mutable_ops.append(nn.ModuleList())
-            for i in range(2 + depth):  # include 2 input nodes
-                # reduction should be used only for input node
-                stride = 2 if reduction and i < 2 else 1
-                m_ops = []
-                for primitive in PRIMITIVES:
-                    op = OPS[primitive](channels, stride, False)
-                    m_ops.append(op)
-                op = nas.mutables.LayerChoice(m_ops, key="r{}_d{}_i{}".format(reduction, depth, i))
-                self.mutable_ops[depth].append(op)
-
-    def forward(self, s0, s1):
-        # s0, s1 are the outputs of previous previous cell and previous cell, respectively.
-        tensors = [self.preproc0(s0), self.preproc1(s1)]
-        for ops in self.mutable_ops:
-            assert len(ops) == len(tensors)
-            cur_tensor = sum(op(tensor) for op, tensor in zip(ops, tensors))
-            tensors.append(cur_tensor)
-
-        output = torch.cat(tensors[2:], dim=1)
-        return output
diff --git a/src/sdk/pynni/nni/nas/pytorch/darts/cnn_network.py b/src/sdk/pynni/nni/nas/pytorch/darts/cnn_network.py
deleted file mode 100644
index d126e3353e..0000000000
--- a/src/sdk/pynni/nni/nas/pytorch/darts/cnn_network.py
+++ /dev/null
@@ -1,73 +0,0 @@
-
-import torch.nn as nn
-
-from .cnn_cell import CnnCell
-
-
-class CnnNetwork(nn.Module):
-    """
-    Search CNN model
-    """
-
-    def __init__(self, in_channels, channels, n_classes, n_layers, n_nodes=4, stem_multiplier=3, cell_type=CnnCell):
-        """
-        Initializing a search channelsNN.
-
-        Parameters
-        ----------
-        in_channels: int
-            Number of channels in images.
-        channels: int
-            Number of channels used in the network.
-        n_classes: int
-            Number of classes.
-        n_layers: int
-            Number of cells in the whole network.
-        n_nodes: int
-            Number of nodes in a cell.
-        stem_multiplier: int
-            Multiplier of channels in STEM.
-        """
-        super().__init__()
-        self.in_channels = in_channels
-        self.channels = channels
-        self.n_classes = n_classes
-        self.n_layers = n_layers
-
-        c_cur = stem_multiplier * self.channels
-        self.stem = nn.Sequential(
-            nn.Conv2d(in_channels, c_cur, 3, 1, 1, bias=False),
-            nn.BatchNorm2d(c_cur)
-        )
-
-        # for the first cell, stem is used for both s0 and s1
-        # [!] channels_pp and channels_p is output channel size, but c_cur is input channel size.
-        channels_pp, channels_p, c_cur = c_cur, c_cur, channels
-
-        self.cells = nn.ModuleList()
-        reduction_p, reduction = False, False
-        for i in range(n_layers):
-            reduction_p, reduction = reduction, False
-            # Reduce featuremap size and double channels in 1/3 and 2/3 layer.
-            if i in [n_layers // 3, 2 * n_layers // 3]:
-                c_cur *= 2
-                reduction = True
-
-            cell = cell_type(n_nodes, channels_pp, channels_p, c_cur, reduction_p, reduction)
-            self.cells.append(cell)
-            c_cur_out = c_cur * n_nodes
-            channels_pp, channels_p = channels_p, c_cur_out
-
-        self.gap = nn.AdaptiveAvgPool2d(1)
-        self.linear = nn.Linear(channels_p, n_classes)
-
-    def forward(self, x):
-        s0 = s1 = self.stem(x)
-
-        for cell in self.cells:
-            s0, s1 = s1, cell(s0, s1)
-
-        out = self.gap(s1)
-        out = out.view(out.size(0), -1)  # flatten
-        logits = self.linear(out)
-        return logits
diff --git a/src/sdk/pynni/nni/nas/pytorch/darts/cnn_ops.py b/src/sdk/pynni/nni/nas/pytorch/darts/cnn_ops.py
deleted file mode 100644
index 02b4a3a94c..0000000000
--- a/src/sdk/pynni/nni/nas/pytorch/darts/cnn_ops.py
+++ /dev/null
@@ -1,189 +0,0 @@
-import torch
-import torch.nn as nn
-
-PRIMITIVES = [
-    'none',
-    'max_pool_3x3',
-    'avg_pool_3x3',
-    'skip_connect',  # identity
-    'sep_conv_3x3',
-    'sep_conv_5x5',
-    'dil_conv_3x3',
-    'dil_conv_5x5',
-]
-
-OPS = {
-    'none': lambda C, stride, affine: Zero(stride),
-    'avg_pool_3x3': lambda C, stride, affine: PoolBN('avg', C, 3, stride, 1, affine=affine),
-    'max_pool_3x3': lambda C, stride, affine: PoolBN('max', C, 3, stride, 1, affine=affine),
-    'skip_connect': lambda C, stride, affine: Identity() if stride == 1 else FactorizedReduce(C, C, affine=affine),
-    'sep_conv_3x3': lambda C, stride, affine: SepConv(C, C, 3, stride, 1, affine=affine),
-    'sep_conv_5x5': lambda C, stride, affine: SepConv(C, C, 5, stride, 2, affine=affine),
-    'sep_conv_7x7': lambda C, stride, affine: SepConv(C, C, 7, stride, 3, affine=affine),
-    'dil_conv_3x3': lambda C, stride, affine: DilConv(C, C, 3, stride, 2, 2, affine=affine),  # 5x5
-    'dil_conv_5x5': lambda C, stride, affine: DilConv(C, C, 5, stride, 4, 2, affine=affine),  # 9x9
-    'conv_7x1_1x7': lambda C, stride, affine: FacConv(C, C, 7, stride, 3, affine=affine)
-}
-
-
-def drop_path_(x, drop_prob, training):
-    if training and drop_prob > 0.:
-        keep_prob = 1. - drop_prob
-        # per data point mask; assuming x in cuda.
-        mask = torch.cuda.FloatTensor(x.size(0), 1, 1, 1).bernoulli_(keep_prob)
-        x.div_(keep_prob).mul_(mask)
-
-    return x
-
-
-class DropPath_(nn.Module):
-    def __init__(self, p=0.):
-        """ [!] DropPath is inplace module
-        Args:
-            p: probability of an path to be zeroed.
-        """
-        super().__init__()
-        self.p = p
-
-    def extra_repr(self):
-        return 'p={}, inplace'.format(self.p)
-
-    def forward(self, x):
-        drop_path_(x, self.p, self.training)
-
-        return x
-
-
-class PoolBN(nn.Module):
-    """
-    AvgPool or MaxPool - BN
-    """
-
-    def __init__(self, pool_type, C, kernel_size, stride, padding, affine=True):
-        """
-        Args:
-            pool_type: 'max' or 'avg'
-        """
-        super().__init__()
-        if pool_type.lower() == 'max':
-            self.pool = nn.MaxPool2d(kernel_size, stride, padding)
-        elif pool_type.lower() == 'avg':
-            self.pool = nn.AvgPool2d(kernel_size, stride, padding, count_include_pad=False)
-        else:
-            raise ValueError()
-
-        self.bn = nn.BatchNorm2d(C, affine=affine)
-
-    def forward(self, x):
-        out = self.pool(x)
-        out = self.bn(out)
-        return out
-
-
-class StdConv(nn.Module):
-    """ Standard conv
-    ReLU - Conv - BN
-    """
-
-    def __init__(self, C_in, C_out, kernel_size, stride, padding, affine=True):
-        super().__init__()
-        self.net = nn.Sequential(
-            nn.ReLU(),
-            nn.Conv2d(C_in, C_out, kernel_size, stride, padding, bias=False),
-            nn.BatchNorm2d(C_out, affine=affine)
-        )
-
-    def forward(self, x):
-        return self.net(x)
-
-
-class FacConv(nn.Module):
-    """ Factorized conv
-    ReLU - Conv(Kx1) - Conv(1xK) - BN
-    """
-
-    def __init__(self, C_in, C_out, kernel_length, stride, padding, affine=True):
-        super().__init__()
-        self.net = nn.Sequential(
-            nn.ReLU(),
-            nn.Conv2d(C_in, C_in, (kernel_length, 1), stride, padding, bias=False),
-            nn.Conv2d(C_in, C_out, (1, kernel_length), stride, padding, bias=False),
-            nn.BatchNorm2d(C_out, affine=affine)
-        )
-
-    def forward(self, x):
-        return self.net(x)
-
-
-class DilConv(nn.Module):
-    """ (Dilated) depthwise separable conv
-    ReLU - (Dilated) depthwise separable - Pointwise - BN
-    If dilation == 2, 3x3 conv => 5x5 receptive field
-                    5x5 conv => 9x9 receptive field
-    """
-
-    def __init__(self, C_in, C_out, kernel_size, stride, padding, dilation, affine=True):
-        super().__init__()
-        self.net = nn.Sequential(
-            nn.ReLU(),
-            nn.Conv2d(C_in, C_in, kernel_size, stride, padding, dilation=dilation, groups=C_in, bias=False),
-            nn.Conv2d(C_in, C_out, 1, stride=1, padding=0, bias=False),
-            nn.BatchNorm2d(C_out, affine=affine)
-        )
-
-    def forward(self, x):
-        return self.net(x)
-
-
-class SepConv(nn.Module):
-    """ Depthwise separable conv
-    DilConv(dilation=1) * 2
-    """
-
-    def __init__(self, C_in, C_out, kernel_size, stride, padding, affine=True):
-        super().__init__()
-        self.net = nn.Sequential(
-            DilConv(C_in, C_in, kernel_size, stride, padding, dilation=1, affine=affine),
-            DilConv(C_in, C_out, kernel_size, 1, padding, dilation=1, affine=affine)
-        )
-
-    def forward(self, x):
-        return self.net(x)
-
-
-class Identity(nn.Module):
-
-    def forward(self, x):
-        return x
-
-
-class Zero(nn.Module):
-    def __init__(self, stride):
-        super().__init__()
-        self.stride = stride
-
-    def forward(self, x):
-        if self.stride == 1:
-            return x * 0.
-
-        # re-sizing by stride
-        return x[:, :, ::self.stride, ::self.stride] * 0.
-
-
-class FactorizedReduce(nn.Module):
-    """
-    Reduce feature map size by factorized pointwise(stride=2).
-    """
-
-    def __init__(self, C_in, C_out, affine=True):
-        super().__init__()
-        self.relu = nn.ReLU()
-        self.conv1 = nn.Conv2d(C_in, C_out // 2, 1, stride=2, padding=0, bias=False)
-        self.conv2 = nn.Conv2d(C_in, C_out // 2, 1, stride=2, padding=0, bias=False)
-        self.bn = nn.BatchNorm2d(C_out, affine=affine)
-
-    def forward(self, x):
-        x = self.relu(x)
-        out = torch.cat([self.conv1(x), self.conv2(x[:, :, 1:, 1:])], dim=1)
-        out = self.bn(out)
-        return out
diff --git a/src/sdk/pynni/nni/nas/pytorch/modules.py b/src/sdk/pynni/nni/nas/pytorch/modules.py
deleted file mode 100644
index 6570220e13..0000000000
--- a/src/sdk/pynni/nni/nas/pytorch/modules.py
+++ /dev/null
@@ -1,9 +0,0 @@
-
-from torch import nn as nn
-
-
-class RankedModule(nn.Module):
-    def __init__(self, rank=None, reduction=False):
-        super(RankedModule, self).__init__()
-        self.rank = rank
-        self.reduction = reduction
diff --git a/src/sdk/pynni/nni/nas/pytorch/pdarts/mutator.py b/src/sdk/pynni/nni/nas/pytorch/pdarts/mutator.py
index 6e385b1170..62b39956a3 100644
--- a/src/sdk/pynni/nni/nas/pytorch/pdarts/mutator.py
+++ b/src/sdk/pynni/nni/nas/pytorch/pdarts/mutator.py
@@ -14,21 +14,35 @@ class PdartsMutator(DartsMutator):
     def __init__(self, model, pdarts_epoch_index, pdarts_num_to_drop, switches=None):
         self.pdarts_epoch_index = pdarts_epoch_index
         self.pdarts_num_to_drop = pdarts_num_to_drop
-        self.switches = switches
+        if switches is None:
+            self.switches = {}
+        else:
+            self.switches = switches
 
         super(PdartsMutator, self).__init__(model)
 
-    def before_build(self, model):
+    def after_parse_search_space(self, mutable: LayerChoice):
         self.choices = nn.ParameterDict()
-        if self.switches is None:
-            self.switches = {}
 
-    def named_mutables(self, model):
-        key2module = dict()
-        for name, module in model.named_modules():
-            if isinstance(module, LayerChoice):
-                key2module[module.key] = module
-                yield name, module, True
+        switches = self.switches.get(
+            mutable.key, [True for j in range(mutable.length)])
+
+        for _, mutable in self.named_mutables():
+            if isinstance(mutable, LayerChoice):
+
+                switches = self.switches.get(
+                    mutable.key, [True for j in range(mutable.length)])
+
+                for index in range(len(switches)-1, -1, -1):
+                    if switches[index] == False:
+                        del(mutable.choices[index])
+                        mutable.length -= 1
+
+                self.switches[mutable.key] = switches
+                self.choices[mutable.key] = nn.Parameter(1.0E-3 * torch.randn(len(mutable) + 1))
+
+    def on_calc_layer_choice_mask(self, mutable: LayerChoice):
+        return F.softmax(self.choices[mutable.key], dim=-1)
 
     def drop_paths(self):
         for key in self.switches:
@@ -49,22 +63,6 @@ def drop_paths(self):
                 switches[idxs[idx]] = False
         return self.switches
 
-    def on_init_layer_choice(self, mutable: LayerChoice):
-        switches = self.switches.get(
-            mutable.key, [True for j in range(mutable.length)])
-
-        for index in range(len(switches)-1, -1, -1):
-            if switches[index] == False:
-                del(mutable.choices[index])
-                mutable.length -= 1
-
-        self.switches[mutable.key] = switches
-
-        self.choices[mutable.key] = nn.Parameter(1.0E-3 * torch.randn(mutable.length))
-
-    def on_calc_layer_choice_mask(self, mutable: LayerChoice):
-        return F.softmax(self.choices[mutable.key], dim=-1)
-
     def get_min_k(self, input_in, k):
         index = []
         for _ in range(k):
diff --git a/src/sdk/pynni/nni/nas/pytorch/pdarts/trainer.py b/src/sdk/pynni/nni/nas/pytorch/pdarts/trainer.py
index 4d9c231143..527fd9ed54 100644
--- a/src/sdk/pynni/nni/nas/pytorch/pdarts/trainer.py
+++ b/src/sdk/pynni/nni/nas/pytorch/pdarts/trainer.py
@@ -1,3 +1,4 @@
+from nni.nas.pytorch.callbacks import (ArchitectureCheckpoint, LearningRateScheduler)
 from nni.nas.pytorch.darts import DartsTrainer
 from nni.nas.pytorch.trainer import Trainer
 
@@ -6,12 +7,12 @@
 
 class PdartsTrainer(Trainer):
 
-    def __init__(self, model_creator, metrics, num_epochs, dataset_train, dataset_valid,
-                 layers=5, n_nodes=4, pdarts_num_layers=[0, 6, 12], pdarts_num_to_drop=[3, 2, 2],
+    def __init__(self, layers, model_creator, metrics,
+                 num_epochs, dataset_train, dataset_valid,
+                 pdarts_num_layers=[0, 6, 12], pdarts_num_to_drop=[3, 2, 2],
                  mutator=None, batch_size=64, workers=4, device=None, log_frequency=None):
         self.model_creator = model_creator
         self.layers = layers
-        self.n_nodes = n_nodes
         self.pdarts_num_layers = pdarts_num_layers
         self.pdarts_num_to_drop = pdarts_num_to_drop
         self.pdarts_epoch = len(pdarts_num_to_drop)
@@ -28,18 +29,21 @@ def __init__(self, model_creator, metrics, num_epochs, dataset_train, dataset_va
 
     def train(self):
         layers = self.layers
-        n_nodes = self.n_nodes
         switches = None
         for epoch in range(self.pdarts_epoch):
 
             layers = self.layers+self.pdarts_num_layers[epoch]
-            model, loss, model_optim, _ = self.model_creator(
-                layers, n_nodes)
+            model, criterion, optim, lr_scheduler = self.model_creator(
+                layers)
             mutator = PdartsMutator(
                 model, epoch, self.pdarts_num_to_drop, switches)
 
-            self.trainer = DartsTrainer(model, loss=loss, optimizer=model_optim,
-                                        mutator=mutator, **self.darts_parameters)
+            self.trainer = DartsTrainer(model,
+                                        loss=criterion,
+                                        optimizer=optim,
+                                        callbacks=[LearningRateScheduler(
+                                            lr_scheduler), ArchitectureCheckpoint("./checkpoints")],
+                                        **self.darts_parameters)
             print("start pdrats training %s..." % epoch)
 
             self.trainer.train()

From 5b68e91723efa74b511e24c6eed524651a4164da Mon Sep 17 00:00:00 2001
From: squirrelsc <squirrel@chisong01.fareast.corp.microsoft.com>
Date: Wed, 20 Nov 2019 12:22:37 +0800
Subject: [PATCH 02/30] duplicate darts code to support pdarts

---
 docs/en_US/NAS/Overview.md            |   6 +-
 examples/nas/pdarts/darts/datasets.py |  53 +++++++++
 examples/nas/pdarts/darts/model.py    | 151 ++++++++++++++++++++++++++
 examples/nas/pdarts/darts/ops.py      | 135 +++++++++++++++++++++++
 examples/nas/pdarts/darts/utils.py    |  18 +++
 examples/nas/pdarts/datasets.py       |  25 -----
 examples/nas/pdarts/search.py         |   7 +-
 7 files changed, 364 insertions(+), 31 deletions(-)
 create mode 100644 examples/nas/pdarts/darts/datasets.py
 create mode 100644 examples/nas/pdarts/darts/model.py
 create mode 100644 examples/nas/pdarts/darts/ops.py
 create mode 100644 examples/nas/pdarts/darts/utils.py
 delete mode 100644 examples/nas/pdarts/datasets.py

diff --git a/docs/en_US/NAS/Overview.md b/docs/en_US/NAS/Overview.md
index c5aaeeb795..6c01685359 100644
--- a/docs/en_US/NAS/Overview.md
+++ b/docs/en_US/NAS/Overview.md
@@ -23,7 +23,7 @@ The main contribution of [DARTS: Differentiable Architecture Search][3] on algor
 git clone https://github.com/Microsoft/nni.git
 
 cd examples/nas/darts
-python search.py
+python3 search.py
 ```
 
 ### P-DARTS
@@ -36,8 +36,8 @@ python search.py
 ### In case NNI code is not cloned.
 git clone https://github.com/Microsoft/nni.git
 
-cd examples/nas/pdarts
-python search.py
+cd examples/nas
+python3 pdarts/search.py
 ```
 
 ## Use NNI API
diff --git a/examples/nas/pdarts/darts/datasets.py b/examples/nas/pdarts/darts/datasets.py
new file mode 100644
index 0000000000..c5861f16d3
--- /dev/null
+++ b/examples/nas/pdarts/darts/datasets.py
@@ -0,0 +1,53 @@
+import numpy as np
+import torch
+from torchvision import transforms
+from torchvision.datasets import CIFAR10
+
+
+class Cutout(object):
+    def __init__(self, length):
+        self.length = length
+
+    def __call__(self, img):
+        h, w = img.size(1), img.size(2)
+        mask = np.ones((h, w), np.float32)
+        y = np.random.randint(h)
+        x = np.random.randint(w)
+
+        y1 = np.clip(y - self.length // 2, 0, h)
+        y2 = np.clip(y + self.length // 2, 0, h)
+        x1 = np.clip(x - self.length // 2, 0, w)
+        x2 = np.clip(x + self.length // 2, 0, w)
+
+        mask[y1: y2, x1: x2] = 0.
+        mask = torch.from_numpy(mask)
+        mask = mask.expand_as(img)
+        img *= mask
+
+        return img
+
+
+def get_dataset(cls, cutout_length=0):
+    MEAN = [0.49139968, 0.48215827, 0.44653124]
+    STD = [0.24703233, 0.24348505, 0.26158768]
+    transf = [
+        transforms.RandomCrop(32, padding=4),
+        transforms.RandomHorizontalFlip()
+    ]
+    normalize = [
+        transforms.ToTensor(),
+        transforms.Normalize(MEAN, STD)
+    ]
+    cutout = []
+    if cutout_length > 0:
+        cutout.append(Cutout(cutout_length))
+
+    train_transform = transforms.Compose(transf + normalize + cutout)
+    valid_transform = transforms.Compose(normalize)
+
+    if cls == "cifar10":
+        dataset_train = CIFAR10(root="./data", train=True, download=True, transform=train_transform)
+        dataset_valid = CIFAR10(root="./data", train=False, download=True, transform=valid_transform)
+    else:
+        raise NotImplementedError
+    return dataset_train, dataset_valid
diff --git a/examples/nas/pdarts/darts/model.py b/examples/nas/pdarts/darts/model.py
new file mode 100644
index 0000000000..166e262784
--- /dev/null
+++ b/examples/nas/pdarts/darts/model.py
@@ -0,0 +1,151 @@
+import torch
+import torch.nn as nn
+
+from nni.nas.pytorch import darts, mutables
+
+#pylint: disable=relative-beyond-top-level
+from . import ops
+
+
+class AuxiliaryHead(nn.Module):
+    """ Auxiliary head in 2/3 place of network to let the gradient flow well """
+
+    def __init__(self, input_size, C, n_classes):
+        """ assuming input size 7x7 or 8x8 """
+        assert input_size in [7, 8]
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.ReLU(inplace=True),
+            nn.AvgPool2d(5, stride=input_size - 5, padding=0, count_include_pad=False),  # 2x2 out
+            nn.Conv2d(C, 128, kernel_size=1, bias=False),
+            nn.BatchNorm2d(128),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(128, 768, kernel_size=2, bias=False),  # 1x1 out
+            nn.BatchNorm2d(768),
+            nn.ReLU(inplace=True)
+        )
+        self.linear = nn.Linear(768, n_classes)
+
+    def forward(self, x):
+        out = self.net(x)
+        out = out.view(out.size(0), -1)  # flatten
+        logits = self.linear(out)
+        return logits
+
+
+class Node(darts.DartsNode):
+    def __init__(self, node_id, num_prev_nodes, channels, num_downsample_connect, drop_path_prob=0.):
+        super().__init__(node_id, limitation=2)
+        self.ops = nn.ModuleList()
+        for i in range(num_prev_nodes):
+            stride = 2 if i < num_downsample_connect else 1
+            self.ops.append(
+                mutables.LayerChoice(
+                    [
+                        ops.PoolBN('max', channels, 3, stride, 1, affine=False),
+                        ops.PoolBN('avg', channels, 3, stride, 1, affine=False),
+                        nn.Identity() if stride == 1 else ops.FactorizedReduce(channels, channels, affine=False),
+                        ops.SepConv(channels, channels, 3, stride, 1, affine=False),
+                        ops.SepConv(channels, channels, 5, stride, 2, affine=False),
+                        ops.DilConv(channels, channels, 3, stride, 2, 2, affine=False),
+                        ops.DilConv(channels, channels, 5, stride, 4, 2, affine=False),
+                    ],
+                    key="{}_p{}".format(node_id, i)))
+        self.drop_path = ops.DropPath_(drop_path_prob)
+
+    def forward(self, prev_nodes):
+        assert len(self.ops) == len(prev_nodes)
+        out = [op(node) for op, node in zip(self.ops, prev_nodes)]
+        return sum(self.drop_path(o) for o in out if o is not None)
+
+
+class Cell(nn.Module):
+
+    def __init__(self, n_nodes, channels_pp, channels_p, channels, reduction_p, reduction, drop_path_prob=0.):
+        super().__init__()
+        self.reduction = reduction
+        self.n_nodes = n_nodes
+
+        # If previous cell is reduction cell, current input size does not match with
+        # output size of cell[k-2]. So the output[k-2] should be reduced by preprocessing.
+        if reduction_p:
+            self.preproc0 = ops.FactorizedReduce(channels_pp, channels, affine=False)
+        else:
+            self.preproc0 = ops.StdConv(channels_pp, channels, 1, 1, 0, affine=False)
+        self.preproc1 = ops.StdConv(channels_p, channels, 1, 1, 0, affine=False)
+
+        # generate dag
+        self.mutable_ops = nn.ModuleList()
+        for depth in range(self.n_nodes):
+            self.mutable_ops.append(Node("r{:d}_n{}".format(reduction, depth),
+                                         depth + 2, channels, 2 if reduction else 0,
+                                         drop_path_prob=drop_path_prob))
+
+    def forward(self, s0, s1):
+        # s0, s1 are the outputs of previous previous cell and previous cell, respectively.
+        tensors = [self.preproc0(s0), self.preproc1(s1)]
+        for node in self.mutable_ops:
+            cur_tensor = node(tensors)
+            tensors.append(cur_tensor)
+
+        output = torch.cat(tensors[2:], dim=1)
+        return output
+
+
+class CNN(nn.Module):
+
+    def __init__(self, input_size, in_channels, channels, n_classes, n_layers, n_nodes=4,
+                 stem_multiplier=3, auxiliary=False, drop_path_prob=0.):
+        super().__init__()
+        self.in_channels = in_channels
+        self.channels = channels
+        self.n_classes = n_classes
+        self.n_layers = n_layers
+        self.aux_pos = 2 * n_layers // 3 if auxiliary else -1
+
+        c_cur = stem_multiplier * self.channels
+        self.stem = nn.Sequential(
+            nn.Conv2d(in_channels, c_cur, 3, 1, 1, bias=False),
+            nn.BatchNorm2d(c_cur)
+        )
+
+        # for the first cell, stem is used for both s0 and s1
+        # [!] channels_pp and channels_p is output channel size, but c_cur is input channel size.
+        channels_pp, channels_p, c_cur = c_cur, c_cur, channels
+
+        self.cells = nn.ModuleList()
+        reduction_p, reduction = False, False
+        for i in range(n_layers):
+            reduction_p, reduction = reduction, False
+            # Reduce featuremap size and double channels in 1/3 and 2/3 layer.
+            if i in [n_layers // 3, 2 * n_layers // 3]:
+                c_cur *= 2
+                reduction = True
+
+            cell = Cell(n_nodes, channels_pp, channels_p, c_cur, reduction_p, reduction, drop_path_prob=drop_path_prob)
+            self.cells.append(cell)
+            c_cur_out = c_cur * n_nodes
+            channels_pp, channels_p = channels_p, c_cur_out
+
+            if i == self.aux_pos:
+                self.aux_head = AuxiliaryHead(input_size // 4, channels_p, n_classes)
+
+        self.gap = nn.AdaptiveAvgPool2d(1)
+        self.linear = nn.Linear(channels_p, n_classes)
+
+    def forward(self, x):
+        s0 = s1 = self.stem(x)
+
+        aux_logits = None
+        for i, cell in enumerate(self.cells):
+            s0, s1 = s1, cell(s0, s1)
+            if i == self.aux_pos and self.training:
+                aux_logits = self.aux_head(s1)
+
+        out = self.gap(s1)
+        out = out.view(out.size(0), -1)  # flatten
+        logits = self.linear(out)
+
+        if aux_logits is not None:
+            return logits, aux_logits
+        return logits
diff --git a/examples/nas/pdarts/darts/ops.py b/examples/nas/pdarts/darts/ops.py
new file mode 100644
index 0000000000..2fef9fec19
--- /dev/null
+++ b/examples/nas/pdarts/darts/ops.py
@@ -0,0 +1,135 @@
+import torch
+import torch.nn as nn
+
+
+class DropPath_(nn.Module):
+    def __init__(self, p=0.):
+        """ [!] DropPath is inplace module
+        Args:
+            p: probability of an path to be zeroed.
+        """
+        super().__init__()
+        self.p = p
+
+    def extra_repr(self):
+        return 'p={}, inplace'.format(self.p)
+
+    def forward(self, x):
+        if self.training and self.p > 0.:
+            keep_prob = 1. - self.p
+            # per data point mask
+            mask = torch.zeros((x.size(0), 1, 1, 1), device=x.device).bernoulli_(keep_prob)
+            x.div_(keep_prob).mul_(mask)
+
+        return x
+
+
+class PoolBN(nn.Module):
+    """
+    AvgPool or MaxPool - BN
+    """
+    def __init__(self, pool_type, C, kernel_size, stride, padding, affine=True):
+        """
+        Args:
+            pool_type: 'max' or 'avg'
+        """
+        super().__init__()
+        if pool_type.lower() == 'max':
+            self.pool = nn.MaxPool2d(kernel_size, stride, padding)
+        elif pool_type.lower() == 'avg':
+            self.pool = nn.AvgPool2d(kernel_size, stride, padding, count_include_pad=False)
+        else:
+            raise ValueError()
+
+        self.bn = nn.BatchNorm2d(C, affine=affine)
+
+    def forward(self, x):
+        out = self.pool(x)
+        out = self.bn(out)
+        return out
+
+
+class StdConv(nn.Module):
+    """ Standard conv
+    ReLU - Conv - BN
+    """
+    def __init__(self, C_in, C_out, kernel_size, stride, padding, affine=True):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.ReLU(),
+            nn.Conv2d(C_in, C_out, kernel_size, stride, padding, bias=False),
+            nn.BatchNorm2d(C_out, affine=affine)
+        )
+
+    def forward(self, x):
+        return self.net(x)
+
+
+class FacConv(nn.Module):
+    """ Factorized conv
+    ReLU - Conv(Kx1) - Conv(1xK) - BN
+    """
+    def __init__(self, C_in, C_out, kernel_length, stride, padding, affine=True):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.ReLU(),
+            nn.Conv2d(C_in, C_in, (kernel_length, 1), stride, padding, bias=False),
+            nn.Conv2d(C_in, C_out, (1, kernel_length), stride, padding, bias=False),
+            nn.BatchNorm2d(C_out, affine=affine)
+        )
+
+    def forward(self, x):
+        return self.net(x)
+
+
+class DilConv(nn.Module):
+    """ (Dilated) depthwise separable conv
+    ReLU - (Dilated) depthwise separable - Pointwise - BN
+    If dilation == 2, 3x3 conv => 5x5 receptive field
+                      5x5 conv => 9x9 receptive field
+    """
+    def __init__(self, C_in, C_out, kernel_size, stride, padding, dilation, affine=True):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.ReLU(),
+            nn.Conv2d(C_in, C_in, kernel_size, stride, padding, dilation=dilation, groups=C_in,
+                      bias=False),
+            nn.Conv2d(C_in, C_out, 1, stride=1, padding=0, bias=False),
+            nn.BatchNorm2d(C_out, affine=affine)
+        )
+
+    def forward(self, x):
+        return self.net(x)
+
+
+class SepConv(nn.Module):
+    """ Depthwise separable conv
+    DilConv(dilation=1) * 2
+    """
+    def __init__(self, C_in, C_out, kernel_size, stride, padding, affine=True):
+        super().__init__()
+        self.net = nn.Sequential(
+            DilConv(C_in, C_in, kernel_size, stride, padding, dilation=1, affine=affine),
+            DilConv(C_in, C_out, kernel_size, 1, padding, dilation=1, affine=affine)
+        )
+
+    def forward(self, x):
+        return self.net(x)
+
+
+class FactorizedReduce(nn.Module):
+    """
+    Reduce feature map size by factorized pointwise(stride=2).
+    """
+    def __init__(self, C_in, C_out, affine=True):
+        super().__init__()
+        self.relu = nn.ReLU()
+        self.conv1 = nn.Conv2d(C_in, C_out // 2, 1, stride=2, padding=0, bias=False)
+        self.conv2 = nn.Conv2d(C_in, C_out // 2, 1, stride=2, padding=0, bias=False)
+        self.bn = nn.BatchNorm2d(C_out, affine=affine)
+
+    def forward(self, x):
+        x = self.relu(x)
+        out = torch.cat([self.conv1(x), self.conv2(x[:, :, 1:, 1:])], dim=1)
+        out = self.bn(out)
+        return out
diff --git a/examples/nas/pdarts/darts/utils.py b/examples/nas/pdarts/darts/utils.py
new file mode 100644
index 0000000000..2aac457ad1
--- /dev/null
+++ b/examples/nas/pdarts/darts/utils.py
@@ -0,0 +1,18 @@
+def accuracy(output, target, topk=(1,)):
+    """ Computes the precision@k for the specified values of k """
+    maxk = max(topk)
+    batch_size = target.size(0)
+
+    _, pred = output.topk(maxk, 1, True, True)
+    pred = pred.t()
+    # one-hot case
+    if target.ndimension() > 1:
+        target = target.max(1)[1]
+
+    correct = pred.eq(target.view(1, -1).expand_as(pred))
+
+    res = dict()
+    for k in topk:
+        correct_k = correct[:k].view(-1).float().sum(0)
+        res["acc{}".format(k)] = correct_k.mul_(1.0 / batch_size).item()
+    return res
\ No newline at end of file
diff --git a/examples/nas/pdarts/datasets.py b/examples/nas/pdarts/datasets.py
deleted file mode 100644
index 8fe0ab0fbf..0000000000
--- a/examples/nas/pdarts/datasets.py
+++ /dev/null
@@ -1,25 +0,0 @@
-from torchvision import transforms
-from torchvision.datasets import CIFAR10
-
-
-def get_dataset(cls):
-    MEAN = [0.49139968, 0.48215827, 0.44653124]
-    STD = [0.24703233, 0.24348505, 0.26158768]
-    transf = [
-        transforms.RandomCrop(32, padding=4),
-        transforms.RandomHorizontalFlip()
-    ]
-    normalize = [
-        transforms.ToTensor(),
-        transforms.Normalize(MEAN, STD)
-    ]
-
-    train_transform = transforms.Compose(transf + normalize)
-    valid_transform = transforms.Compose(normalize)
-
-    if cls == "cifar10":
-        dataset_train = CIFAR10(root="./data", train=True, download=True, transform=train_transform)
-        dataset_valid = CIFAR10(root="./data", train=False, download=True, transform=valid_transform)
-    else:
-        raise NotImplementedError
-    return dataset_train, dataset_valid
diff --git a/examples/nas/pdarts/search.py b/examples/nas/pdarts/search.py
index c186e4cd86..4ba100f97f 100644
--- a/examples/nas/pdarts/search.py
+++ b/examples/nas/pdarts/search.py
@@ -1,12 +1,13 @@
 from argparse import ArgumentParser
 
-import datasets
 import torch
 import torch.nn as nn
+
 import nni.nas.pytorch as nas
 from nni.nas.pytorch.pdarts import PdartsTrainer
-# pylint: disable=relative-beyond-top-level
-from ..darts.model import CNN
+
+from darts.model import CNN
+from darts import datasets
 
 
 def accuracy(output, target, topk=(1,)):

From 9f1fb8a62ebc92b4441db57e2dfa1269f671ab0e Mon Sep 17 00:00:00 2001
From: squirrelsc <squirrel@chisong01.fareast.corp.microsoft.com>
Date: Wed, 20 Nov 2019 12:30:07 +0800
Subject: [PATCH 03/30] add abstract methods for pdarts

---
 src/sdk/pynni/nni/nas/pytorch/pdarts/trainer.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/sdk/pynni/nni/nas/pytorch/pdarts/trainer.py b/src/sdk/pynni/nni/nas/pytorch/pdarts/trainer.py
index 527fd9ed54..a9c584bbac 100644
--- a/src/sdk/pynni/nni/nas/pytorch/pdarts/trainer.py
+++ b/src/sdk/pynni/nni/nas/pytorch/pdarts/trainer.py
@@ -44,7 +44,7 @@ def train(self):
                                         callbacks=[LearningRateScheduler(
                                             lr_scheduler), ArchitectureCheckpoint("./checkpoints")],
                                         **self.darts_parameters)
-            print("start pdrats training %s..." % epoch)
+            print("start pdarts training %s..." % epoch)
 
             self.trainer.train()
 
@@ -53,6 +53,12 @@ def train(self):
 
             switches = mutator.drop_paths()
 
+    def train_one_epoch(self, epoch):
+        self.trainer.train_one_epoch(epoch)
+
+    def validate_one_epoch(self, epoch):
+        self.trainer.train_one_epoch(epoch)
+
     def export(self):
         if (self.trainer is not None) and hasattr(self.trainer, "export"):
             self.trainer.export()

From 3c8e95e972f283525bdde992c83b5ebc9f472c6c Mon Sep 17 00:00:00 2001
From: squirrelsc <squirrel@chisong01.fareast.corp.microsoft.com>
Date: Wed, 20 Nov 2019 12:40:17 +0800
Subject: [PATCH 04/30] fix bug

---
 src/sdk/pynni/nni/nas/pytorch/pdarts/trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/sdk/pynni/nni/nas/pytorch/pdarts/trainer.py b/src/sdk/pynni/nni/nas/pytorch/pdarts/trainer.py
index a9c584bbac..ce9a2c4db3 100644
--- a/src/sdk/pynni/nni/nas/pytorch/pdarts/trainer.py
+++ b/src/sdk/pynni/nni/nas/pytorch/pdarts/trainer.py
@@ -7,7 +7,7 @@
 
 class PdartsTrainer(Trainer):
 
-    def __init__(self, layers, model_creator, metrics,
+    def __init__(self, model_creator, layers, metrics,
                  num_epochs, dataset_train, dataset_valid,
                  pdarts_num_layers=[0, 6, 12], pdarts_num_to_drop=[3, 2, 2],
                  mutator=None, batch_size=64, workers=4, device=None, log_frequency=None):

From 0676ba0706a692d68cff619cfa9f10b8bc5823de Mon Sep 17 00:00:00 2001
From: squirrelsc <squirrel@chisong01.fareast.corp.microsoft.com>
Date: Wed, 20 Nov 2019 12:50:03 +0800
Subject: [PATCH 05/30] fix base trainer

---
 .../pynni/nni/nas/pytorch/pdarts/trainer.py    | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/src/sdk/pynni/nni/nas/pytorch/pdarts/trainer.py b/src/sdk/pynni/nni/nas/pytorch/pdarts/trainer.py
index ce9a2c4db3..ab9d7f4519 100644
--- a/src/sdk/pynni/nni/nas/pytorch/pdarts/trainer.py
+++ b/src/sdk/pynni/nni/nas/pytorch/pdarts/trainer.py
@@ -1,16 +1,17 @@
 from nni.nas.pytorch.callbacks import (ArchitectureCheckpoint, LearningRateScheduler)
 from nni.nas.pytorch.darts import DartsTrainer
-from nni.nas.pytorch.trainer import Trainer
+from nni.nas.pytorch.trainer import BaseTrainer
 
 from .mutator import PdartsMutator
 
 
-class PdartsTrainer(Trainer):
+class PdartsTrainer(BaseTrainer):
 
     def __init__(self, model_creator, layers, metrics,
                  num_epochs, dataset_train, dataset_valid,
                  pdarts_num_layers=[0, 6, 12], pdarts_num_to_drop=[3, 2, 2],
                  mutator=None, batch_size=64, workers=4, device=None, log_frequency=None):
+        super(PdartsTrainer, self).__init__()
         self.model_creator = model_creator
         self.layers = layers
         self.pdarts_num_layers = pdarts_num_layers
@@ -53,12 +54,9 @@ def train(self):
 
             switches = mutator.drop_paths()
 
-    def train_one_epoch(self, epoch):
-        self.trainer.train_one_epoch(epoch)
+    def validate(self):
+        self.trainer.validate()
 
-    def validate_one_epoch(self, epoch):
-        self.trainer.train_one_epoch(epoch)
-
-    def export(self):
-        if (self.trainer is not None) and hasattr(self.trainer, "export"):
-            self.trainer.export()
+    def train_and_validate(self):
+        self.train()
+        self.validate()

From 63ed173cca182a8a89325fd57b76eb6267f715b3 Mon Sep 17 00:00:00 2001
From: Chi Song <27178119+squirrelsc@users.noreply.github.com>
Date: Wed, 20 Nov 2019 12:55:27 +0800
Subject: [PATCH 06/30] fix bug on mutator

---
 src/sdk/pynni/nni/nas/pytorch/pdarts/mutator.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/sdk/pynni/nni/nas/pytorch/pdarts/mutator.py b/src/sdk/pynni/nni/nas/pytorch/pdarts/mutator.py
index 62b39956a3..4a8f969875 100644
--- a/src/sdk/pynni/nni/nas/pytorch/pdarts/mutator.py
+++ b/src/sdk/pynni/nni/nas/pytorch/pdarts/mutator.py
@@ -21,12 +21,9 @@ def __init__(self, model, pdarts_epoch_index, pdarts_num_to_drop, switches=None)
 
         super(PdartsMutator, self).__init__(model)
 
-    def after_parse_search_space(self, mutable: LayerChoice):
+    def after_parse_search_space(self):
         self.choices = nn.ParameterDict()
 
-        switches = self.switches.get(
-            mutable.key, [True for j in range(mutable.length)])
-
         for _, mutable in self.named_mutables():
             if isinstance(mutable, LayerChoice):
 

From 58b913ba3b11b3f072ed2b50ac374a285b5fcba2 Mon Sep 17 00:00:00 2001
From: Chi Song <27178119+squirrelsc@users.noreply.github.com>
Date: Wed, 20 Nov 2019 12:58:46 +0800
Subject: [PATCH 07/30] try to improve performance

---
 src/sdk/pynni/nni/nas/pytorch/darts/trainer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/sdk/pynni/nni/nas/pytorch/darts/trainer.py b/src/sdk/pynni/nni/nas/pytorch/darts/trainer.py
index 464832eadf..9f35affeb4 100644
--- a/src/sdk/pynni/nni/nas/pytorch/darts/trainer.py
+++ b/src/sdk/pynni/nni/nas/pytorch/darts/trainer.py
@@ -38,8 +38,8 @@ def train_one_epoch(self, epoch):
         lr = self.optimizer.param_groups[0]["lr"]
         meters = AverageMeterGroup()
         for step, ((trn_X, trn_y), (val_X, val_y)) in enumerate(zip(self.train_loader, self.valid_loader)):
-            trn_X, trn_y = trn_X.to(self.device), trn_y.to(self.device)
-            val_X, val_y = val_X.to(self.device), val_y.to(self.device)
+            trn_X, trn_y = trn_X.to(self.device, non_blocking=True), trn_y.to(self.device, non_blocking=True)
+            val_X, val_y = val_X.to(self.device, non_blocking=True), val_y.to(self.device, non_blocking=True)
 
             # backup model for hessian
             backup_model = copy.deepcopy(self.model.state_dict())

From a1646a585370f76fd32e2e3b74ca2bcdba4e0826 Mon Sep 17 00:00:00 2001
From: squirrelsc <squirrel@chisong01.fareast.corp.microsoft.com>
Date: Wed, 20 Nov 2019 13:23:19 +0800
Subject: [PATCH 08/30] optimize code to reduce duplicated files.

---
 examples/nas/pdarts/darts/datasets.py         |  53 ------
 examples/nas/pdarts/darts/model.py            | 151 ------------------
 examples/nas/pdarts/darts/ops.py              | 135 ----------------
 examples/nas/pdarts/darts/utils.py            |  18 ---
 examples/nas/pdarts/search.py                 |  11 +-
 .../pynni/nni/nas/pytorch/darts/trainer.py    |   4 +-
 6 files changed, 8 insertions(+), 364 deletions(-)
 delete mode 100644 examples/nas/pdarts/darts/datasets.py
 delete mode 100644 examples/nas/pdarts/darts/model.py
 delete mode 100644 examples/nas/pdarts/darts/ops.py
 delete mode 100644 examples/nas/pdarts/darts/utils.py

diff --git a/examples/nas/pdarts/darts/datasets.py b/examples/nas/pdarts/darts/datasets.py
deleted file mode 100644
index c5861f16d3..0000000000
--- a/examples/nas/pdarts/darts/datasets.py
+++ /dev/null
@@ -1,53 +0,0 @@
-import numpy as np
-import torch
-from torchvision import transforms
-from torchvision.datasets import CIFAR10
-
-
-class Cutout(object):
-    def __init__(self, length):
-        self.length = length
-
-    def __call__(self, img):
-        h, w = img.size(1), img.size(2)
-        mask = np.ones((h, w), np.float32)
-        y = np.random.randint(h)
-        x = np.random.randint(w)
-
-        y1 = np.clip(y - self.length // 2, 0, h)
-        y2 = np.clip(y + self.length // 2, 0, h)
-        x1 = np.clip(x - self.length // 2, 0, w)
-        x2 = np.clip(x + self.length // 2, 0, w)
-
-        mask[y1: y2, x1: x2] = 0.
-        mask = torch.from_numpy(mask)
-        mask = mask.expand_as(img)
-        img *= mask
-
-        return img
-
-
-def get_dataset(cls, cutout_length=0):
-    MEAN = [0.49139968, 0.48215827, 0.44653124]
-    STD = [0.24703233, 0.24348505, 0.26158768]
-    transf = [
-        transforms.RandomCrop(32, padding=4),
-        transforms.RandomHorizontalFlip()
-    ]
-    normalize = [
-        transforms.ToTensor(),
-        transforms.Normalize(MEAN, STD)
-    ]
-    cutout = []
-    if cutout_length > 0:
-        cutout.append(Cutout(cutout_length))
-
-    train_transform = transforms.Compose(transf + normalize + cutout)
-    valid_transform = transforms.Compose(normalize)
-
-    if cls == "cifar10":
-        dataset_train = CIFAR10(root="./data", train=True, download=True, transform=train_transform)
-        dataset_valid = CIFAR10(root="./data", train=False, download=True, transform=valid_transform)
-    else:
-        raise NotImplementedError
-    return dataset_train, dataset_valid
diff --git a/examples/nas/pdarts/darts/model.py b/examples/nas/pdarts/darts/model.py
deleted file mode 100644
index 166e262784..0000000000
--- a/examples/nas/pdarts/darts/model.py
+++ /dev/null
@@ -1,151 +0,0 @@
-import torch
-import torch.nn as nn
-
-from nni.nas.pytorch import darts, mutables
-
-#pylint: disable=relative-beyond-top-level
-from . import ops
-
-
-class AuxiliaryHead(nn.Module):
-    """ Auxiliary head in 2/3 place of network to let the gradient flow well """
-
-    def __init__(self, input_size, C, n_classes):
-        """ assuming input size 7x7 or 8x8 """
-        assert input_size in [7, 8]
-        super().__init__()
-        self.net = nn.Sequential(
-            nn.ReLU(inplace=True),
-            nn.AvgPool2d(5, stride=input_size - 5, padding=0, count_include_pad=False),  # 2x2 out
-            nn.Conv2d(C, 128, kernel_size=1, bias=False),
-            nn.BatchNorm2d(128),
-            nn.ReLU(inplace=True),
-            nn.Conv2d(128, 768, kernel_size=2, bias=False),  # 1x1 out
-            nn.BatchNorm2d(768),
-            nn.ReLU(inplace=True)
-        )
-        self.linear = nn.Linear(768, n_classes)
-
-    def forward(self, x):
-        out = self.net(x)
-        out = out.view(out.size(0), -1)  # flatten
-        logits = self.linear(out)
-        return logits
-
-
-class Node(darts.DartsNode):
-    def __init__(self, node_id, num_prev_nodes, channels, num_downsample_connect, drop_path_prob=0.):
-        super().__init__(node_id, limitation=2)
-        self.ops = nn.ModuleList()
-        for i in range(num_prev_nodes):
-            stride = 2 if i < num_downsample_connect else 1
-            self.ops.append(
-                mutables.LayerChoice(
-                    [
-                        ops.PoolBN('max', channels, 3, stride, 1, affine=False),
-                        ops.PoolBN('avg', channels, 3, stride, 1, affine=False),
-                        nn.Identity() if stride == 1 else ops.FactorizedReduce(channels, channels, affine=False),
-                        ops.SepConv(channels, channels, 3, stride, 1, affine=False),
-                        ops.SepConv(channels, channels, 5, stride, 2, affine=False),
-                        ops.DilConv(channels, channels, 3, stride, 2, 2, affine=False),
-                        ops.DilConv(channels, channels, 5, stride, 4, 2, affine=False),
-                    ],
-                    key="{}_p{}".format(node_id, i)))
-        self.drop_path = ops.DropPath_(drop_path_prob)
-
-    def forward(self, prev_nodes):
-        assert len(self.ops) == len(prev_nodes)
-        out = [op(node) for op, node in zip(self.ops, prev_nodes)]
-        return sum(self.drop_path(o) for o in out if o is not None)
-
-
-class Cell(nn.Module):
-
-    def __init__(self, n_nodes, channels_pp, channels_p, channels, reduction_p, reduction, drop_path_prob=0.):
-        super().__init__()
-        self.reduction = reduction
-        self.n_nodes = n_nodes
-
-        # If previous cell is reduction cell, current input size does not match with
-        # output size of cell[k-2]. So the output[k-2] should be reduced by preprocessing.
-        if reduction_p:
-            self.preproc0 = ops.FactorizedReduce(channels_pp, channels, affine=False)
-        else:
-            self.preproc0 = ops.StdConv(channels_pp, channels, 1, 1, 0, affine=False)
-        self.preproc1 = ops.StdConv(channels_p, channels, 1, 1, 0, affine=False)
-
-        # generate dag
-        self.mutable_ops = nn.ModuleList()
-        for depth in range(self.n_nodes):
-            self.mutable_ops.append(Node("r{:d}_n{}".format(reduction, depth),
-                                         depth + 2, channels, 2 if reduction else 0,
-                                         drop_path_prob=drop_path_prob))
-
-    def forward(self, s0, s1):
-        # s0, s1 are the outputs of previous previous cell and previous cell, respectively.
-        tensors = [self.preproc0(s0), self.preproc1(s1)]
-        for node in self.mutable_ops:
-            cur_tensor = node(tensors)
-            tensors.append(cur_tensor)
-
-        output = torch.cat(tensors[2:], dim=1)
-        return output
-
-
-class CNN(nn.Module):
-
-    def __init__(self, input_size, in_channels, channels, n_classes, n_layers, n_nodes=4,
-                 stem_multiplier=3, auxiliary=False, drop_path_prob=0.):
-        super().__init__()
-        self.in_channels = in_channels
-        self.channels = channels
-        self.n_classes = n_classes
-        self.n_layers = n_layers
-        self.aux_pos = 2 * n_layers // 3 if auxiliary else -1
-
-        c_cur = stem_multiplier * self.channels
-        self.stem = nn.Sequential(
-            nn.Conv2d(in_channels, c_cur, 3, 1, 1, bias=False),
-            nn.BatchNorm2d(c_cur)
-        )
-
-        # for the first cell, stem is used for both s0 and s1
-        # [!] channels_pp and channels_p is output channel size, but c_cur is input channel size.
-        channels_pp, channels_p, c_cur = c_cur, c_cur, channels
-
-        self.cells = nn.ModuleList()
-        reduction_p, reduction = False, False
-        for i in range(n_layers):
-            reduction_p, reduction = reduction, False
-            # Reduce featuremap size and double channels in 1/3 and 2/3 layer.
-            if i in [n_layers // 3, 2 * n_layers // 3]:
-                c_cur *= 2
-                reduction = True
-
-            cell = Cell(n_nodes, channels_pp, channels_p, c_cur, reduction_p, reduction, drop_path_prob=drop_path_prob)
-            self.cells.append(cell)
-            c_cur_out = c_cur * n_nodes
-            channels_pp, channels_p = channels_p, c_cur_out
-
-            if i == self.aux_pos:
-                self.aux_head = AuxiliaryHead(input_size // 4, channels_p, n_classes)
-
-        self.gap = nn.AdaptiveAvgPool2d(1)
-        self.linear = nn.Linear(channels_p, n_classes)
-
-    def forward(self, x):
-        s0 = s1 = self.stem(x)
-
-        aux_logits = None
-        for i, cell in enumerate(self.cells):
-            s0, s1 = s1, cell(s0, s1)
-            if i == self.aux_pos and self.training:
-                aux_logits = self.aux_head(s1)
-
-        out = self.gap(s1)
-        out = out.view(out.size(0), -1)  # flatten
-        logits = self.linear(out)
-
-        if aux_logits is not None:
-            return logits, aux_logits
-        return logits
diff --git a/examples/nas/pdarts/darts/ops.py b/examples/nas/pdarts/darts/ops.py
deleted file mode 100644
index 2fef9fec19..0000000000
--- a/examples/nas/pdarts/darts/ops.py
+++ /dev/null
@@ -1,135 +0,0 @@
-import torch
-import torch.nn as nn
-
-
-class DropPath_(nn.Module):
-    def __init__(self, p=0.):
-        """ [!] DropPath is inplace module
-        Args:
-            p: probability of an path to be zeroed.
-        """
-        super().__init__()
-        self.p = p
-
-    def extra_repr(self):
-        return 'p={}, inplace'.format(self.p)
-
-    def forward(self, x):
-        if self.training and self.p > 0.:
-            keep_prob = 1. - self.p
-            # per data point mask
-            mask = torch.zeros((x.size(0), 1, 1, 1), device=x.device).bernoulli_(keep_prob)
-            x.div_(keep_prob).mul_(mask)
-
-        return x
-
-
-class PoolBN(nn.Module):
-    """
-    AvgPool or MaxPool - BN
-    """
-    def __init__(self, pool_type, C, kernel_size, stride, padding, affine=True):
-        """
-        Args:
-            pool_type: 'max' or 'avg'
-        """
-        super().__init__()
-        if pool_type.lower() == 'max':
-            self.pool = nn.MaxPool2d(kernel_size, stride, padding)
-        elif pool_type.lower() == 'avg':
-            self.pool = nn.AvgPool2d(kernel_size, stride, padding, count_include_pad=False)
-        else:
-            raise ValueError()
-
-        self.bn = nn.BatchNorm2d(C, affine=affine)
-
-    def forward(self, x):
-        out = self.pool(x)
-        out = self.bn(out)
-        return out
-
-
-class StdConv(nn.Module):
-    """ Standard conv
-    ReLU - Conv - BN
-    """
-    def __init__(self, C_in, C_out, kernel_size, stride, padding, affine=True):
-        super().__init__()
-        self.net = nn.Sequential(
-            nn.ReLU(),
-            nn.Conv2d(C_in, C_out, kernel_size, stride, padding, bias=False),
-            nn.BatchNorm2d(C_out, affine=affine)
-        )
-
-    def forward(self, x):
-        return self.net(x)
-
-
-class FacConv(nn.Module):
-    """ Factorized conv
-    ReLU - Conv(Kx1) - Conv(1xK) - BN
-    """
-    def __init__(self, C_in, C_out, kernel_length, stride, padding, affine=True):
-        super().__init__()
-        self.net = nn.Sequential(
-            nn.ReLU(),
-            nn.Conv2d(C_in, C_in, (kernel_length, 1), stride, padding, bias=False),
-            nn.Conv2d(C_in, C_out, (1, kernel_length), stride, padding, bias=False),
-            nn.BatchNorm2d(C_out, affine=affine)
-        )
-
-    def forward(self, x):
-        return self.net(x)
-
-
-class DilConv(nn.Module):
-    """ (Dilated) depthwise separable conv
-    ReLU - (Dilated) depthwise separable - Pointwise - BN
-    If dilation == 2, 3x3 conv => 5x5 receptive field
-                      5x5 conv => 9x9 receptive field
-    """
-    def __init__(self, C_in, C_out, kernel_size, stride, padding, dilation, affine=True):
-        super().__init__()
-        self.net = nn.Sequential(
-            nn.ReLU(),
-            nn.Conv2d(C_in, C_in, kernel_size, stride, padding, dilation=dilation, groups=C_in,
-                      bias=False),
-            nn.Conv2d(C_in, C_out, 1, stride=1, padding=0, bias=False),
-            nn.BatchNorm2d(C_out, affine=affine)
-        )
-
-    def forward(self, x):
-        return self.net(x)
-
-
-class SepConv(nn.Module):
-    """ Depthwise separable conv
-    DilConv(dilation=1) * 2
-    """
-    def __init__(self, C_in, C_out, kernel_size, stride, padding, affine=True):
-        super().__init__()
-        self.net = nn.Sequential(
-            DilConv(C_in, C_in, kernel_size, stride, padding, dilation=1, affine=affine),
-            DilConv(C_in, C_out, kernel_size, 1, padding, dilation=1, affine=affine)
-        )
-
-    def forward(self, x):
-        return self.net(x)
-
-
-class FactorizedReduce(nn.Module):
-    """
-    Reduce feature map size by factorized pointwise(stride=2).
-    """
-    def __init__(self, C_in, C_out, affine=True):
-        super().__init__()
-        self.relu = nn.ReLU()
-        self.conv1 = nn.Conv2d(C_in, C_out // 2, 1, stride=2, padding=0, bias=False)
-        self.conv2 = nn.Conv2d(C_in, C_out // 2, 1, stride=2, padding=0, bias=False)
-        self.bn = nn.BatchNorm2d(C_out, affine=affine)
-
-    def forward(self, x):
-        x = self.relu(x)
-        out = torch.cat([self.conv1(x), self.conv2(x[:, :, 1:, 1:])], dim=1)
-        out = self.bn(out)
-        return out
diff --git a/examples/nas/pdarts/darts/utils.py b/examples/nas/pdarts/darts/utils.py
deleted file mode 100644
index 2aac457ad1..0000000000
--- a/examples/nas/pdarts/darts/utils.py
+++ /dev/null
@@ -1,18 +0,0 @@
-def accuracy(output, target, topk=(1,)):
-    """ Computes the precision@k for the specified values of k """
-    maxk = max(topk)
-    batch_size = target.size(0)
-
-    _, pred = output.topk(maxk, 1, True, True)
-    pred = pred.t()
-    # one-hot case
-    if target.ndimension() > 1:
-        target = target.max(1)[1]
-
-    correct = pred.eq(target.view(1, -1).expand_as(pred))
-
-    res = dict()
-    for k in topk:
-        correct_k = correct[:k].view(-1).float().sum(0)
-        res["acc{}".format(k)] = correct_k.mul_(1.0 / batch_size).item()
-    return res
\ No newline at end of file
diff --git a/examples/nas/pdarts/search.py b/examples/nas/pdarts/search.py
index 4ba100f97f..ce0d7ddca5 100644
--- a/examples/nas/pdarts/search.py
+++ b/examples/nas/pdarts/search.py
@@ -1,13 +1,14 @@
+from nni.nas.pytorch.pdarts import PdartsTrainer
+import nni.nas.pytorch as nas
+import sys
 from argparse import ArgumentParser
 
 import torch
 import torch.nn as nn
 
-import nni.nas.pytorch as nas
-from nni.nas.pytorch.pdarts import PdartsTrainer
-
-from darts.model import CNN
-from darts import datasets
+sys.path.append('../darts')
+import datasets
+from model import CNN
 
 
 def accuracy(output, target, topk=(1,)):
diff --git a/src/sdk/pynni/nni/nas/pytorch/darts/trainer.py b/src/sdk/pynni/nni/nas/pytorch/darts/trainer.py
index 9f35affeb4..464832eadf 100644
--- a/src/sdk/pynni/nni/nas/pytorch/darts/trainer.py
+++ b/src/sdk/pynni/nni/nas/pytorch/darts/trainer.py
@@ -38,8 +38,8 @@ def train_one_epoch(self, epoch):
         lr = self.optimizer.param_groups[0]["lr"]
         meters = AverageMeterGroup()
         for step, ((trn_X, trn_y), (val_X, val_y)) in enumerate(zip(self.train_loader, self.valid_loader)):
-            trn_X, trn_y = trn_X.to(self.device, non_blocking=True), trn_y.to(self.device, non_blocking=True)
-            val_X, val_y = val_X.to(self.device, non_blocking=True), val_y.to(self.device, non_blocking=True)
+            trn_X, trn_y = trn_X.to(self.device), trn_y.to(self.device)
+            val_X, val_y = val_X.to(self.device), val_y.to(self.device)
 
             # backup model for hessian
             backup_model = copy.deepcopy(self.model.state_dict())

From ccd2369fadc7cec8b4201d8dafe32bce1315cfae Mon Sep 17 00:00:00 2001
From: Chi Song <27178119+squirrelsc@users.noreply.github.com>
Date: Wed, 20 Nov 2019 14:09:42 +0800
Subject: [PATCH 09/30] update document path

---
 docs/en_US/NAS/Overview.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/en_US/NAS/Overview.md b/docs/en_US/NAS/Overview.md
index 6c01685359..8dcaf33ab8 100644
--- a/docs/en_US/NAS/Overview.md
+++ b/docs/en_US/NAS/Overview.md
@@ -36,8 +36,8 @@ python3 search.py
 ### In case NNI code is not cloned.
 git clone https://github.com/Microsoft/nni.git
 
-cd examples/nas
-python3 pdarts/search.py
+cd examples/nas/pdarts
+python3 search.py
 ```
 
 ## Use NNI API

From e402dd8c921c9be5718323e481c37344c46f9049 Mon Sep 17 00:00:00 2001
From: Chi Song <27178119+squirrelsc@users.noreply.github.com>
Date: Wed, 20 Nov 2019 14:11:29 +0800
Subject: [PATCH 10/30] update format

---
 src/sdk/pynni/nni/nas/pytorch/pdarts/trainer.py | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/src/sdk/pynni/nni/nas/pytorch/pdarts/trainer.py b/src/sdk/pynni/nni/nas/pytorch/pdarts/trainer.py
index ab9d7f4519..78d626fbfb 100644
--- a/src/sdk/pynni/nni/nas/pytorch/pdarts/trainer.py
+++ b/src/sdk/pynni/nni/nas/pytorch/pdarts/trainer.py
@@ -34,17 +34,12 @@ def train(self):
         for epoch in range(self.pdarts_epoch):
 
             layers = self.layers+self.pdarts_num_layers[epoch]
-            model, criterion, optim, lr_scheduler = self.model_creator(
-                layers)
-            mutator = PdartsMutator(
-                model, epoch, self.pdarts_num_to_drop, switches)
-
-            self.trainer = DartsTrainer(model,
-                                        loss=criterion,
-                                        optimizer=optim,
-                                        callbacks=[LearningRateScheduler(
-                                            lr_scheduler), ArchitectureCheckpoint("./checkpoints")],
-                                        **self.darts_parameters)
+            model, criterion, optim, lr_scheduler = self.model_creator(layers)
+
+            mutator = PdartsMutator(model, epoch, self.pdarts_num_to_drop, switches)
+
+            self.trainer = DartsTrainer(model, loss=criterion, optimizer=optim, callbacks=[LearningRateScheduler(
+                lr_scheduler), ArchitectureCheckpoint("./checkpoints")], **self.darts_parameters)
             print("start pdarts training %s..." % epoch)
 
             self.trainer.train()

From 49e0aa5f4375be8ccccaad5fb3ab95ace38af732 Mon Sep 17 00:00:00 2001
From: Chi Song <27178119+squirrelsc@users.noreply.github.com>
Date: Wed, 20 Nov 2019 14:25:48 +0800
Subject: [PATCH 11/30] update code to get validate run every time.

---
 src/sdk/pynni/nni/nas/pytorch/pdarts/trainer.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/sdk/pynni/nni/nas/pytorch/pdarts/trainer.py b/src/sdk/pynni/nni/nas/pytorch/pdarts/trainer.py
index 78d626fbfb..c605300f67 100644
--- a/src/sdk/pynni/nni/nas/pytorch/pdarts/trainer.py
+++ b/src/sdk/pynni/nni/nas/pytorch/pdarts/trainer.py
@@ -43,6 +43,7 @@ def train(self):
             print("start pdarts training %s..." % epoch)
 
             self.trainer.train()
+            self.trainer.validate()
 
             # with open('log/parameters_%d.txt' % epoch, "w") as f:
             #     f.write(str(model.parameters))
@@ -50,8 +51,8 @@ def train(self):
             switches = mutator.drop_paths()
 
     def validate(self):
-        self.trainer.validate()
+        # pdarts validate after train, it doesn't support separated validation progress.
+        pass
 
     def train_and_validate(self):
         self.train()
-        self.validate()

From 12b3c61e4d8400374032760d730dfbc3d73ab0c6 Mon Sep 17 00:00:00 2001
From: Chi Song <27178119+squirrelsc@users.noreply.github.com>
Date: Wed, 20 Nov 2019 17:03:29 +0800
Subject: [PATCH 12/30] change urls to official ones.

---
 docs/en_US/NAS/Overview.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/en_US/NAS/Overview.md b/docs/en_US/NAS/Overview.md
index 8dcaf33ab8..7023d9b8e9 100644
--- a/docs/en_US/NAS/Overview.md
+++ b/docs/en_US/NAS/Overview.md
@@ -53,7 +53,7 @@ The programming interface of designing and searching a model is often demanded i
 1. When designing a neural network, there may be multiple operation choices on a layer, sub-model, or connection, and it's undetermined which one or combination performs  best. So, it needs an easy way to express the candidate layers or sub-models.
 2. When applying NAS on a neural network, it needs an unified way to express the search space of architectures, so that it doesn't need to update trial code for different searching algorithms.
 
-NNI proposed API is [here](https://github.com/microsoft/nni/tree/dev-nas-refactor/src/sdk/pynni/nni/nas/pytorch). And [here](https://github.com/microsoft/nni/tree/dev-nas-refactor/examples/nas/darts) is an example of NAS implementation, which bases on NNI proposed interface.
+NNI proposed API is [here](https://github.com/microsoft/nni/tree/master/src/sdk/pynni/nni/nas/pytorch). And [here](https://github.com/microsoft/nni/tree/master/examples/nas/darts) is an example of NAS implementation, which bases on NNI proposed interface.
 
 [1]: https://arxiv.org/abs/1802.03268
 [2]: https://arxiv.org/abs/1707.07012

From c8c466fed03d8c7596d56ac79770df2f709c481d Mon Sep 17 00:00:00 2001
From: Chi Song <27178119+squirrelsc@users.noreply.github.com>
Date: Wed, 20 Nov 2019 17:37:59 +0800
Subject: [PATCH 13/30] add header and simplify code

---
 examples/nas/pdarts/search.py                    |  3 +++
 src/sdk/pynni/nni/nas/pytorch/pdarts/__init__.py |  3 +++
 src/sdk/pynni/nni/nas/pytorch/pdarts/mutator.py  | 10 +++++-----
 src/sdk/pynni/nni/nas/pytorch/pdarts/trainer.py  |  3 +++
 4 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/examples/nas/pdarts/search.py b/examples/nas/pdarts/search.py
index ce0d7ddca5..d76dbbf02c 100644
--- a/examples/nas/pdarts/search.py
+++ b/examples/nas/pdarts/search.py
@@ -1,3 +1,6 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
 from nni.nas.pytorch.pdarts import PdartsTrainer
 import nni.nas.pytorch as nas
 import sys
diff --git a/src/sdk/pynni/nni/nas/pytorch/pdarts/__init__.py b/src/sdk/pynni/nni/nas/pytorch/pdarts/__init__.py
index 27dd912ab3..d1d17764ba 100644
--- a/src/sdk/pynni/nni/nas/pytorch/pdarts/__init__.py
+++ b/src/sdk/pynni/nni/nas/pytorch/pdarts/__init__.py
@@ -1 +1,4 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
 from .trainer import PdartsTrainer
diff --git a/src/sdk/pynni/nni/nas/pytorch/pdarts/mutator.py b/src/sdk/pynni/nni/nas/pytorch/pdarts/mutator.py
index 4a8f969875..9c39bd32d5 100644
--- a/src/sdk/pynni/nni/nas/pytorch/pdarts/mutator.py
+++ b/src/sdk/pynni/nni/nas/pytorch/pdarts/mutator.py
@@ -1,3 +1,6 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
 import copy
 
 import numpy as np
@@ -11,13 +14,10 @@
 
 class PdartsMutator(DartsMutator):
 
-    def __init__(self, model, pdarts_epoch_index, pdarts_num_to_drop, switches=None):
+    def __init__(self, model, pdarts_epoch_index, pdarts_num_to_drop, switches={}):
         self.pdarts_epoch_index = pdarts_epoch_index
         self.pdarts_num_to_drop = pdarts_num_to_drop
-        if switches is None:
-            self.switches = {}
-        else:
-            self.switches = switches
+        self.switches = switches
 
         super(PdartsMutator, self).__init__(model)
 
diff --git a/src/sdk/pynni/nni/nas/pytorch/pdarts/trainer.py b/src/sdk/pynni/nni/nas/pytorch/pdarts/trainer.py
index c605300f67..28a437c388 100644
--- a/src/sdk/pynni/nni/nas/pytorch/pdarts/trainer.py
+++ b/src/sdk/pynni/nni/nas/pytorch/pdarts/trainer.py
@@ -1,3 +1,6 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
 from nni.nas.pytorch.callbacks import (ArchitectureCheckpoint, LearningRateScheduler)
 from nni.nas.pytorch.darts import DartsTrainer
 from nni.nas.pytorch.trainer import BaseTrainer

From 81c5070755136579539e7341238b4129739a00cb Mon Sep 17 00:00:00 2001
From: Chi Song <27178119+squirrelsc@users.noreply.github.com>
Date: Wed, 20 Nov 2019 18:05:57 +0800
Subject: [PATCH 14/30] fix bug that may get None

---
 src/sdk/pynni/nni/nas/pytorch/pdarts/mutator.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/sdk/pynni/nni/nas/pytorch/pdarts/mutator.py b/src/sdk/pynni/nni/nas/pytorch/pdarts/mutator.py
index 9c39bd32d5..0100419172 100644
--- a/src/sdk/pynni/nni/nas/pytorch/pdarts/mutator.py
+++ b/src/sdk/pynni/nni/nas/pytorch/pdarts/mutator.py
@@ -17,7 +17,10 @@ class PdartsMutator(DartsMutator):
     def __init__(self, model, pdarts_epoch_index, pdarts_num_to_drop, switches={}):
         self.pdarts_epoch_index = pdarts_epoch_index
         self.pdarts_num_to_drop = pdarts_num_to_drop
-        self.switches = switches
+        if switches is None:
+            self.switches = {}
+        else:
+            self.switches = switches
 
         super(PdartsMutator, self).__init__(model)
 

From b821b1274420f8a49d7e40c1668c368f936ea863 Mon Sep 17 00:00:00 2001
From: Chi Song <27178119+squirrelsc@users.noreply.github.com>
Date: Thu, 21 Nov 2019 15:54:01 +0800
Subject: [PATCH 15/30] update code for new refactoring

---
 examples/nas/pdarts/search.py                 | 26 +++----------------
 .../pynni/nni/nas/pytorch/pdarts/mutator.py   | 16 +++---------
 .../pynni/nni/nas/pytorch/pdarts/trainer.py   | 20 +++++++-------
 3 files changed, 16 insertions(+), 46 deletions(-)

diff --git a/examples/nas/pdarts/search.py b/examples/nas/pdarts/search.py
index d76dbbf02c..9aab19b5d2 100644
--- a/examples/nas/pdarts/search.py
+++ b/examples/nas/pdarts/search.py
@@ -12,27 +12,7 @@
 sys.path.append('../darts')
 import datasets
 from model import CNN
-
-
-def accuracy(output, target, topk=(1,)):
-    """ Computes the precision@k for the specified values of k """
-    maxk = max(topk)
-    batch_size = target.size(0)
-
-    _, pred = output.topk(maxk, 1, True, True)
-    pred = pred.t()
-    # one-hot case
-    if target.ndimension() > 1:
-        target = target.max(1)[1]
-
-    correct = pred.eq(target.view(1, -1).expand_as(pred))
-
-    res = dict()
-    for k in topk:
-        correct_k = correct[:k].view(-1).float().sum(0)
-        res["acc{}".format(k)] = correct_k.mul_(1.0 / batch_size).item()
-    return res
-
+from utils import accuracy
 
 if __name__ == "__main__":
     parser = ArgumentParser("pdarts")
@@ -40,7 +20,7 @@ def accuracy(output, target, topk=(1,)):
                         default=[0, 6, 12], help='add layers')
     parser.add_argument("--nodes", default=4, type=int)
     parser.add_argument("--layers", default=5, type=int)
-    parser.add_argument("--batch-size", default=128, type=int)
+    parser.add_argument("--batch-size", default=64, type=int)
     parser.add_argument("--log-frequency", default=1, type=int)
     parser.add_argument("--epochs", default=50, type=int)
     args = parser.parse_args()
@@ -66,4 +46,4 @@ def model_creator(layers):
                             dataset_valid=dataset_valid,
                             batch_size=args.batch_size,
                             log_frequency=args.log_frequency)
-    trainer.train_and_validate()
+    trainer.train()
diff --git a/src/sdk/pynni/nni/nas/pytorch/pdarts/mutator.py b/src/sdk/pynni/nni/nas/pytorch/pdarts/mutator.py
index ee952557ad..68c6f1dde5 100644
--- a/src/sdk/pynni/nni/nas/pytorch/pdarts/mutator.py
+++ b/src/sdk/pynni/nni/nas/pytorch/pdarts/mutator.py
@@ -14,7 +14,7 @@
 
 class PdartsMutator(DartsMutator):
 
-    def __init__(self, pdarts_epoch_index, pdarts_num_to_drop, switches={}):
+    def __init__(self, model, pdarts_epoch_index, pdarts_num_to_drop, switches={}):
         self.pdarts_epoch_index = pdarts_epoch_index
         self.pdarts_num_to_drop = pdarts_num_to_drop
         if switches is None:
@@ -22,16 +22,12 @@ def __init__(self, pdarts_epoch_index, pdarts_num_to_drop, switches={}):
         else:
             self.switches = switches
 
-        super(PdartsMutator, self).__init__()
+        super(PdartsMutator, self).__init__(model)
 
-    def before_build(self):
-        self.choices = nn.ParameterDict()
-
-        for _, mutable in self.named_mutables():
+        for mutable in self.mutables:
             if isinstance(mutable, LayerChoice):
 
-                switches = self.switches.get(
-                    mutable.key, [True for j in range(mutable.length)])
+                switches = self.switches.get(mutable.key, [True for j in range(mutable.length)])
 
                 for index in range(len(switches)-1, -1, -1):
                     if switches[index] == False:
@@ -39,10 +35,6 @@ def before_build(self):
                         mutable.length -= 1
 
                 self.switches[mutable.key] = switches
-                self.choices[mutable.key] = nn.Parameter(1.0E-3 * torch.randn(len(mutable) + 1))
-
-    def on_calc_layer_choice_mask(self, mutable: LayerChoice):
-        return F.softmax(self.choices[mutable.key], dim=-1)
 
     def drop_paths(self):
         for key in self.switches:
diff --git a/src/sdk/pynni/nni/nas/pytorch/pdarts/trainer.py b/src/sdk/pynni/nni/nas/pytorch/pdarts/trainer.py
index 28a437c388..816239ffb5 100644
--- a/src/sdk/pynni/nni/nas/pytorch/pdarts/trainer.py
+++ b/src/sdk/pynni/nni/nas/pytorch/pdarts/trainer.py
@@ -39,23 +39,21 @@ def train(self):
             layers = self.layers+self.pdarts_num_layers[epoch]
             model, criterion, optim, lr_scheduler = self.model_creator(layers)
 
-            mutator = PdartsMutator(model, epoch, self.pdarts_num_to_drop, switches)
+            self.mutator = PdartsMutator(model, epoch, self.pdarts_num_to_drop, switches)
 
-            self.trainer = DartsTrainer(model, loss=criterion, optimizer=optim, callbacks=[LearningRateScheduler(
+            self.trainer = DartsTrainer(model, mutator=self.mutator, loss=criterion, optimizer=optim, callbacks=[LearningRateScheduler(
                 lr_scheduler), ArchitectureCheckpoint("./checkpoints")], **self.darts_parameters)
             print("start pdarts training %s..." % epoch)
 
             self.trainer.train()
-            self.trainer.validate()
 
-            # with open('log/parameters_%d.txt' % epoch, "w") as f:
-            #     f.write(str(model.parameters))
-
-            switches = mutator.drop_paths()
+            switches = self.mutator.drop_paths()
 
     def validate(self):
-        # pdarts validate after train, it doesn't support separated validation progress.
-        pass
+        self.model.validate()
+
+    def export(self):
+        self.mutator.export()
 
-    def train_and_validate(self):
-        self.train()
+    def checkpoint(self):
+        raise NotImplementedError("Not implemented yet")

From e8c064634d35eaebea4c8fc5807b3f493943762f Mon Sep 17 00:00:00 2001
From: Chi Song <27178119+squirrelsc@users.noreply.github.com>
Date: Thu, 21 Nov 2019 16:35:51 +0800
Subject: [PATCH 16/30] fix call backs

---
 examples/nas/pdarts/search.py                 | 11 ++++++-----
 .../pynni/nni/nas/pytorch/pdarts/trainer.py   | 19 ++++++++++++++++---
 2 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/examples/nas/pdarts/search.py b/examples/nas/pdarts/search.py
index 9aab19b5d2..1e9cb3acd6 100644
--- a/examples/nas/pdarts/search.py
+++ b/examples/nas/pdarts/search.py
@@ -9,10 +9,10 @@
 import torch
 import torch.nn as nn
 
-sys.path.append('../darts')
-import datasets
-from model import CNN
-from utils import accuracy
+with sys.path.append('../darts'):
+    from utils import accuracy
+    from model import CNN
+    import datasets
 
 if __name__ == "__main__":
     parser = ArgumentParser("pdarts")
@@ -45,5 +45,6 @@ def model_creator(layers):
                             dataset_train=dataset_train,
                             dataset_valid=dataset_valid,
                             batch_size=args.batch_size,
-                            log_frequency=args.log_frequency)
+                            log_frequency=args.log_frequency,
+                            callbacks=[ArchitectureCheckpoint("./checkpoints")])
     trainer.train()
diff --git a/src/sdk/pynni/nni/nas/pytorch/pdarts/trainer.py b/src/sdk/pynni/nni/nas/pytorch/pdarts/trainer.py
index 816239ffb5..b7411fa6d2 100644
--- a/src/sdk/pynni/nni/nas/pytorch/pdarts/trainer.py
+++ b/src/sdk/pynni/nni/nas/pytorch/pdarts/trainer.py
@@ -13,13 +13,14 @@ class PdartsTrainer(BaseTrainer):
     def __init__(self, model_creator, layers, metrics,
                  num_epochs, dataset_train, dataset_valid,
                  pdarts_num_layers=[0, 6, 12], pdarts_num_to_drop=[3, 2, 2],
-                 mutator=None, batch_size=64, workers=4, device=None, log_frequency=None):
+                 mutator=None, batch_size=64, workers=4, device=None, log_frequency=None, callbacks=None):
         super(PdartsTrainer, self).__init__()
         self.model_creator = model_creator
         self.layers = layers
         self.pdarts_num_layers = pdarts_num_layers
         self.pdarts_num_to_drop = pdarts_num_to_drop
         self.pdarts_epoch = len(pdarts_num_to_drop)
+        self.callbacks = callbacks
         self.darts_parameters = {
             "metrics": metrics,
             "num_epochs": num_epochs,
@@ -30,25 +31,37 @@ def __init__(self, model_creator, layers, metrics,
             "device": device,
             "log_frequency": log_frequency
         }
+        self.callbacks = callbacks if callbacks is not None else []
+        for callback in self.callbacks:
+            callback.build(self.model, self.mutator, self)
 
     def train(self):
         layers = self.layers
         switches = None
         for epoch in range(self.pdarts_epoch):
+            for callback in self.callbacks:
+                callback.on_epoch_begin(epoch)
 
             layers = self.layers+self.pdarts_num_layers[epoch]
             model, criterion, optim, lr_scheduler = self.model_creator(layers)
 
             self.mutator = PdartsMutator(model, epoch, self.pdarts_num_to_drop, switches)
 
-            self.trainer = DartsTrainer(model, mutator=self.mutator, loss=criterion, optimizer=optim, callbacks=[LearningRateScheduler(
-                lr_scheduler), ArchitectureCheckpoint("./checkpoints")], **self.darts_parameters)
+            darts_callbacks = []
+            if lr_scheduler is not None:
+                darts_callbacks.append(LearningRateScheduler(lr_scheduler))
+
+            self.trainer = DartsTrainer(model, mutator=self.mutator, loss=criterion, optimizer=optim,
+                                        callbacks=darts_callbacks, **self.darts_parameters)
             print("start pdarts training %s..." % epoch)
 
             self.trainer.train()
 
             switches = self.mutator.drop_paths()
 
+            for callback in self.callbacks:
+                callback.on_epoch_end(epoch)
+
     def validate(self):
         self.model.validate()
 

From ef0d9072c24e3f7a31ca0157a18d0e704acabaf6 Mon Sep 17 00:00:00 2001
From: Chi Song <27178119+squirrelsc@users.noreply.github.com>
Date: Thu, 21 Nov 2019 16:38:01 +0800
Subject: [PATCH 17/30] fix a bug on missing import

---
 examples/nas/pdarts/search.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/examples/nas/pdarts/search.py b/examples/nas/pdarts/search.py
index 1e9cb3acd6..93a0f50117 100644
--- a/examples/nas/pdarts/search.py
+++ b/examples/nas/pdarts/search.py
@@ -1,14 +1,16 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-from nni.nas.pytorch.pdarts import PdartsTrainer
-import nni.nas.pytorch as nas
 import sys
 from argparse import ArgumentParser
 
 import torch
 import torch.nn as nn
 
+import nni.nas.pytorch as nas
+from nni.nas.pytorch.callbacks import ArchitectureCheckpoint
+from nni.nas.pytorch.pdarts import PdartsTrainer
+
 with sys.path.append('../darts'):
     from utils import accuracy
     from model import CNN

From 8322a953f0eacf3c1fdaa6e9135ab0df7d7a3272 Mon Sep 17 00:00:00 2001
From: Chi Song <27178119+squirrelsc@users.noreply.github.com>
Date: Thu, 21 Nov 2019 16:42:25 +0800
Subject: [PATCH 18/30] fix runtime bug

---
 examples/nas/pdarts/search.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/examples/nas/pdarts/search.py b/examples/nas/pdarts/search.py
index 93a0f50117..f4372a1738 100644
--- a/examples/nas/pdarts/search.py
+++ b/examples/nas/pdarts/search.py
@@ -11,7 +11,9 @@
 from nni.nas.pytorch.callbacks import ArchitectureCheckpoint
 from nni.nas.pytorch.pdarts import PdartsTrainer
 
-with sys.path.append('../darts'):
+# prevent it to be reordered.
+if True:
+    sys.path.append('../darts')
     from utils import accuracy
     from model import CNN
     import datasets

From a58492a2e1eca3dab19b4225630f0475d0488974 Mon Sep 17 00:00:00 2001
From: Chi Song <27178119+squirrelsc@users.noreply.github.com>
Date: Thu, 21 Nov 2019 16:57:40 +0800
Subject: [PATCH 19/30] fix callback code's location.

---
 src/sdk/pynni/nni/nas/pytorch/pdarts/trainer.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/sdk/pynni/nni/nas/pytorch/pdarts/trainer.py b/src/sdk/pynni/nni/nas/pytorch/pdarts/trainer.py
index b7411fa6d2..cf2e703bf7 100644
--- a/src/sdk/pynni/nni/nas/pytorch/pdarts/trainer.py
+++ b/src/sdk/pynni/nni/nas/pytorch/pdarts/trainer.py
@@ -32,21 +32,20 @@ def __init__(self, model_creator, layers, metrics,
             "log_frequency": log_frequency
         }
         self.callbacks = callbacks if callbacks is not None else []
-        for callback in self.callbacks:
-            callback.build(self.model, self.mutator, self)
 
     def train(self):
         layers = self.layers
         switches = None
         for epoch in range(self.pdarts_epoch):
-            for callback in self.callbacks:
-                callback.on_epoch_begin(epoch)
 
             layers = self.layers+self.pdarts_num_layers[epoch]
             model, criterion, optim, lr_scheduler = self.model_creator(layers)
-
             self.mutator = PdartsMutator(model, epoch, self.pdarts_num_to_drop, switches)
 
+            for callback in self.callbacks:
+                callback.build(self.model, self.mutator, self)
+                callback.on_epoch_begin(epoch)
+
             darts_callbacks = []
             if lr_scheduler is not None:
                 darts_callbacks.append(LearningRateScheduler(lr_scheduler))

From c474820906832226431e8dcb23c8e44621068c2e Mon Sep 17 00:00:00 2001
From: Chi Song <27178119+squirrelsc@users.noreply.github.com>
Date: Thu, 21 Nov 2019 17:02:56 +0800
Subject: [PATCH 20/30] fix previous bug throughtly...

---
 src/sdk/pynni/nni/nas/pytorch/pdarts/trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/sdk/pynni/nni/nas/pytorch/pdarts/trainer.py b/src/sdk/pynni/nni/nas/pytorch/pdarts/trainer.py
index cf2e703bf7..0621e7f491 100644
--- a/src/sdk/pynni/nni/nas/pytorch/pdarts/trainer.py
+++ b/src/sdk/pynni/nni/nas/pytorch/pdarts/trainer.py
@@ -43,7 +43,7 @@ def train(self):
             self.mutator = PdartsMutator(model, epoch, self.pdarts_num_to_drop, switches)
 
             for callback in self.callbacks:
-                callback.build(self.model, self.mutator, self)
+                callback.build(model, self.mutator, self)
                 callback.on_epoch_begin(epoch)
 
             darts_callbacks = []

From 269507bb3773dfff54d8a567c9a14d9473b4a54b Mon Sep 17 00:00:00 2001
From: Chi Song <27178119+squirrelsc@users.noreply.github.com>
Date: Thu, 21 Nov 2019 17:34:47 +0800
Subject: [PATCH 21/30] update document and remove a duplicated line.

---
 docs/en_US/AdvancedFeature/MultiPhase.md        |  2 +-
 docs/en_US/NAS/Overview.md                      | 13 +++++++++++--
 docs/en_US/Tutorial/SearchSpaceSpec.md          |  8 --------
 docs/en_US/advanced.rst                         |  2 --
 src/sdk/pynni/nni/nas/pytorch/pdarts/trainer.py |  1 -
 5 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/docs/en_US/AdvancedFeature/MultiPhase.md b/docs/en_US/AdvancedFeature/MultiPhase.md
index c9727bcdcc..4cdb3a7a99 100644
--- a/docs/en_US/AdvancedFeature/MultiPhase.md
+++ b/docs/en_US/AdvancedFeature/MultiPhase.md
@@ -79,7 +79,7 @@ With this information, the tuner could know which trial is requesting a configur
 
 ### Tuners support multi-phase experiments:
 
-[TPE](../Tuner/HyperoptTuner.md), [Random](../Tuner/HyperoptTuner.md), [Anneal](../Tuner/HyperoptTuner.md), [Evolution](../Tuner/EvolutionTuner.md), [SMAC](../Tuner/SmacTuner.md), [NetworkMorphism](../Tuner/NetworkmorphismTuner.md), [MetisTuner](../Tuner/MetisTuner.md), [BOHB](../Tuner/BohbAdvisor.md), [Hyperband](../Tuner/HyperbandAdvisor.md), [ENAS tuner](https://github.com/countif/enas_nni/blob/master/nni/examples/tuners/enas/nni_controller_ptb.py).
+[TPE](../Tuner/HyperoptTuner.md), [Random](../Tuner/HyperoptTuner.md), [Anneal](../Tuner/HyperoptTuner.md), [Evolution](../Tuner/EvolutionTuner.md), [SMAC](../Tuner/SmacTuner.md), [NetworkMorphism](../Tuner/NetworkmorphismTuner.md), [MetisTuner](../Tuner/MetisTuner.md), [BOHB](../Tuner/BohbAdvisor.md), [Hyperband](../Tuner/HyperbandAdvisor.md).
 
 ### Training services support multi-phase experiment:
 [Local Machine](../TrainingService/LocalMode.md), [Remote Servers](../TrainingService/RemoteMachineMode.md), [OpenPAI](../TrainingService/PaiMode.md)
diff --git a/docs/en_US/NAS/Overview.md b/docs/en_US/NAS/Overview.md
index 7023d9b8e9..8a52d04af2 100644
--- a/docs/en_US/NAS/Overview.md
+++ b/docs/en_US/NAS/Overview.md
@@ -19,11 +19,15 @@ The main contribution of [DARTS: Differentiable Architecture Search][3] on algor
 #### Usage
 
 ```bash
-### In case NNI code is not cloned.
+# In case NNI code is not cloned. If the code is cloned already, ignore this line and enter code folder.
 git clone https://github.com/Microsoft/nni.git
 
+# search the best architecture
 cd examples/nas/darts
 python3 search.py
+
+# train the best architecture
+python3 retrain.py --arc-checkpoint ./checkpoints/epoch_49.json
 ```
 
 ### P-DARTS
@@ -33,11 +37,16 @@ python3 search.py
 #### Usage
 
 ```bash
-### In case NNI code is not cloned.
+# In case NNI code is not cloned. If the code is cloned already, ignore this line and enter code folder.
 git clone https://github.com/Microsoft/nni.git
 
+# search the best architecture
 cd examples/nas/pdarts
 python3 search.py
+
+# train the best architecture, it's the same progress as darts.
+cd examples/nas/darts
+python3 retrain.py --arc-checkpoint ./checkpoints/epoch_2.json
 ```
 
 ## Use NNI API
diff --git a/docs/en_US/Tutorial/SearchSpaceSpec.md b/docs/en_US/Tutorial/SearchSpaceSpec.md
index fd1781716f..eb5d39315c 100644
--- a/docs/en_US/Tutorial/SearchSpaceSpec.md
+++ b/docs/en_US/Tutorial/SearchSpaceSpec.md
@@ -73,12 +73,6 @@ All types of sampling strategies and their parameter are listed here:
   * Which means the variable value is a value like `round(exp(normal(mu, sigma)) / q) * q`
   * Suitable for a discrete variable with respect to which the objective is smooth and gets smoother with the size of the variable, which is bounded from one side.
 
-* `{"_type": "mutable_layer", "_value": {mutable_layer_infomation}}`
-  * Type for [Neural Architecture Search Space][1]. Value is also a dictionary, which contains key-value pairs representing respectively name and search space of each mutable_layer.
-  * For now, users can only use this type of search space with annotation, which means that there is no need to define a json file for search space since it will be automatically generated according to the annotation in trial code.
-  * The following HPO tuners can be adapted to tune this search space: TPE, Random, Anneal, Evolution, Grid Search,
-  Hyperband and BOHB.
-  * For detailed usage, please refer to [General NAS Interfaces][1].
 
 ## Search Space Types Supported by Each Tuner
 
@@ -105,5 +99,3 @@ Known Limitations:
     * Only Random Search/TPE/Anneal/Evolution tuner supports nested search space
 
     * We do not support nested search space "Hyper Parameter" in visualization now, the enhancement is being considered in [#1110](https://github.com/microsoft/nni/issues/1110), any suggestions or discussions or contributions are warmly welcomed
-
-[1]: ../AdvancedFeature/GeneralNasInterfaces.md
diff --git a/docs/en_US/advanced.rst b/docs/en_US/advanced.rst
index d9192cc869..e38f634969 100644
--- a/docs/en_US/advanced.rst
+++ b/docs/en_US/advanced.rst
@@ -3,5 +3,3 @@ Advanced Features
 
 ..  toctree::
     MultiPhase<./AdvancedFeature/MultiPhase>
-    AdvancedNas<./AdvancedFeature/AdvancedNas>
-    NAS Programming Interface<./AdvancedFeature/GeneralNasInterfaces>
\ No newline at end of file
diff --git a/src/sdk/pynni/nni/nas/pytorch/pdarts/trainer.py b/src/sdk/pynni/nni/nas/pytorch/pdarts/trainer.py
index 0621e7f491..702600e826 100644
--- a/src/sdk/pynni/nni/nas/pytorch/pdarts/trainer.py
+++ b/src/sdk/pynni/nni/nas/pytorch/pdarts/trainer.py
@@ -20,7 +20,6 @@ def __init__(self, model_creator, layers, metrics,
         self.pdarts_num_layers = pdarts_num_layers
         self.pdarts_num_to_drop = pdarts_num_to_drop
         self.pdarts_epoch = len(pdarts_num_to_drop)
-        self.callbacks = callbacks
         self.darts_parameters = {
             "metrics": metrics,
             "num_epochs": num_epochs,

From 07b15cdcc620b99c13b57bd995aadbde6c85b8df Mon Sep 17 00:00:00 2001
From: Chi Song <27178119+squirrelsc@users.noreply.github.com>
Date: Thu, 21 Nov 2019 18:01:08 +0800
Subject: [PATCH 22/30] update logs

---
 examples/nas/darts/search.py                    |  7 +++++--
 examples/nas/pdarts/search.py                   |  4 ++++
 src/sdk/pynni/nni/nas/pytorch/darts/trainer.py  | 10 ++++++++--
 src/sdk/pynni/nni/nas/pytorch/enas/trainer.py   |  7 +++++--
 src/sdk/pynni/nni/nas/pytorch/mutables.py       |  5 ++++-
 src/sdk/pynni/nni/nas/pytorch/pdarts/trainer.py |  6 ++++--
 src/sdk/pynni/nni/nas/pytorch/trainer.py        |  4 ++--
 7 files changed, 32 insertions(+), 11 deletions(-)

diff --git a/examples/nas/darts/search.py b/examples/nas/darts/search.py
index 02c720a60c..568a4c52fe 100644
--- a/examples/nas/darts/search.py
+++ b/examples/nas/darts/search.py
@@ -1,14 +1,17 @@
+import logging
 from argparse import ArgumentParser
 
-import datasets
 import torch
 import torch.nn as nn
 
+import datasets
 from model import CNN
-from nni.nas.pytorch.callbacks import LearningRateScheduler, ArchitectureCheckpoint
+from nni.nas.pytorch.callbacks import (ArchitectureCheckpoint,
+                                       LearningRateScheduler)
 from nni.nas.pytorch.darts import DartsTrainer
 from utils import accuracy
 
+logging.basicConfig(level=logging.INFO)
 
 if __name__ == "__main__":
     parser = ArgumentParser("darts")
diff --git a/examples/nas/pdarts/search.py b/examples/nas/pdarts/search.py
index f4372a1738..787d3e38ba 100644
--- a/examples/nas/pdarts/search.py
+++ b/examples/nas/pdarts/search.py
@@ -1,6 +1,7 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
+import logging
 import sys
 from argparse import ArgumentParser
 
@@ -18,6 +19,9 @@
     from model import CNN
     import datasets
 
+
+logging.basicConfig(level=logging.INFO)
+
 if __name__ == "__main__":
     parser = ArgumentParser("pdarts")
     parser.add_argument('--add_layers', action='append',
diff --git a/src/sdk/pynni/nni/nas/pytorch/darts/trainer.py b/src/sdk/pynni/nni/nas/pytorch/darts/trainer.py
index c6b29de04a..f69dac5ef6 100644
--- a/src/sdk/pynni/nni/nas/pytorch/darts/trainer.py
+++ b/src/sdk/pynni/nni/nas/pytorch/darts/trainer.py
@@ -1,12 +1,16 @@
 import copy
+import logging
 
 import torch
 from torch import nn as nn
 
 from nni.nas.pytorch.trainer import Trainer
 from nni.nas.pytorch.utils import AverageMeterGroup
+
 from .mutator import DartsMutator
 
+logger = logging.getLogger("darts/trainer")
+
 
 class DartsTrainer(Trainer):
     def __init__(self, model, loss, metrics,
@@ -72,7 +76,8 @@ def train_one_epoch(self, epoch):
             metrics["loss"] = loss.item()
             meters.update(metrics)
             if self.log_frequency is not None and step % self.log_frequency == 0:
-                print("Epoch [{}/{}] Step [{}/{}]  {}".format(epoch, self.num_epochs, step, len(self.train_loader), meters))
+                logger.info("Epoch [{}/{}] Step [{}/{}]  {}".format(epoch,
+                                                                    self.num_epochs, step, len(self.train_loader), meters))
 
     def validate_one_epoch(self, epoch):
         self.model.eval()
@@ -86,7 +91,8 @@ def validate_one_epoch(self, epoch):
                 metrics = self.metrics(logits, y)
                 meters.update(metrics)
                 if self.log_frequency is not None and step % self.log_frequency == 0:
-                    print("Epoch [{}/{}] Step [{}/{}]  {}".format(epoch, self.num_epochs, step, len(self.valid_loader), meters))
+                    logger.info("Epoch [{}/{}] Step [{}/{}]  {}".format(epoch,
+                                                                        self.num_epochs, step, len(self.valid_loader), meters))
 
     def _unrolled_backward(self, trn_X, trn_y, val_X, val_y, backup_model, lr):
         """
diff --git a/src/sdk/pynni/nni/nas/pytorch/enas/trainer.py b/src/sdk/pynni/nni/nas/pytorch/enas/trainer.py
index 1ed302ac7b..460e3d9767 100644
--- a/src/sdk/pynni/nni/nas/pytorch/enas/trainer.py
+++ b/src/sdk/pynni/nni/nas/pytorch/enas/trainer.py
@@ -1,3 +1,4 @@
+import logging
 import torch
 import torch.optim as optim
 
@@ -6,6 +7,8 @@
 from .mutator import EnasMutator
 
 
+logger = logging.getLogger("enas/trainer")
+
 class EnasTrainer(Trainer):
     def __init__(self, model, loss, metrics, reward_function,
                  optimizer, num_epochs, dataset_train, dataset_valid,
@@ -70,7 +73,7 @@ def train_one_epoch(self, epoch):
             meters.update(metrics)
 
             if self.log_frequency is not None and step % self.log_frequency == 0:
-                print("Model Epoch [{}/{}] Step [{}/{}]  {}".format(epoch, self.num_epochs,
+                logger.info("Model Epoch [{}/{}] Step [{}/{}]  {}".format(epoch, self.num_epochs,
                                                                     step, len(self.train_loader), meters))
 
         # Train sampler (mutator)
@@ -109,7 +112,7 @@ def train_one_epoch(self, epoch):
                     self.mutator_optim.zero_grad()
 
                 if self.log_frequency is not None and step % self.log_frequency == 0:
-                    print("RL Epoch [{}/{}] Step [{}/{}]  {}".format(epoch, self.num_epochs,
+                    logger.info("RL Epoch [{}/{}] Step [{}/{}]  {}".format(epoch, self.num_epochs,
                                                                      mutator_step // self.mutator_steps_aggregate,
                                                                      self.mutator_steps, meters))
                 mutator_step += 1
diff --git a/src/sdk/pynni/nni/nas/pytorch/mutables.py b/src/sdk/pynni/nni/nas/pytorch/mutables.py
index 79cde1cf3f..0ee7855ca9 100644
--- a/src/sdk/pynni/nni/nas/pytorch/mutables.py
+++ b/src/sdk/pynni/nni/nas/pytorch/mutables.py
@@ -1,7 +1,10 @@
+import logging
+
 import torch.nn as nn
 
 from nni.nas.pytorch.utils import global_mutable_counting
 
+logger = logging.getLogger("darts/trainer")
 
 class Mutable(nn.Module):
     """
@@ -20,7 +23,7 @@ def __init__(self, key=None):
         if key is not None:
             if not isinstance(key, str):
                 key = str(key)
-                print("Warning: key \"{}\" is not string, converted to string.".format(key))
+                logger.warn("Warning: key \"{}\" is not string, converted to string.".format(key))
             self._key = key
         else:
             self._key = self.__class__.__name__ + str(global_mutable_counting())
diff --git a/src/sdk/pynni/nni/nas/pytorch/pdarts/trainer.py b/src/sdk/pynni/nni/nas/pytorch/pdarts/trainer.py
index 702600e826..c403262040 100644
--- a/src/sdk/pynni/nni/nas/pytorch/pdarts/trainer.py
+++ b/src/sdk/pynni/nni/nas/pytorch/pdarts/trainer.py
@@ -1,12 +1,14 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
-
+import logging
 from nni.nas.pytorch.callbacks import (ArchitectureCheckpoint, LearningRateScheduler)
 from nni.nas.pytorch.darts import DartsTrainer
 from nni.nas.pytorch.trainer import BaseTrainer
 
 from .mutator import PdartsMutator
 
+logger = logging.getLogger("pdarts/trainer")
+
 
 class PdartsTrainer(BaseTrainer):
 
@@ -51,7 +53,7 @@ def train(self):
 
             self.trainer = DartsTrainer(model, mutator=self.mutator, loss=criterion, optimizer=optim,
                                         callbacks=darts_callbacks, **self.darts_parameters)
-            print("start pdarts training %s..." % epoch)
+            logger.info("start pdarts training %s..." % epoch)
 
             self.trainer.train()
 
diff --git a/src/sdk/pynni/nni/nas/pytorch/trainer.py b/src/sdk/pynni/nni/nas/pytorch/trainer.py
index a4954a0747..9e4b20c1cb 100644
--- a/src/sdk/pynni/nni/nas/pytorch/trainer.py
+++ b/src/sdk/pynni/nni/nas/pytorch/trainer.py
@@ -59,12 +59,12 @@ def train(self, validate=True):
                 callback.on_epoch_begin(epoch)
 
             # training
-            print("Epoch {} Training".format(epoch))
+            _logger.info("Epoch {} Training".format(epoch))
             self.train_one_epoch(epoch)
 
             if validate:
                 # validation
-                print("Epoch {} Validating".format(epoch))
+                _logger.info("Epoch {} Validating".format(epoch))
                 self.validate_one_epoch(epoch)
 
             for callback in self.callbacks:

From 882e040405fb1574a2a7c96d4d71551ece42e1f9 Mon Sep 17 00:00:00 2001
From: Chi Song <27178119+squirrelsc@users.noreply.github.com>
Date: Thu, 21 Nov 2019 18:11:09 +0800
Subject: [PATCH 23/30] remove useless file

---
 src/sdk/pynni/nni/nas/utils.py | 49 ----------------------------------
 1 file changed, 49 deletions(-)
 delete mode 100644 src/sdk/pynni/nni/nas/utils.py

diff --git a/src/sdk/pynni/nni/nas/utils.py b/src/sdk/pynni/nni/nas/utils.py
deleted file mode 100644
index 5000946e7e..0000000000
--- a/src/sdk/pynni/nni/nas/utils.py
+++ /dev/null
@@ -1,49 +0,0 @@
-from collections import OrderedDict
-
-_counter = 0
-
-
-def global_mutable_counting():
-    global _counter
-    _counter += 1
-    return _counter
-
-
-class AverageMeterGroup(object):
-
-    def __init__(self):
-        self.meters = OrderedDict()
-
-    def update(self, data):
-        for k, v in data.items():
-            if k not in self.meters:
-                self.meters[k] = AverageMeter(k, ":4f")
-            self.meters[k].update(v)
-
-    def __str__(self):
-        return "  ".join(str(v) for _, v in self.meters.items())
-
-
-class AverageMeter(object):
-    """Computes and stores the average and current value"""
-
-    def __init__(self, name, fmt=':f'):
-        self.name = name
-        self.fmt = fmt
-        self.reset()
-
-    def reset(self):
-        self.val = 0
-        self.avg = 0
-        self.sum = 0
-        self.count = 0
-
-    def update(self, val, n=1):
-        self.val = val
-        self.sum += val * n
-        self.count += n
-        self.avg = self.sum / self.count
-
-    def __str__(self):
-        fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
-        return fmtstr.format(**self.__dict__)

From 2f09e99f48fb2fabde925d72fd295e3d940c823a Mon Sep 17 00:00:00 2001
From: Chi Song <27178119+squirrelsc@users.noreply.github.com>
Date: Thu, 21 Nov 2019 18:19:07 +0800
Subject: [PATCH 24/30] set log level to info

---
 examples/nas/darts/retrain.py                   | 3 ++-
 examples/nas/darts/search.py                    | 2 --
 examples/nas/pdarts/search.py                   | 2 --
 src/sdk/pynni/nni/nas/pytorch/darts/trainer.py  | 1 +
 src/sdk/pynni/nni/nas/pytorch/enas/trainer.py   | 1 +
 src/sdk/pynni/nni/nas/pytorch/mutables.py       | 1 +
 src/sdk/pynni/nni/nas/pytorch/pdarts/trainer.py | 1 +
 src/sdk/pynni/nni/nas/pytorch/trainer.py        | 1 +
 8 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/examples/nas/darts/retrain.py b/examples/nas/darts/retrain.py
index 5c8fabf8d0..fdef6e5620 100644
--- a/examples/nas/darts/retrain.py
+++ b/examples/nas/darts/retrain.py
@@ -10,8 +10,9 @@
 from nni.nas.pytorch.fixed import apply_fixed_architecture
 from nni.nas.pytorch.utils import AverageMeter
 
-logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
 
diff --git a/examples/nas/darts/search.py b/examples/nas/darts/search.py
index 568a4c52fe..6eaaeb5e59 100644
--- a/examples/nas/darts/search.py
+++ b/examples/nas/darts/search.py
@@ -11,8 +11,6 @@
 from nni.nas.pytorch.darts import DartsTrainer
 from utils import accuracy
 
-logging.basicConfig(level=logging.INFO)
-
 if __name__ == "__main__":
     parser = ArgumentParser("darts")
     parser.add_argument("--layers", default=8, type=int)
diff --git a/examples/nas/pdarts/search.py b/examples/nas/pdarts/search.py
index 787d3e38ba..0c50b52836 100644
--- a/examples/nas/pdarts/search.py
+++ b/examples/nas/pdarts/search.py
@@ -20,8 +20,6 @@
     import datasets
 
 
-logging.basicConfig(level=logging.INFO)
-
 if __name__ == "__main__":
     parser = ArgumentParser("pdarts")
     parser.add_argument('--add_layers', action='append',
diff --git a/src/sdk/pynni/nni/nas/pytorch/darts/trainer.py b/src/sdk/pynni/nni/nas/pytorch/darts/trainer.py
index f69dac5ef6..7069601b3b 100644
--- a/src/sdk/pynni/nni/nas/pytorch/darts/trainer.py
+++ b/src/sdk/pynni/nni/nas/pytorch/darts/trainer.py
@@ -10,6 +10,7 @@
 from .mutator import DartsMutator
 
 logger = logging.getLogger("darts/trainer")
+logger.setLevel(logging.INFO)
 
 
 class DartsTrainer(Trainer):
diff --git a/src/sdk/pynni/nni/nas/pytorch/enas/trainer.py b/src/sdk/pynni/nni/nas/pytorch/enas/trainer.py
index 460e3d9767..43d5bef536 100644
--- a/src/sdk/pynni/nni/nas/pytorch/enas/trainer.py
+++ b/src/sdk/pynni/nni/nas/pytorch/enas/trainer.py
@@ -8,6 +8,7 @@
 
 
 logger = logging.getLogger("enas/trainer")
+logger.setLevel(logging.INFO)
 
 class EnasTrainer(Trainer):
     def __init__(self, model, loss, metrics, reward_function,
diff --git a/src/sdk/pynni/nni/nas/pytorch/mutables.py b/src/sdk/pynni/nni/nas/pytorch/mutables.py
index 0ee7855ca9..9e870ff76f 100644
--- a/src/sdk/pynni/nni/nas/pytorch/mutables.py
+++ b/src/sdk/pynni/nni/nas/pytorch/mutables.py
@@ -5,6 +5,7 @@
 from nni.nas.pytorch.utils import global_mutable_counting
 
 logger = logging.getLogger("darts/trainer")
+logger.setLevel(logging.INFO)
 
 class Mutable(nn.Module):
     """
diff --git a/src/sdk/pynni/nni/nas/pytorch/pdarts/trainer.py b/src/sdk/pynni/nni/nas/pytorch/pdarts/trainer.py
index c403262040..f0d9a64548 100644
--- a/src/sdk/pynni/nni/nas/pytorch/pdarts/trainer.py
+++ b/src/sdk/pynni/nni/nas/pytorch/pdarts/trainer.py
@@ -8,6 +8,7 @@
 from .mutator import PdartsMutator
 
 logger = logging.getLogger("pdarts/trainer")
+logger.setLevel(logging.INFO)
 
 
 class PdartsTrainer(BaseTrainer):
diff --git a/src/sdk/pynni/nni/nas/pytorch/trainer.py b/src/sdk/pynni/nni/nas/pytorch/trainer.py
index 9e4b20c1cb..6908985de2 100644
--- a/src/sdk/pynni/nni/nas/pytorch/trainer.py
+++ b/src/sdk/pynni/nni/nas/pytorch/trainer.py
@@ -7,6 +7,7 @@
 from .base_trainer import BaseTrainer
 
 _logger = logging.getLogger(__name__)
+_logger.setLevel(logging.INFO)
 
 
 class TorchTensorEncoder(json.JSONEncoder):

From 1837fae4e876486b9019fae7dc4d6f4a1cc6956b Mon Sep 17 00:00:00 2001
From: Chi Song <27178119+squirrelsc@users.noreply.github.com>
Date: Thu, 21 Nov 2019 18:41:27 +0800
Subject: [PATCH 25/30] fix format and test

---
 examples/nas/darts/retrain.py | 11 ++++++++++-
 examples/nas/pdarts/search.py | 14 ++++++++++++++
 2 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/examples/nas/darts/retrain.py b/examples/nas/darts/retrain.py
index fdef6e5620..e3167376f9 100644
--- a/examples/nas/darts/retrain.py
+++ b/examples/nas/darts/retrain.py
@@ -1,4 +1,5 @@
 import logging
+import time
 from argparse import ArgumentParser
 
 import torch
@@ -10,8 +11,16 @@
 from nni.nas.pytorch.fixed import apply_fixed_architecture
 from nni.nas.pytorch.utils import AverageMeter
 
-logger = logging.getLogger(__name__)
+logger = logging.getLogger()
+
+fmt = '[%(asctime)s] %(levelname)s (%(name)s/%(threadName)s) %(message)s'
+logging.Formatter.converter = time.localtime
+formatter = logging.Formatter(fmt, '%m/%d/%Y, %I:%M:%S %p')
+
+std_out_info = logging.StreamHandler()
+std_out_info.setFormatter(formatter)
 logger.setLevel(logging.INFO)
+logger.addHandler(std_out_info)
 
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
diff --git a/examples/nas/pdarts/search.py b/examples/nas/pdarts/search.py
index 0c50b52836..4c3f5cf8d4 100644
--- a/examples/nas/pdarts/search.py
+++ b/examples/nas/pdarts/search.py
@@ -3,6 +3,7 @@
 
 import logging
 import sys
+import time
 from argparse import ArgumentParser
 
 import torch
@@ -19,6 +20,16 @@
     from model import CNN
     import datasets
 
+logger = logging.getLogger()
+
+fmt = '[%(asctime)s] %(levelname)s (%(name)s/%(threadName)s) %(message)s'
+logging.Formatter.converter = time.localtime
+formatter = logging.Formatter(fmt, '%m/%d/%Y, %I:%M:%S %p')
+
+std_out_info = logging.StreamHandler()
+std_out_info.setFormatter(formatter)
+logger.setLevel(logging.INFO)
+logger.addHandler(std_out_info)
 
 if __name__ == "__main__":
     parser = ArgumentParser("pdarts")
@@ -31,6 +42,7 @@
     parser.add_argument("--epochs", default=50, type=int)
     args = parser.parse_args()
 
+    logger.info("loading data")
     dataset_train, dataset_valid = datasets.get_dataset("cifar10")
 
     def model_creator(layers):
@@ -42,6 +54,7 @@ def model_creator(layers):
 
         return model, criterion, optim, lr_scheduler
 
+    logger.info("initializing trainer")
     trainer = PdartsTrainer(model_creator,
                             layers=args.layers,
                             metrics=lambda output, target: accuracy(output, target, topk=(1,)),
@@ -53,4 +66,5 @@ def model_creator(layers):
                             batch_size=args.batch_size,
                             log_frequency=args.log_frequency,
                             callbacks=[ArchitectureCheckpoint("./checkpoints")])
+    logger.info("training")
     trainer.train()

From 0eddd375eccfd16478f1db3093079d9bc54bd65b Mon Sep 17 00:00:00 2001
From: Chi Song <27178119+squirrelsc@users.noreply.github.com>
Date: Thu, 21 Nov 2019 18:51:34 +0800
Subject: [PATCH 26/30] add more logger for examples

---
 examples/nas/darts/search.py | 12 ++++++++++++
 examples/nas/enas/search.py  | 13 +++++++++++++
 2 files changed, 25 insertions(+)

diff --git a/examples/nas/darts/search.py b/examples/nas/darts/search.py
index 6eaaeb5e59..d9bdf0c7b5 100644
--- a/examples/nas/darts/search.py
+++ b/examples/nas/darts/search.py
@@ -1,4 +1,5 @@
 import logging
+import time
 from argparse import ArgumentParser
 
 import torch
@@ -11,6 +12,17 @@
 from nni.nas.pytorch.darts import DartsTrainer
 from utils import accuracy
 
+logger = logging.getLogger()
+
+fmt = '[%(asctime)s] %(levelname)s (%(name)s/%(threadName)s) %(message)s'
+logging.Formatter.converter = time.localtime
+formatter = logging.Formatter(fmt, '%m/%d/%Y, %I:%M:%S %p')
+
+std_out_info = logging.StreamHandler()
+std_out_info.setFormatter(formatter)
+logger.setLevel(logging.INFO)
+logger.addHandler(std_out_info)
+
 if __name__ == "__main__":
     parser = ArgumentParser("darts")
     parser.add_argument("--layers", default=8, type=int)
diff --git a/examples/nas/enas/search.py b/examples/nas/enas/search.py
index 35bc930333..6fade75164 100644
--- a/examples/nas/enas/search.py
+++ b/examples/nas/enas/search.py
@@ -1,3 +1,5 @@
+import logging
+import time
 from argparse import ArgumentParser
 
 import torch
@@ -10,6 +12,17 @@
 from nni.nas.pytorch.callbacks import LearningRateScheduler, ArchitectureCheckpoint
 from utils import accuracy, reward_accuracy
 
+logger = logging.getLogger()
+
+fmt = '[%(asctime)s] %(levelname)s (%(name)s/%(threadName)s) %(message)s'
+logging.Formatter.converter = time.localtime
+formatter = logging.Formatter(fmt, '%m/%d/%Y, %I:%M:%S %p')
+
+std_out_info = logging.StreamHandler()
+std_out_info.setFormatter(formatter)
+logger.setLevel(logging.INFO)
+logger.addHandler(std_out_info)
+
 if __name__ == "__main__":
     parser = ArgumentParser("enas")
     parser.add_argument("--batch-size", default=128, type=int)

From 2ed3161eb9cff14a5b4ae9a4cc2c05243d7b3b4a Mon Sep 17 00:00:00 2001
From: Chi Song <27178119+squirrelsc@users.noreply.github.com>
Date: Fri, 22 Nov 2019 09:33:28 +0800
Subject: [PATCH 27/30] fix log information

---
 src/sdk/pynni/nni/nas/pytorch/darts/trainer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/sdk/pynni/nni/nas/pytorch/darts/trainer.py b/src/sdk/pynni/nni/nas/pytorch/darts/trainer.py
index 7069601b3b..e714105367 100644
--- a/src/sdk/pynni/nni/nas/pytorch/darts/trainer.py
+++ b/src/sdk/pynni/nni/nas/pytorch/darts/trainer.py
@@ -78,7 +78,7 @@ def train_one_epoch(self, epoch):
             meters.update(metrics)
             if self.log_frequency is not None and step % self.log_frequency == 0:
                 logger.info("Epoch [{}/{}] Step [{}/{}]  {}".format(epoch,
-                                                                    self.num_epochs, step, len(self.train_loader), meters))
+                                                                    self.num_epochs, step+1, len(self.train_loader), meters))
 
     def validate_one_epoch(self, epoch):
         self.model.eval()
@@ -93,7 +93,7 @@ def validate_one_epoch(self, epoch):
                 meters.update(metrics)
                 if self.log_frequency is not None and step % self.log_frequency == 0:
                     logger.info("Epoch [{}/{}] Step [{}/{}]  {}".format(epoch,
-                                                                        self.num_epochs, step, len(self.valid_loader), meters))
+                                                                        self.num_epochs, step+1, len(self.test_loader), meters))
 
     def _unrolled_backward(self, trn_X, trn_y, val_X, val_y, backup_model, lr):
         """

From d030fe04dbe2b29502e68ed0b8580b29d21ad0c6 Mon Sep 17 00:00:00 2001
From: Chi Song <27178119+squirrelsc@users.noreply.github.com>
Date: Fri, 22 Nov 2019 11:17:20 +0800
Subject: [PATCH 28/30] fix pylint errors

---
 examples/nas/pdarts/search.py                   |  1 -
 src/sdk/pynni/nni/nas/pytorch/darts/trainer.py  |  8 ++++----
 src/sdk/pynni/nni/nas/pytorch/enas/trainer.py   | 10 +++++-----
 src/sdk/pynni/nni/nas/pytorch/mutables.py       |  3 ++-
 src/sdk/pynni/nni/nas/pytorch/pdarts/mutator.py |  2 --
 src/sdk/pynni/nni/nas/pytorch/pdarts/trainer.py |  4 ++--
 src/sdk/pynni/nni/nas/pytorch/trainer.py        |  4 ++--
 7 files changed, 15 insertions(+), 17 deletions(-)

diff --git a/examples/nas/pdarts/search.py b/examples/nas/pdarts/search.py
index 4c3f5cf8d4..5d38fda0db 100644
--- a/examples/nas/pdarts/search.py
+++ b/examples/nas/pdarts/search.py
@@ -9,7 +9,6 @@
 import torch
 import torch.nn as nn
 
-import nni.nas.pytorch as nas
 from nni.nas.pytorch.callbacks import ArchitectureCheckpoint
 from nni.nas.pytorch.pdarts import PdartsTrainer
 
diff --git a/src/sdk/pynni/nni/nas/pytorch/darts/trainer.py b/src/sdk/pynni/nni/nas/pytorch/darts/trainer.py
index e714105367..a395463760 100644
--- a/src/sdk/pynni/nni/nas/pytorch/darts/trainer.py
+++ b/src/sdk/pynni/nni/nas/pytorch/darts/trainer.py
@@ -77,8 +77,8 @@ def train_one_epoch(self, epoch):
             metrics["loss"] = loss.item()
             meters.update(metrics)
             if self.log_frequency is not None and step % self.log_frequency == 0:
-                logger.info("Epoch [{}/{}] Step [{}/{}]  {}".format(epoch,
-                                                                    self.num_epochs, step+1, len(self.train_loader), meters))
+                logger.info("Epoch [%s/%s] Step [%s/%s]  %s", epoch+1,
+                            self.num_epochs, step+1, len(self.train_loader), meters)
 
     def validate_one_epoch(self, epoch):
         self.model.eval()
@@ -92,8 +92,8 @@ def validate_one_epoch(self, epoch):
                 metrics = self.metrics(logits, y)
                 meters.update(metrics)
                 if self.log_frequency is not None and step % self.log_frequency == 0:
-                    logger.info("Epoch [{}/{}] Step [{}/{}]  {}".format(epoch,
-                                                                        self.num_epochs, step+1, len(self.test_loader), meters))
+                    logger.info("Epoch [%s/%s] Step [%s/%s]  %s", epoch+1,
+                                self.num_epochs, step+1, len(self.test_loader), meters)
 
     def _unrolled_backward(self, trn_X, trn_y, val_X, val_y, backup_model, lr):
         """
diff --git a/src/sdk/pynni/nni/nas/pytorch/enas/trainer.py b/src/sdk/pynni/nni/nas/pytorch/enas/trainer.py
index 43d5bef536..d365a3b5d6 100644
--- a/src/sdk/pynni/nni/nas/pytorch/enas/trainer.py
+++ b/src/sdk/pynni/nni/nas/pytorch/enas/trainer.py
@@ -10,6 +10,7 @@
 logger = logging.getLogger("enas/trainer")
 logger.setLevel(logging.INFO)
 
+
 class EnasTrainer(Trainer):
     def __init__(self, model, loss, metrics, reward_function,
                  optimizer, num_epochs, dataset_train, dataset_valid,
@@ -74,8 +75,8 @@ def train_one_epoch(self, epoch):
             meters.update(metrics)
 
             if self.log_frequency is not None and step % self.log_frequency == 0:
-                logger.info("Model Epoch [{}/{}] Step [{}/{}]  {}".format(epoch, self.num_epochs,
-                                                                    step, len(self.train_loader), meters))
+                logger.info("Model Epoch [%s/%s] Step [%s/%s]  %s", epoch,
+                            self.num_epochs, step, len(self.train_loader), meters)
 
         # Train sampler (mutator)
         self.model.eval()
@@ -113,9 +114,8 @@ def train_one_epoch(self, epoch):
                     self.mutator_optim.zero_grad()
 
                 if self.log_frequency is not None and step % self.log_frequency == 0:
-                    logger.info("RL Epoch [{}/{}] Step [{}/{}]  {}".format(epoch, self.num_epochs,
-                                                                     mutator_step // self.mutator_steps_aggregate,
-                                                                     self.mutator_steps, meters))
+                    logger.info("RL Epoch [%s/%s] Step [%s/%s]  %s", epoch, self.num_epochs,
+                                mutator_step // self.mutator_steps_aggregate, self.mutator_steps, meters)
                 mutator_step += 1
                 if mutator_step >= total_mutator_steps:
                     break
diff --git a/src/sdk/pynni/nni/nas/pytorch/mutables.py b/src/sdk/pynni/nni/nas/pytorch/mutables.py
index 9e870ff76f..d0954f6ed5 100644
--- a/src/sdk/pynni/nni/nas/pytorch/mutables.py
+++ b/src/sdk/pynni/nni/nas/pytorch/mutables.py
@@ -7,6 +7,7 @@
 logger = logging.getLogger("darts/trainer")
 logger.setLevel(logging.INFO)
 
+
 class Mutable(nn.Module):
     """
     Mutable is designed to function as a normal layer, with all necessary operators' weights.
@@ -24,7 +25,7 @@ def __init__(self, key=None):
         if key is not None:
             if not isinstance(key, str):
                 key = str(key)
-                logger.warn("Warning: key \"{}\" is not string, converted to string.".format(key))
+                logger.warning("Warning: key \"%s\" is not string, converted to string.", key)
             self._key = key
         else:
             self._key = self.__class__.__name__ + str(global_mutable_counting())
diff --git a/src/sdk/pynni/nni/nas/pytorch/pdarts/mutator.py b/src/sdk/pynni/nni/nas/pytorch/pdarts/mutator.py
index 68c6f1dde5..5862e9714b 100644
--- a/src/sdk/pynni/nni/nas/pytorch/pdarts/mutator.py
+++ b/src/sdk/pynni/nni/nas/pytorch/pdarts/mutator.py
@@ -4,8 +4,6 @@
 import copy
 
 import numpy as np
-import torch
-from torch import nn as nn
 from torch.nn import functional as F
 
 from nni.nas.pytorch.darts import DartsMutator
diff --git a/src/sdk/pynni/nni/nas/pytorch/pdarts/trainer.py b/src/sdk/pynni/nni/nas/pytorch/pdarts/trainer.py
index f0d9a64548..377bd9ec3a 100644
--- a/src/sdk/pynni/nni/nas/pytorch/pdarts/trainer.py
+++ b/src/sdk/pynni/nni/nas/pytorch/pdarts/trainer.py
@@ -1,7 +1,7 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 import logging
-from nni.nas.pytorch.callbacks import (ArchitectureCheckpoint, LearningRateScheduler)
+from nni.nas.pytorch.callbacks import LearningRateScheduler
 from nni.nas.pytorch.darts import DartsTrainer
 from nni.nas.pytorch.trainer import BaseTrainer
 
@@ -54,7 +54,7 @@ def train(self):
 
             self.trainer = DartsTrainer(model, mutator=self.mutator, loss=criterion, optimizer=optim,
                                         callbacks=darts_callbacks, **self.darts_parameters)
-            logger.info("start pdarts training %s..." % epoch)
+            logger.info("start pdarts training %s...", epoch)
 
             self.trainer.train()
 
diff --git a/src/sdk/pynni/nni/nas/pytorch/trainer.py b/src/sdk/pynni/nni/nas/pytorch/trainer.py
index 6908985de2..9195631a60 100644
--- a/src/sdk/pynni/nni/nas/pytorch/trainer.py
+++ b/src/sdk/pynni/nni/nas/pytorch/trainer.py
@@ -60,12 +60,12 @@ def train(self, validate=True):
                 callback.on_epoch_begin(epoch)
 
             # training
-            _logger.info("Epoch {} Training".format(epoch))
+            _logger.info("Epoch %d Training", epoch)
             self.train_one_epoch(epoch)
 
             if validate:
                 # validation
-                _logger.info("Epoch {} Validating".format(epoch))
+                _logger.info("Epoch %d Validating", epoch)
                 self.validate_one_epoch(epoch)
 
             for callback in self.callbacks:

From 380ed8595db9b92a883ace32e07aab4c55a72101 Mon Sep 17 00:00:00 2001
From: Chi Song <27178119+squirrelsc@users.noreply.github.com>
Date: Fri, 22 Nov 2019 11:21:55 +0800
Subject: [PATCH 29/30] update logger names

---
 src/sdk/pynni/nni/nas/pytorch/darts/trainer.py  | 2 +-
 src/sdk/pynni/nni/nas/pytorch/enas/trainer.py   | 2 +-
 src/sdk/pynni/nni/nas/pytorch/mutables.py       | 2 +-
 src/sdk/pynni/nni/nas/pytorch/pdarts/trainer.py | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/sdk/pynni/nni/nas/pytorch/darts/trainer.py b/src/sdk/pynni/nni/nas/pytorch/darts/trainer.py
index a395463760..6392962111 100644
--- a/src/sdk/pynni/nni/nas/pytorch/darts/trainer.py
+++ b/src/sdk/pynni/nni/nas/pytorch/darts/trainer.py
@@ -9,7 +9,7 @@
 
 from .mutator import DartsMutator
 
-logger = logging.getLogger("darts/trainer")
+logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
 
 
diff --git a/src/sdk/pynni/nni/nas/pytorch/enas/trainer.py b/src/sdk/pynni/nni/nas/pytorch/enas/trainer.py
index d365a3b5d6..49052d6b08 100644
--- a/src/sdk/pynni/nni/nas/pytorch/enas/trainer.py
+++ b/src/sdk/pynni/nni/nas/pytorch/enas/trainer.py
@@ -7,7 +7,7 @@
 from .mutator import EnasMutator
 
 
-logger = logging.getLogger("enas/trainer")
+logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
 
 
diff --git a/src/sdk/pynni/nni/nas/pytorch/mutables.py b/src/sdk/pynni/nni/nas/pytorch/mutables.py
index d0954f6ed5..4dbf514af8 100644
--- a/src/sdk/pynni/nni/nas/pytorch/mutables.py
+++ b/src/sdk/pynni/nni/nas/pytorch/mutables.py
@@ -4,7 +4,7 @@
 
 from nni.nas.pytorch.utils import global_mutable_counting
 
-logger = logging.getLogger("darts/trainer")
+logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
 
 
diff --git a/src/sdk/pynni/nni/nas/pytorch/pdarts/trainer.py b/src/sdk/pynni/nni/nas/pytorch/pdarts/trainer.py
index 377bd9ec3a..af31da08fc 100644
--- a/src/sdk/pynni/nni/nas/pytorch/pdarts/trainer.py
+++ b/src/sdk/pynni/nni/nas/pytorch/pdarts/trainer.py
@@ -7,7 +7,7 @@
 
 from .mutator import PdartsMutator
 
-logger = logging.getLogger("pdarts/trainer")
+logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
 
 

From 85ccac0d3046f08c31f2d29bd011288beb4456c0 Mon Sep 17 00:00:00 2001
From: Chi Song <27178119+squirrelsc@users.noreply.github.com>
Date: Fri, 22 Nov 2019 11:39:39 +0800
Subject: [PATCH 30/30] add dependencies

---
 docs/en_US/NAS/Overview.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/docs/en_US/NAS/Overview.md b/docs/en_US/NAS/Overview.md
index 8a52d04af2..4e48483df3 100644
--- a/docs/en_US/NAS/Overview.md
+++ b/docs/en_US/NAS/Overview.md
@@ -12,6 +12,12 @@ NNI supports below NAS algorithms now and being adding more. User can reproduce
 
 Note, these algorithms run standalone without nnictl, and supports PyTorch only.
 
+### Dependencies
+
+* Install latest NNI
+* PyTorch 1.2+
+* git
+
 ### DARTS
 
 The main contribution of [DARTS: Differentiable Architecture Search][3] on algorithm is to introduce a novel algorithm for differentiable network architecture search on bilevel optimization.