diff --git a/doc/api/v2/config/layer.rst b/doc/api/v2/config/layer.rst
index 372272a53c12c..dc7c6d2e59726 100644
--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@@ -257,6 +257,11 @@ seq_concat
 ..  autoclass:: paddle.v2.layer.seq_concat
     :noindex:
 
+sub_nested_seq
+--------------
+..  autoclass:: paddle.v2.layer.sub_nested_seq
+    :noindex:
+
 Reshaping Layers
 ================
 
diff --git a/paddle/gserver/layers/SubNestedSequenceLayer.cpp b/paddle/gserver/layers/SubNestedSequenceLayer.cpp
new file mode 100644
index 0000000000000..76f587fff760d
--- /dev/null
+++ b/paddle/gserver/layers/SubNestedSequenceLayer.cpp
@@ -0,0 +1,176 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/math/Vector.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+class SubNestedSequenceLayer : public Layer {
+public:
+  explicit SubNestedSequenceLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+
+private:
+  /*
+   * This functions generates the indices of rows in a batch according to the
+   * indices of selected sub-sequence in each sequence.
+   *
+   * Examples:
+   * selectedIndices:
+   *   [
+   *     [0, 1, -1],
+   *     [0, 1, 2],
+   *     [0, -1, -1],
+   *     [0, 2, 3],
+   *   ]
+   * inputSeqInfo:
+   *   [
+   *     [0,3,4],
+   *     [4,5,7,10,15],
+   *     [15,20],
+   *     [20,22,23,25,28]
+   *   ]
+   *
+   * ths output is saved to private member rowIndice_;
+   * [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
+   *  16,17,18,19,20,21,22,23,24,25,26,27]
+   */
+
+  void calSelectedCols(const MatrixPtr selectedIndices,
+                       const std::vector<std::vector<int>>& inputSeqInfo);
+
+  // if the second input of this layer is on GPU memory, copy it to CPU memory.
+  MatrixPtr selIdsCpu_;
+
+  // reorganized sequenceStartPositions and subSequenceStartPositions
+  // into a 2d vector to facilitate the sequence selection process.
+  std::vector<std::vector<int>> inputSeqInfoVec_;
+
+  // the final selected row indices in a batch,
+  // rowIdx_ and selectedRows_ actually share a same memory.
+  IVectorPtr rowIndice_;
+  std::vector<int> selectedRows_;
+};
+
+REGISTER_LAYER(sub_nested_seq, SubNestedSequenceLayer);
+
+bool SubNestedSequenceLayer::init(const LayerMap& layerMap,
+                                  const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+  CHECK_EQ(2U, inputLayers_.size());
+  setNeedSequenceInfo(false);
+  return true;
+}
+
+void SubNestedSequenceLayer::calSelectedCols(
+    const MatrixPtr selectedIndices,
+    const std::vector<std::vector<int>>& inputSeqInfo) {
+  selectedRows_.clear();
+
+  std::vector<int> outSeqStartInfo(1, 0);
+  std::vector<int> outSubSeqStartInfo(1, 0);
+
+  size_t seqNum = selectedIndices->getHeight();
+  size_t beamSize = selectedIndices->getWidth();
+  for (size_t i = 0; i < seqNum; ++i) {
+    for (size_t j = 0; j < beamSize; ++j) {
+      if (selectedIndices->getElement(i, j) == -1.) break;
+      int selSubSeqIdx = selectedIndices->getElement(i, j);
+      CHECK_GT(inputSeqInfoVec_[i].size() - 1, selSubSeqIdx);
+
+      size_t subSeqLen = inputSeqInfoVec_[i][selSubSeqIdx + 1] -
+                         inputSeqInfoVec_[i][selSubSeqIdx];
+      for (size_t k = 0; k < subSeqLen; ++k)
+        selectedRows_.push_back(inputSeqInfoVec_[i][selSubSeqIdx] + k);
+      outSubSeqStartInfo.push_back(outSubSeqStartInfo.back() + subSeqLen);
+    }
+    outSeqStartInfo.push_back(outSubSeqStartInfo.back());
+  }
+
+  if (useGpu_) {
+    rowIndice_ = IVector::create(selectedRows_.size(), useGpu_);
+    rowIndice_->copyFrom(selectedRows_.data(), selectedRows_.size());
+  } else {
+    rowIndice_ =
+        IVector::create(selectedRows_.data(), selectedRows_.size(), useGpu_);
+  }
+
+  // create the sequence information for the output.
+  ICpuGpuVector::resizeOrCreate(
+      output_.sequenceStartPositions, outSeqStartInfo.size(), false);
+  output_.sequenceStartPositions->copyFrom(
+      outSeqStartInfo.data(), outSeqStartInfo.size(), false);
+
+  ICpuGpuVector::resizeOrCreate(
+      output_.subSequenceStartPositions, outSubSeqStartInfo.size(), false);
+  output_.subSequenceStartPositions->copyFrom(
+      outSubSeqStartInfo.data(), outSubSeqStartInfo.size(), false);
+}
+
+void SubNestedSequenceLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  const Argument& inputSeq = getInput(0);
+  CHECK(inputSeq.hasSubseq()) << "The first input of SubNestSequence layer "
+                              << "must be a nested sequence.";
+  const MatrixPtr selectedIndices = getInputValue(1);
+  CHECK_EQ(inputSeq.getNumSequences(), selectedIndices->getHeight());
+
+  if (dynamic_cast<GpuMatrix*>(selectedIndices.get())) {
+    /*
+     * Currently, the second input for this layer is generated by
+     * kmax_sequence_score_layer whose output is always stored on CPU,
+     * or a data_layer which canbe on GPU.
+     *
+     * If the second input is on GPU, copy it to CPU memory, because this
+     * input always uses very few memory, and operations related to it are
+     * all logic control, not computations.
+     */
+    Matrix::resizeOrCreate(selIdsCpu_,
+                           selectedIndices->getHeight(),
+                           selectedIndices->getWidth(),
+                           false /* trans */,
+                           false /* useGpu */);
+    selIdsCpu_->copyFrom(*selectedIndices);
+  } else {
+    selIdsCpu_ = selectedIndices;
+  }
+
+  Argument::reorganizeSeqInfo(inputSeq.sequenceStartPositions,
+                              inputSeq.subSequenceStartPositions,
+                              inputSeqInfoVec_);
+  calSelectedCols(selIdsCpu_, inputSeqInfoVec_);
+
+  resetOutput(selectedRows_.size(), getSize());
+  getOutputValue()->selectRows(*getInputValue(0), *rowIndice_);
+}
+
+void SubNestedSequenceLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr inputSeqGrad = getInputGrad(0);
+  MatrixPtr outputGrad = getOutputGrad();
+
+  if (inputSeqGrad) outputGrad->addToRows(*inputSeqGrad, *rowIndice_);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index fe11278f41c01..0f312b6ca50bc 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -1899,6 +1899,84 @@ TEST(Layer, CropLayer) {
   }
 }
 
+vector<real> randSampling(real range, int n) {
+  CHECK_GE(range, n);
+  vector<real> num(range);
+  iota(begin(num), end(num), 0.);
+  if (range == n) return num;
+
+  random_shuffle(begin(num), end(num));
+  num.resize(n);
+  sort(begin(num), end(num));
+  return num;
+}
+
+TEST(Layer, SubNestedSequenceLayer) {
+  // layer size is not crutial for this layer,
+  // so use a small layer size in unittest
+  const int layerSize = 4;
+
+  const int maxSeqNum = 50;
+  const int maxSeqLen = 50;
+  const int maxBeamSize = 32;
+
+  srand((size_t)(time(NULL)));
+  int beamSize = 1 + (rand() % maxBeamSize);
+
+  TestConfig config;
+  config.layerConfig.set_type("sub_nested_seq");
+  config.layerConfig.set_name("sub_nested_seq_layer");
+  config.layerConfig.set_size(layerSize);
+
+  int seqNum = 1 + (rand() % maxSeqNum);
+
+  // sequence information for the first input, it is a nested sequence
+  vector<int> seqStartPos(seqNum + 1, 0);
+  vector<int> subSeqStartPos(1, 0);
+
+  // selected indices
+  MatrixPtr selectedIndices = Matrix::create(seqNum, beamSize, false, false);
+  selectedIndices->one();
+  selectedIndices->mulScalar(-1.);
+  real* indicesData = selectedIndices->getData();
+
+  for (int i = 0; i < seqNum; ++i) {
+    int subSeqNum = 1 + (rand() % maxSeqNum);
+    for (int j = 0; j < subSeqNum; ++j) {
+      subSeqStartPos.push_back(subSeqStartPos.back() +
+                               (1 + (rand() % maxSeqLen)));
+    }
+    vector<real> selSeqs =
+        randSampling(static_cast<real>(subSeqNum), min(beamSize, subSeqNum));
+    memcpy(indicesData + (i * beamSize),
+           selSeqs.data(),
+           selSeqs.size() * sizeof(real));
+    seqStartPos[i + 1] = subSeqStartPos.back();
+  }
+
+  MatrixPtr seqInputPtr =
+      Matrix::create(seqStartPos.back(), layerSize, false, false);
+  seqInputPtr->randomizeUniform();
+  config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA,
+                              "nested_seq_input",
+                              seqInputPtr,
+                              seqStartPos,
+                              subSeqStartPos});
+  config.layerConfig.add_inputs();
+  config.inputDefs.push_back(
+      {INPUT_SELF_DEFINE_DATA, "selected_indices", selectedIndices});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config,
+                  "sub_nested_seq",
+                  /* batchSize */ seqNum,
+                  /* trans */ false,
+                  /* useGpu*/ useGpu,
+                  /* useWeight */ false);
+  }
+}
+
 TEST(Layer, ClipLayer) {
   const size_t batchSize = 128;
   const size_t size = 512;
diff --git a/paddle/parameter/Argument.cpp b/paddle/parameter/Argument.cpp
index ef72b973c1a46..0547ac93cd183 100644
--- a/paddle/parameter/Argument.cpp
+++ b/paddle/parameter/Argument.cpp
@@ -666,4 +666,24 @@ void Argument::subArgFrom(const Argument& input,
   }
 }
 
+void Argument::reorganizeSeqInfo(
+    const ICpuGpuVectorPtr seqStartPos,
+    const ICpuGpuVectorPtr subSeqStartPos,
+    std::vector<std::vector<int>>& reorganizedSeqInfo) {
+  int* seqStarts = seqStartPos->getMutableData(false);
+  int* subSeqStarts = subSeqStartPos->getMutableData(false);
+
+  int seqNum = seqStartPos->getSize() - 1;
+  reorganizedSeqInfo.resize(seqNum, std::vector<int>());
+  int seqIdx = 0;
+  for (size_t i = 0; i < subSeqStartPos->getSize(); ++i) {
+    reorganizedSeqInfo[seqIdx].push_back(subSeqStarts[i]);
+    if (subSeqStarts[i] == seqStarts[seqIdx + 1]) {
+      seqIdx++;
+      if (seqIdx == seqNum) return;
+      reorganizedSeqInfo[seqIdx].push_back(subSeqStarts[i]);
+    }
+  }
+}
+
 }  // namespace paddle
diff --git a/paddle/parameter/Argument.h b/paddle/parameter/Argument.h
index 0ccdef802e71b..d8d7a4398f99a 100644
--- a/paddle/parameter/Argument.h
+++ b/paddle/parameter/Argument.h
@@ -317,6 +317,30 @@ struct Argument {
    */
   void printValueString(std::ostream& stream,
                         const std::string& prefix = "") const;
+
+  /**
+   * @brief reorganizeSeqInfo will reorganize sequenceStartPositions and
+   * subSequenceStartPositions into a 2 dimensional arrary: reorganizedSeqInfo.
+   *
+   * @param seqStartPos: sequenceStartPositions of an Argument.
+   * @param subSeqStartPos: subSequenceStartPositions of an Argument.
+   * @param the reorganized sequence start position information.
+   *
+   * Examples:
+   * seqStartPos: [0, 4, 15, 20, 28]
+   * subSeqStartPos: [0, 3, 4, 5, 7, 10, 15, 20, 22, 23, 25, 28]
+   * reorganizedSeqInfo:
+   *   [
+   *     [0,3,4],
+   *     [4,5,7,10,15],
+   *     [15,20],
+   *     [20,22,23,25,28]
+   *   ]
+   */
+  static void reorganizeSeqInfo(
+      const ICpuGpuVectorPtr seqStartPos,
+      const ICpuGpuVectorPtr subSeqStartPos,
+      std::vector<std::vector<int>>& reorganizedSeqInfo);
 };
 
 }  // namespace paddle
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index 9ea69fc5e5763..c8fc49e20da2e 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -2657,6 +2657,31 @@ def __init__(self, name, inputs, bias=False, **xargs):
         self.create_bias_parameter(bias, size)
 
 
+@config_layer('sub_nested_seq')
+class SubNestedSequenceLayer(LayerBase):
+    def __init__(self, name, inputs, selected_indices, bias=False, **xargs):
+        if isinstance(inputs, list):
+            assert len(inputs) == 1, ('the first input of sub_nested_seq '
+                                      'layer is a single nested sequence.')
+            inputs = inputs[0]
+        if isinstance(selected_indices, list):
+            assert len(selected_indices) == 1, (
+                'the second input of '
+                'sub_nested_seq layer is a single layer which is a '
+                'set of selected indices.')
+            selected_indices = selected_indices[0]
+
+        super(SubNestedSequenceLayer, self).__init__(
+            name,
+            'sub_nested_seq',
+            0,
+            inputs=[inputs, selected_indices],
+            **xargs)
+        input_layer0 = self.get_input_layer(0)
+        size = input_layer0.size
+        self.set_layer_size(size)
+
+
 @config_layer('out_prod')
 class OuterProdLayer(LayerBase):
     def __init__(self, name, inputs, device=None):
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index ea5fdcc50f6ab..2c7cebc359173 100755
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -129,6 +129,7 @@
     'prelu_layer',
     'gated_unit_layer',
     'crop_layer',
+    'sub_nested_seq_layer',
     'clip_layer',
     'slice_projection',
 ]
@@ -224,6 +225,7 @@ class LayerType(object):
 
     PRELU = 'prelu'
     CROP_LAYER = 'crop'
+    SUB_NESTED_SEQ = 'sub_nested_seq'
     CLIP_LAYER = 'clip'
 
     @staticmethod
@@ -6088,6 +6090,53 @@ def crop_layer(input, offset, axis=2, shape=None, name=None, layer_attr=None):
         size=l.config.size)
 
 
+@wrap_name_default()
+@layer_support()
+def sub_nested_seq_layer(input, selected_indices, name=None):
+    """
+    The sub_nested_seq_layer accepts two inputs: the first one is a nested
+    sequence; the second one is a set of selceted indices in the nested sequence.
+
+    Then sub_nest_seq_layer trims the first nested sequence input according
+    to the selected indices to form a new output. This layer is useful in
+    beam training.
+
+    The example usage is:
+
+    .. code-block:: python
+
+        sub_nest_seq = sub_nested_seq_layer(input=[data, selected_indices])
+
+
+    :param input: A nested sequence.
+    :type input: LayerOutput
+    :param selected_indices: a set of sequence indices in the nested sequence.
+    :type input: LayerOutput
+    :param name: name of this layer.
+    :type name: basestring
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+
+    assert isinstance(input, LayerOutput), (
+        'The first input of '
+        'sub_nested_seq_layer must be a Paddle layer.')
+    assert isinstance(selected_indices, LayerOutput), (
+        'The second input of '
+        'sub_nested_seq_layer must be a Paddle layer.')
+
+    l = Layer(
+        inputs=input.name,
+        selected_indices=selected_indices.name,
+        name=name,
+        type=LayerType.SUB_NESTED_SEQ)
+    return LayerOutput(
+        name=name,
+        layer_type=LayerType.SUB_NESTED_SEQ,
+        parents=input,
+        size=l.config.size)
+
+
 @wrap_name_default("clip")
 def clip_layer(input, min, max, name=None):
     """
diff --git a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
index 0ffa58bc1e208..a627369275112 100755
--- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
@@ -7,6 +7,7 @@ test_rnn_group shared_fc shared_lstm shared_gru test_cost_layers_with_weight
 test_spp_layer test_bilinear_interp test_maxout test_bi_grumemory math_ops
 test_seq_concat_reshape test_pad test_smooth_l1 test_multiplex_layer
 test_prelu_layer test_row_conv test_detection_output_layer test_multibox_loss_layer
-test_recursive_topology test_gated_unit_layer test_clip_layer test_row_l2_norm_layer)
+test_recursive_topology test_gated_unit_layer test_clip_layer test_row_l2_norm_layer
+test_seq_select_layers)
 
 export whole_configs=(test_split_datasource)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_seq_select_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_seq_select_layers.protostr
new file mode 100644
index 0000000000000..4b906b113e3c0
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_seq_select_layers.protostr
@@ -0,0 +1,37 @@
+type: "nn"
+layers {
+  name: "input_seq"
+  type: "data"
+  size: 300
+  active_type: ""
+}
+layers {
+  name: "input"
+  type: "data"
+  size: 5
+  active_type: ""
+}
+layers {
+  name: "__sub_nested_seq_layer_0__"
+  type: "sub_nested_seq"
+  size: 300
+  active_type: ""
+  inputs {
+    input_layer_name: "input_seq"
+  }
+  inputs {
+    input_layer_name: "input"
+  }
+}
+input_layer_names: "input_seq"
+output_layer_names: "__sub_nested_seq_layer_0__"
+sub_models {
+  name: "root"
+  layer_names: "input_seq"
+  layer_names: "input"
+  layer_names: "__sub_nested_seq_layer_0__"
+  input_layer_names: "input_seq"
+  output_layer_names: "__sub_nested_seq_layer_0__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_seq_select_layers.py b/python/paddle/trainer_config_helpers/tests/configs/test_seq_select_layers.py
new file mode 100644
index 0000000000000..6d1c3175ba980
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_seq_select_layers.py
@@ -0,0 +1,11 @@
+#!/usr/bin/env python
+#coding=utf-8
+from paddle.trainer_config_helpers import *
+
+beam_size = 5
+
+data = data_layer(name='input_seq', size=300)
+selected_ids = data_layer(name='input', size=beam_size)
+sub_nest_seq = sub_nested_seq_layer(input=data, selected_indices=selected_ids)
+
+outputs(sub_nest_seq)