diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index ea3564f16a..9076f6e1f1 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -23,6 +23,7 @@ Added
 - Improve efficiency of parallelization with max_memory_mb a new parameter of backend_opts (#61)
 - Add optimized mcx, mcy, mcz, mcu1, mcu2, mcu3, gates to QubitVector (#124)
 - Add optimized controlled-swap gate to QubitVector
+- Add gate-fusion optimization for QasmContoroller, which is enabled by setting fusion_enable=true (#136)
 
 Changed
 -------
diff --git a/src/base/controller.hpp b/src/base/controller.hpp
index c4b56fd903..3dbac7f835 100755
--- a/src/base/controller.hpp
+++ b/src/base/controller.hpp
@@ -186,7 +186,10 @@ class Controller {
   //-------------------------------------------------------------------------
 
   // Generate an equivalent circuit with input_circ as output_circ.
-  Circuit optimize_circuit(const Circuit &input_circ) const;
+  template <class state_t>
+  Circuit optimize_circuit(const Circuit &input_circ,
+                           state_t& state,
+                           OutputData &data) const;
 
   //-----------------------------------------------------------------------
   // Config
@@ -271,6 +274,9 @@ void Controller::set_config(const json_t &config) {
     max_memory_mb_ = system_memory_mb / 2;
   }
 
+  for (std::shared_ptr<CircuitOptimization> opt: optimizations_)
+    opt->set_config(config_);
+
   std::string path;
   JSON::get_value(path, "library_dir", config);
   // Fix for MacOS and OpenMP library double initialization crash.
@@ -429,12 +435,19 @@ bool Controller::validate_memory_requirements(state_t &state,
 //-------------------------------------------------------------------------
 // Circuit optimization
 //-------------------------------------------------------------------------
-
-Circuit Controller::optimize_circuit(const Circuit &input_circ) const {
+template <class state_t>
+Circuit Controller::optimize_circuit(const Circuit &input_circ,
+                                     state_t& state,
+                                     OutputData &data) const {
 
   Circuit working_circ = input_circ;
+  Operations::OpSet allowed_opset;
+  allowed_opset.optypes = state.allowed_ops();
+  allowed_opset.gates = state.allowed_gates();
+  allowed_opset.snapshots = state.allowed_snapshots();
+
   for (std::shared_ptr<CircuitOptimization> opt: optimizations_)
-    opt->optimize_circuit(working_circ);
+    opt->optimize_circuit(working_circ, allowed_opset, data);
 
   return working_circ;
 }
diff --git a/src/framework/circuitopt.hpp b/src/framework/circuitopt.hpp
index 0eb8fd8d38..1b97b05322 100644
--- a/src/framework/circuitopt.hpp
+++ b/src/framework/circuitopt.hpp
@@ -24,8 +24,11 @@ namespace AER {
 
 class CircuitOptimization {
 public:
-  virtual void optimize_circuit(Circuit& circ) const = 0;
-  void set_config(const json_t &config);
+  virtual void optimize_circuit(Circuit& circ,
+                                const Operations::OpSet &opset,
+                                OutputData &data) const = 0;
+
+  virtual void set_config(const json_t &config);
 
 protected:
   json_t config_;
diff --git a/src/framework/operations.hpp b/src/framework/operations.hpp
index 00d8620536..6c97d0b5bd 100755
--- a/src/framework/operations.hpp
+++ b/src/framework/operations.hpp
@@ -29,7 +29,7 @@ enum class RegComparison {Equal, NotEqual, Less, LessEqual, Greater, GreaterEqua
 // Enum class for operation types
 enum class OpType {
   gate, measure, reset, bfunc, barrier, snapshot,
-  matrix, kraus, roerror, noise_switch, initialize
+  matrix, matrix_sequence, kraus, roerror, noise_switch, initialize
 };
 
 std::ostream& operator<<(std::ostream& stream, const OpType& type) {
@@ -55,6 +55,9 @@ std::ostream& operator<<(std::ostream& stream, const OpType& type) {
   case OpType::matrix:
     stream << "matrix";
     break;
+  case OpType::matrix_sequence:
+    stream << "matrix_sequence";
+    break;
   case OpType::kraus:
     stream << "kraus";
     break;
@@ -83,6 +86,7 @@ struct Op {
   OpType type;                    // operation type identifier
   std::string name;               // operation name
   reg_t qubits;                   //  qubits operation acts on
+  std::vector<reg_t> regs;        //  list of qubits for matrixes
   std::vector<complex_t> params;  // real or complex params for gates
   std::vector<std::string> string_params; // used or snapshot label, and boolean functions
 
@@ -369,6 +373,17 @@ inline Op make_mat(const reg_t &qubits, const cmatrix_t &mat, std::string label
   return op;
 }
 
+inline Op make_matrix_sequence(const std::vector<reg_t> &regs, const std::vector<cmatrix_t> &mats, std::string label = "") {
+  Op op;
+  op.type = OpType::matrix_sequence;
+  op.name = "matrix_sequence";
+  op.regs = regs;
+  op.mats = mats;
+  if (label != "")
+    op.string_params = {label};
+  return op;
+}
+
 template <typename T> // real or complex numeric type
 inline Op make_u1(uint_t qubit, T lam) {
   Op op;
@@ -431,7 +446,9 @@ inline Op make_roerror(const reg_t &memory, const std::vector<rvector_t> &probs)
 
 // Main JSON deserialization functions
 Op json_to_op(const json_t &js); // Patial TODO
+json_t op_to_json(const Op &op); // Patial TODO
 inline void from_json(const json_t &js, Op &op) {op = json_to_op(js);}
+inline void to_json(json_t &js, const Op &op) { js = op_to_json(op);}
 
 // Standard operations
 Op json_to_op_gate(const json_t &js);
@@ -496,6 +513,26 @@ Op json_to_op(const json_t &js) {
   return json_to_op_gate(js);
 }
 
+json_t op_to_json(const Op &op) {
+  json_t ret;
+  ret["name"] = op.name;
+  if (!op.qubits.empty())
+    ret["qubits"] = op.qubits;
+  if (!op.regs.empty())
+    ret["regs"] = op.regs;
+  if (!op.params.empty())
+    ret["params"] = op.params;
+  if (op.conditional)
+    ret["conditional"] = op.conditional_reg;
+  if (!op.memory.empty())
+    ret["memory"] = op.memory;
+  if (!op.registers.empty())
+    ret["register"] = op.registers;
+  if (!op.mats.empty())
+    ret["mats"] = op.mats;
+  return ret;
+}
+
 
 //------------------------------------------------------------------------------
 // Implementation: Gates, measure, reset deserialization
diff --git a/src/simulators/qasm/basic_optimization.hpp b/src/simulators/qasm/basic_optimization.hpp
index 8934b97de1..c9a70e46d7 100644
--- a/src/simulators/qasm/basic_optimization.hpp
+++ b/src/simulators/qasm/basic_optimization.hpp
@@ -12,16 +12,26 @@
 
 namespace AER {
 
+using uint_t = uint_t;
+using op_t = Operations::Op;
+using optype_t = Operations::OpType;
+using oplist_t = std::vector<op_t>;
+using reg_t = std::vector<uint_t>;
+
 class ReduceNop : public CircuitOptimization {
 public:
-  void optimize_circuit(Circuit& circ) const;
+  void optimize_circuit(Circuit& circ,
+                        const Operations::OpSet &opset,
+                        OutputData &data) const override;
 };
 
-void ReduceNop::optimize_circuit(Circuit& circ) const {
+void ReduceNop::optimize_circuit(Circuit& circ,
+                                 const Operations::OpSet &allowed_opset,
+                                 OutputData &data) const {
 
-  std::vector<Operations::Op>::iterator it = circ.ops.begin();
+  oplist_t::iterator it = circ.ops.begin();
   while (it != circ.ops.end()) {
-    if (it->type == Operations::OpType::barrier)
+    if (it->type == optype_t::barrier)
       it = circ.ops.erase(it);
     else
       ++it;
@@ -30,19 +40,429 @@ void ReduceNop::optimize_circuit(Circuit& circ) const {
 
 class Debug : public CircuitOptimization {
 public:
-  void optimize_circuit(Circuit& circ) const;
+  void optimize_circuit(Circuit& circ,
+                        const Operations::OpSet &opset,
+                        OutputData &data) const override;
 };
 
-void Debug::optimize_circuit(Circuit& circ) const {
+void Debug::optimize_circuit(Circuit& circ,
+                             const Operations::OpSet &allowed_opset,
+                             OutputData &data) const {
 
-  std::vector<Operations::Op>::iterator it = circ.ops.begin();
+  oplist_t::iterator it = circ.ops.begin();
   while (it != circ.ops.end()) {
     std::clog << it->name << std::endl;
     ++it;
   }
 }
 
+class Fusion : public CircuitOptimization {
+public:
+  Fusion(uint_t max_qubit = 5, uint_t threshold = 16, double cost_factor = 2.5);
+
+  void set_config(const json_t &config) override;
+
+  void optimize_circuit(Circuit& circ,
+                        const Operations::OpSet &opset,
+                        OutputData &data) const override;
+
+  bool can_ignore(const op_t& op) const;
+
+  bool can_apply_fusion(const op_t& op) const;
+
+  oplist_t aggregate(const oplist_t& buffer) const;
+
+  void swap_cols_and_rows(const uint_t idx1,
+                          const uint_t idx2,
+                          cmatrix_t &mat,
+                          uint_t dim) const;
+
+  cmatrix_t sort_matrix(const reg_t &src,
+                        const reg_t &sorted,
+                        const cmatrix_t &mat) const;
+
+  double estimate_cost(const oplist_t& ops,
+                       const uint_t from,
+                       const uint_t until) const;
+
+  void add_fusion_qubits(reg_t& fusion_qubits, const op_t& op) const;
+
+  cmatrix_t matrix(const op_t& op) const;
+
+#ifdef DEBUG
+  void dump(const Circuit& circuit) const;
+#endif
+
+  const static std::vector<std::string> supported_gates;
+
+private:
+  uint_t max_qubit_;
+  uint_t threshold_;
+  const double cost_factor_;
+  bool verbose_;
+  bool active_;
+};
+
+const std::vector<std::string> Fusion::supported_gates({
+  "id",   // Pauli-Identity gate
+  "x",    // Pauli-X gate
+  "y",    // Pauli-Y gate
+  "z",    // Pauli-Z gate
+  "s",    // Phase gate (aka sqrt(Z) gate)
+  "sdg",  // Conjugate-transpose of Phase gate
+  "h",    // Hadamard gate (X + Z / sqrt(2))
+  "t",    // T-gate (sqrt(S))
+  "tdg",  // Conjguate-transpose of T gate
+  // Waltz Gates
+  "u0",   // idle gate in multiples of X90
+  "u1",   // zero-X90 pulse waltz gate
+  "u2",   // single-X90 pulse waltz gate
+  "u3",   // two X90 pulse waltz gate
+  "U",    // two X90 pulse waltz gate
+  // Two-qubit gates
+  "CX",   // Controlled-X gate (CNOT)
+  "cx",   // Controlled-X gate (CNOT)
+  "cz",   // Controlled-Z gate
+  "swap" // SWAP gate
+  // Three-qubit gates
+  //"ccx"   // Controlled-CX gate (Toffoli): TODO
+});
+
+Fusion::Fusion(uint_t max_qubit, uint_t threshold, double cost_factor):
+    max_qubit_(max_qubit), threshold_(threshold), cost_factor_(cost_factor),
+    verbose_(false), active_(false) {
+}
+
+void Fusion::set_config(const json_t &config) {
+
+  CircuitOptimization::set_config(config);
+
+  if (JSON::check_key("fusion_verbose", config_))
+    JSON::get_value(verbose_, "fusion_verbose", config_);
+
+  if (JSON::check_key("fusion_enable", config_))
+    JSON::get_value(active_, "fusion_enable", config_);
+
+  if (JSON::check_key("fusion_max_qubit", config_))
+    JSON::get_value(max_qubit_, "fusion_max_qubit", config_);
+
+  if (JSON::check_key("fusion_threshold", config_))
+    JSON::get_value(threshold_, "fusion_threshold", config_);
+
+}
+
+
+#ifdef DEBUG
+void Fusion::dump(const Circuit& circuit) const {
+  int idx = 0;
+  for (const Operations::Op& op : circuit.ops) {
+    std::cout << "  " << idx++ << ":\t" << op.name << " " << op.qubits << std::endl;
+    for (const cmatrix_t&  mat: op.mats) {
+      const uint_t row = mat.GetRows();
+      const uint_t column = mat.GetColumns();
+      for (uint_t i = 0; i < row; ++i) {
+        for (uint_t j = 0; j < column; ++j) {
+          if (j == 0) std::cout << "      ";
+          else std::cout << ", ";
+          std::cout << mat(i, j);
+        }
+        std::cout << std::endl;
+      }
+    }
+  }
+}
+#endif
+
+void Fusion::optimize_circuit(Circuit& circ,
+                              const Operations::OpSet &allowed_opset,
+                              OutputData &data) const {
+
+  if (circ.num_qubits < threshold_
+      || !active_
+      || allowed_opset.optypes.find(Operations::OpType::matrix_sequence) == allowed_opset.optypes.end())
+    return;
+
+  bool ret = false;
+
+  oplist_t optimized_ops;
+
+  optimized_ops.clear();
+  oplist_t buffer;
+
+  int idx = 0;
+  for (const op_t op: circ.ops) {
+    if (can_ignore(op))
+      continue;
+    if (!can_apply_fusion(op)) {
+      if (!buffer.empty()) {
+        ret = true;
+        oplist_t optimized_buffer = aggregate(buffer);
+        optimized_ops.insert(optimized_ops.end(), optimized_buffer.begin(), optimized_buffer.end());
+        buffer.clear();
+      }
+      optimized_ops.push_back(op);
+    } else {
+      buffer.push_back(op);
+    }
+  }
+
+  if (!buffer.empty()) {
+    oplist_t optimized_buffer = aggregate(buffer);
+    optimized_ops.insert(optimized_ops.end(), optimized_buffer.begin(), optimized_buffer.end());
+    ret = true;
+  }
+
+  circ.ops = optimized_ops;
+
+  if (verbose_) {
+    data.add_additional_data("metadata",
+                             json_t::object({{"fusion_verbose", optimized_ops}}));
+  }
+
+#ifdef DEBUG
+  dump(optimized_ops);
+#endif
+}
+
+bool Fusion::can_ignore(const op_t& op) const {
+  switch (op.type) {
+  case optype_t::barrier:
+    return true;
+  case optype_t::gate:
+    return op.name == "id" || op.name == "u0";
+  default:
+    return false;
+  }
+}
+
+bool Fusion::can_apply_fusion(const op_t& op) const {
+  if (op.conditional)
+    return false;
+  switch (op.type) {
+  case optype_t::barrier:
+    return false;
+  case optype_t::matrix:
+    return op.mats.size() == 1 && op.mats[0].size() <= 4;
+  case optype_t::gate:
+    return (std::find(supported_gates.begin(), supported_gates.end(), op.name) != supported_gates.end());
+  case optype_t::reset:
+  case optype_t::measure:
+  case optype_t::bfunc:
+  case optype_t::roerror:
+  case optype_t::snapshot:
+  case optype_t::kraus:
+  default:
+    return false;
+  }
+}
+
+oplist_t Fusion::aggregate(const oplist_t& original) const {
+
+  // costs[i]: estimated cost to execute from 0-th to i-th in original.ops
+  std::vector<double> costs;
+  // fusion_to[i]: best path to i-th in original.ops
+  std::vector<int> fusion_to;
+
+  int applied_total = 0;
+  // calculate the minimal path to each operation in the circuit
+  for (int i = 0; i < original.size(); ++i) {
+    bool applied = false;
+
+    // first, fusion from i-th to i-th
+    fusion_to.push_back(i);
+
+    // calculate the initial cost from i-th to i-th
+    if (i == 0) {
+      // if this is the first op, no fusion
+      costs.push_back(cost_factor_);
+      continue;
+    }
+    // otherwise, i-th cost is calculated from (i-1)-th cost
+    costs.push_back(costs[i - 1] + cost_factor_);
+
+    for (uint_t num_fusion = 2; num_fusion <= max_qubit_; ++num_fusion) {
+      // calculate cost if {num_fusion}-qubit fusion is applied
+      reg_t fusion_qubits;
+      add_fusion_qubits(fusion_qubits, original[i]);
+
+      for (int j = i - 1; j >= 0; --j) {
+        add_fusion_qubits(fusion_qubits, original[j]);
+
+        if (fusion_qubits.size() > num_fusion) // exceed the limit of fusion
+          break;
+
+        // calculate a new cost of (i-th) by adding
+        double estimated_cost = estimate_cost(original, (uint_t) j, i) // fusion gate from j-th to i-th, and
+            + (j == 0 ? 0.0 : costs[j - 1]); // cost of (j-1)-th
+
+        // update cost
+        if (estimated_cost < costs[i]) {
+          costs[i] = estimated_cost;
+          fusion_to[i] = j;
+          applied = true;
+        }
+      }
+    }
+    if (applied)
+      ++applied_total;
+  }
+
+  if (applied_total / static_cast<double> (original.size()) < 0.25)
+    return original;
+
+  // generate a new circuit with the minimal path to the last operation in the circuit
+  oplist_t optimized;
+
+  for (int i = original.size() - 1; i >= 0;) {
+    int to = fusion_to[i];
+
+    if (to == i) {
+      optimized.push_back(original[i]);
+    } else {
+      std::vector<reg_t> regs;
+      std::vector<cmatrix_t> mats;
+      for (int j = to; j <= i; ++j) {
+        regs.push_back(original[j].qubits);
+        mats.push_back(matrix(original[j]));
+      }
+      optimized.push_back(Operations::make_matrix_sequence(regs, mats));
+    }
+    i = to - 1;
+  }
+
+  std::reverse(optimized.begin(), optimized.end());
+
+  return optimized;
+}
+
+//------------------------------------------------------------------------------
+// Gate-swap optimized helper functions
+//------------------------------------------------------------------------------
+void Fusion::swap_cols_and_rows(const uint_t idx1, const uint_t idx2,
+                                cmatrix_t &mat, uint_t dim) const {
+
+  uint_t mask1 = (1UL << idx1);
+  uint_t mask2 = (1UL << idx2);
+
+  for (uint_t first = 0; first < dim; ++first) {
+    if ((first & mask1) && !(first & mask2)) {
+      uint_t second = (first ^ mask1) | mask2;
+
+      for (uint_t i = 0; i < dim; ++i) {
+        complex_t cache = mat(first, i);
+        mat(first, i) = mat(second, i);
+        mat(second, i) = cache;
+      }
+      for (uint_t i = 0; i < dim; ++i) {
+        complex_t cache = mat(i, first);
+        mat(i, first) = mat(i, second);
+        mat(i, second) = cache;
+      }
+    }
+  }
+}
+
+cmatrix_t Fusion::sort_matrix(const reg_t &src,
+                              const reg_t &sorted,
+                              const cmatrix_t &mat) const {
+
+  const uint_t dim = mat.GetRows();
+  auto ret = mat;
+  auto current = src;
 
+  while (current != sorted) {
+    uint_t from;
+    uint_t to;
+    for (from = 0; from < current.size(); ++from)
+      if (current[from] != sorted[from])
+        break;
+    if (from == current.size())
+      break;
+    for (to = from + 1; to < current.size(); ++to)
+      if (current[from] == sorted[to])
+        break;
+    if (to == current.size()) {
+      std::stringstream ss;
+      ss << "Fusion::sort_matrix we should not reach here";
+      throw std::runtime_error(ss.str());
+    }
+    swap_cols_and_rows(from, to, ret, dim);
+
+    uint_t cache = current[from];
+    current[from] = current[to];
+    current[to] = cache;
+  }
+
+  return ret;
+}
+
+double Fusion::estimate_cost(const std::vector<op_t>& ops,
+                             const uint_t from,
+                             const uint_t until) const {
+  reg_t fusion_qubits;
+  for (uint_t i = from; i <= until; ++i)
+    add_fusion_qubits(fusion_qubits, ops[i]);
+  return pow(cost_factor_, (double) fusion_qubits.size());
+}
+
+void Fusion::add_fusion_qubits(reg_t& fusion_qubits, const op_t& op) const {
+  for (uint_t i = 0; i < op.qubits.size(); ++i)
+    if (find(fusion_qubits.begin(), fusion_qubits.end(), op.qubits[i]) == fusion_qubits.end())
+      fusion_qubits.push_back(op.qubits[i]);
+}
+
+cmatrix_t Fusion::matrix(const op_t& op) const {
+  if (op.type == optype_t::gate) {
+    if (op.name == "id") {   // Pauli-Identity gate
+      return Utils::Matrix::I;
+    } else if (op.name == "x") {    // Pauli-X gate
+      return Utils::Matrix::X;
+    } else if (op.name == "y") {    // Pauli-Y gate
+      return Utils::Matrix::Y;
+    } else if (op.name == "z") {    // Pauli-Z gate
+      return Utils::Matrix::Z;
+    } else if (op.name == "s") {    // Phase gate (aka sqrt(Z) gate)
+      return Utils::Matrix::S;
+    } else if (op.name == "sdg") {  // Conjugate-transpose of Phase gate
+      return Utils::Matrix::Sdg;
+    } else if (op.name == "h") {    // Hadamard gate (X + Z / sqrt(2))
+      return Utils::Matrix::H;
+    } else if (op.name == "t") {    // T-gate (sqrt(S))
+      return Utils::Matrix::T;
+    } else if (op.name == "tdg") {  // Conjguate-transpose of T gate
+      return Utils::Matrix::Tdg;
+    } else if (op.name == "u0") {   // idle gate in multiples of X90
+      return Utils::Matrix::I;
+    } else if (op.name == "u1") {   // zero-X90 pulse waltz gate
+      return Utils::make_matrix<complex_t>( {
+        { {1, 0}, {0, 0} },
+        { {0, 0}, std::exp( complex_t(0, 1.) * std::real(op.params[0])) }}
+      );
+    } else if (op.name == "u2") {   // single-X90 pulse waltz gate
+      return Utils::Matrix::U3( M_PI / 2., std::real(op.params[0]), std::real(op.params[1]));
+    } else if (op.name == "u3" || op.name == "U") {   // two X90 pulse waltz gate
+      return Utils::Matrix::U3( std::real(op.params[0]), std::real(op.params[1]), std::real(op.params[2]));
+    // Two-qubit gates
+    } else if (op.name == "CX" || op.name == "cx") {   // Controlled-X gate (CNOT)
+      return Utils::Matrix::CX;
+    } else if (op.name == "cz") {   // Controlled-Z gate
+      return Utils::Matrix::CZ;
+    } else if (op.name == "swap") { // SWAP gate
+      return Utils::Matrix::SWAP;
+    // Three-qubit gates
+//    } else if (op.name == "ccx") {   // Controlled-CX gate (Toffoli)
+//      return Utils::Matrix::CCX;
+    } else {
+      std::stringstream msg;
+      msg << "invalid operation:" << op.name << "\'.matrix()";
+      throw std::runtime_error(msg.str());
+    }
+  } else if (op.type == optype_t::matrix) {
+    return op.mats[0];
+  } else {
+    throw std::runtime_error("Fusion: unexpected operation type");
+  }
+}
 //-------------------------------------------------------------------------
 } // end namespace AER
 //-------------------------------------------------------------------------
diff --git a/src/simulators/qasm/qasm_controller.hpp b/src/simulators/qasm/qasm_controller.hpp
index 237c7cd944..6c1bd835b1 100755
--- a/src/simulators/qasm/qasm_controller.hpp
+++ b/src/simulators/qasm/qasm_controller.hpp
@@ -235,6 +235,7 @@ class QasmController : public Base::Controller {
 //-------------------------------------------------------------------------
 QasmController::QasmController() {
   add_circuit_optimization(ReduceNop());
+  add_circuit_optimization(Fusion());
 }
 
 //-------------------------------------------------------------------------
@@ -476,7 +477,7 @@ void QasmController::run_circuit_with_noise(const Circuit &circ,
   // Sample a new noise circuit and optimize for each shot
   while(shots-- > 0) {
     Circuit noise_circ = noise_model_.sample_noise(circ, rng);
-    noise_circ = optimize_circuit(noise_circ);
+    noise_circ = optimize_circuit(noise_circ, state, data);
     run_single_shot(noise_circ, state, initial_state, data, rng);
   }                                   
 }
@@ -491,7 +492,7 @@ void QasmController::run_circuit_without_noise(const Circuit &circ,
                                                RngEngine &rng) const {
   // Optimize circuit for state type
   Circuit opt_circ;
-  opt_circ = optimize_circuit(circ);
+  opt_circ = optimize_circuit(circ, state, data);
 
   // Check if measure sampler and optimization are valid
   auto check = check_measure_sampling_opt(opt_circ);
diff --git a/src/simulators/statevector/qubitvector.hpp b/src/simulators/statevector/qubitvector.hpp
index 89293c833d..4cdd69eedc 100755
--- a/src/simulators/statevector/qubitvector.hpp
+++ b/src/simulators/statevector/qubitvector.hpp
@@ -238,6 +238,11 @@ class QubitVector {
   // The matrix is input as vector of the column-major vectorized N-qubit matrix.
   void apply_matrix(const reg_t &qubits, const cvector_t &mat);
 
+  // Apply a N-qubit matrix constructed from composition of 1 and 2 qubit matrices.
+  // The sets of qubits and matrices are passed as vectors, where each individual matrix
+  // is input as a column-major vectorized matrix.
+  void apply_matrix_sequence(const std::vector<reg_t> &regs, const std::vector<cvector_t> &mats);
+
   // Apply a 1-qubit diagonal matrix to the state vector.
   // The matrix is input as vector of the matrix diagonal.
   void apply_diagonal_matrix(const uint_t qubit, const cvector_t &mat);
@@ -372,12 +377,6 @@ class QubitVector {
   // Optimization configuration settings
   //-----------------------------------------------------------------------
 
-  // Enable sorted qubit matrix gate optimization (Default disabled)
-  void enable_gate_opt() {gate_opt_ = true;}
-
-  // Disable sorted qubit matrix gate optimization
-  void disable_gate_opt() {gate_opt_ = false;}
-
   // Set the sample_measure index size
   void set_sample_measure_index_size(int n) {sample_measure_index_size_ = n;}
 
@@ -400,7 +399,6 @@ class QubitVector {
   uint_t omp_threads_ = 1;     // Disable multithreading by default
   uint_t omp_threshold_ = 13;  // Qubit threshold for multithreading when enabled
   int sample_measure_index_size_ = 10; // Sample measure indexing qubit size
-  bool gate_opt_ = false;      // enable large-qubit optimized gates
   double json_chop_threshold_ = 0;  // Threshold for choping small values
                                     // in JSON serialization
 
@@ -520,17 +518,6 @@ class QubitVector {
                                    const list_t &qubits,
                                    const param_t &params) const;
 
-  //-----------------------------------------------------------------------
-  // Optimized matrix multiplication
-  //-----------------------------------------------------------------------
-  
-  // Optimized implementations
-  void apply_matrix2(const reg_t &qubits, const cvector_t &mat);
-  void apply_matrix3(const reg_t &qubits, const cvector_t &mat);
-  void apply_matrix4(const reg_t &qubits, const cvector_t &mat);
-  void apply_matrix5(const reg_t &qubits, const cvector_t &mat);
-  void apply_matrix6(const reg_t &qubits, const cvector_t &mat);
-
   // Permute an N-qubit vectorized matrix to match a reordering of qubits
   cvector_t sort_matrix(const reg_t &src,
                         const reg_t &sorted,
@@ -539,6 +526,12 @@ class QubitVector {
   // Swap cols and rows of vectorized matrix
   void swap_cols_and_rows(const uint_t idx1, const uint_t idx2,
                           cvector_t &mat, uint_t dim) const;
+
+  // Extend a matrix for qubits (src_qubits) to a matrix for more qubits (dst_sorted_qubits)
+  // qubits in dst_sorted_qubits must contains all the qubits in src_qubits
+  cvector_t expand_matrix(const reg_t& src_qubits,
+                          const reg_t& dst_sorted_qubits,
+                          const cvector_t& vmat) const;
 };
 
 /*******************************************************************************
@@ -1086,32 +1079,6 @@ void QubitVector<data_t>::apply_matrix(const reg_t &qubits,
   check_vector(mat, 2 * N);
   #endif
 
-  // Matrix-swap based optimized 2-6 qubit implementation
-  if (gate_opt_ && N <= 6) {
-    switch (N) {
-      case 1:
-        apply_matrix(qubits[0], mat);
-        return;
-      case 2:
-        apply_matrix2(qubits, mat);
-        return;
-      case 3:
-        apply_matrix3(qubits, mat);
-        return;
-      case 4:
-        apply_matrix4(qubits, mat);
-        return;
-      case 5:
-        apply_matrix5(qubits, mat);
-        return;
-      case 6:
-        apply_matrix6(qubits, mat);
-        return;
-    default:
-      break;
-    } // end switch
-  }
-  
   // Static array optimized lambda functions
   switch (N) {
     case 1:
@@ -1188,6 +1155,69 @@ void QubitVector<data_t>::apply_matrix(const reg_t &qubits,
   } // end switch
 }
 
+template <typename data_t>
+void QubitVector<data_t>::apply_matrix_sequence(const std::vector<reg_t> &regs,
+                                         const std::vector<cvector_t> &mats) {
+  if (mats.size() == 0)
+    return;
+
+
+#ifdef DEBUG
+  if (regs.size() != mats.size());
+    throw std::runtime_error("QubitVector<data_t>::apply_matrix_sequence allows same size of qubitss and mats.");
+#endif
+
+  bool at_most_two = true;
+  // check 1 or 2 qubits
+  for (const reg_t& reg: regs) {
+    if (reg.size() > 2) {
+      at_most_two = false;
+      break;
+    }
+  }
+
+  if (!at_most_two) {
+    for (size_t i = 0; i < regs.size(); ++i)
+      apply_matrix(regs[i], mats[i]);
+    return;
+  }
+
+
+  reg_t sorted_qubits;
+  for (const reg_t& reg: regs)
+    for (const uint_t qubit: reg)
+      if (std::find(sorted_qubits.begin(), sorted_qubits.end(), qubit) == sorted_qubits.end())
+        sorted_qubits.push_back(qubit);
+
+  std::sort(sorted_qubits.begin(), sorted_qubits.end());
+
+  std::vector<cvector_t> sorted_mats;
+
+  for (size_t i = 0; i < regs.size(); ++i) {
+    const reg_t& reg = regs[i];
+    const cvector_t& mat = mats[i];
+    sorted_mats.push_back(expand_matrix(reg, sorted_qubits, mat));
+  }
+
+  auto U = sorted_mats[0];
+  const auto dim = BITS[sorted_qubits.size()];
+
+  for (size_t m = 1; m < sorted_mats.size(); m++) {
+
+    cvector_t u_tmp(U.size(), (0., 0.));
+    const cvector_t& u = sorted_mats[m];
+
+    for (size_t i = 0; i < dim; ++i)
+      for (size_t j = 0; j < dim; ++j)
+        for (size_t k = 0; k < dim; ++k)
+          u_tmp[i + j * dim] += u[i + k * dim] * U[k + j * dim];
+
+    U = u_tmp;
+  }
+
+  apply_matrix(sorted_qubits, U);
+}
+
 template <typename data_t>
 void QubitVector<data_t>::apply_diagonal_matrix(const reg_t &qubits,
                                                 const cvector_t &diag) {
@@ -1719,380 +1749,6 @@ void QubitVector<data_t>::apply_diagonal_matrix(const uint_t qubit,
   }
 }
 
-//------------------------------------------------------------------------------
-// 2-6 qubit optimized matrices
-//------------------------------------------------------------------------------
-
-template <typename data_t>
-void QubitVector<data_t>::apply_matrix2(const reg_t& qubits, const cvector_t &vmat) {
-  // Check qubits is size.
-  if (qubits.size() != 2) {
-    throw std::runtime_error("QubitVector::apply_matrix2 called for wrong number of qubits");
-  }
-  // Optimized implementation
-  auto sorted_qs = qubits;
-  std::sort(sorted_qs.begin(), sorted_qs.end());
-  auto sorted_vmat = sort_matrix(qubits, sorted_qs, vmat);
-
-  int_t END = data_size_;
-  int_t step1 = BITS[sorted_qs[0]];
-  int_t step2 = BITS[sorted_qs[1]];
-#pragma omp parallel if (num_qubits_ > omp_threshold_ && omp_threads_ > 1) num_threads(omp_threads_)
-  {
-#ifdef _WIN32
-#pragma omp for
-#else
-#pragma omp for collapse(3)
-#endif
-    for (int_t k1 = 0; k1 < END; k1 += (step2 * 2UL)) {
-      for (int_t k2 = 0; k2 < step2; k2 += (step1 * 2UL)) {
-        for (int_t k3 = 0; k3 < step1; k3++) {
-          int_t t0 = k1 | k2 | k3;
-          int_t t1 = t0 | step1;
-          int_t t2 = t0 | step2;
-          int_t t3 = t2 | step1;
-
-          const complex_t psi0 = data_[t0];
-          const complex_t psi1 = data_[t1];
-          const complex_t psi2 = data_[t2];
-          const complex_t psi3 = data_[t3];
-          // data_[t_i] = sum_j mat[i + 4 * j] * psi[j]
-          data_[t0] = psi0 * sorted_vmat[0] + psi1 * sorted_vmat[4] + psi2 * sorted_vmat[8] + psi3 * sorted_vmat[12];
-          data_[t1] = psi0 * sorted_vmat[1] + psi1 * sorted_vmat[5] + psi2 * sorted_vmat[9] + psi3 * sorted_vmat[13];
-          data_[t2] = psi0 * sorted_vmat[2] + psi1 * sorted_vmat[6] + psi2 * sorted_vmat[10] + psi3 * sorted_vmat[14];
-          data_[t3] = psi0 * sorted_vmat[3] + psi1 * sorted_vmat[7] + psi2 * sorted_vmat[11] + psi3 * sorted_vmat[15];
-        }
-      }
-    }
-  }
-}
-
-template <typename data_t>
-void QubitVector<data_t>::apply_matrix3(const reg_t& qubits, const cvector_t &vmat) {
-  // Check qubits is size.
-  if (qubits.size() != 3) {
-    throw std::runtime_error("QubitVector::apply_matrix3 called for wrong number of qubits");
-  }
-  // Optimized implementation
-  auto sorted_qs = qubits;
-  std::sort(sorted_qs.begin(), sorted_qs.end());
-  auto sorted_vmat = sort_matrix(qubits, sorted_qs, vmat);
-  const uint_t dim = BITS[3];
-
-  int_t END = data_size_;
-  int_t step1 = BITS[sorted_qs[0]];
-  int_t step2 = BITS[sorted_qs[1]];
-  int_t step3 = BITS[sorted_qs[2]];
-
-  int_t masks[] = {//
-      0, //
-      step1, //
-      step2, //
-      step2 | step1, //
-      step3, //
-      step3 | step1, //
-      step3 | step2, //
-      step3 | step2 | step1 //
-  };
-
-#pragma omp parallel if (num_qubits_ > omp_threshold_ && omp_threads_ > 1) num_threads(omp_threads_)
-  {
-#ifdef _WIN32
-#pragma omp for
-#else
-#pragma omp for collapse(4)
-#endif
-    for (int_t k1 = 0; k1 < END; k1 += (step3 * 2UL)) {
-      for (int_t k2 = 0; k2 < step3; k2 += (step2 * 2UL)) {
-        for (int_t k3 = 0; k3 < step2; k3 += (step1 * 2UL)) {
-          for (int_t k4 = 0; k4 < step1; k4++) {
-            int_t base = k1 | k2 | k3 | k4;
-            complex_t psi[8];
-            for (int_t i = 0; i < 8; ++i) {
-              psi[i] = data_[base | masks[i]];
-              data_[base | masks[i]] = 0.;
-            }
-            for (size_t i = 0; i < 8; ++i)
-              for (size_t j = 0; j < 8; ++j)
-                data_[base | masks[i]] += psi[j] * sorted_vmat[j * dim + i];
-          }
-        }
-      }
-    }
-  }
-}
-
-template <typename data_t>
-void QubitVector<data_t>::apply_matrix4(const reg_t& qubits, const cvector_t &vmat) {
-  // Check qubits is size.
-  if (qubits.size() != 4) {
-    throw std::runtime_error("QubitVector::apply_matrix4 called for wrong number of qubits");
-  }
-  // Optimized implementation
-  auto sorted_qs = qubits;
-  std::sort(sorted_qs.begin(), sorted_qs.end());
-  auto sorted_vmat = sort_matrix(qubits, sorted_qs, vmat);
-  const uint_t dim = BITS[4];
-
-  int_t END = data_size_;
-  int_t step1 = BITS[sorted_qs[0]];
-  int_t step2 = BITS[sorted_qs[1]];
-  int_t step3 = BITS[sorted_qs[2]];
-  int_t step4 = BITS[sorted_qs[3]];
-
-  int_t masks[] = {//
-      0, //
-      step1, //
-      step2, //
-      step2 | step1, //
-      step3, //
-      step3 | step1, //
-      step3 | step2, //
-      step3 | step2 | step1, //
-      step4, //
-      step4 | step1, //
-      step4 | step2, //
-      step4 | step2 | step1, //
-      step4 | step3, //
-      step4 | step3 | step1, //
-      step4 | step3 | step2, //
-      step4 | step3 | step2 | step1 //
-  };
-
-#pragma omp parallel if (num_qubits_ > omp_threshold_ && omp_threads_ > 1) num_threads(omp_threads_)
-  {
-#ifdef _WIN32
-#pragma omp for
-#else
-#pragma omp for collapse(5)
-#endif
-    for (int_t k1 = 0; k1 < END; k1 += (step4 * 2UL)) {
-      for (int_t k2 = 0; k2 < step4; k2 += (step3 * 2UL)) {
-        for (int_t k3 = 0; k3 < step3; k3 += (step2 * 2UL)) {
-          for (int_t k4 = 0; k4 < step2; k4 += (step1 * 2UL)) {
-            for (int_t k5 = 0; k5 < step1; k5++) {
-              int_t base = k1 | k2 | k3 | k4 | k5;
-              complex_t psi[16];
-              for (int_t i = 0; i < 16; ++i) {
-                psi[i] = data_[base | masks[i]];
-                data_[base | masks[i]] = 0.;
-              }
-              for (size_t i = 0; i < 16; ++i)
-                for (size_t j = 0; j < 16; ++j)
-                  data_[base | masks[i]] += psi[j] * sorted_vmat[j * dim + i];
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-template <typename data_t>
-void QubitVector<data_t>::apply_matrix5(const reg_t &qubits, const cvector_t &vmat) {
-  // Check qubits is size.
-  if (qubits.size() != 5) {
-    throw std::runtime_error("QubitVector::apply_matrix5 called for wrong number of qubits");
-  }
-  // Optimized implementation
-  auto sorted_qs = qubits;
-  std::sort(sorted_qs.begin(), sorted_qs.end());
-  auto sorted_vmat = sort_matrix(qubits, sorted_qs, vmat);
-  const uint_t dim = BITS[5];
-
-  int_t END = data_size_;
-  int_t step1 = BITS[sorted_qs[0]];
-  int_t step2 = BITS[sorted_qs[1]];
-  int_t step3 = BITS[sorted_qs[2]];
-  int_t step4 = BITS[sorted_qs[3]];
-  int_t step5 = BITS[sorted_qs[4]];
-
-  int_t masks[] = {//
-      0, //
-      step1, //
-      step2, //
-      step2 | step1, //
-      step3, //
-      step3 | step1, //
-      step3 | step2, //
-      step3 | step2 | step1, //
-      step4, //
-      step4 | step1, //
-      step4 | step2, //
-      step4 | step2 | step1, //
-      step4 | step3, //
-      step4 | step3 | step1, //
-      step4 | step3 | step2, //
-      step4 | step3 | step2 | step1, //
-      step5, //
-      step5 | step1, //
-      step5 | step2, //
-      step5 | step2 | step1, //
-      step5 | step3, //
-      step5 | step3 | step1, //
-      step5 | step3 | step2, //
-      step5 | step3 | step2 | step1, //
-      step5 | step4, //
-      step5 | step4 | step1, //
-      step5 | step4 | step2, //
-      step5 | step4 | step2 | step1, //
-      step5 | step4 | step3, //
-      step5 | step4 | step3 | step1, //
-      step5 | step4 | step3 | step2, //
-      step5 | step4 | step3 | step2 | step1 //
-  };
-
-#pragma omp parallel if (num_qubits_ > omp_threshold_ && omp_threads_ > 1) num_threads(omp_threads_)
-  {
-#ifdef _WIN32
-#pragma omp for
-#else
-#pragma omp for collapse(6)
-#endif
-    for (int_t k1 = 0; k1 < END; k1 += (step5 * 2UL)) {
-      for (int_t k2 = 0; k2 < step5; k2 += (step4 * 2UL)) {
-        for (int_t k3 = 0; k3 < step4; k3 += (step3 * 2UL)) {
-          for (int_t k4 = 0; k4 < step3; k4 += (step2 * 2UL)) {
-            for (int_t k5 = 0; k5 < step2; k5 += (step1 * 2UL)) {
-              for (int_t k6 = 0; k6 < step1; k6++) {
-                int_t base = k1 | k2 | k3 | k4 | k5 | k6;
-                complex_t psi[32];
-                for (int_t i = 0; i < 32; ++i) {
-                  psi[i] = data_[base | masks[i]];
-                  data_[base | masks[i]] = 0.;
-                }
-                for (size_t i = 0; i < 32; ++i)
-                  for (size_t j = 0; j < 32; ++j)
-                    data_[base | masks[i]] += psi[j] * sorted_vmat[j * dim + i];
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-template <typename data_t>
-void QubitVector<data_t>::apply_matrix6(const reg_t &qubits, const cvector_t &vmat) {
-  // Check qubits is size.
-  if (qubits.size() != 6) {
-    throw std::runtime_error("QubitVector::apply_matrix6 called for wrong number of qubits");
-  }
-  // Optimized implementation
-  auto sorted_qs = qubits;
-  std::sort(sorted_qs.begin(), sorted_qs.end());
-  auto sorted_vmat = sort_matrix(qubits, sorted_qs, vmat);
-  const uint_t dim = BITS[6];
-
-  int_t END = data_size_;
-  int_t step1 = BITS[sorted_qs[0]];
-  int_t step2 = BITS[sorted_qs[1]];
-  int_t step3 = BITS[sorted_qs[2]];
-  int_t step4 = BITS[sorted_qs[3]];
-  int_t step5 = BITS[sorted_qs[4]];
-  int_t step6 = BITS[sorted_qs[5]];
-
-  int_t masks[] = {//
-      0, //
-      step1, //
-      step2, //
-      step2 | step1, //
-      step3, //
-      step3 | step1, //
-      step3 | step2, //
-      step3 | step2 | step1, //
-      step4, //
-      step4 | step1, //
-      step4 | step2, //
-      step4 | step2 | step1, //
-      step4 | step3, //
-      step4 | step3 | step1, //
-      step4 | step3 | step2, //
-      step4 | step3 | step2 | step1, //
-      step5, //
-      step5 | step1, //
-      step5 | step2, //
-      step5 | step2 | step1, //
-      step5 | step3, //
-      step5 | step3 | step1, //
-      step5 | step3 | step2, //
-      step5 | step3 | step2 | step1, //
-      step5 | step4, //
-      step5 | step4 | step1, //
-      step5 | step4 | step2, //
-      step5 | step4 | step2 | step1, //
-      step5 | step4 | step3, //
-      step5 | step4 | step3 | step1, //
-      step5 | step4 | step3 | step2, //
-      step5 | step4 | step3 | step2 | step1, //
-      step6, //
-      step6 | step1, //
-      step6 | step2, //
-      step6 | step2 | step1, //
-      step6 | step3, //
-      step6 | step3 | step1, //
-      step6 | step3 | step2, //
-      step6 | step3 | step2 | step1, //
-      step6 | step4, //
-      step6 | step4 | step1, //
-      step6 | step4 | step2, //
-      step6 | step4 | step2 | step1, //
-      step6 | step4 | step3, //
-      step6 | step4 | step3 | step1, //
-      step6 | step4 | step3 | step2, //
-      step6 | step4 | step3 | step2 | step1, //
-      step6 | step5, //
-      step6 | step5 | step1, //
-      step6 | step5 | step2, //
-      step6 | step5 | step2 | step1, //
-      step6 | step5 | step3, //
-      step6 | step5 | step3 | step1, //
-      step6 | step5 | step3 | step2, //
-      step6 | step5 | step3 | step2 | step1, //
-      step6 | step5 | step4, //
-      step6 | step5 | step4 | step1, //
-      step6 | step5 | step4 | step2, //
-      step6 | step5 | step4 | step2 | step1, //
-      step6 | step5 | step4 | step3, //
-      step6 | step5 | step4 | step3 | step1, //
-      step6 | step5 | step4 | step3 | step2, //
-      step6 | step5 | step4 | step3 | step2 | step1 //
-  };
-
-#pragma omp parallel if (num_qubits_ > omp_threshold_ && omp_threads_ > 1) num_threads(omp_threads_)
-  {
-#ifdef _WIN32
-#pragma omp for
-#else
-#pragma omp for collapse(7)
-#endif
-    for (int_t k1 = 0; k1 < END; k1 += (step6 * 2UL)) {
-      for (int_t k2 = 0; k2 < step6; k2 += (step5 * 2UL)) {
-        for (int_t k3 = 0; k3 < step5; k3 += (step4 * 2UL)) {
-          for (int_t k4 = 0; k4 < step4; k4 += (step3 * 2UL)) {
-            for (int_t k5 = 0; k5 < step3; k5 += (step2 * 2UL)) {
-              for (int_t k6 = 0; k6 < step2; k6 += (step1 * 2UL)) {
-                for (int_t k7 = 0; k7 < step1; k7++) {
-                  int_t base = k1 | k2 | k3 | k4 | k5 | k6 | k7;
-                  complex_t psi[64];
-                  for (int_t i = 0; i < 64; ++i) {
-                    psi[i] = data_[base | masks[i]];
-                    data_[base | masks[i]] = 0.;
-                  }
-                  for (size_t i = 0; i < 64; ++i)
-                    for (size_t j = 0; j < 64; ++j)
-                      data_[base | masks[i]] += psi[j] * sorted_vmat[j * dim + i];
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
 //------------------------------------------------------------------------------
 // Gate-swap optimized helper functions
 //------------------------------------------------------------------------------
@@ -2154,6 +1810,90 @@ cvector_t QubitVector<data_t>::sort_matrix(const reg_t& src,
   return ret;
 }
 
+template <typename data_t>
+cvector_t QubitVector<data_t>::expand_matrix(const reg_t& src_qubits, const reg_t& dst_sorted_qubits, const cvector_t& vmat) const {
+
+  const auto dst_dim = BITS[dst_sorted_qubits.size()];
+  const auto dst_vmat_size = dst_dim * dst_dim;
+  const auto src_dim = BITS[src_qubits.size()];
+
+  // generate a matrix for op
+  cvector_t u(dst_vmat_size, (.0, .0));
+  std::vector<bool> filled(dst_dim, false);
+
+  if (src_qubits.size() == 1) { //1-qubit operation
+    // 1. identify delta
+    const auto index = std::distance(dst_sorted_qubits.begin(),
+                               std::find(dst_sorted_qubits.begin(), dst_sorted_qubits.end(), src_qubits[0]));
+
+    const auto delta = BITS[index];
+
+    // 2. find vmat(0, 0) position in U
+    for (uint_t i = 0; i < dst_dim; ++i) {
+
+      if (filled[i])
+        continue;
+
+      //  3. allocate op.u to u based on u(i, i) and delta
+      u[i           + (i + 0)     * dst_dim] = vmat[0 + 0  * src_dim];
+      u[i           + (i + delta) * dst_dim] = vmat[0 + 1  * src_dim];
+      u[(i + delta) + (i + 0)     * dst_dim] = vmat[1 + 0  * src_dim];
+      u[(i + delta) + (i + delta) * dst_dim] = vmat[1 + 1  * src_dim];
+      filled[i] = filled[i + delta] = true;
+    }
+  } else if (src_qubits.size() == 2) { //2-qubit operation
+
+    reg_t sorted_src_qubits = src_qubits;
+    std::sort(sorted_src_qubits.begin(), sorted_src_qubits.end());
+    const cvector_t sorted_vmat = sort_matrix(src_qubits, sorted_src_qubits, vmat);
+
+    // 1. identify low and high delta
+    auto low = std::distance(dst_sorted_qubits.begin(),
+                               std::find(dst_sorted_qubits.begin(), dst_sorted_qubits.end(), src_qubits[0]));
+
+    auto high = std::distance(dst_sorted_qubits.begin(),
+                                std::find(dst_sorted_qubits.begin(), dst_sorted_qubits.end(), src_qubits[1]));
+
+    auto low_delta = BITS[low];
+    auto high_delta = BITS[high];
+
+    // 2. find op.u(0, 0) position in U
+    for (uint_t i = 0; i < dst_dim; ++i) {
+      if (filled[i])
+        continue;
+
+      //  3. allocate vmat to u based on u(i, i) and delta
+      u[i                             + (i + 0)                      * dst_dim] = sorted_vmat[0 + 0 * src_dim];
+      u[i                             + (i + low_delta)              * dst_dim] = sorted_vmat[0 + 1 * src_dim];
+      u[i                             + (i + high_delta)             * dst_dim] = sorted_vmat[0 + 2 * src_dim];
+      u[i                             + (i + low_delta + high_delta) * dst_dim] = sorted_vmat[0 + 3 * src_dim];
+      u[(i + low_delta)               + (i + 0)                      * dst_dim] = sorted_vmat[1 + 0 * src_dim];
+      u[(i + low_delta)               + (i + low_delta)              * dst_dim] = sorted_vmat[1 + 1 * src_dim];
+      u[(i + low_delta)               + (i + high_delta)             * dst_dim] = sorted_vmat[1 + 2 * src_dim];
+      u[(i + low_delta)               + (i + low_delta + high_delta) * dst_dim] = sorted_vmat[1 + 3 * src_dim];
+      u[(i + high_delta)              + (i + 0)                      * dst_dim] = sorted_vmat[2 + 0 * src_dim];
+      u[(i + high_delta)              + (i + low_delta)              * dst_dim] = sorted_vmat[2 + 1 * src_dim];
+      u[(i + high_delta)              + (i + high_delta)             * dst_dim] = sorted_vmat[2 + 2 * src_dim];
+      u[(i + high_delta)              + (i + low_delta + high_delta) * dst_dim] = sorted_vmat[2 + 3 * src_dim];
+      u[(i + low_delta + high_delta)  + (i + 0)                      * dst_dim] = sorted_vmat[3 + 0 * src_dim];
+      u[(i + low_delta + high_delta)  + (i + low_delta)              * dst_dim] = sorted_vmat[3 + 1 * src_dim];
+      u[(i + low_delta + high_delta)  + (i + high_delta)             * dst_dim] = sorted_vmat[3 + 2 * src_dim];
+      u[(i + low_delta + high_delta)  + (i + low_delta + high_delta) * dst_dim] = sorted_vmat[3 + 3 * src_dim];
+
+      filled[i] = true;
+      filled[i + low_delta] = true;
+      filled[i + high_delta] = true;
+      filled[i + low_delta + high_delta] = true;
+    }
+    //TODO: } else if (src_qubits.size() == 3) {
+  } else {
+    std::stringstream ss;
+    ss << "Fusion::illegal qubit number:" << src_qubits.size();
+    throw std::runtime_error(ss.str());
+  }
+
+  return u;
+}
 /*******************************************************************************
  *
  * NORMS
diff --git a/src/simulators/statevector/statevector_state.hpp b/src/simulators/statevector/statevector_state.hpp
index b3425e2346..9139be9143 100755
--- a/src/simulators/statevector/statevector_state.hpp
+++ b/src/simulators/statevector/statevector_state.hpp
@@ -67,6 +67,7 @@ class State : public Base::State<statevec_t> {
       Operations::OpType::bfunc,
       Operations::OpType::roerror,
       Operations::OpType::matrix,
+      Operations::OpType::matrix_sequence,
       Operations::OpType::kraus
     });
   }
@@ -165,6 +166,9 @@ class State : public Base::State<statevec_t> {
   // Apply a vectorized matrix to given qubits (identity on all other qubits)
   void apply_matrix(const reg_t &qubits, const cvector_t & vmat);
 
+  // Apply multiple gate operations
+  void apply_matrix_sequence(const std::vector<reg_t> &regs, const std::vector<cmatrix_t>& mats);
+
   // Apply a Kraus error operation
   void apply_kraus(const reg_t &qubits,
                    const std::vector<cmatrix_t> &krausops,
@@ -389,12 +393,6 @@ void State<statevec_t>::set_config(const json_t &config) {
   if (JSON::get_value(index_size, "statevector_sample_measure_opt", config)) {
     BaseState::qreg_.set_sample_measure_index_size(index_size);
   };
-
-  // Enable sorted gate optimzations
-  bool gate_opt = false;
-  JSON::get_value(gate_opt, "statevector_gate_opt", config);
-  if (gate_opt)
-    BaseState::qreg_.enable_gate_opt();
 }
 
 
@@ -436,6 +434,9 @@ void State<statevec_t>::apply_ops(const std::vector<Operations::Op> &ops,
       case Operations::OpType::matrix:
         apply_matrix(op.qubits, op.mats[0]);
         break;
+      case Operations::OpType::matrix_sequence:
+        apply_matrix_sequence(op.regs, op.mats);
+        break;
       case Operations::OpType::kraus:
         apply_kraus(op.qubits, op.mats, rng);
         break;
@@ -708,6 +709,20 @@ void State<statevec_t>::apply_matrix(const reg_t &qubits, const cvector_t &vmat)
   }
 }
 
+template <class statevec_t>
+void State<statevec_t>::apply_matrix_sequence(const std::vector<reg_t> &regs, const std::vector<cmatrix_t>& mats) {
+
+  if (regs.empty())
+    return;
+
+  std::vector<cvector_t> vmats;
+  for (const cmatrix_t& mat: mats)
+    vmats.push_back(Utils::vectorize_matrix(mat));
+
+  BaseState::qreg_.apply_matrix_sequence(regs, vmats);
+}
+
+
 template <class statevec_t>
 void State<statevec_t>::apply_gate_mcu3(const reg_t& qubits,
                                         double theta,
diff --git a/test/terra/backends/qasm_simulator/qasm_fusion.py b/test/terra/backends/qasm_simulator/qasm_fusion.py
new file mode 100644
index 0000000000..a6651854a5
--- /dev/null
+++ b/test/terra/backends/qasm_simulator/qasm_fusion.py
@@ -0,0 +1,229 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2018, IBM.
+#
+# This source code is licensed under the Apache License, Version 2.0 found in
+# the LICENSE.txt file in the root directory of this source tree.
+
+"""
+QasmSimulator Integration Tests
+"""
+
+from test.benchmark.tools import quantum_volume_circuit
+from qiskit import compile, QuantumRegister, ClassicalRegister, QuantumCircuit
+from qiskit.providers.aer import QasmSimulator
+from qiskit.providers.aer.noise import NoiseModel
+from qiskit.providers.aer.noise.errors import ReadoutError, depolarizing_error
+
+from test.terra.reference import ref_1q_clifford
+from test.terra.reference import ref_2q_clifford
+
+
+class QasmFusionTests:
+    """QasmSimulator fusion tests."""
+
+    SIMULATOR = QasmSimulator()
+    BACKEND_OPTS = {'fusion_verbose': True}
+    
+    def create_statevector_circuit(self):
+        qr = QuantumRegister(10)
+        cr = ClassicalRegister(10)
+        circuit = QuantumCircuit(qr, cr)
+        circuit.u3(0.1,0.1,0.1,qr[0])
+        circuit.barrier(qr)
+        circuit.x(qr[0])
+        circuit.barrier(qr)
+        circuit.x(qr[1])
+        circuit.barrier(qr)
+        circuit.x(qr[0])
+        circuit.barrier(qr)
+        circuit.u3(0.1,0.1,0.1,qr[0])
+        circuit.barrier(qr)
+        circuit.measure(qr, cr)
+        return circuit
+    
+    def noise_model(self):
+        readout_error = [0.01, 0.1]
+        depolarizing = {'u3': (1, 0.001), 'cx': (2, 0.02)}
+        noise = NoiseModel()
+        readout = [[1.0 - readout_error[0], readout_error[0]], [readout_error[1], 1.0 - readout_error[1]]]
+        noise.add_all_qubit_readout_error(ReadoutError(readout))
+        for gate, (num_qubits, gate_error) in depolarizing.items():
+            noise.add_all_qubit_quantum_error(depolarizing_error(gate_error, num_qubits), gate)
+            return noise    
+    
+    def check_mat_exist(self, result):
+        fusion_gates = result.as_dict()['results'][0]['metadata']['fusion_verbose']
+        for gate in fusion_gates:
+            print (gate)
+    
+    def test_clifford_no_fusion(self):
+        """Test Fusion with clifford simulator"""
+        shots = 100
+        circuits = ref_2q_clifford.cx_gate_circuits_deterministic(final_measure=True)
+        qobj = compile(circuits, self.SIMULATOR, shots=shots)
+        result = self.SIMULATOR.run(qobj, backend_options={'fusion_verbose': True}).result()
+        self.is_completed(result)
+        
+        self.assertTrue('results' in result.as_dict(), 
+                        msg="results must exist in result")
+        self.assertTrue('metadata' in result.as_dict()['results'][0], 
+                        msg="metadata must exist in results[0]")
+        self.assertTrue('fusion_verbose' not in result.as_dict()['results'][0]['metadata'], 
+                        msg="fusion must not work for clifford")
+
+    def test_noise_fusion(self):
+        """Test Fusion with noise model option"""
+        circuit = self.create_statevector_circuit()
+        
+        shots = 100
+        noise_model = self.noise_model()
+        qobj = compile([circuit], self.SIMULATOR, shots=shots, seed=1, basis_gates=noise_model.basis_gates)
+
+        result = self.SIMULATOR.run(qobj, noise_model=noise_model, backend_options={'fusion_enable': True, 'fusion_verbose': True, 'fusion_threshold': 1}).result()
+        self.is_completed(result)
+        
+        self.assertTrue('results' in result.as_dict(), 
+                        msg="results must exist in result")
+        self.assertTrue('metadata' in result.as_dict()['results'][0], 
+                        msg="metadata must exist in results[0]")
+        self.assertTrue('fusion_verbose' in result.as_dict()['results'][0]['metadata'], 
+                        msg="verbose must work with noise")
+        
+    def test_fusion_verbose(self):
+        """Test Fusion with verbose option"""
+        circuit = self.create_statevector_circuit()
+        
+        shots = 100
+        qobj = compile([circuit], self.SIMULATOR, shots=shots, seed=1)
+        
+        result_verbose = self.SIMULATOR.run(qobj, backend_options={'fusion_enable': True, 'fusion_verbose': True, 'fusion_threshold': 1}).result()
+        self.is_completed(result_verbose)
+        self.assertTrue('results' in result_verbose.as_dict(), 
+                        msg="results must exist in result")
+        self.assertTrue('metadata' in result_verbose.as_dict()['results'][0], 
+                        msg="metadata must exist in results[0]")
+        self.assertTrue('fusion_verbose' in result_verbose.as_dict()['results'][0]['metadata'], 
+                        msg="fusion must work for satevector")
+
+        result_nonverbose = self.SIMULATOR.run(qobj, backend_options={'fusion_enable': True, 'fusion_verbose': False, 'fusion_threshold': 1}).result()
+        self.is_completed(result_nonverbose)
+        self.assertTrue('results' in result_nonverbose.as_dict(), 
+                        msg="results must exist in result")
+        self.assertTrue('metadata' in result_nonverbose.as_dict()['results'][0], 
+                        msg="metadata must exist in results[0]")
+        self.assertTrue('fusion_verbose' not in result_nonverbose.as_dict()['results'][0]['metadata'], 
+                        msg="verbose must not work if fusion_verbose is False")
+
+        result_default = self.SIMULATOR.run(qobj, backend_options={'fusion_enable': True}).result()
+        self.is_completed(result_default)
+        self.assertTrue('results' in result_default.as_dict(), 
+                        msg="results must exist in result")
+        self.assertTrue('metadata' in result_default.as_dict()['results'][0], 
+                        msg="metadata must exist in results[0]")
+        self.assertTrue('fusion_verbose' not in result_default.as_dict()['results'][0]['metadata'], 
+                        msg="verbose must not work if fusion_verbose is False")
+        
+    def test_control_fusion(self):
+        """Test Fusion enable/disable option"""
+        shots = 100
+        circuit = self.create_statevector_circuit()
+        qobj = compile([circuit], self.SIMULATOR, shots=shots, seed=0)
+        
+        result_verbose = self.SIMULATOR.run(qobj, backend_options={'fusion_enable': True, 'fusion_verbose': True, 'fusion_threshold': 1}).result()
+        self.is_completed(result_verbose)
+        self.assertTrue('results' in result_verbose.as_dict(), 
+                        msg="results must exist in result")
+        self.assertTrue('metadata' in result_verbose.as_dict()['results'][0], 
+                        msg="metadata must exist in results[0]")
+        self.assertTrue('fusion_verbose' in result_verbose.as_dict()['results'][0]['metadata'], 
+                        msg="fusion must work for satevector")
+
+        result_disabled = self.SIMULATOR.run(qobj, backend_options={'fusion_enable': False, 'fusion_verbose': True, 'fusion_threshold': 1}).result()
+        self.is_completed(result_disabled)
+        self.assertTrue('results' in result_disabled.as_dict(), 
+                        msg="results must exist in result")
+        self.assertTrue('metadata' in result_disabled.as_dict()['results'][0], 
+                        msg="metadata must exist in results[0]")
+        self.assertTrue('fusion_verbose' not in result_disabled.as_dict()['results'][0]['metadata'], 
+                        msg="fusion must not work with fusion_enable is False")
+
+        result_default = self.SIMULATOR.run(qobj, backend_options={'fusion_verbose': True}).result()
+        self.is_completed(result_default)
+        self.assertTrue('results' in result_default.as_dict(), 
+                        msg="results must exist in result")
+        self.assertTrue('metadata' in result_default.as_dict()['results'][0], 
+                        msg="metadata must exist in results[0]")
+        self.assertTrue('fusion_verbose' not in result_default.as_dict()['results'][0]['metadata'], 
+                        msg="fusion must not work by default for satevector")
+
+
+    def test_fusion_operations(self):
+        """Test Fusion enable/disable option"""
+        shots = 100
+        
+        qr = QuantumRegister(10)
+        cr = ClassicalRegister(10)
+        circuit = QuantumCircuit(qr, cr)
+        
+        for i in range(10):
+            circuit.h(qr[i])
+            circuit.barrier(qr)
+            
+        circuit.x(qr[0])
+        circuit.barrier(qr)
+        circuit.x(qr[1])
+        circuit.barrier(qr)
+        circuit.x(qr[0])
+        circuit.barrier(qr)
+        circuit.x(qr[1])
+        circuit.barrier(qr)
+        circuit.cx(qr[2], qr[3])
+        circuit.barrier(qr)
+        circuit.u3(0.1,0.1,0.1,qr[3])
+        circuit.barrier(qr)
+        circuit.u3(0.1,0.1,0.1,qr[3])
+        circuit.barrier(qr)
+        
+        circuit.x(qr[0])
+        circuit.barrier(qr)
+        circuit.x(qr[1])
+        circuit.barrier(qr)
+        circuit.x(qr[0])
+        circuit.barrier(qr)
+        circuit.x(qr[1])
+        circuit.barrier(qr)
+        circuit.cx(qr[2], qr[3])
+        circuit.barrier(qr)
+        circuit.u3(0.1,0.1,0.1,qr[3])
+        circuit.barrier(qr)
+        circuit.u3(0.1,0.1,0.1,qr[3])
+        circuit.barrier(qr)
+        
+        circuit.x(qr[0])
+        circuit.barrier(qr)
+        circuit.x(qr[1])
+        circuit.barrier(qr)
+        circuit.x(qr[0])
+        circuit.barrier(qr)
+        circuit.x(qr[1])
+        circuit.barrier(qr)
+        circuit.cx(qr[2], qr[3])
+        circuit.barrier(qr)
+        circuit.u3(0.1,0.1,0.1,qr[3])
+        circuit.barrier(qr)
+        circuit.u3(0.1,0.1,0.1,qr[3])
+        circuit.barrier(qr)
+        
+        circuit.measure(qr, cr)
+
+        qobj = compile([circuit], self.SIMULATOR, shots=shots, seed=1)
+ 
+        result_fusion = self.SIMULATOR.run(qobj, backend_options={'fusion_enable': True, 'fusion_verbose': True, 'fusion_threshold': 1}).result()
+        self.is_completed(result_fusion)
+        
+        result_nonfusion = self.SIMULATOR.run(qobj, backend_options={'fusion_enable': False, 'fusion_verbose': True, 'fusion_threshold': 1}).result()
+        self.is_completed(result_nonfusion)
+
+        self.assertDictAlmostEqual(result_fusion.get_counts(circuit), result_nonfusion.get_counts(circuit), delta=0.0, msg="fusion x-x-x was failed")
+      
\ No newline at end of file
diff --git a/test/terra/backends/test_qasm_simulator.py b/test/terra/backends/test_qasm_simulator.py
index 7ead370860..b9ade9e51e 100644
--- a/test/terra/backends/test_qasm_simulator.py
+++ b/test/terra/backends/test_qasm_simulator.py
@@ -27,6 +27,7 @@
 from test.terra.backends.qasm_simulator.qasm_algorithms import QasmAlgorithmTestsMinimalBasis
 from test.terra.backends.qasm_simulator.qasm_extra import QasmExtraTests
 from test.terra.backends.qasm_simulator.qasm_thread_management import QasmThreadManagementTests
+from test.terra.backends.qasm_simulator.qasm_fusion import QasmFusionTests
 
 
 class TestQasmSimulator(common.QiskitAerTestCase,
@@ -45,7 +46,8 @@ class TestQasmSimulator(common.QiskitAerTestCase,
                         QasmAlgorithmTestsWaltzBasis,
                         QasmAlgorithmTestsMinimalBasis,
                         QasmExtraTests,
-                        QasmThreadManagementTests):
+                        QasmThreadManagementTests,
+                        QasmFusionTests):
     """QasmSimulator automatic method tests."""