From 76f5896fded50285345d0066e9f623c24c77bd7c Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Fri, 4 Nov 2016 17:42:34 +0800
Subject: [PATCH 1/4] fix floating-point overflow problem of tanh

---
 paddle/cuda/include/hl_base.h                |  9 ++
 paddle/cuda/src/hl_avx_functions.cc          |  2 +
 paddle/cuda/src/hl_cpu_functions.cc          |  4 +-
 paddle/gserver/tests/test_LayerGrad.cpp      |  2 +-
 paddle/gserver/tests/test_RecurrentLayer.cpp |  2 +-
 paddle/math/BaseMatrix.cu                    |  5 +-
 paddle/math/MathFunctions.cpp                |  5 +-
 paddle/math/tests/CMakeLists.txt             |  1 +
 paddle/math/tests/test_FPException.cpp       | 92 ++++++++++++++++++++
 9 files changed, 117 insertions(+), 5 deletions(-)
 create mode 100644 paddle/math/tests/test_FPException.cpp
diff --git a/paddle/cuda/include/hl_base.h b/paddle/cuda/include/hl_base.h
index 1fe2774cc5a29..02fa6bc3ace32 100644
--- a/paddle/cuda/include/hl_base.h
+++ b/paddle/cuda/include/hl_base.h
@@ -209,6 +209,15 @@ typedef struct {
 #define HL_FLOAT_MIN        2.2250738585072014e-308
 #endif
 
+
+/**
+ * The maximum input value for exp, used to avoid overflow problem.
+ *
+ * Currently only used for tanh function.
+ */
+#define EXP_MAX_INPUT       40.0
+
+
 /**
  * @brief DIVUP(x, y) is similar to ceil(x / y).
  * @note  For CUDA, DIVUP will be used to specify
diff --git a/paddle/cuda/src/hl_avx_functions.cc b/paddle/cuda/src/hl_avx_functions.cc
index 2d471206f61f2..08976180fff5b 100644
--- a/paddle/cuda/src/hl_avx_functions.cc
+++ b/paddle/cuda/src/hl_avx_functions.cc
@@ -38,7 +38,9 @@ namespace hppl {
   }
 
   __m256 tanh(const __m256 a) {
+    __m256 max = _mm256_set1_ps(EXP_MAX_INPUT);
     __m256 tmp = _mm256_mul_ps(_mm256_set1_ps(-2.0f), a);
+    tmp = _mm256_min_ps(tmp, max);
     tmp = exp(tmp);
     return _mm256_sub_ps(
         _mm256_div_ps(_mm256_set1_ps(2.0f),
diff --git a/paddle/cuda/src/hl_cpu_functions.cc b/paddle/cuda/src/hl_cpu_functions.cc
index 3fd6b278d0537..5f45f4c9d53ed 100644
--- a/paddle/cuda/src/hl_cpu_functions.cc
+++ b/paddle/cuda/src/hl_cpu_functions.cc
@@ -30,7 +30,9 @@ namespace hppl {
   }
 
   real tanh(const real a) {
-    return (2.0 / (1.0 + exp(-2.0*a))) - 1.0;
+    real tmp = -2 * a;
+    tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
+    return (2.0 / (1.0 + exp(tmp))) - 1.0;
   }
 
   real linear(const real a) {
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index bf2c2e0499941..46f36db1a9177 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -996,7 +996,7 @@ TEST(Layer, LstmLayer) {
   TestConfig config;
   config.layerConfig.set_type("lstmemory");
   config.layerConfig.set_size(4);
-  config.layerConfig.set_active_type("sigmoid");
+  config.layerConfig.set_active_type("tanh");
   config.layerConfig.set_active_state_type("sigmoid");
   config.layerConfig.set_active_gate_type("sigmoid");
   config.biasSize = 28;
diff --git a/paddle/gserver/tests/test_RecurrentLayer.cpp b/paddle/gserver/tests/test_RecurrentLayer.cpp
index 9b933b153d158..1c8497e8c526f 100644
--- a/paddle/gserver/tests/test_RecurrentLayer.cpp
+++ b/paddle/gserver/tests/test_RecurrentLayer.cpp
@@ -369,7 +369,7 @@ TEST(Layer, LstmLayer) {
   LayerConfig layerConfig;
   layerConfig.set_type("lstmemory");
   layerConfig.set_active_type("relu");
-  layerConfig.set_active_state_type("sigmoid");
+  layerConfig.set_active_state_type("tanh");
   layerConfig.set_active_gate_type("sigmoid");
 
   layerConfig.add_inputs();
diff --git a/paddle/math/BaseMatrix.cu b/paddle/math/BaseMatrix.cu
index 8b888b1ee5e46..bc65edf39ce23 100644
--- a/paddle/math/BaseMatrix.cu
+++ b/paddle/math/BaseMatrix.cu
@@ -625,7 +625,10 @@ void BaseMatrixT<T>::squareDerivative(BaseMatrixT& b) {
   applyBinary(binary::SquareDerivative<T>(), b);
 }
 
-DEFINE_MATRIX_BINARY_OP(Tanh, b = 2.0 / (1.0 + exp(-2 * a)) - 1.0);
+DEFINE_MATRIX_BINARY_OP(Tanh,
+    T tmp = -2 * a;
+    tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
+    b = 2.0 / (1.0 + std::exp(tmp)) - 1.0);
 template<>
 void BaseMatrixT<real>::tanh(BaseMatrixT& b) {
   applyBinary(binary::Tanh<real>(), b);
diff --git a/paddle/math/MathFunctions.cpp b/paddle/math/MathFunctions.cpp
index da493379e3a37..6fa760de1fa6a 100644
--- a/paddle/math/MathFunctions.cpp
+++ b/paddle/math/MathFunctions.cpp
@@ -160,7 +160,10 @@ void vLog1p(const int n, const T* a, T* r) {
     binary::vLog1p<T>(), const_cast<T*>(a), r, 1, n, n, n);
 }
 
-DEFINE_MATRIX_BINARY_OP(vTanh, b = 2.0 / (1.0 + std::exp(-2 * a)) - 1.0);
+DEFINE_MATRIX_BINARY_OP(vTanh,
+    T tmp = -2 * a;
+    tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
+    b = 2.0 / (1.0 + std::exp(tmp)) - 1.0);
 template<class T>
 void vTanh(const int n, const T* a, T* r) {
   hl_cpu_apply_binary_op<T, binary::vTanh<T>, 0, 0>(
diff --git a/paddle/math/tests/CMakeLists.txt b/paddle/math/tests/CMakeLists.txt
index eb72f11e1c653..247be983ba329 100644
--- a/paddle/math/tests/CMakeLists.txt
+++ b/paddle/math/tests/CMakeLists.txt
@@ -13,3 +13,4 @@ add_simple_unittest(test_sparseMatrixCompare)
 add_simple_unittest(test_perturbation)
 add_simple_unittest(test_CpuGpuVector)
 add_simple_unittest(test_Allocator)
+add_simple_unittest(test_FPException)
diff --git a/paddle/math/tests/test_FPException.cpp b/paddle/math/tests/test_FPException.cpp
new file mode 100644
index 0000000000000..322e69d7ee090
--- /dev/null
+++ b/paddle/math/tests/test_FPException.cpp
@@ -0,0 +1,92 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+/**
+ * This test is about floating point calculation exception.
+ * Paddle catches FE_INVALID, FE DIVBYZERO and FE_OVERFLOW exceptions.
+ *
+ * Some exceptions occur in the middle of a set of formulas, 
+ * that can be circumvented by some tricks.
+ * For example, 
+ * calculate tanh
+ *   b = 2.0 / (1.0 + exp(-2 * a)) - 1.0
+ *
+ * If the result of (-2 * a) is too large,
+ * a FE_OVERFLOW exception occurs when calculating exp.
+ * But the result of tanh is no overflow problem,
+ * so we can add some tricks to prevent exp calculate an excessive value.
+ *
+ */
+#include <fenv.h>
+#include <gtest/gtest.h>
+#include "paddle/math/Matrix.h"
+
+using namespace paddle;     // NOLINT
+
+void SetTensorValue(Matrix& matrix, real value) {
+  int height = matrix.getHeight();
+  int width = matrix.getWidth();
+  int stride = matrix.getStride();
+  real* data = matrix.getData();
+  for (int i = 0; i < height; i++) {
+    int j = rand() % width;  // NOLINT
+    if (typeid(matrix) == typeid(CpuMatrix)) {
+      data[i * stride + j] = value;
+    } else if (typeid(matrix) == typeid(GpuMatrix)) {
+      hl_memcpy(&data[i * stride + j], &value, sizeof(real));
+    } else {
+    }
+  }
+}
+
+template<typename Matrix>
+void testTanh(real illegal) {
+  MatrixPtr A = std::make_shared<Matrix>(10, 10);
+  MatrixPtr B = std::make_shared<Matrix>(10, 10);
+  A->randomizeUniform();
+  B->randomizeUniform();
+
+  SetTensorValue(*A, illegal);
+
+  A->tanh(*B);
+}
+
+template<typename Matrix>
+void testSigmoid(real illegal) {
+  MatrixPtr A = std::make_shared<Matrix>(10, 10);
+  MatrixPtr B = std::make_shared<Matrix>(10, 10);
+  A->randomizeUniform();
+  B->randomizeUniform();
+
+  SetTensorValue(*A, illegal);
+
+  A->sigmoid(*B);
+}
+
+TEST(fp, overflow) {
+  for (auto illegal : {-90.0, 90.0}) {
+    LOG(INFO) << " illegal=" << illegal;
+    testTanh<CpuMatrix>(illegal);
+    testSigmoid<CpuMatrix>(illegal);
+  }
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+
+  feenableexcept(FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW);
+  return RUN_ALL_TESTS();
+}

From 2a936928358c4b81dd4419e93b4ece68f01b6f5a Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Fri, 4 Nov 2016 18:22:06 +0800
Subject: [PATCH 2/4] fix compile bug in osx

---
 paddle/cuda/src/hl_cpu_functions.cc    | 2 +-
 paddle/math/BaseMatrix.cu              | 2 +-
 paddle/math/MathFunctions.cpp          | 2 +-
 paddle/math/tests/test_FPException.cpp | 1 +
 4 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/paddle/cuda/src/hl_cpu_functions.cc b/paddle/cuda/src/hl_cpu_functions.cc
index 5f45f4c9d53ed..b8352c2d537fb 100644
--- a/paddle/cuda/src/hl_cpu_functions.cc
+++ b/paddle/cuda/src/hl_cpu_functions.cc
@@ -30,7 +30,7 @@ namespace hppl {
   }
 
   real tanh(const real a) {
-    real tmp = -2 * a;
+    real tmp = -2.0 * a;
     tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
     return (2.0 / (1.0 + exp(tmp))) - 1.0;
   }
diff --git a/paddle/math/BaseMatrix.cu b/paddle/math/BaseMatrix.cu
index bc65edf39ce23..d81b99e544158 100644
--- a/paddle/math/BaseMatrix.cu
+++ b/paddle/math/BaseMatrix.cu
@@ -626,7 +626,7 @@ void BaseMatrixT<T>::squareDerivative(BaseMatrixT& b) {
 }
 
 DEFINE_MATRIX_BINARY_OP(Tanh,
-    T tmp = -2 * a;
+    T tmp = -2.0 * a;
     tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
     b = 2.0 / (1.0 + std::exp(tmp)) - 1.0);
 template<>
diff --git a/paddle/math/MathFunctions.cpp b/paddle/math/MathFunctions.cpp
index 6fa760de1fa6a..673e6221cb917 100644
--- a/paddle/math/MathFunctions.cpp
+++ b/paddle/math/MathFunctions.cpp
@@ -161,7 +161,7 @@ void vLog1p(const int n, const T* a, T* r) {
 }
 
 DEFINE_MATRIX_BINARY_OP(vTanh,
-    T tmp = -2 * a;
+    T tmp = -2.0 * a;
     tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
     b = 2.0 / (1.0 + std::exp(tmp)) - 1.0);
 template<class T>
diff --git a/paddle/math/tests/test_FPException.cpp b/paddle/math/tests/test_FPException.cpp
index 322e69d7ee090..b86b392ef5481 100644
--- a/paddle/math/tests/test_FPException.cpp
+++ b/paddle/math/tests/test_FPException.cpp
@@ -32,6 +32,7 @@ limitations under the License. */
 #include <fenv.h>
 #include <gtest/gtest.h>
 #include "paddle/math/Matrix.h"
+#include "paddle/utils/Excepts.h"
 
 using namespace paddle;     // NOLINT
 

From 8c2ad8da22c2a778bd5c5516d2ebb3b9c2501db6 Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Fri, 4 Nov 2016 19:29:35 +0800
Subject: [PATCH 3/4] remove errno, paddle trapping floating point exceptions
 with feenableexcept

---
 paddle/math/Matrix.cpp | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp
index aaeae98f0d28b..40bbad22df852 100644
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -3406,9 +3406,7 @@ void CpuMatrix::tanh(Matrix& output) {
   size_t dim = getWidth();
   CHECK_EQ(output.getHeight(), numSamples);
   CHECK_EQ(output.getWidth(), dim);
-  errno = 0;
   vTanh(numSamples * dim, getData(), output.getData());
-  CHECK_EQ(errno, 0) << "vTanh error";
 }
 
 void CpuMatrix::tanhDerivative(Matrix& output) {
@@ -3430,10 +3428,8 @@ void CpuMatrix::softrelu(Matrix& output) {
       out[j] = x;
     }
   }
-  errno = 0;
   vExp(numSamples * dim, output.getData(), output.getData());
   vLog1p(numSamples * dim, output.getData(), output.getData());
-  CHECK_EQ(errno, 0) << "vExp+vLog1p error";
 }
 
 void CpuMatrix::softreluDerivative(Matrix& output) {
@@ -3448,9 +3444,7 @@ void CpuMatrix::softreluDerivative(Matrix& output) {
   MatrixPtr tmpMat = Matrix::create(numSamples, dim);
   real* tmp = tmpMat->getData();
 
-  errno = 0;
   vExp(size, output.getData(), tmpMat->getData());
-  CHECK_EQ(errno, 0) << "vExp error";
 
   for (size_t i = 0; i < size; ++i) {
     grad[i] *= (1.0 - 1.0 / tmp[i]);
@@ -3473,10 +3467,7 @@ void CpuMatrix::scaledTanh(Matrix& output, real p1, real p2) {
     out[i] = p2 * in[i];
   }
 
-  // out = tanh(out)
-  errno = 0;
   vTanh(numSamples * dim, out, out);
-  CHECK_EQ(errno, 0) << "vTanh error";
 
   // out = p1 * out
   for (size_t i = 0; i < numSamples * dim; ++i) {

From 250238598a73c58fd79fd501b08caf2aae0614ea Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@baidu.com>
Date: Tue, 8 Nov 2016 13:13:36 +0800
Subject: [PATCH 4/4] small fix

---
 paddle/math/tests/test_FPException.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/paddle/math/tests/test_FPException.cpp b/paddle/math/tests/test_FPException.cpp
index b86b392ef5481..174278c2aaac4 100644
--- a/paddle/math/tests/test_FPException.cpp
+++ b/paddle/math/tests/test_FPException.cpp
@@ -48,6 +48,7 @@ void SetTensorValue(Matrix& matrix, real value) {
     } else if (typeid(matrix) == typeid(GpuMatrix)) {
       hl_memcpy(&data[i * stride + j], &value, sizeof(real));
     } else {
+      LOG(FATAL) << "should not reach here";
     }
   }
 }