From 445d1e820122ece4e0a3e8343b886319ee79cf66 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Fri, 9 Sep 2016 19:46:48 +0800
Subject: [PATCH 01/41] Support MAC OS for PaddlePaddle

---
 cmake/cudnn.cmake                             |   2 +-
 cmake/flags.cmake                             |   4 +-
 cmake/util.cmake                              |  41 +++++--
 paddle/api/Util.cpp                           |   1 +
 .../gserver/dataproviders/PyDataProvider.cpp  |   2 +
 .../gradientmachines/NeuralNetwork.cpp        |  12 +-
 paddle/math/Allocator.h                       |  13 ++-
 paddle/math/MathFunctions.h                   |   2 +
 paddle/math/Storage.cpp                       |   4 +-
 paddle/pserver/LightNetwork.cpp               |  29 ++++-
 paddle/pserver/SocketChannel.cpp              |  12 +-
 paddle/trainer/Trainer.cpp                    |   1 +
 paddle/trainer/TrainerMain.cpp                |   1 +
 paddle/utils/Excepts.cpp                      |  54 +++++++++
 paddle/utils/Excepts.h                        |  26 +++++
 paddle/utils/Locks.cpp                        | 106 ++++++++++++++++++
 paddle/utils/Locks.h                          |  69 ++++++++++++
 paddle/utils/PythonUtil.h                     |  20 +++-
 18 files changed, 361 insertions(+), 38 deletions(-)
 create mode 100644 paddle/utils/Excepts.cpp
 create mode 100644 paddle/utils/Excepts.h
 create mode 100644 paddle/utils/Locks.cpp

diff --git a/cmake/cudnn.cmake b/cmake/cudnn.cmake
index e2ff923a22923..e5b59be19369d 100644
--- a/cmake/cudnn.cmake
+++ b/cmake/cudnn.cmake
@@ -15,7 +15,7 @@ list(APPEND CUDNN_CHECK_LIBRARY_DIRS
     $ENV{CUDNN_ROOT}/lib64
     $ENV{CUDNN_ROOT}/lib
     /usr/lib)
-find_library(CUDNN_LIBRARY NAMES libcudnn.so # libcudnn_static.a
+find_library(CUDNN_LIBRARY NAMES libcudnn.so libcudnn.dylib # libcudnn_static.a
     PATHS ${CUDNN_CHECK_LIBRARY_DIRS} ${CUDNN_INCLUDE_DIR} ${__libpath_hist}
           NO_DEFAULT_PATH
     DOC "Path to cuDNN library.")
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index 4b99e7f7fb6af..c95d4063105e7 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -58,8 +58,8 @@ set(COMMON_FLAGS
     -fPIC
     -fno-omit-frame-pointer
     -Wall
-    -Wextra
-    -Werror
+#    -Wextra
+#    -Werror
     -Wnon-virtual-dtor
     -Wdelete-non-virtual-dtor
     -Wno-unused-parameter
diff --git a/cmake/util.cmake b/cmake/util.cmake
index e0e372fed0b04..bad44c7e9dda6 100644
--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@@ -1,4 +1,21 @@
 # Some common routine for paddle compile.
+if(APPLE)
+    # -------------------------------------------------------
+    # OSX
+    # -------------------------------------------------------
+    set(GROUP_START "-Wl,-force_load")
+    set(GROUP_END "")
+
+    set(ARCHIVE_START "-Wl,-force_load")
+    set(ARCHIVE_END "")
+else()
+    set(GROUP_START "-Wl,--start-group")
+    set(GROUP_END "-Wl,--end-group")
+
+    set(ARCHIVE_START "-Wl,--whole-archive")
+    set(ARCHIVE_END "-Wl,--no-whole-archive")
+endif()
+
 
 
 # target_circle_link_libraries
@@ -7,10 +24,18 @@
 # First Argument: target name want to be linked with libraries
 # Rest Arguments: libraries which link together.
 function(target_circle_link_libraries TARGET_NAME)
-    target_link_libraries(${TARGET_NAME}
-        -Wl,--start-group
-        ${ARGN}
-        -Wl,--end-group)
+    if(APPLE)
+        foreach(f ${ARGN})
+            list(APPEND OSX_LIBRARIES "-Wl,-force_load" "${f}")
+        endforeach(f)
+        target_link_libraries(${TARGET_NAME}
+                ${OSX_LIBRARIES})
+    else()
+        target_link_libraries(${TARGET_NAME}
+                ${GROUP_START}
+                ${ARGN}
+                ${GROUP_END})
+    endif()
 endfunction()
 
 # compile_cu_as_cpp
@@ -41,20 +66,18 @@ function(link_paddle_exe TARGET_NAME)
     if(PADDLE_WITH_INTERNAL)
         set(INTERAL_LIBS paddle_internal_gserver paddle_internal_parameter)
         target_circle_link_libraries(${TARGET_NAME}
-            -Wl,--whole-archive
             paddle_internal_gserver
             paddle_internal_owlqn
-            -Wl,--no-whole-archive
             paddle_internal_parameter)
     else()
         set(INTERAL_LIBS "")
     endif()
 
     target_circle_link_libraries(${TARGET_NAME}
-        -Wl,--whole-archive
+#        ${ARCHIVE_START}
         paddle_gserver
         ${METRIC_LIBS}
-        -Wl,--no-whole-archive
+#        ${ARCHIVE_END}
         paddle_pserver
         paddle_trainer_lib
         paddle_network
@@ -69,7 +92,7 @@ function(link_paddle_exe TARGET_NAME)
         ${CBLAS_LIBS}
         ${CMAKE_DL_LIBS}
         ${INTERAL_LIBS}
-        -lz)
+        )
     
     if(WITH_PYTHON)
         target_link_libraries(${TARGET_NAME}
diff --git a/paddle/api/Util.cpp b/paddle/api/Util.cpp
index 4e655c324a1ed..8a6741078f2f1 100644
--- a/paddle/api/Util.cpp
+++ b/paddle/api/Util.cpp
@@ -17,6 +17,7 @@ limitations under the License. */
 #include "paddle/utils/Util.h"
 #include "paddle/utils/PythonUtil.h"
 #include "paddle/utils/Flags.h"
+#include "paddle/utils/Excepts.h"
 #include "paddle/parameter/Parameter.h"
 
 #include <fenv.h>
diff --git a/paddle/gserver/dataproviders/PyDataProvider.cpp b/paddle/gserver/dataproviders/PyDataProvider.cpp
index aeefd16063df8..cc3e09a3c2ecb 100644
--- a/paddle/gserver/dataproviders/PyDataProvider.cpp
+++ b/paddle/gserver/dataproviders/PyDataProvider.cpp
@@ -17,6 +17,8 @@ limitations under the License. */
 #include "paddle/utils/PythonUtil.h"
 #include <fenv.h>
 #include "paddle/utils/Util.h"
+#include "paddle/utils/Excepts.h"
+
 
 namespace paddle {
 
diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.cpp b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
index fca52828957a2..f9da812027dc3 100644
--- a/paddle/gserver/gradientmachines/NeuralNetwork.cpp
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
@@ -384,17 +384,17 @@ void NeuralNetwork::setOutputGrad(const std::vector<Argument>& args) {
   }
 }
 
-extern NeuralNetwork* newCustomNeuralNetwork(
-    const std::string& name, NeuralNetwork* network) __attribute__((weak));
+// extern NeuralNetwork* newCustomNeuralNetwork(
+//    const std::string& name, NeuralNetwork* network) __attribute__((weak));
 
 NeuralNetwork* NeuralNetwork::newNeuralNetwork(
     const std::string& name,
     NeuralNetwork* rootNetwork) {
-  if (newCustomNeuralNetwork) {
-    return newCustomNeuralNetwork(name, rootNetwork);
-  } else {
+//  if (newCustomNeuralNetwork) {
+//    return newCustomNeuralNetwork(name, rootNetwork);
+//  } else {
     return new NeuralNetwork(name, rootNetwork);
-  }
+//  }
 }
 
 }  // namespace paddle
diff --git a/paddle/math/Allocator.h b/paddle/math/Allocator.h
index 36166236e9eff..7d277b1c10d2b 100644
--- a/paddle/math/Allocator.h
+++ b/paddle/math/Allocator.h
@@ -16,7 +16,7 @@ limitations under the License. */
 #pragma once
 
 #include <mutex>
-#include <malloc.h>
+#include <stdlib.h>
 #include "hl_gpu.h"
 #include "paddle/utils/Logging.h"
 
@@ -48,9 +48,14 @@ class CpuAllocator : public Allocator {
    * @return Pointer to the allocated memory
    */
   virtual void* alloc(size_t size) {
-    void* ptr = memalign(32ul, size);
-    CHECK(ptr) << "Fail to allocate CPU memory: size=" << size;
-    return ptr;
+    #if defined(__APPLE__) || defined(__OSX__)
+      return malloc(size);
+    #else
+      void* ptr;
+      posix_memalign(&ptr, 32ul, size);
+      CHECK(ptr) << "Fail to allocate CPU memory: size=" << size;
+      return ptr;
+    #endif
   }
 
   /**
diff --git a/paddle/math/MathFunctions.h b/paddle/math/MathFunctions.h
index fe486c741d6f5..43075977dc9ce 100644
--- a/paddle/math/MathFunctions.h
+++ b/paddle/math/MathFunctions.h
@@ -23,6 +23,8 @@ extern "C" {
 }
 #endif
 
+#include <cmath>
+
 namespace paddle {
 
 template<class T>
diff --git a/paddle/math/Storage.cpp b/paddle/math/Storage.cpp
index 9a879a964ec6d..2bd3db2341638 100644
--- a/paddle/math/Storage.cpp
+++ b/paddle/math/Storage.cpp
@@ -25,8 +25,8 @@ namespace paddle {
 // Initialization StorageEngine singleton.
 // Other modules may rely on storage management,
 // so StorageEngine need to be initialized before other modules.
-static InitFunction __init_storage_engine(
-  StorageEngine::singleton, std::numeric_limits<int>::max());
+// static InitFunction __init_storage_engine(
+//            StorageEngine::singleton, std::numeric_limits<int>::max());
 
 StorageEngine::StorageEngine() : cpuAllocator_(nullptr) {
 }
diff --git a/paddle/pserver/LightNetwork.cpp b/paddle/pserver/LightNetwork.cpp
index fb427832fad64..c42d2dbe4bbf5 100644
--- a/paddle/pserver/LightNetwork.cpp
+++ b/paddle/pserver/LightNetwork.cpp
@@ -24,7 +24,12 @@ limitations under the License. */
 #include <net/if.h>
 #include <net/if_arp.h>
 #include <sstream>
+
+#if defined(__OSX__) || defined(__APPLE__)
+#include <netinet/tcp.h>
+#else
 #include <linux/tcp.h>
+#endif
 
 #include "LightNetwork.h"
 #include "paddle/utils/Util.h"
@@ -92,10 +97,12 @@ void setOption(int sockfd) {
     CHECK_GE(
         setsockopt(sockfd, IPPROTO_TCP, TCP_NODELAY, &optval, sizeof(optval)),
         0);
+#ifdef TCP_QUICKACK
     optval = 1;
     CHECK_GE(
         setsockopt(sockfd, IPPROTO_TCP, TCP_QUICKACK, &optval, sizeof(optval)),
         0);
+#endif
   }
   int reuse = 1;
   CHECK_GE(setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, &reuse, sizeof(reuse)),
@@ -340,17 +347,27 @@ void SocketWorker::run() {
  */
 void SocketClient::TcpClient(const std::string &serverAddr, int serverPort) {
   struct sockaddr_in serv_addr;
-  struct hostent hostinfo, *server;
-  char buf[1024];  // temp for gethostbyname_r
+  struct hostent *server;
+
   int errRet;      // temp for gethostbyname_r
 
   /// Create a socket point
   int sockfd = socket(AF_INET, SOCK_STREAM, 0);
   PCHECK(sockfd >= 0) << "ERROR opening socket";
-  CHECK_EQ(0, gethostbyname_r(serverAddr.c_str(), &hostinfo, buf, sizeof(buf),
-                              &server, &errRet))
-      << "ERROR, no such host: " << serverAddr << " ret = " << errRet;
-  CHECK(server) << "gethostbyname_r err";
+
+#if defined(__OSX__) || defined(__APPLE__)
+   server = getipnodebyname(serverAddr.c_str(), AF_INET, AI_DEFAULT, &errRet);
+   CHECK_NE(HOST_NOT_FOUND, errRet)
+     << "ERROR, no such host: " << serverAddr << " ret = " << errRet;
+   CHECK(server) << "getipnodebyname error!";
+#else
+   struct hostent hostinfo;
+   char buf[1024];  // temp for gethostbyname_r
+   CHECK_EQ(0, gethostbyname_r(serverAddr.c_str(), &hostinfo, buf, sizeof(buf),
+                               &server, &errRet))
+       << "ERROR, no such host: " << serverAddr << " ret = " << errRet;
+   CHECK(server) << "gethostbyname_r error!";
+#endif
 
   bzero((char *)&serv_addr, sizeof(serv_addr));
   serv_addr.sin_family = AF_INET;
diff --git a/paddle/pserver/SocketChannel.cpp b/paddle/pserver/SocketChannel.cpp
index 698473060a4c1..b9d542a296ddd 100644
--- a/paddle/pserver/SocketChannel.cpp
+++ b/paddle/pserver/SocketChannel.cpp
@@ -27,6 +27,15 @@ limitations under the License. */
 
 namespace paddle {
 
+/**
+ * UIO_MAXIOV is documented in writev(2), but <sys/uio.h> only
+ * declares it on osx/ios if defined(KERNEL)
+ */
+#ifndef UIO_MAXIOV
+#define UIO_MAXIOV 512
+#endif
+
+
 SocketChannel::~SocketChannel() {
   if (tcpRdma_ == F_TCP)
     close(tcpSocket_);
@@ -148,8 +157,7 @@ void SocketChannel::writeMessage(const std::vector<struct iovec>& userIovs) {
   std::vector<iovec> iovs;
   iovs.reserve(userIovs.size() + 2);
   iovs.push_back({&header, sizeof(header)});
-  iovs.push_back({&iovLengths[0],
-       sizeof(iovLengths[0]) * (size_t) header.numIovs});
+  iovs.push_back({&iovLengths[0], sizeof(iovLengths[0]) * header.numIovs});
   iovs.insert(iovs.end(), userIovs.begin(), userIovs.end());
 
   header.totalLength = 0;
diff --git a/paddle/trainer/Trainer.cpp b/paddle/trainer/Trainer.cpp
index 2890f5b5d7ad9..84d2ee1e73a54 100644
--- a/paddle/trainer/Trainer.cpp
+++ b/paddle/trainer/Trainer.cpp
@@ -28,6 +28,7 @@ limitations under the License. */
 #include "paddle/utils/PythonUtil.h"
 #include "paddle/utils/Stat.h"
 #include "paddle/utils/Util.h"
+#include "paddle/utils/Excepts.h"
 #include "paddle/utils/GlobalConstants.h"
 
 #include "paddle/gserver/gradientmachines/NeuralNetwork.h"
diff --git a/paddle/trainer/TrainerMain.cpp b/paddle/trainer/TrainerMain.cpp
index dd30b2c8a5b45..94266639f94ad 100644
--- a/paddle/trainer/TrainerMain.cpp
+++ b/paddle/trainer/TrainerMain.cpp
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <fenv.h>
 #include "paddle/utils/PythonUtil.h"
 #include "paddle/utils/StringUtil.h"
+#include "paddle/utils/Excepts.h"
 #include "paddle/pserver/ParameterServer2.h"
 
 #include "ParamUtil.h"
diff --git a/paddle/utils/Excepts.cpp b/paddle/utils/Excepts.cpp
new file mode 100644
index 0000000000000..9123508fc78d0
--- /dev/null
+++ b/paddle/utils/Excepts.cpp
@@ -0,0 +1,54 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Excepts.h"
+
+#if defined(__APPLE__) || defined(__OSX__)
+
+#include <fenv.h>
+
+int fegetexcept(void) {
+  static fenv_t fenv;
+  return fegetenv(&fenv) ? -1 : (fenv.__control & FE_ALL_EXCEPT);
+}
+
+int feenableexcept(unsigned int excepts) {
+  static fenv_t fenv;
+  unsigned int new_excepts = excepts & FE_ALL_EXCEPT, old_excepts;
+
+  if ( fegetenv (&fenv) ) return -1;
+  old_excepts = fenv.__control & FE_ALL_EXCEPT;
+
+  // unmask
+  fenv.__control &= ~new_excepts;
+  fenv.__mxcsr   &= ~(new_excepts << 7);
+
+  return ( fesetenv (&fenv) ? -1 : old_excepts );
+}
+
+int fedisableexcept(unsigned int excepts) {
+  static fenv_t fenv;
+  unsigned int new_excepts = excepts & FE_ALL_EXCEPT, old_excepts;
+
+  if ( fegetenv (&fenv) ) return -1;
+  old_excepts = fenv.__control & FE_ALL_EXCEPT;
+
+  // mask
+  fenv.__control |= new_excepts;
+  fenv.__mxcsr   |= new_excepts << 7;
+
+  return ( fesetenv (&fenv) ? -1 : old_excepts );
+}
+
+#endif
diff --git a/paddle/utils/Excepts.h b/paddle/utils/Excepts.h
new file mode 100644
index 0000000000000..a84a2d33a6a3d
--- /dev/null
+++ b/paddle/utils/Excepts.h
@@ -0,0 +1,26 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef EXCEPTS_H_
+#define EXCEPTS_H_
+
+#if defined(__APPLE__) || defined(__OSX__)
+
+int fegetexcept(void);
+int feenableexcept(unsigned int excepts);
+int fedisableexcept(unsigned int excepts);
+
+#endif
+
+#endif  // EXCEPTS_H_
diff --git a/paddle/utils/Locks.cpp b/paddle/utils/Locks.cpp
new file mode 100644
index 0000000000000..c2f58cf5764ef
--- /dev/null
+++ b/paddle/utils/Locks.cpp
@@ -0,0 +1,106 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef __APPLE__
+#include <dispatch/dispatch.h>
+#endif
+
+#ifdef __APPLE__
+#ifndef PTHREAD_BARRIER_H_
+#define PTHREAD_BARRIER_H_
+
+#include <pthread.h>
+#include <errno.h>
+
+typedef int pthread_barrierattr_t;
+typedef struct {
+    pthread_mutex_t mutex;
+    pthread_cond_t cond;
+    int count;
+    int tripCount;
+} pthread_barrier_t;
+
+int pthread_barrier_init(pthread_barrier_t *barrier,
+   const pthread_barrierattr_t *attr, unsigned int count) {
+    if (count == 0) {
+        errno = EINVAL;
+        return -1;
+    }
+    if (pthread_mutex_init(&barrier->mutex, 0) < 0) {
+        return -1;
+    }
+    if (pthread_cond_init(&barrier->cond, 0) < 0) {
+        pthread_mutex_destroy(&barrier->mutex);
+        return -1;
+    }
+    barrier->tripCount = count;
+    barrier->count = 0;
+
+    return 0;
+}
+
+int pthread_barrier_destroy(pthread_barrier_t *barrier) {
+    pthread_cond_destroy(&barrier->cond);
+    pthread_mutex_destroy(&barrier->mutex);
+    return 0;
+}
+
+int pthread_barrier_wait(pthread_barrier_t *barrier) {
+    pthread_mutex_lock(&barrier->mutex);
+    ++(barrier->count);
+    if (barrier->count >= barrier->tripCount) {
+        barrier->count = 0;
+        pthread_cond_broadcast(&barrier->cond);
+        pthread_mutex_unlock(&barrier->mutex);
+        return 1;
+    } else {
+        pthread_cond_wait(&barrier->cond, &(barrier->mutex));
+        pthread_mutex_unlock(&barrier->mutex);
+        return 0;
+    }
+}
+
+#endif  // PTHREAD_BARRIER_H_
+
+typedef int pthread_spinlock_t;
+
+int pthread_spin_init(pthread_spinlock_t *lock, int pshared) {
+    __asm__ __volatile__("" ::: "memory");
+    *lock = 0;
+    return 0;
+}
+
+int pthread_spin_destroy(pthread_spinlock_t *lock) {
+    return 0;
+}
+
+int pthread_spin_lock(pthread_spinlock_t *lock) {
+    while (1) {
+        int i;
+        for (i=0; i < 10000; i++) {
+            if (__sync_bool_compare_and_swap(lock, 0, 1)) {
+                return 0;
+            }
+        }
+        sched_yield();
+    }
+}
+
+int pthread_spin_unlock(pthread_spinlock_t *lock) {
+    __asm__ __volatile__("" ::: "memory");
+    *lock = 0;
+    return 0;
+}
+
+#endif  // __APPLE__
diff --git a/paddle/utils/Locks.h b/paddle/utils/Locks.h
index 085aca508dbbe..e7b0b77081f36 100644
--- a/paddle/utils/Locks.h
+++ b/paddle/utils/Locks.h
@@ -23,6 +23,50 @@ limitations under the License. */
 #include <condition_variable>
 #include <mutex>
 
+#ifdef __APPLE__
+#include <dispatch/dispatch.h>
+#endif
+
+#ifdef __APPLE__
+#ifndef PTHREAD_BARRIER_H_
+#define PTHREAD_BARRIER_H_
+
+#include <pthread.h>
+#include <errno.h>
+
+typedef int pthread_barrierattr_t;
+typedef struct {
+    pthread_mutex_t mutex;
+    pthread_cond_t cond;
+    int count;
+    int tripCount;
+} pthread_barrier_t;
+
+
+extern int pthread_barrier_init(pthread_barrier_t *barrier,
+                                const pthread_barrierattr_t *attr,
+                                unsigned int count);
+
+extern int pthread_barrier_destroy(pthread_barrier_t *barrier);
+
+extern int pthread_barrier_wait(pthread_barrier_t *barrier);
+
+#endif  // PTHREAD_BARRIER_H_
+
+typedef int pthread_spinlock_t;
+
+extern int pthread_spin_init(pthread_spinlock_t *lock, int pshared);
+
+extern int pthread_spin_destroy(pthread_spinlock_t *lock);
+
+extern int pthread_spin_lock(pthread_spinlock_t *lock);
+
+extern int pthread_spin_unlock(pthread_spinlock_t *lock);
+
+#endif
+
+
+
 namespace paddle {
 
 /**
@@ -117,6 +161,29 @@ class SpinLock {
 /**
  * A simple wapper of semaphore which can only be shared in the same process.
  */
+
+#ifdef __APPLE__
+
+class Semaphore {
+public:
+    explicit Semaphore(int initValue = 0) {
+        sem_ = dispatch_semaphore_create(initValue);
+    }
+
+    ~Semaphore() { dispatch_release(sem_); }
+    bool timeWait(struct timespec* ts) {
+        dispatch_time_t m = dispatch_walltime(ts, 0);
+        return (0 == dispatch_semaphore_wait(sem_, m));
+    }
+    void wait() { dispatch_semaphore_wait(sem_, DISPATCH_TIME_FOREVER); }
+    void post() { dispatch_semaphore_signal(sem_);}
+
+protected:
+ dispatch_semaphore_t sem_;
+};
+
+#else
+
 class Semaphore {
 public:
   /**
@@ -153,6 +220,8 @@ class Semaphore {
   sem_t sem_;
 };
 
+#endif
+
 static_assert(sizeof(SpinLock) == 64, "Wrong padding");
 
 /**
diff --git a/paddle/utils/PythonUtil.h b/paddle/utils/PythonUtil.h
index 4467fd784ec4e..397229d803df9 100644
--- a/paddle/utils/PythonUtil.h
+++ b/paddle/utils/PythonUtil.h
@@ -18,6 +18,12 @@ limitations under the License. */
 #ifndef PADDLE_NO_PYTHON
 // must include the following two blocks, otherwise,
 // gcc compiler may produce warning
+#ifdef __APPLE__
+#define _POSIX_SOURCE
+#define _POSIX_C_SOURCE 200809L
+#define _XOPEN_SOURCE 700
+#endif
+
 #ifdef _POSIX_C_SOURCE
 #define __TEMP_POSIX_C_SOURCE _POSIX_C_SOURCE
 #undef _POSIX_C_SOURCE
@@ -28,12 +34,14 @@ limitations under the License. */
 #endif
 #include <Python.h>
 #include <frameobject.h>
-#ifndef _POSIX_C_SOURCE
-#warning "no _POSIX_C_SOURCE defined in Python.h"
-#endif
-#ifndef _XOPEN_SOURCE
-#warning "no _XOPEN_SOURCE defined in Python.h"
-#endif
+
+// #ifndef _POSIX_C_SOURCE
+// #warning "no _POSIX_C_SOURCE defined in Python.h"
+// #endif
+// #ifndef _XOPEN_SOURCE
+// #warning "no _XOPEN_SOURCE defined in Python.h"
+// #endif
+
 #endif
 
 #include "paddle/utils/Util.h"

From ad6cadeb9a5206ea9ea21679267a14687a278f80 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Fri, 9 Sep 2016 19:50:55 +0800
Subject: [PATCH 02/41] replace linux/tcp by netinet/tcp

---
 paddle/pserver/LightNetwork.cpp | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/paddle/pserver/LightNetwork.cpp b/paddle/pserver/LightNetwork.cpp
index c42d2dbe4bbf5..5dc04ee6c2ef1 100644
--- a/paddle/pserver/LightNetwork.cpp
+++ b/paddle/pserver/LightNetwork.cpp
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <sys/socket.h>
 #include <netdb.h>
 #include <netinet/in.h>
+#include <netinet/tcp.h>
 #include <fcntl.h>
 
 #include <arpa/inet.h>
@@ -25,12 +26,6 @@ limitations under the License. */
 #include <net/if_arp.h>
 #include <sstream>
 
-#if defined(__OSX__) || defined(__APPLE__)
-#include <netinet/tcp.h>
-#else
-#include <linux/tcp.h>
-#endif
-
 #include "LightNetwork.h"
 #include "paddle/utils/Util.h"
 #include "paddle/utils/StringUtil.h"

From 3f5ce64cab69a98c524d3d481b11e97516b4eb69 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Fri, 9 Sep 2016 19:50:55 +0800
Subject: [PATCH 03/41] fix StorageEngine::singleton and std::function<void()>
 type unmatched.

---
 paddle/math/Storage.cpp         | 4 ++--
 paddle/pserver/LightNetwork.cpp | 7 +------
 2 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/paddle/math/Storage.cpp b/paddle/math/Storage.cpp
index 2bd3db2341638..0403c3521cf54 100644
--- a/paddle/math/Storage.cpp
+++ b/paddle/math/Storage.cpp
@@ -25,8 +25,8 @@ namespace paddle {
 // Initialization StorageEngine singleton.
 // Other modules may rely on storage management,
 // so StorageEngine need to be initialized before other modules.
-// static InitFunction __init_storage_engine(
-//            StorageEngine::singleton, std::numeric_limits<int>::max());
+static InitFunction __init_storage_engine([](){StorageEngine::singleton();},
+                                          std::numeric_limits<int>::max());
 
 StorageEngine::StorageEngine() : cpuAllocator_(nullptr) {
 }
diff --git a/paddle/pserver/LightNetwork.cpp b/paddle/pserver/LightNetwork.cpp
index c42d2dbe4bbf5..5dc04ee6c2ef1 100644
--- a/paddle/pserver/LightNetwork.cpp
+++ b/paddle/pserver/LightNetwork.cpp
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <sys/socket.h>
 #include <netdb.h>
 #include <netinet/in.h>
+#include <netinet/tcp.h>
 #include <fcntl.h>
 
 #include <arpa/inet.h>
@@ -25,12 +26,6 @@ limitations under the License. */
 #include <net/if_arp.h>
 #include <sstream>
 
-#if defined(__OSX__) || defined(__APPLE__)
-#include <netinet/tcp.h>
-#else
-#include <linux/tcp.h>
-#endif
-
 #include "LightNetwork.h"
 #include "paddle/utils/Util.h"
 #include "paddle/utils/StringUtil.h"

From a3941cbc1f2b1087af7a5869b516ab29be3aaa36 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Fri, 9 Sep 2016 21:22:40 +0800
Subject: [PATCH 04/41] remove weak attribute for internal FPGA

---
 paddle/gserver/gradientmachines/NeuralNetwork.cpp | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.cpp b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
index f9da812027dc3..903922204343e 100644
--- a/paddle/gserver/gradientmachines/NeuralNetwork.cpp
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
@@ -384,17 +384,10 @@ void NeuralNetwork::setOutputGrad(const std::vector<Argument>& args) {
   }
 }
 
-// extern NeuralNetwork* newCustomNeuralNetwork(
-//    const std::string& name, NeuralNetwork* network) __attribute__((weak));
-
 NeuralNetwork* NeuralNetwork::newNeuralNetwork(
     const std::string& name,
     NeuralNetwork* rootNetwork) {
-//  if (newCustomNeuralNetwork) {
-//    return newCustomNeuralNetwork(name, rootNetwork);
-//  } else {
     return new NeuralNetwork(name, rootNetwork);
-//  }
 }
 
 }  // namespace paddle

From 38166e29d7e1e3e682d90b107bead6599b9fbb73 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Sat, 10 Sep 2016 07:46:53 +0800
Subject: [PATCH 05/41] Add default openblas path on MAC OS

---
 cmake/cblas.cmake | 10 ++++++----
 cmake/util.cmake  | 28 ++++++++++++++--------------
 2 files changed, 20 insertions(+), 18 deletions(-)

diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake
index 617bd7ea7162b..5568f927572f5 100644
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -44,8 +44,8 @@ set(ATLAS_LIB_SEARCH_PATHS
         /usr/lib
         /usr/lib/blas/atlas
         /usr/lib/atlas
-        /usr/lib/atlas-base   # special for ubuntu 14.04.
-    )
+        /usr/lib/atlas-base)   # special for ubuntu 14.04.
+
 find_path(ATLAS_INC_DIR NAMES cblas.h 
   PATHS ${ATLAS_INCLUDE_SEARCH_PATHS})
 find_library(ATLAS_CBLAS_LIB NAMES cblas libcblas.so.3 
@@ -65,12 +65,14 @@ set(OPENBLAS_ROOT $ENV{OPENBLAS_ROOT} CACHE PATH "Folder contains Openblas")
 set(OPENBLAS_INCLUDE_SEARCH_PATHS
         ${OPENBLAS_ROOT}/include
         /usr/include
-        /usr/include/openblas)
+        /usr/include/openblas
+        /usr/local/opt/openblas/include)
 set(OPENBLAS_LIB_SEARCH_PATHS
         ${OPENBLAS_ROOT}/lib
         /usr/lib
         /usr/lib/blas/openblas
-        /usr/lib/openblas)
+        /usr/lib/openblas
+        /usr/local/opt/openblas/lib)
 
 find_path(OPENBLAS_INC_DIR NAMES cblas.h
   PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS})
diff --git a/cmake/util.cmake b/cmake/util.cmake
index bad44c7e9dda6..f3227d27c53c2 100644
--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@@ -1,12 +1,9 @@
-# Some common routine for paddle compile.
+# MAC OS does not contain start-up and whole-archive args
 if(APPLE)
-    # -------------------------------------------------------
-    # OSX
-    # -------------------------------------------------------
-    set(GROUP_START "-Wl,-force_load")
+    set(GROUP_START "")
     set(GROUP_END "")
 
-    set(ARCHIVE_START "-Wl,-force_load")
+    set(ARCHIVE_START "")
     set(ARCHIVE_END "")
 else()
     set(GROUP_START "-Wl,--start-group")
@@ -17,6 +14,7 @@ else()
 endif()
 
 
+# Some common routine for paddle compile.
 
 # target_circle_link_libraries
 # Link libraries to target which has circle dependencies.
@@ -25,15 +23,16 @@ endif()
 # Rest Arguments: libraries which link together.
 function(target_circle_link_libraries TARGET_NAME)
     if(APPLE)
-        foreach(f ${ARGN})
-            list(APPEND OSX_LIBRARIES "-Wl,-force_load" "${f}")
-        endforeach(f)
+        foreach(arg ${ARGN})
+            list(APPEND OSX_LIBRARIES "-Wl,-force_load" "${arg}")
+        endforeach()
         target_link_libraries(${TARGET_NAME}
-                ${OSX_LIBRARIES})
+                ${OSX_LIBRARIES} -lz)
     else()
         target_link_libraries(${TARGET_NAME}
                 ${GROUP_START}
                 ${ARGN}
+                -lz
                 ${GROUP_END})
     endif()
 endfunction()
@@ -66,18 +65,20 @@ function(link_paddle_exe TARGET_NAME)
     if(PADDLE_WITH_INTERNAL)
         set(INTERAL_LIBS paddle_internal_gserver paddle_internal_parameter)
         target_circle_link_libraries(${TARGET_NAME}
+            ${ARCHIVE_START}
             paddle_internal_gserver
             paddle_internal_owlqn
+            ${ARCHIVE_END}
             paddle_internal_parameter)
     else()
         set(INTERAL_LIBS "")
     endif()
 
     target_circle_link_libraries(${TARGET_NAME}
-#        ${ARCHIVE_START}
+        ${ARCHIVE_START}
         paddle_gserver
         ${METRIC_LIBS}
-#        ${ARCHIVE_END}
+        ${ARCHIVE_END}
         paddle_pserver
         paddle_trainer_lib
         paddle_network
@@ -91,8 +92,7 @@ function(link_paddle_exe TARGET_NAME)
         ${CMAKE_THREAD_LIBS_INIT}
         ${CBLAS_LIBS}
         ${CMAKE_DL_LIBS}
-        ${INTERAL_LIBS}
-        )
+        ${INTERAL_LIBS})
     
     if(WITH_PYTHON)
         target_link_libraries(${TARGET_NAME}

From 58f74e2ca17fd6b417355e2686ce00f3ff898fbd Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Sat, 10 Sep 2016 09:05:57 +0800
Subject: [PATCH 06/41] Add main entry for unit test files and replace memalign
 by posix_memalign

---
 paddle/math/Allocator.h                  |  4 ----
 paddle/math/tests/test_SIMDFunctions.cpp |  6 ++++--
 paddle/math/tests/test_perturbation.cpp  |  5 +++++
 paddle/parameter/tests/test_common.cpp   | 10 ++++++----
 paddle/utils/tests/test_StringUtils.cpp  |  5 +++++
 5 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/paddle/math/Allocator.h b/paddle/math/Allocator.h
index 7d277b1c10d2b..ca8eadbc1aa42 100644
--- a/paddle/math/Allocator.h
+++ b/paddle/math/Allocator.h
@@ -48,14 +48,10 @@ class CpuAllocator : public Allocator {
    * @return Pointer to the allocated memory
    */
   virtual void* alloc(size_t size) {
-    #if defined(__APPLE__) || defined(__OSX__)
-      return malloc(size);
-    #else
       void* ptr;
       posix_memalign(&ptr, 32ul, size);
       CHECK(ptr) << "Fail to allocate CPU memory: size=" << size;
       return ptr;
-    #endif
   }
 
   /**
diff --git a/paddle/math/tests/test_SIMDFunctions.cpp b/paddle/math/tests/test_SIMDFunctions.cpp
index 631d0516cf409..bae5d8c684d89 100644
--- a/paddle/math/tests/test_SIMDFunctions.cpp
+++ b/paddle/math/tests/test_SIMDFunctions.cpp
@@ -24,7 +24,7 @@ limitations under the License. */
 #include <algorithm>
 #include <memory>
 
-#include <malloc.h>
+#include <stdlib.h>
 #include <time.h>
 
 static constexpr size_t VECTOR_LEN = 3072;
@@ -37,7 +37,9 @@ static std::mt19937 RandomEngine(time(0));
 
 inline static std::unique_ptr<float[]> NewVector(size_t len = VECTOR_LEN,
                                                  size_t align = ALIGN) {
-  return std::unique_ptr<float[]>((float*)memalign(align, len * sizeof(float)));
+  float* ptr;
+  posix_memalign((void**)&ptr, align, len * sizeof(float));
+  return std::unique_ptr<float[]>(ptr);
 }
 
 inline static std::unique_ptr<float[]> NewRandomVector(size_t len = VECTOR_LEN,
diff --git a/paddle/math/tests/test_perturbation.cpp b/paddle/math/tests/test_perturbation.cpp
index 4fa9bc72013da..51e346fef91bf 100644
--- a/paddle/math/tests/test_perturbation.cpp
+++ b/paddle/math/tests/test_perturbation.cpp
@@ -249,4 +249,9 @@ TEST_F(PerturbationTest, scale_test) {
   }
 }
 
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
 #endif
diff --git a/paddle/parameter/tests/test_common.cpp b/paddle/parameter/tests/test_common.cpp
index 3db96ccf941e3..4f92aec1d9671 100644
--- a/paddle/parameter/tests/test_common.cpp
+++ b/paddle/parameter/tests/test_common.cpp
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 
-#include <malloc.h>
+#include <stdlib.h>
 #include <paddle/utils/Util.h>
 
 #include <gtest/gtest.h>
@@ -124,9 +124,11 @@ void CommonTest::test_sgdUpadate(real* gradientBuffer, real* valueBuffer,
 TEST_F(CommonTest, sgdUpdate) {
   const size_t alignHeader[] = {0, 2, 3, 5, 7, 8};
   for (auto& size : sizeVec_) {
-    real* gradientBuffer = (real*)memalign(32, sizeof(real) * size);
-    real* valueBuffer = (real*)memalign(32, sizeof(real) * size);
-    real* momentumBuffer = (real*)memalign(32, sizeof(real) * size);
+    real *gradientBuffer, *valueBuffer, *momentumBuffer;
+    posix_memalign((void**)&gradientBuffer, 32, sizeof(real) * size);
+    posix_memalign((void**)&valueBuffer, 32, sizeof(real) * size);
+    posix_memalign((void**)&momentumBuffer, 32, sizeof(real) * size);
+
     for (size_t i = 0; i < size; i++) {
       gradientBuffer[i] = 1.0;
       valueBuffer[i] = 2.0;
diff --git a/paddle/utils/tests/test_StringUtils.cpp b/paddle/utils/tests/test_StringUtils.cpp
index b8636709e9b42..95290005ae983 100644
--- a/paddle/utils/tests/test_StringUtils.cpp
+++ b/paddle/utils/tests/test_StringUtils.cpp
@@ -22,3 +22,8 @@ TEST(StringUtil, to) {
   ASSERT_DEATH(paddle::str::to<double>("12.45x23"), ".*");
   ASSERT_DEATH(paddle::str::to<int>(""), ".*");
 }
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}

From 87f96f873aaea36387c3914fdb6f85d6ff86af06 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Sat, 10 Sep 2016 10:00:53 +0800
Subject: [PATCH 07/41] fix dynamic load PaddlePaddle for Mac OS

---
 paddle/cuda/src/hl_dso_loader.cc | 23 ++++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/paddle/cuda/src/hl_dso_loader.cc b/paddle/cuda/src/hl_dso_loader.cc
index 3558b163b5ae0..eee9984e07326 100644
--- a/paddle/cuda/src/hl_dso_loader.cc
+++ b/paddle/cuda/src/hl_dso_loader.cc
@@ -69,23 +69,40 @@ static inline void GetDsoHandleWithSearchPath(
 
     CHECK(nullptr != *dso_handle)
       << "For Gpu version of PaddlePaddle, it couldn't find CUDA library: "
-      << dlPath.c_str() << " Please make sure you already specify its path."
-      << "Note: for training data on Cpu using Gpu version of PaddlePaddle,"
-      << "you must specify libcudart.so via LD_LIBRARY_PATH.";
+      << dlPath.c_str() << ". Please make sure you already specify its path. "
+      << "Note: for training data on Cpu using Gpu version of PaddlePaddle, "
+      << "you must specify libcudart via export LD_LIBRARY_PATH for Linux or "
+      << "export DYLD_LIBRARY_PATH for MAC OS.";
 }
 
 void GetCublasDsoHandle(void** dso_handle) {
+#if defined(__APPLE__) || defined(__OSX__)
+    GetDsoHandleWithSearchPath(FLAGS_cuda_dir, "libcublas.dylib", dso_handle);
+#else
     GetDsoHandleWithSearchPath(FLAGS_cuda_dir, "libcublas.so", dso_handle);
+#endif
 }
 
 void GetCudnnDsoHandle(void** dso_handle) {
+#if defined(__APPLE__) || defined(__OSX__)
+    GetDsoHandleWithSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", dso_handle);
+#else
     GetDsoHandleWithSearchPath(FLAGS_cudnn_dir, "libcudnn.so", dso_handle);
+#endif
 }
 
 void GetCudartDsoHandle(void** dso_handle) {
+#if defined(__APPLE__) || defined(__OSX__)
+    GetDsoHandleWithSearchPath("", "libcudart.dylib", dso_handle);
+#else
     GetDsoHandleWithSearchPath("", "libcudart.so", dso_handle);
+#endif
 }
 
 void GetCurandDsoHandle(void** dso_handle) {
+#if defined(__APPLE__) || defined(__OSX__)
+    GetDsoHandleWithSearchPath(FLAGS_cuda_dir, "libcurand.dylib", dso_handle);
+#else
     GetDsoHandleWithSearchPath(FLAGS_cuda_dir, "libcurand.so", dso_handle);
+#endif
 }

From b664ca0321f139ad33577066d52a1d4f5e868a28 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Sat, 10 Sep 2016 15:36:13 +0800
Subject: [PATCH 08/41] auto-tuning SND/REV buff size on MAC OS

---
 paddle/pserver/LightNetwork.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/paddle/pserver/LightNetwork.cpp b/paddle/pserver/LightNetwork.cpp
index 5dc04ee6c2ef1..ff2875fc702ff 100644
--- a/paddle/pserver/LightNetwork.cpp
+++ b/paddle/pserver/LightNetwork.cpp
@@ -79,6 +79,7 @@ std::string getIpAddr(std::string &device) {
  * @note adjust some default sock option for better performance
  */
 void setOption(int sockfd) {
+#if !defined(__APPLE__) && !defined(__OSX__)
   int sendSize = FLAGS_sock_send_buf_size;
   int recvSize = FLAGS_sock_recv_buf_size;
   CHECK_GE(
@@ -87,6 +88,8 @@ void setOption(int sockfd) {
   CHECK_GE(
       setsockopt(sockfd, SOL_SOCKET, SO_SNDBUF, &sendSize, sizeof(sendSize)),
       0);
+#endif
+
   if (FLAGS_small_messages) {
     int optval = 1;
     CHECK_GE(

From 70cceb0cdd1335fbba38a45d82e1333961a2659a Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Sun, 11 Sep 2016 14:13:59 +0800
Subject: [PATCH 09/41] fix compile paddle swig bug on MAC OS

---
 paddle/api/paddle_ld_flags.py | 17 +++++++++++++++--
 paddle/setup.py.in            | 23 +++++++++++++++++++----
 2 files changed, 34 insertions(+), 6 deletions(-)

diff --git a/paddle/api/paddle_ld_flags.py b/paddle/api/paddle_ld_flags.py
index 21b4ca1dd6171..bc1afc5898e82 100644
--- a/paddle/api/paddle_ld_flags.py
+++ b/paddle/api/paddle_ld_flags.py
@@ -15,6 +15,19 @@
 try:
     from paddle_api_config import *
     import os.path
+    import platform
+
+    system = platform.system().lower()
+    is_osx = (system == 'darwin')
+    is_win = (system == 'windows')
+    is_lin = (system == 'linux')
+
+    if is_lin:
+        whole_start = "-Wl,--whole-archive"
+        whole_end = "-Wl,--no-whole-archive"
+    elif is_osx:
+        whole_start = ""
+        whole_end = ""
 
     LIB_DIRS = ["math", 'utils', 'parameter', "gserver", "api", "cuda", "pserver", "trainer"]
     PARENT_LIB_DIRS = ['proto']
@@ -56,9 +69,9 @@ def parent_dir_str(self):
 
         def libs_str(self):
             libs = [
-                "-Wl,--whole-archive",
+                whole_start,
                 "-lpaddle_gserver",
-                "-Wl,--no-whole-archive",
+                whole_end,
                 "-lpaddle_pserver",
                 "-lpaddle_trainer_lib",
                 "-lpaddle_network",
diff --git a/paddle/setup.py.in b/paddle/setup.py.in
index da86eb795dc58..02ea9067431c6 100644
--- a/paddle/setup.py.in
+++ b/paddle/setup.py.in
@@ -17,6 +17,14 @@
 from setuptools import setup, Extension
 import numpy as np
 import api.paddle_ld_flags
+import platform
+
+system = platform.system().lower()
+
+is_osx = (system == 'darwin')
+is_win = (system == 'windows')
+is_lin = (system == 'linux')
+
 
 # The extra links will passed from COMAKE
 #   because generate paddle LDFLAGS is too complicated to do in setup.py
@@ -34,17 +42,24 @@ try:
 except:
   pass
 
+if is_lin == True:
+    extra_links = ["-Xlinker", '-start-group'] + extra_links + ["-Xlinker", "-end-group"]
+elif is_osx == True:
+    extra_links = ["-Wl,-all_load"] + extra_links
+
+include_dirs = [np.get_include(), "../"]    # include numpy and paddle.
+
 setup(name="py_paddle",
   version="@PADDLE_VERSION@",
   ext_modules=[
     Extension('py_paddle._swig_paddle',      # Build SWIG Extension.
-      ['Paddle_wrap.cxx'],
-      extra_link_args=["-Xlinker", '-start-group'] +
-                        extra_links + ["-Xlinker", "-end-group"]
+       ['Paddle_wrap.cxx'],
+       include_dirs = include_dirs,
+       extra_link_args = extra_links
     )
   ],
   packages=['py_paddle'],
-  include_dirs = [np.get_include(), "../"],   # include numpy and paddle.
+  include_dirs = include_dirs,
   install_requires = [
     'numpy>=1.8.0',      # The numpy is required.
     'protobuf>=2.4.1' # The paddle protobuf version

From c7ece60e2d3715a1b37973fd503eea0848a795d5 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Mon, 12 Sep 2016 19:43:12 +0800
Subject: [PATCH 10/41] add gettid syscall for MAC OS

---
 paddle/utils/Stat.cpp | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/paddle/utils/Stat.cpp b/paddle/utils/Stat.cpp
index 14aae6909d404..aab5446a98c0b 100644
--- a/paddle/utils/Stat.cpp
+++ b/paddle/utils/Stat.cpp
@@ -23,10 +23,14 @@ namespace paddle {
 
 // return the thread id used by glog
 pid_t getTID() {
-#ifndef __NR_gettid
-#define __NR_gettid 224
-#endif
-  pid_t tid = syscall(__NR_gettid);
+  #if defined(__APPLE__) || defined(__OSX__)
+      pid_t tid = syscall(SYS_thread_selfid);
+  #elif defined(__LINUX__)
+      #ifndef __NR_gettid
+      #define __NR_gettid 224
+      #endif
+      pid_t tid = syscall(__NR_gettid);
+  #endif
   CHECK_NE(tid, -1);
   return tid;
 }

From 8d0214193e79c3b68e06ab460acb5a37c705aea8 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Mon, 12 Sep 2016 19:45:32 +0800
Subject: [PATCH 11/41] fix unit test bug when only one gpu

---
 paddle/trainer/tests/test_Trainer.cpp        | 8 ++++++--
 paddle/trainer/tests/test_TrainerOnePass.cpp | 6 +++++-
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/paddle/trainer/tests/test_Trainer.cpp b/paddle/trainer/tests/test_Trainer.cpp
index 8ca9be71de9ac..2044279c2151f 100644
--- a/paddle/trainer/tests/test_Trainer.cpp
+++ b/paddle/trainer/tests/test_Trainer.cpp
@@ -62,7 +62,11 @@ TEST(checkGradient, multiGpu) {
   }
 }
 
-TEST(checkGradient, parallel) { checkGradientTest(configFile4, true, true); }
+TEST(checkGradient, parallel) {
+  if (hl_get_device_count() >= 2) {
+    checkGradientTest(configFile4, true, true);
+  }
+}
 
 TEST(checkGradient, multiParallel) {
   FLAGS_allow_only_one_model_on_one_gpu = false;
@@ -90,7 +94,7 @@ TEST(checkGradient, multi) {
 TEST(checkGradient, hsigmoid) { checkGradientTest(configFile2, false, false); }
 
 TEST(checkGradient, chunk) {
-  EXPECT_EQ(0, system("python2 trainer/tests/gen_proto_data.py"));
+  EXPECT_EQ(0, system("python trainer/tests/gen_proto_data.py"));
   checkGradientTest(configFile3, false, false);
 #ifndef PADDLE_ONLY_CPU
   checkGradientTest(configFile3, true, true);
diff --git a/paddle/trainer/tests/test_TrainerOnePass.cpp b/paddle/trainer/tests/test_TrainerOnePass.cpp
index 6d8b8e0ca5c98..4554b94485f99 100644
--- a/paddle/trainer/tests/test_TrainerOnePass.cpp
+++ b/paddle/trainer/tests/test_TrainerOnePass.cpp
@@ -82,7 +82,11 @@ TEST(trainerOnePass, gpu2) { trainerOnePassTest(configFile1, true, false, 2); }
 
 TEST(trainerOnePass, gpu4) { trainerOnePassTest(configFile1, true, false, 4); }
 
-TEST(trainerOnePass, parallel) { trainerOnePassTest(configFile2, true, true); }
+TEST(trainerOnePass, parallel) {
+  if (hl_get_device_count() >= 2) {
+    trainerOnePassTest(configFile2, true, true);
+  }
+}
 #endif
 
 // 2. test average_window.

From d3eef0c9af6dec65a51b042f0bb1dbbdb030caaa Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Mon, 12 Sep 2016 21:31:54 +0800
Subject: [PATCH 12/41] reduce data_layer size of unit test  to avoid cuda out
 of memory on MAC OS

---
 paddle/gserver/tests/concat_table_a.conf | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/gserver/tests/concat_table_a.conf b/paddle/gserver/tests/concat_table_a.conf
index 2e3c518883e20..a8ff70f883318 100644
--- a/paddle/gserver/tests/concat_table_a.conf
+++ b/paddle/gserver/tests/concat_table_a.conf
@@ -16,9 +16,9 @@
 
 from paddle.trainer_config_helpers import *
 
-settings(batch_size=1000)
+settings(batch_size=300)
 
-data = data_layer(name ="input", size=100000)
+data = data_layer(name ="input", size=10000)
 
 # emb1 is equal to emb2, note that bias_attr=false 
 # and act=LinearActivation() in default.

From eb5cf64b5e7f1b12a6d48057b630067dfd73851c Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Mon, 12 Sep 2016 21:32:38 +0800
Subject: [PATCH 13/41] reduce data_layer size of unit test  to avoid cuda out
 of memory on MAC OS

---
 paddle/gserver/tests/concat_table_b.conf | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/gserver/tests/concat_table_b.conf b/paddle/gserver/tests/concat_table_b.conf
index 6da24a5fbc55c..95d7c10f7b0cd 100644
--- a/paddle/gserver/tests/concat_table_b.conf
+++ b/paddle/gserver/tests/concat_table_b.conf
@@ -16,9 +16,9 @@
 
 from paddle.trainer_config_helpers import *
 
-settings(batch_size=1000)
+settings(batch_size=300)
 
-data = data_layer(name ="input", size=100000)
+data = data_layer(name ="input", size=10000)
 
 proj1 = table_projection(input=data, size=128)
 

From 0ad0e4ac4831c12187b6a2f0005742e84b85df8d Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Mon, 12 Sep 2016 21:50:25 +0800
Subject: [PATCH 14/41] Add Wall Werror

---
 cmake/flags.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index c95d4063105e7..4b99e7f7fb6af 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -58,8 +58,8 @@ set(COMMON_FLAGS
     -fPIC
     -fno-omit-frame-pointer
     -Wall
-#    -Wextra
-#    -Werror
+    -Wextra
+    -Werror
     -Wnon-virtual-dtor
     -Wdelete-non-virtual-dtor
     -Wno-unused-parameter

From 738b6894bce2b84c87431f1f84edbde7be4f4f24 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Mon, 12 Sep 2016 22:13:25 +0800
Subject: [PATCH 15/41] Make whole-archive to less library

* Also make use cmake find to zlib.
* circle link in osx, use reverse link all libs instead. But
  maybe osx just don't care circle link.
---
 CMakeLists.txt   |  1 +
 cmake/util.cmake | 70 +++++++++++++++++++++++++++++-------------------
 2 files changed, 43 insertions(+), 28 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 007f1f18bb655..99c6c0d373052 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -14,6 +14,7 @@ find_package(CUDA QUIET)
 find_package(Protobuf REQUIRED)
 find_package(PythonLibs 2.7 REQUIRED)
 find_package(PythonInterp 2.7 REQUIRED)
+find_package(ZLIB REQUIRED)
 find_package(NumPy)
 find_package(Threads REQUIRED)
 find_package(Glog)
diff --git a/cmake/util.cmake b/cmake/util.cmake
index f3227d27c53c2..a91e1d5643014 100644
--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@@ -1,19 +1,3 @@
-# MAC OS does not contain start-up and whole-archive args
-if(APPLE)
-    set(GROUP_START "")
-    set(GROUP_END "")
-
-    set(ARCHIVE_START "")
-    set(ARCHIVE_END "")
-else()
-    set(GROUP_START "-Wl,--start-group")
-    set(GROUP_END "-Wl,--end-group")
-
-    set(ARCHIVE_START "-Wl,--whole-archive")
-    set(ARCHIVE_END "-Wl,--no-whole-archive")
-endif()
-
-
 # Some common routine for paddle compile.
 
 # target_circle_link_libraries
@@ -23,17 +7,46 @@ endif()
 # Rest Arguments: libraries which link together.
 function(target_circle_link_libraries TARGET_NAME)
     if(APPLE)
+        set(LIBS)
+        set(inArchive OFF)
+        set(libsInArgn)
+
         foreach(arg ${ARGN})
-            list(APPEND OSX_LIBRARIES "-Wl,-force_load" "${arg}")
+            if(${arg} STREQUAL "ARCHIVE_START")
+                set(inArchive ON)
+            elseif(${arg} STREQUAL "ARCHIVE_END")
+                set(inArchive OFF)
+            else()
+                if(inArchive)
+                    list(APPEND LIBS "-Wl,-force_load")
+                endif()
+                list(APPEND LIBS ${arg})
+                list(APPEND libsInArgn ${arg})
+            endif()
         endforeach()
+
+        list(REVERSE libsInArgn)
         target_link_libraries(${TARGET_NAME}
-                ${OSX_LIBRARIES} -lz)
-    else()
+            ${LIBS}
+            ${libsInArgn})
+
+    else()  # LINUX
+        set(LIBS)
+
+        foreach(arg ${ARGN})
+            if(${arg} STREQUAL "ARCHIVE_START")
+                list(APPEND LIBS "-Wl,--whole-archive")
+            elseif(${arg} STREQUAL "ARCHIVE_END")
+                list(APPEND LIBS "-Wl,--no-whole-archive")
+            else()
+                list(APPEND LIBS ${arg})
+            endif()
+        endforeach()
+
         target_link_libraries(${TARGET_NAME}
-                ${GROUP_START}
-                ${ARGN}
-                -lz
-                ${GROUP_END})
+                "-Wl,--start-group"
+                ${LIBS}
+                "-Wl,--end-group")
     endif()
 endfunction()
 
@@ -65,20 +78,20 @@ function(link_paddle_exe TARGET_NAME)
     if(PADDLE_WITH_INTERNAL)
         set(INTERAL_LIBS paddle_internal_gserver paddle_internal_parameter)
         target_circle_link_libraries(${TARGET_NAME}
-            ${ARCHIVE_START}
+            ARCHIVE_START
             paddle_internal_gserver
             paddle_internal_owlqn
-            ${ARCHIVE_END}
+            ARCHIVE_END
             paddle_internal_parameter)
     else()
         set(INTERAL_LIBS "")
     endif()
 
     target_circle_link_libraries(${TARGET_NAME}
-        ${ARCHIVE_START}
+        ARCHIVE_START
         paddle_gserver
         ${METRIC_LIBS}
-        ${ARCHIVE_END}
+        ARCHIVE_END
         paddle_pserver
         paddle_trainer_lib
         paddle_network
@@ -92,7 +105,8 @@ function(link_paddle_exe TARGET_NAME)
         ${CMAKE_THREAD_LIBS_INIT}
         ${CBLAS_LIBS}
         ${CMAKE_DL_LIBS}
-        ${INTERAL_LIBS})
+        ${INTERAL_LIBS}
+        ${ZLIB_LIBRARIES})
     
     if(WITH_PYTHON)
         target_link_libraries(${TARGET_NAME}

From db13fddcc4882cd5505e1c628770131b47cd70e1 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Tue, 13 Sep 2016 10:27:37 +0800
Subject: [PATCH 16/41] Clean locks.h, Add arch Locks.cpp

---
 .gitignore                        |   1 +
 paddle/utils/CMakeLists.txt       |  14 +++-
 paddle/utils/Locks.cpp            | 106 -------------------------
 paddle/utils/Locks.h              | 126 ++++++++----------------------
 paddle/utils/arch/linux/Locks.cpp |  85 ++++++++++++++++++++
 paddle/utils/arch/osx/Locks.cpp   | 112 ++++++++++++++++++++++++++
 6 files changed, 239 insertions(+), 205 deletions(-)
 delete mode 100644 paddle/utils/Locks.cpp
 create mode 100644 paddle/utils/arch/linux/Locks.cpp
 create mode 100644 paddle/utils/arch/osx/Locks.cpp

diff --git a/.gitignore b/.gitignore
index 00368ede67d3d..801c76325c92e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,3 @@
 *.DS_Store
 build/
+*.user
diff --git a/paddle/utils/CMakeLists.txt b/paddle/utils/CMakeLists.txt
index 3c08f1e3055f8..0557b01e36f07 100644
--- a/paddle/utils/CMakeLists.txt
+++ b/paddle/utils/CMakeLists.txt
@@ -2,12 +2,18 @@
 
 file(GLOB UTIL_HEADERS . *.h)
 file(GLOB UTIL_SOURCES . *.cpp)
-
+if(APPLE)
+    file(GLOB UTIL_ARCH_SOURCES . arch/osx/*.cpp)
+else()
+    file(GLOB UTIL_ARCH_SOURCES . arch/linux/*.cpp)
+endif()
 add_library(paddle_utils STATIC
-        ${UTIL_SOURCES})
+        ${UTIL_SOURCES}
+        ${UTIL_ARCH_SOURCES})
 add_style_check_target(paddle_utils ${UTIL_HEADERS})
-add_style_check_target(paddle_utils ${UTIL_SOURCES})
+add_style_check_target(paddle_utils ${UTIL_SOURCES}
+    ${UTIL_ARCH_SOURCES})
 add_dependencies(paddle_utils gen_proto_cpp)
 if(WITH_TESTING)
     add_subdirectory(tests)
-endif()
\ No newline at end of file
+endif()
diff --git a/paddle/utils/Locks.cpp b/paddle/utils/Locks.cpp
deleted file mode 100644
index c2f58cf5764ef..0000000000000
--- a/paddle/utils/Locks.cpp
+++ /dev/null
@@ -1,106 +0,0 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef __APPLE__
-#include <dispatch/dispatch.h>
-#endif
-
-#ifdef __APPLE__
-#ifndef PTHREAD_BARRIER_H_
-#define PTHREAD_BARRIER_H_
-
-#include <pthread.h>
-#include <errno.h>
-
-typedef int pthread_barrierattr_t;
-typedef struct {
-    pthread_mutex_t mutex;
-    pthread_cond_t cond;
-    int count;
-    int tripCount;
-} pthread_barrier_t;
-
-int pthread_barrier_init(pthread_barrier_t *barrier,
-   const pthread_barrierattr_t *attr, unsigned int count) {
-    if (count == 0) {
-        errno = EINVAL;
-        return -1;
-    }
-    if (pthread_mutex_init(&barrier->mutex, 0) < 0) {
-        return -1;
-    }
-    if (pthread_cond_init(&barrier->cond, 0) < 0) {
-        pthread_mutex_destroy(&barrier->mutex);
-        return -1;
-    }
-    barrier->tripCount = count;
-    barrier->count = 0;
-
-    return 0;
-}
-
-int pthread_barrier_destroy(pthread_barrier_t *barrier) {
-    pthread_cond_destroy(&barrier->cond);
-    pthread_mutex_destroy(&barrier->mutex);
-    return 0;
-}
-
-int pthread_barrier_wait(pthread_barrier_t *barrier) {
-    pthread_mutex_lock(&barrier->mutex);
-    ++(barrier->count);
-    if (barrier->count >= barrier->tripCount) {
-        barrier->count = 0;
-        pthread_cond_broadcast(&barrier->cond);
-        pthread_mutex_unlock(&barrier->mutex);
-        return 1;
-    } else {
-        pthread_cond_wait(&barrier->cond, &(barrier->mutex));
-        pthread_mutex_unlock(&barrier->mutex);
-        return 0;
-    }
-}
-
-#endif  // PTHREAD_BARRIER_H_
-
-typedef int pthread_spinlock_t;
-
-int pthread_spin_init(pthread_spinlock_t *lock, int pshared) {
-    __asm__ __volatile__("" ::: "memory");
-    *lock = 0;
-    return 0;
-}
-
-int pthread_spin_destroy(pthread_spinlock_t *lock) {
-    return 0;
-}
-
-int pthread_spin_lock(pthread_spinlock_t *lock) {
-    while (1) {
-        int i;
-        for (i=0; i < 10000; i++) {
-            if (__sync_bool_compare_and_swap(lock, 0, 1)) {
-                return 0;
-            }
-        }
-        sched_yield();
-    }
-}
-
-int pthread_spin_unlock(pthread_spinlock_t *lock) {
-    __asm__ __volatile__("" ::: "memory");
-    *lock = 0;
-    return 0;
-}
-
-#endif  // __APPLE__
diff --git a/paddle/utils/Locks.h b/paddle/utils/Locks.h
index e7b0b77081f36..1fc0363d34597 100644
--- a/paddle/utils/Locks.h
+++ b/paddle/utils/Locks.h
@@ -16,56 +16,11 @@ limitations under the License. */
 #pragma once
 
 #include <pthread.h>
-#include <semaphore.h>
 #include <sys/time.h>
-#include <unistd.h>
-
 #include <condition_variable>
 #include <mutex>
 
-#ifdef __APPLE__
-#include <dispatch/dispatch.h>
-#endif
-
-#ifdef __APPLE__
-#ifndef PTHREAD_BARRIER_H_
-#define PTHREAD_BARRIER_H_
-
-#include <pthread.h>
-#include <errno.h>
-
-typedef int pthread_barrierattr_t;
-typedef struct {
-    pthread_mutex_t mutex;
-    pthread_cond_t cond;
-    int count;
-    int tripCount;
-} pthread_barrier_t;
-
-
-extern int pthread_barrier_init(pthread_barrier_t *barrier,
-                                const pthread_barrierattr_t *attr,
-                                unsigned int count);
-
-extern int pthread_barrier_destroy(pthread_barrier_t *barrier);
-
-extern int pthread_barrier_wait(pthread_barrier_t *barrier);
-
-#endif  // PTHREAD_BARRIER_H_
-
-typedef int pthread_spinlock_t;
-
-extern int pthread_spin_init(pthread_spinlock_t *lock, int pshared);
-
-extern int pthread_spin_destroy(pthread_spinlock_t *lock);
-
-extern int pthread_spin_lock(pthread_spinlock_t *lock);
-
-extern int pthread_spin_unlock(pthread_spinlock_t *lock);
-
-#endif
-
-
+#include "DisableCopy.h"
 
 namespace paddle {
 
@@ -142,58 +97,44 @@ class ReadLockGuard {
  * which means it will keep trying to lock until lock on successfully.
  * The SpinLock disable copy.
  */
+class SpinLockPrivate;
 class SpinLock {
 public:
-  SpinLock() { pthread_spin_init(&lock_, 0); }
-  ~SpinLock() { pthread_spin_destroy(&lock_); }
-  SpinLock(const SpinLock&) = delete;
-  SpinLock& operator=(const SpinLock&) = delete;
+  DISABLE_COPY(SpinLock);
+  SpinLock();
+  ~SpinLock();
 
   // std::mutext interface
-  void lock() { pthread_spin_lock(&lock_); }
-  void unlock() { pthread_spin_unlock(&lock_); }
+  void lock();
+  void unlock();
 
-protected:
-  pthread_spinlock_t lock_;
-  char padding_[64 - sizeof(pthread_spinlock_t)];
+private:
+  SpinLockPrivate* m;
 };
 
 /**
  * A simple wapper of semaphore which can only be shared in the same process.
  */
-
-#ifdef __APPLE__
-
+class SemaphorePrivate;
 class Semaphore {
 public:
-    explicit Semaphore(int initValue = 0) {
-        sem_ = dispatch_semaphore_create(initValue);
-    }
-
-    ~Semaphore() { dispatch_release(sem_); }
-    bool timeWait(struct timespec* ts) {
-        dispatch_time_t m = dispatch_walltime(ts, 0);
-        return (0 == dispatch_semaphore_wait(sem_, m));
-    }
-    void wait() { dispatch_semaphore_wait(sem_, DISPATCH_TIME_FOREVER); }
-    void post() { dispatch_semaphore_signal(sem_);}
-
-protected:
- dispatch_semaphore_t sem_;
-};
+  //! Disable copy & assign
+  Semaphore(const Semaphore& other) = delete;
+  Semaphore& operator= (const Semaphore&& other) = delete;
 
-#else
+  //! Enable move.
+  Semaphore(Semaphore&& other): m(std::move(other.m)) {
+  }
 
-class Semaphore {
 public:
   /**
    * @brief Construct Function. 
    * @param[in] initValue the initial value of the 
    * semaphore, default 0.
    */
-  explicit Semaphore(int initValue = 0) { sem_init(&sem_, 0, initValue); }
+  explicit Semaphore(int initValue = 0);
 
-  ~Semaphore() { sem_destroy(&sem_); }
+  ~Semaphore();
 
   /**
    * @brief The same as wait(), except if the decrement can not 
@@ -203,43 +144,38 @@ class Semaphore {
    * @return ture if the decrement proceeds before ts, 
    * else return false.
    */
-  bool timeWait(struct timespec* ts) { return (0 == sem_timedwait(&sem_, ts)); }
+  bool timeWait(struct timespec* ts);
 
   /**
    * @brief decrement the semaphore. If the semaphore's value is 0, then call blocks.
    */
-  void wait() { sem_wait(&sem_); }
+  void wait();
 
   /**
    * @brief increment the semaphore. If the semaphore's value 
    * greater than 0, wake up a thread blocked in wait().
    */
-  void post() { sem_post(&sem_); }
+  void post();
 
-protected:
-  sem_t sem_;
+private:
+  SemaphorePrivate* m;
 };
 
-#endif
-
-static_assert(sizeof(SpinLock) == 64, "Wrong padding");
-
 /**
  * A simple wrapper of thread barrier.
  * The ThreadBarrier disable copy.
  */
+class ThreadBarrierPrivate;
 class ThreadBarrier {
 public:
+  DISABLE_COPY(ThreadBarrier);
+
   /**
    * @brief Construct Function. Initialize the barrier should
    * wait for count threads in wait().
    */
-  explicit ThreadBarrier(int count) {
-    pthread_barrier_init(&barrier_, NULL, count);
-  }
-  ~ThreadBarrier() { pthread_barrier_destroy(&barrier_); }
-  ThreadBarrier(const ThreadBarrier&) = delete;
-  ThreadBarrier& operator=(const ThreadBarrier&) = delete;
+  explicit ThreadBarrier(int count);
+  ~ThreadBarrier();
 
   /**
    * @brief . 
@@ -247,10 +183,10 @@ class ThreadBarrier {
    * then wake up all the count - 1 threads and continue run together. 
    * Else block the thread until waked by other thread .
    */
-  void wait() { pthread_barrier_wait(&barrier_); }
+  void wait();
 
-protected:
-  pthread_barrier_t barrier_;
+private:
+  ThreadBarrierPrivate* m;
 };
 
 /**
diff --git a/paddle/utils/arch/linux/Locks.cpp b/paddle/utils/arch/linux/Locks.cpp
new file mode 100644
index 0000000000000..939a04dc0fc2c
--- /dev/null
+++ b/paddle/utils/arch/linux/Locks.cpp
@@ -0,0 +1,85 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/util/Locks.h"
+#include <semaphore.h>
+#include <unistd.h>
+
+namespace paddle {
+class SemaphorePrivate {
+public:
+  sem_t sem;
+};
+
+Semaphore::Semaphore(int initValue): m(new SemaphorePrivate()) {
+  sem_init(&m->sem, 0, initValue);
+}
+
+Semaphore::~Semaphore() {
+  sem_destroy(&m->sem);
+}
+
+bool Semaphore::timeWait(struct timespec* ts) {
+  return (0 == sem_timedwait(&m->sem, ts));
+}
+
+void Semaphore::wait() {
+  sem_wait(&m->sem); 
+}
+
+void Semaphore::post() {
+  sem_post(&m->sem);
+}
+
+
+class SpinLockPrivate {
+public:
+  inline SpinLockPrivate() { pthread_spin_init(&lock_, 0); }
+  inline ~SpinLockPrivate() { pthread_spin_destroy(&lock_); }
+  pthread_spinlock_t lock_;
+  char padding_[64 - sizeof(pthread_spinlock_t)];
+};
+
+SpinLock::SpinLock():m(new SpinLockPrivate()) {}
+
+
+SpinLock::~SpinLock() { delete m; }
+
+void SpinLock::lock() {
+  pthread_spin_lock(&m->lock_);
+}
+
+void SpinLock::unlock() {
+  pthread_spin_unlock(&m->lock_);
+}
+
+class ThreadBarrierPrivate {
+public:
+  pthread_barrier_t barrier_;
+};
+
+ThreadBarrier::ThreadBarrier(int count): m(new ThreadBarrierPrivate()) {
+  pthread_barrier_init(&m->barrier_, nullptr, count);
+}
+
+ThreadBarrier::~ThreadBarrier() {
+  pthread_barrier_destroy(&m->barrier_);
+  delete m;
+}
+
+void ThreadBarrier::wait() {
+  pthread_barrier_wait(&m->barrier_);
+}
+
+}  // namespace paddle
\ No newline at end of file
diff --git a/paddle/utils/arch/osx/Locks.cpp b/paddle/utils/arch/osx/Locks.cpp
new file mode 100644
index 0000000000000..5e0411624fd60
--- /dev/null
+++ b/paddle/utils/arch/osx/Locks.cpp
@@ -0,0 +1,112 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/utils/Locks.h"
+#include "paddle/utils/Logging.h"
+#include <dispatch/dispatch.h>
+#include <libkern/OSAtomic.h>
+namespace paddle {
+class SemaphorePrivate {
+public:
+  ~SemaphorePrivate() {
+    dispatch_release(sem);
+  }
+
+  dispatch_semaphore_t sem;
+};
+
+Semaphore::Semaphore(int initValue): m(new SemaphorePrivate()) {
+  m->sem = dispatch_semaphore_create(initValue);
+}
+
+Semaphore::~Semaphore() {
+  delete m;
+}
+
+bool Semaphore::timeWait(timespec *ts) {
+  dispatch_time_t tm = dispatch_walltime(ts, 0);
+  return (0 == dispatch_semaphore_wait(m->sem, tm));
+}
+
+void Semaphore::wait() {
+  dispatch_semaphore_wait(m->sem, DISPATCH_TIME_FOREVER);
+}
+
+void Semaphore::post() {
+  dispatch_semaphore_signal(m->sem);
+}
+
+class SpinLockPrivate {
+public:
+  SpinLockPrivate(): lock_(0) {}
+
+  OSSpinLock lock_;
+  char padding_[64 - sizeof(OSSpinLock)];  // Padding to cache line size
+};
+
+SpinLock::SpinLock(): m(new SpinLockPrivate()) {}
+SpinLock::~SpinLock() { delete m; }
+
+void SpinLock::lock() {
+  OSSpinLockLock(&m->lock_);
+}
+
+void SpinLock::unlock() {
+  OSSpinLockUnlock(&m->lock_);
+}
+
+
+class ThreadBarrierPrivate {
+public:
+  pthread_mutex_t mutex;
+  pthread_cond_t cond;
+  int count;
+  int tripCount;
+
+  inline explicit ThreadBarrierPrivate(int cnt):count(0), tripCount(cnt) {
+    CHECK_NE(cnt, 0);
+    CHECK_GE(pthread_mutex_init(&mutex, 0), 0);
+    CHECK_GE(pthread_cond_init(&cond, 0), 0);
+  }
+
+  inline ~ThreadBarrierPrivate() {
+    pthread_cond_destroy(&cond);
+    pthread_mutex_destroy(&mutex);
+  }
+
+  /**
+   * @brief wait
+   * @return true if the last wait
+   */
+  inline bool wait() {
+    pthread_mutex_lock(&mutex);
+    ++count;
+    if (count > tripCount) {
+      count = 0;
+      pthread_cond_broadcast(&cond);
+      pthread_mutex_unlock(&mutex);
+      return true;
+    } else {
+      pthread_cond_wait(&cond, &mutex);
+      pthread_mutex_unlock(&mutex);
+      return false;
+    }
+  }
+};
+
+ThreadBarrier::ThreadBarrier(int count): m(new ThreadBarrierPrivate(count)) {}
+ThreadBarrier::~ThreadBarrier() { delete m; }
+void ThreadBarrier::wait() { m->wait(); }
+
+}  // namespace paddle

From a238b11f835653a86341d26738c95035aebd271d Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Tue, 13 Sep 2016 11:14:58 +0800
Subject: [PATCH 17/41] Fix linux compile

---
 cmake/util.cmake                         | 5 +++--
 paddle/math/Allocator.h                  | 2 +-
 paddle/math/tests/test_SIMDFunctions.cpp | 2 +-
 paddle/parameter/tests/test_common.cpp   | 8 +++++---
 paddle/utils/Stat.cpp                    | 2 +-
 paddle/utils/arch/linux/Locks.cpp        | 6 +++---
 6 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/cmake/util.cmake b/cmake/util.cmake
index a91e1d5643014..5f2f4a075cc57 100644
--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@@ -104,9 +104,10 @@ function(link_paddle_exe TARGET_NAME)
         ${PROTOBUF_LIBRARY}
         ${CMAKE_THREAD_LIBS_INIT}
         ${CBLAS_LIBS}
-        ${CMAKE_DL_LIBS}
         ${INTERAL_LIBS}
-        ${ZLIB_LIBRARIES})
+        ${ZLIB_LIBRARIES}
+        ${CMAKE_DL_LIBS}
+        )
     
     if(WITH_PYTHON)
         target_link_libraries(${TARGET_NAME}
diff --git a/paddle/math/Allocator.h b/paddle/math/Allocator.h
index ca8eadbc1aa42..f7aa60380f23e 100644
--- a/paddle/math/Allocator.h
+++ b/paddle/math/Allocator.h
@@ -49,7 +49,7 @@ class CpuAllocator : public Allocator {
    */
   virtual void* alloc(size_t size) {
       void* ptr;
-      posix_memalign(&ptr, 32ul, size);
+      CHECK_EQ(posix_memalign(&ptr, 32ul, size), 0);
       CHECK(ptr) << "Fail to allocate CPU memory: size=" << size;
       return ptr;
   }
diff --git a/paddle/math/tests/test_SIMDFunctions.cpp b/paddle/math/tests/test_SIMDFunctions.cpp
index bae5d8c684d89..491b0cda7b9e1 100644
--- a/paddle/math/tests/test_SIMDFunctions.cpp
+++ b/paddle/math/tests/test_SIMDFunctions.cpp
@@ -38,7 +38,7 @@ static std::mt19937 RandomEngine(time(0));
 inline static std::unique_ptr<float[]> NewVector(size_t len = VECTOR_LEN,
                                                  size_t align = ALIGN) {
   float* ptr;
-  posix_memalign((void**)&ptr, align, len * sizeof(float));
+  CHECK_EQ(posix_memalign((void**)&ptr, align, len * sizeof(float)), 0);
   return std::unique_ptr<float[]>(ptr);
 }
 
diff --git a/paddle/parameter/tests/test_common.cpp b/paddle/parameter/tests/test_common.cpp
index 4f92aec1d9671..1a22abf7cf801 100644
--- a/paddle/parameter/tests/test_common.cpp
+++ b/paddle/parameter/tests/test_common.cpp
@@ -125,9 +125,11 @@ TEST_F(CommonTest, sgdUpdate) {
   const size_t alignHeader[] = {0, 2, 3, 5, 7, 8};
   for (auto& size : sizeVec_) {
     real *gradientBuffer, *valueBuffer, *momentumBuffer;
-    posix_memalign((void**)&gradientBuffer, 32, sizeof(real) * size);
-    posix_memalign((void**)&valueBuffer, 32, sizeof(real) * size);
-    posix_memalign((void**)&momentumBuffer, 32, sizeof(real) * size);
+    CHECK_EQ(posix_memalign((void**)&gradientBuffer, 32, sizeof(real) * size),
+        0);
+    CHECK_EQ(posix_memalign((void**)&valueBuffer, 32, sizeof(real) * size), 0);
+    CHECK_EQ(posix_memalign((void**)&momentumBuffer, 32, sizeof(real) * size),
+        0);
 
     for (size_t i = 0; i < size; i++) {
       gradientBuffer[i] = 1.0;
diff --git a/paddle/utils/Stat.cpp b/paddle/utils/Stat.cpp
index aab5446a98c0b..ff6e8ade2cd48 100644
--- a/paddle/utils/Stat.cpp
+++ b/paddle/utils/Stat.cpp
@@ -25,7 +25,7 @@ namespace paddle {
 pid_t getTID() {
   #if defined(__APPLE__) || defined(__OSX__)
       pid_t tid = syscall(SYS_thread_selfid);
-  #elif defined(__LINUX__)
+  #else
       #ifndef __NR_gettid
       #define __NR_gettid 224
       #endif
diff --git a/paddle/utils/arch/linux/Locks.cpp b/paddle/utils/arch/linux/Locks.cpp
index 939a04dc0fc2c..347ae64c26dfd 100644
--- a/paddle/utils/arch/linux/Locks.cpp
+++ b/paddle/utils/arch/linux/Locks.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/util/Locks.h"
+#include "paddle/utils/Locks.h"
 #include <semaphore.h>
 #include <unistd.h>
 
@@ -35,7 +35,7 @@ bool Semaphore::timeWait(struct timespec* ts) {
 }
 
 void Semaphore::wait() {
-  sem_wait(&m->sem); 
+  sem_wait(&m->sem);
 }
 
 void Semaphore::post() {
@@ -82,4 +82,4 @@ void ThreadBarrier::wait() {
   pthread_barrier_wait(&m->barrier_);
 }
 
-}  // namespace paddle
\ No newline at end of file
+}  // namespace paddle

From 55a1a7588be825072299ec198a00970df98639d8 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Tue, 13 Sep 2016 14:23:38 +0800
Subject: [PATCH 18/41] Disable a unittest will use large memory by grep

---
 paddle/math/tests/test_perturbation.cpp | 5 -----
 paddle/utils/tests/CMakeLists.txt       | 8 +++++---
 2 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/paddle/math/tests/test_perturbation.cpp b/paddle/math/tests/test_perturbation.cpp
index 51e346fef91bf..4fa9bc72013da 100644
--- a/paddle/math/tests/test_perturbation.cpp
+++ b/paddle/math/tests/test_perturbation.cpp
@@ -249,9 +249,4 @@ TEST_F(PerturbationTest, scale_test) {
   }
 }
 
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
-
 #endif
diff --git a/paddle/utils/tests/CMakeLists.txt b/paddle/utils/tests/CMakeLists.txt
index be59a785ecf36..5b31cd393dd1f 100644
--- a/paddle/utils/tests/CMakeLists.txt
+++ b/paddle/utils/tests/CMakeLists.txt
@@ -9,6 +9,8 @@ add_executable(
     test_CustomStackTracePrint.cpp
 )
 link_paddle_exe(test_CustomStackTracePrint)
-add_test(NAME test_CustomStackTracePrint
-    COMMAND ${PROJ_ROOT}/paddle/utils/tests/test_CustomStackTracePrint.sh
-    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+if(NOT APPLE)
+    add_test(NAME test_CustomStackTracePrint
+        COMMAND ${PROJ_ROOT}/paddle/utils/tests/test_CustomStackTracePrint.sh
+        WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+endif()

From 1df0c7b1495c6cf2c13b7e8d3b2fc965fdf08748 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Wed, 14 Sep 2016 09:46:58 +0800
Subject: [PATCH 19/41] avoid link failed on unit test for mac os

---
 paddle/math/tests/test_CpuGpuVector.cpp        | 6 ++++++
 paddle/math/tests/test_matrixCompare.cpp       | 5 +++++
 paddle/math/tests/test_perturbation.cpp        | 6 ++++++
 paddle/math/tests/test_sparseMatrixCompare.cpp | 6 ++++++
 4 files changed, 23 insertions(+)

diff --git a/paddle/math/tests/test_CpuGpuVector.cpp b/paddle/math/tests/test_CpuGpuVector.cpp
index 7b50b020cda93..61b424e3c6647 100644
--- a/paddle/math/tests/test_CpuGpuVector.cpp
+++ b/paddle/math/tests/test_CpuGpuVector.cpp
@@ -84,4 +84,10 @@ int main(int argc, char** argv) {
   return ret;
 }
 
+#else
+
+int main(int argc, char const* argv[]) {
+    return 0;
+}
+
 #endif
diff --git a/paddle/math/tests/test_matrixCompare.cpp b/paddle/math/tests/test_matrixCompare.cpp
index fe8eacc2efbc5..ac50e7b7499d8 100644
--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
@@ -1851,5 +1851,10 @@ int main(int argc, char** argv) {
   initMain(argc, argv);
   return RUN_ALL_TESTS();
 }
+#else
+
+int main(int argc, char const* argv[]) {
+    return 0;
+}
 
 #endif
diff --git a/paddle/math/tests/test_perturbation.cpp b/paddle/math/tests/test_perturbation.cpp
index 51e346fef91bf..050f2ca9ced80 100644
--- a/paddle/math/tests/test_perturbation.cpp
+++ b/paddle/math/tests/test_perturbation.cpp
@@ -254,4 +254,10 @@ int main(int argc, char** argv) {
   return RUN_ALL_TESTS();
 }
 
+#else
+
+int main(int argc, char const* argv[]) {
+    return 0;
+}
+
 #endif
diff --git a/paddle/math/tests/test_sparseMatrixCompare.cpp b/paddle/math/tests/test_sparseMatrixCompare.cpp
index 6048dd8112229..b3467e4982e24 100644
--- a/paddle/math/tests/test_sparseMatrixCompare.cpp
+++ b/paddle/math/tests/test_sparseMatrixCompare.cpp
@@ -178,4 +178,10 @@ int main(int argc, char** argv) {
   return ret;
 }
 
+#else
+
+int main(int argc, char const* argv[]) {
+    return 0;
+}
+
 #endif

From d8366a67fa91586f93d737a0af78422074914829 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Wed, 14 Sep 2016 11:30:30 +0800
Subject: [PATCH 20/41] fix bug on thread barrier

---
 paddle/math/tests/test_perturbation.cpp | 6 ++++++
 paddle/utils/arch/osx/Locks.cpp         | 4 ++--
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/paddle/math/tests/test_perturbation.cpp b/paddle/math/tests/test_perturbation.cpp
index 51e346fef91bf..050f2ca9ced80 100644
--- a/paddle/math/tests/test_perturbation.cpp
+++ b/paddle/math/tests/test_perturbation.cpp
@@ -254,4 +254,10 @@ int main(int argc, char** argv) {
   return RUN_ALL_TESTS();
 }
 
+#else
+
+int main(int argc, char const* argv[]) {
+    return 0;
+}
+
 #endif
diff --git a/paddle/utils/arch/osx/Locks.cpp b/paddle/utils/arch/osx/Locks.cpp
index 5e0411624fd60..8fe482ddddd3c 100644
--- a/paddle/utils/arch/osx/Locks.cpp
+++ b/paddle/utils/arch/osx/Locks.cpp
@@ -49,7 +49,7 @@ void Semaphore::post() {
 
 class SpinLockPrivate {
 public:
-  SpinLockPrivate(): lock_(0) {}
+  SpinLockPrivate(): lock_(OS_SPINLOCK_INIT) {}
 
   OSSpinLock lock_;
   char padding_[64 - sizeof(OSSpinLock)];  // Padding to cache line size
@@ -92,7 +92,7 @@ class ThreadBarrierPrivate {
   inline bool wait() {
     pthread_mutex_lock(&mutex);
     ++count;
-    if (count > tripCount) {
+    if (count >= tripCount) {
       count = 0;
       pthread_cond_broadcast(&cond);
       pthread_mutex_unlock(&mutex);

From 0ba302f7d7680de974690567c003f07154b81d4b Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Wed, 14 Sep 2016 14:51:15 +0800
Subject: [PATCH 21/41] fix bug on paddle api when WITH_DOUBLE

---
 paddle/api/Matrix.cpp  | 32 +++++++++++++++---------------
 paddle/api/PaddleAPI.h | 45 +++++++++++++++++++++---------------------
 paddle/api/Util.cpp    |  4 ++--
 paddle/api/Vector.cpp  | 30 ++++++++++++++--------------
 4 files changed, 56 insertions(+), 55 deletions(-)

diff --git a/paddle/api/Matrix.cpp b/paddle/api/Matrix.cpp
index 6a79f83495a56..c40a47f3accf9 100644
--- a/paddle/api/Matrix.cpp
+++ b/paddle/api/Matrix.cpp
@@ -44,7 +44,7 @@ Matrix* Matrix::createZero(size_t height, size_t width, bool useGpu) {
   return m;
 }
 
-Matrix* Matrix::createDense(const std::vector<float>& data, size_t height,
+Matrix* Matrix::createDense(const std::vector<real>& data, size_t height,
                             size_t width, bool useGpu) {
   auto m = new Matrix();
   m->m->mat = paddle::Matrix::create(height, width, useGpu);
@@ -52,7 +52,7 @@ Matrix* Matrix::createDense(const std::vector<float>& data, size_t height,
   return m;
 }
 
-Matrix* Matrix::createCpuDenseFromNumpy(float* data, int dim1, int dim2,
+Matrix* Matrix::createCpuDenseFromNumpy(real* data, int dim1, int dim2,
                                         bool copy) {
   auto m = new Matrix();
   if (copy) {
@@ -64,7 +64,7 @@ Matrix* Matrix::createCpuDenseFromNumpy(float* data, int dim1, int dim2,
   return m;
 }
 
-Matrix* Matrix::createGpuDenseFromNumpy(float* data, int dim1, int dim2) {
+Matrix* Matrix::createGpuDenseFromNumpy(real* data, int dim1, int dim2) {
   auto m = new Matrix();
   m->m->mat = paddle::Matrix::create(dim1, dim2, false, true);
   m->m->mat->copyFrom(data, dim1 * dim2);
@@ -86,7 +86,7 @@ size_t Matrix::getHeight() const { return m->mat->getHeight(); }
 
 size_t Matrix::getWidth() const { return m->mat->getWidth(); }
 
-float Matrix::get(size_t x, size_t y) const throw(RangeError) {
+real Matrix::get(size_t x, size_t y) const throw(RangeError) {
   if (x > this->getWidth() || y > this->getHeight()) {
     RangeError e;
     throw e;
@@ -94,8 +94,8 @@ float Matrix::get(size_t x, size_t y) const throw(RangeError) {
   return m->mat->getElement(x, y);
 }
 
-void Matrix::set(size_t x, size_t y, float val) throw(RangeError,
-                                                      UnsupportError) {
+void Matrix::set(size_t x, size_t y, real val) throw(RangeError,
+                                                     UnsupportError) {
   if (x > this->getWidth() || y > this->getHeight()) {
     RangeError e;
     throw e;
@@ -193,10 +193,10 @@ FloatArray Matrix::getData() const {
   auto rawMat = m->mat.get();
   if (dynamic_cast<paddle::GpuMemoryHandle*>(rawMat->getMemoryHandle().get())) {
     // is gpu. then copy data
-    float* data = rawMat->getData();
+    real* data = rawMat->getData();
     size_t len = rawMat->getElementCnt();
-    float* cpuData = new float[len];
-    hl_memcpy_device2host(cpuData, data, len * sizeof(float));
+    real* cpuData = new real[len];
+    hl_memcpy_device2host(cpuData, data, len * sizeof(real));
     FloatArray ret_val(cpuData, len);
     ret_val.needFree = true;
     return ret_val;
@@ -208,7 +208,7 @@ FloatArray Matrix::getData() const {
 
 void Matrix::sparseCopyFrom(
     const std::vector<int>& rows, const std::vector<int>& cols,
-    const std::vector<float>& vals) throw(UnsupportError) {
+    const std::vector<real>& vals) throw(UnsupportError) {
   auto cpuSparseMat =
       std::dynamic_pointer_cast<paddle::CpuSparseMatrix>(m->mat);
   if (cpuSparseMat != nullptr) {
@@ -217,7 +217,7 @@ void Matrix::sparseCopyFrom(
     //  <<" ValSize = "<<vals.size();
     cpuSparseMat->copyFrom(const_cast<std::vector<int>&>(rows),
                            const_cast<std::vector<int>&>(cols),
-                           const_cast<std::vector<float>&>(vals));
+                           const_cast<std::vector<real>&>(vals));
   } else {
     UnsupportError e;
     throw e;
@@ -226,7 +226,7 @@ void Matrix::sparseCopyFrom(
 
 void* Matrix::getSharedPtr() const { return &m->mat; }
 
-void Matrix::toNumpyMatInplace(float** view_data, int* dim1,
+void Matrix::toNumpyMatInplace(real** view_data, int* dim1,
                                int* dim2) throw(UnsupportError) {
   auto cpuMat = std::dynamic_pointer_cast<paddle::CpuMatrix>(m->mat);
   if (cpuMat) {
@@ -237,9 +237,9 @@ void Matrix::toNumpyMatInplace(float** view_data, int* dim1,
     throw UnsupportError();
   }
 }
-void Matrix::copyToNumpyMat(float** view_m_data, int* dim1,
+void Matrix::copyToNumpyMat(real** view_m_data, int* dim1,
                             int* dim2) throw(UnsupportError) {
-  static_assert(sizeof(paddle::real) == sizeof(float),
+  static_assert(sizeof(paddle::real) == sizeof(real),
                 "Currently PaddleAPI only support for single "
                 "precision version of paddle.");
   if (this->isSparse()) {
@@ -247,7 +247,7 @@ void Matrix::copyToNumpyMat(float** view_m_data, int* dim1,
   } else {
     *dim1 = m->mat->getHeight();
     *dim2 = m->mat->getWidth();
-    *view_m_data = new float[(*dim1) * (*dim2)];
+    *view_m_data = new real[(*dim1) * (*dim2)];
     if (auto cpuMat = dynamic_cast<paddle::CpuMatrix*>(m->mat.get())) {
       auto src = cpuMat->getData();
       auto dest = *view_m_data;
@@ -264,7 +264,7 @@ void Matrix::copyToNumpyMat(float** view_m_data, int* dim1,
   }
 }
 
-void Matrix::copyFromNumpyMat(float* data, int dim1,
+void Matrix::copyFromNumpyMat(real* data, int dim1,
                               int dim2) throw(UnsupportError, RangeError) {
   if (isSparse()) {
     throw UnsupportError();
diff --git a/paddle/api/PaddleAPI.h b/paddle/api/PaddleAPI.h
index 79487c4cf4d41..69f3240a77974 100644
--- a/paddle/api/PaddleAPI.h
+++ b/paddle/api/PaddleAPI.h
@@ -20,6 +20,7 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "paddle/utils/GlobalConstants.h"
+#include "paddle/utils/TypeDefs.h"
 
 /// Import PaddlePaddle's enumeration into global namespace.
 using namespace paddle::enumeration_wrapper;  // NOLINT
@@ -55,10 +56,10 @@ class UnsupportError {};
 
 /// This type will map to python's list of float.
 struct FloatArray {
-  const float* buf;
+  const real* buf;
   const size_t length;
   bool needFree;  // true if the buf is dynamic alloced.
-  FloatArray(const float* b, const size_t l);
+  FloatArray(const real* b, const size_t l);
 };
 
 /// This type will map to python's list of int
@@ -71,11 +72,11 @@ struct IntArray {
 
 /// This type will map to python's list of (int, float)
 struct IntWithFloatArray {
-  const float* valBuf;
+  const real* valBuf;
   const int* idxBuf;
   const size_t length;
   bool needFree;
-  IntWithFloatArray(const float* v, const int* i, size_t l, bool f = false);
+  IntWithFloatArray(const real* v, const int* i, size_t l, bool f = false);
 };
 
 enum SparseValueType { SPARSE_NON_VALUE = 0, SPARSE_VALUE = 1 };
@@ -121,7 +122,7 @@ class Matrix {
    * @param data  list of float should be passed in python.
    * @note        the value will be copy into a new matrix.
    */
-  static Matrix* createDense(const std::vector<float>& data, size_t height,
+  static Matrix* createDense(const std::vector<real>& data, size_t height,
                              size_t width, bool useGpu = false);
 
   /**
@@ -133,11 +134,11 @@ class Matrix {
    *  @param copy  true if copy into a new matrix, false will create
    *               matrix inplace.
    */
-  static Matrix* createCpuDenseFromNumpy(float* data, int dim1, int dim2,
+  static Matrix* createCpuDenseFromNumpy(real* data, int dim1, int dim2,
                                          bool copy = false);
 
   /// Create Gpu Dense Matrix from numpy matrix, dtype=float32
-  static Matrix* createGpuDenseFromNumpy(float* data, int dim1, int dim2);
+  static Matrix* createGpuDenseFromNumpy(real* data, int dim1, int dim2);
 
   /**
    * Cast to numpy matrix.
@@ -153,15 +154,15 @@ class Matrix {
    * numpy_mat = m.toNumpyMat()
    * @endcode
    */
-  void toNumpyMatInplace(float** view_data, int* dim1,
+  void toNumpyMatInplace(real** view_data, int* dim1,
                          int* dim2) throw(UnsupportError);
 
   /// Copy To numpy mat.
-  void copyToNumpyMat(float** view_m_data, int* dim1,
+  void copyToNumpyMat(real** view_m_data, int* dim1,
                       int* dim2) throw(UnsupportError);
 
   /// Copy From Numpy Mat
-  void copyFromNumpyMat(float* data, int dim1, int dim2) throw(UnsupportError,
+  void copyFromNumpyMat(real* data, int dim1, int dim2) throw(UnsupportError,
                                                                RangeError);
 
   /// return true if this matrix is sparse.
@@ -180,9 +181,9 @@ class Matrix {
 
   size_t getWidth() const;
 
-  float get(size_t x, size_t y) const throw(RangeError);
+  real get(size_t x, size_t y) const throw(RangeError);
 
-  void set(size_t x, size_t y, float val) throw(RangeError, UnsupportError);
+  void set(size_t x, size_t y, real val) throw(RangeError, UnsupportError);
 
   /// return type is list of float
   FloatArray getData() const;
@@ -194,8 +195,8 @@ class Matrix {
    */
   void sparseCopyFrom(const std::vector<int>& rows,
                       const std::vector<int>& cols,
-                      const std::vector<float>& values =
-                          std::vector<float>()) throw(UnsupportError);
+                      const std::vector<real>& values =
+                          std::vector<real>()) throw(UnsupportError);
 
   bool isGpu() const;
 
@@ -227,33 +228,33 @@ class Vector {
    *
    * It will create a new vector, and copy data into it.
    */
-  static Vector* create(const std::vector<float>& data, bool useGpu = false);
+  static Vector* create(const std::vector<real>& data, bool useGpu = false);
 
   /**
    * Create Cpu Vector from numpy array, which dtype=float32
    *
    * If copy is false, it will create vector inplace.
    */
-  static Vector* createCpuVectorFromNumpy(float* data, int dim,
+  static Vector* createCpuVectorFromNumpy(real* data, int dim,
                                           bool copy = false);
 
   /// Create Gpu Vector from numpy array, which dtype=float32
-  static Vector* createGpuVectorFromNumpy(float* data, int dim);
+  static Vector* createGpuVectorFromNumpy(real* data, int dim);
 
   /// Cast to numpy array inplace.
-  void toNumpyArrayInplace(float** view_data, int* dim1) throw(UnsupportError);
+  void toNumpyArrayInplace(real** view_data, int* dim1) throw(UnsupportError);
 
   /// Copy to numpy array.
-  void copyToNumpyArray(float** view_m_data, int* dim1);
+  void copyToNumpyArray(real** view_m_data, int* dim1);
 
   /// Copy from numpy array.
-  void copyFromNumpyArray(float* data, int dim);
+  void copyFromNumpyArray(real* data, int dim);
 
   /// __getitem__ in python
-  float get(const size_t idx) const throw(RangeError, UnsupportError);
+  real get(const size_t idx) const throw(RangeError, UnsupportError);
 
   /// __setitem__ in python
-  void set(const size_t idx, float val) throw(RangeError, UnsupportError);
+  void set(const size_t idx, real val) throw(RangeError, UnsupportError);
 
   /// Return is GPU vector or not.
   bool isGpu() const;
diff --git a/paddle/api/Util.cpp b/paddle/api/Util.cpp
index 8a6741078f2f1..fe89a62cd3908 100644
--- a/paddle/api/Util.cpp
+++ b/paddle/api/Util.cpp
@@ -31,13 +31,13 @@ void initPaddle(int argc, char** argv) {
   feenableexcept(FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW);
 }
 
-FloatArray::FloatArray(const float* b, const size_t l)
+FloatArray::FloatArray(const real* b, const size_t l)
     : buf(b), length(l), needFree(false) {}
 
 IntArray::IntArray(const int* b, const size_t l, bool f)
     : buf(b), length(l), needFree(f) {}
 
-IntWithFloatArray::IntWithFloatArray(const float* v, const int* i, size_t l,
+IntWithFloatArray::IntWithFloatArray(const real* v, const int* i, size_t l,
                                      bool f)
     : valBuf(v), idxBuf(i), length(l), needFree(f) {}
 
diff --git a/paddle/api/Vector.cpp b/paddle/api/Vector.cpp
index 1affc1a5fefb8..b61eb7934b781 100644
--- a/paddle/api/Vector.cpp
+++ b/paddle/api/Vector.cpp
@@ -140,7 +140,7 @@ struct VectorPrivate {
   paddle::VectorPtr vec;
 
   void safeAccessData(const size_t idx,
-                      const std::function<void(float&)>& func) const
+                      const std::function<void(real&)>& func) const
       throw(RangeError, UnsupportError) {
     auto cpuVec = std::dynamic_pointer_cast<const paddle::CpuVector>(vec);
     if (cpuVec != nullptr) {
@@ -170,7 +170,7 @@ Vector* Vector::createZero(size_t sz, bool useGpu) {
   return retVec;
 }
 
-Vector* Vector::create(const std::vector<float>& data, bool useGpu) {
+Vector* Vector::create(const std::vector<real>& data, bool useGpu) {
   auto retVec = new Vector();
   retVec->m->vec = paddle::Vector::create(data.size(), useGpu);
   retVec->m->vec->copyFrom(data.data(), data.size());
@@ -188,7 +188,7 @@ Vector* Vector::createByPaddleVectorPtr(void* ptr) {
   }
 }
 
-Vector* Vector::createCpuVectorFromNumpy(float* data, int dim, bool copy) {
+Vector* Vector::createCpuVectorFromNumpy(real* data, int dim, bool copy) {
   CHECK_GT(dim, 0);
   auto retVec = new Vector();
   if (copy) {
@@ -200,7 +200,7 @@ Vector* Vector::createCpuVectorFromNumpy(float* data, int dim, bool copy) {
   return retVec;
 }
 
-Vector* Vector::createGpuVectorFromNumpy(float* data, int dim) {
+Vector* Vector::createGpuVectorFromNumpy(real* data, int dim) {
   CHECK_GT(dim, 0);
   auto retVec = new Vector();
   retVec->m->vec = paddle::Vector::create((size_t)dim, true);
@@ -208,7 +208,7 @@ Vector* Vector::createGpuVectorFromNumpy(float* data, int dim) {
   return retVec;
 }
 
-void Vector::toNumpyArrayInplace(float** view_data,
+void Vector::toNumpyArrayInplace(real** view_data,
                                  int* dim1) throw(UnsupportError) {
   auto v = std::dynamic_pointer_cast<paddle::CpuVector>(m->vec);
   if (v != nullptr) {
@@ -219,20 +219,20 @@ void Vector::toNumpyArrayInplace(float** view_data,
   }
 }
 
-void Vector::copyToNumpyArray(float** view_m_data, int* dim1) {
+void Vector::copyToNumpyArray(real** view_m_data, int* dim1) {
   *dim1 = m->vec->getSize();
-  *view_m_data = new float[*dim1];
+  *view_m_data = new real[*dim1];
   if (auto cpuVec = dynamic_cast<paddle::CpuVector*>(m->vec.get())) {
-    std::memcpy(*view_m_data, cpuVec->getData(), sizeof(float) * (*dim1));
+    std::memcpy(*view_m_data, cpuVec->getData(), sizeof(real) * (*dim1));
   } else if (auto gpuVec = dynamic_cast<paddle::CpuVector*>(m->vec.get())) {
     hl_memcpy_device2host(*view_m_data, gpuVec->getData(),
-                          sizeof(float) * (*dim1));
+                          sizeof(real) * (*dim1));
   } else {
     LOG(INFO) << "Unexpected situation";
   }
 }
 
-void Vector::copyFromNumpyArray(float* data, int dim) {
+void Vector::copyFromNumpyArray(real* data, int dim) {
   m->vec->resize(dim);
   m->vec->copyFrom(data, dim);
 }
@@ -241,15 +241,15 @@ bool Vector::isGpu() const {
   return std::dynamic_pointer_cast<paddle::GpuVector>(m->vec) != nullptr;
 }
 
-float Vector::get(const size_t idx) const throw(RangeError, UnsupportError) {
-  float r;
-  m->safeAccessData(idx, [&](float& o) { r = o; });
+real Vector::get(const size_t idx) const throw(RangeError, UnsupportError) {
+  real r;
+  m->safeAccessData(idx, [&](real& o) { r = o; });
   return r;
 }
 
-void Vector::set(const size_t idx, float val) throw(RangeError,
+void Vector::set(const size_t idx, real val) throw(RangeError,
                                                     UnsupportError) {
-  m->safeAccessData(idx, [&](float& o) { o = val; });
+  m->safeAccessData(idx, [&](real& o) { o = val; });
 }
 
 size_t Vector::getSize() const { return m->vec->getSize(); }

From 8a379fce1651640f36bbc979653c62aeedcc1c9c Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Wed, 14 Sep 2016 14:52:33 +0800
Subject: [PATCH 22/41] add sse instructions support for double on MAC OS

---
 paddle/cuda/include/hl_matrix_type.cuh       | 2 ++
 paddle/cuda/include/hl_sse_matrix_kernel.cuh | 3 +++
 2 files changed, 5 insertions(+)

diff --git a/paddle/cuda/include/hl_matrix_type.cuh b/paddle/cuda/include/hl_matrix_type.cuh
index 85b60cc313fa7..6917f36290141 100644
--- a/paddle/cuda/include/hl_matrix_type.cuh
+++ b/paddle/cuda/include/hl_matrix_type.cuh
@@ -27,6 +27,8 @@ typedef float4 vecType;
 typedef double2 vecType;
 #endif
 #else
+#include <mmintrin.h>
+#include <xmmintrin.h>
 #include <emmintrin.h>
 #ifndef HPPL_TYPE_DOUBLE
 typedef __m128  vecType;
diff --git a/paddle/cuda/include/hl_sse_matrix_kernel.cuh b/paddle/cuda/include/hl_sse_matrix_kernel.cuh
index d774150c21e61..c90d49e4adeb5 100644
--- a/paddle/cuda/include/hl_sse_matrix_kernel.cuh
+++ b/paddle/cuda/include/hl_sse_matrix_kernel.cuh
@@ -25,6 +25,9 @@ limitations under the License. */
 #define     VECTOR_LEN      4
 #define     VECTOR_SET      _mm_set_ps1
 #else
+#if   defined(__APPLE__) || defined(__OSX__)
+#define     _mm_set_pd1     _mm_set1_pd
+#endif
 /* number of double in vector */
 #define     VECTOR_LEN      2
 #define     VECTOR_SET      _mm_set_pd1

From 1f8c21978d750c3e541dd91cd43cdb232b4ee36e Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Wed, 14 Sep 2016 14:54:04 +0800
Subject: [PATCH 23/41] replace fabsf() using std::abs

---
 paddle/math/tests/test_matrixUtil.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/paddle/math/tests/test_matrixUtil.h b/paddle/math/tests/test_matrixUtil.h
index 1310e509877a0..fa682164aa864 100644
--- a/paddle/math/tests/test_matrixUtil.h
+++ b/paddle/math/tests/test_matrixUtil.h
@@ -124,8 +124,8 @@ void checkSMatrixErr(const CpuSparseMatrixPtr& a,
       if (a->getValueType() == FLOAT_VALUE) {
         real aVal = a->getValue()[r];
         real bVal = b->getValue()[r];
-        if (fabs(aVal - bVal) > err) {
-          if ((fabsf(aVal - bVal) / fabsf(aVal)) > (err / 10.0f)) {
+        if (std::abs(aVal - bVal) > err) {
+          if ((std::abs(aVal - bVal) / std::abs(aVal)) > (err / 10.0f)) {
             LOG(INFO) << "a=" << aVal << "\t" << "b=" << bVal;
             count++;
           }
@@ -141,8 +141,8 @@ void checkSMatrixErr(const CpuSparseMatrixPtr& a,
       if (a->getValueType() == FLOAT_VALUE) {
         real aVal = a->getValue()[r];
         real bVal = b->getValue()[r];
-        if (fabs(aVal - bVal) > err) {
-          if ((fabsf(aVal - bVal) / fabsf(aVal)) > (err / 10.0f)) {
+        if (std::abs(aVal - bVal) > err) {
+          if ((std::abs(aVal - bVal) / std::abs(aVal)) > (err / 10.0f)) {
             count++;
           }
         }
@@ -173,8 +173,8 @@ void checkMatrixErr(const Matrix& matrix1, const Matrix& matrix2) {
     for (int j = 0; j < width; j++) {
       real a = data1[i * width + j];
       real b = data2[i * width + j];
-      if (fabs(a - b) > err) {
-        if ((fabsf(a - b) / fabsf(a)) > (err / 10.0f)) {
+      if (std::abs(a - b) > err) {
+        if ((std::abs(a - b) / std::abs(a)) > (err / 10.0f)) {
           count++;
         }
       }

From 75beeaf743ae717a91aab5f2a6c5cd43f771b557 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Mon, 19 Sep 2016 13:37:31 +0800
Subject: [PATCH 24/41] Fix unit  test stack trace bug on MAC OS

---
 paddle/utils/tests/test_CustomStackTrace.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/paddle/utils/tests/test_CustomStackTrace.cpp b/paddle/utils/tests/test_CustomStackTrace.cpp
index 26ca4c678a650..3e665021471cb 100644
--- a/paddle/utils/tests/test_CustomStackTrace.cpp
+++ b/paddle/utils/tests/test_CustomStackTrace.cpp
@@ -45,6 +45,7 @@ void testNormalImpl(const std::function<void(
   size_t cntDown = countDown;
   while (cntDown-- > 0) {
     startBarrier.wait();
+    sleep(1);
     doneBarrier.wait();
     ASSERT_TRUE(tracer.empty());
   }

From 4e37b226f4bc79709d81c2d00cabaac02876d53b Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Mon, 19 Sep 2016 13:40:44 +0800
Subject: [PATCH 25/41] Revise member variable in private barrier class

---
 paddle/utils/arch/osx/Locks.cpp | 35 +++++++++++++++++----------------
 1 file changed, 18 insertions(+), 17 deletions(-)

diff --git a/paddle/utils/arch/osx/Locks.cpp b/paddle/utils/arch/osx/Locks.cpp
index 8fe482ddddd3c..47e44e9d7c114 100644
--- a/paddle/utils/arch/osx/Locks.cpp
+++ b/paddle/utils/arch/osx/Locks.cpp
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <dispatch/dispatch.h>
 #include <libkern/OSAtomic.h>
 namespace paddle {
+
 class SemaphorePrivate {
 public:
   ~SemaphorePrivate() {
@@ -69,20 +70,20 @@ void SpinLock::unlock() {
 
 class ThreadBarrierPrivate {
 public:
-  pthread_mutex_t mutex;
-  pthread_cond_t cond;
-  int count;
-  int tripCount;
+  pthread_mutex_t mutex_;
+  pthread_cond_t cond_;
+  int count_;
+  int tripCount_;
 
-  inline explicit ThreadBarrierPrivate(int cnt):count(0), tripCount(cnt) {
+  inline explicit ThreadBarrierPrivate(int cnt):count_(0), tripCount_(cnt) {
     CHECK_NE(cnt, 0);
-    CHECK_GE(pthread_mutex_init(&mutex, 0), 0);
-    CHECK_GE(pthread_cond_init(&cond, 0), 0);
+    CHECK_GE(pthread_mutex_init(&mutex_, 0), 0);
+    CHECK_GE(pthread_cond_init(&cond_, 0), 0);
   }
 
   inline ~ThreadBarrierPrivate() {
-    pthread_cond_destroy(&cond);
-    pthread_mutex_destroy(&mutex);
+    pthread_cond_destroy(&cond_);
+    pthread_mutex_destroy(&mutex_);
   }
 
   /**
@@ -90,16 +91,16 @@ class ThreadBarrierPrivate {
    * @return true if the last wait
    */
   inline bool wait() {
-    pthread_mutex_lock(&mutex);
-    ++count;
-    if (count >= tripCount) {
-      count = 0;
-      pthread_cond_broadcast(&cond);
-      pthread_mutex_unlock(&mutex);
+    pthread_mutex_lock(&mutex_);
+    ++count_;
+    if (count_ >= tripCount_) {
+      count_ = 0;
+      pthread_cond_broadcast(&cond_);
+      pthread_mutex_unlock(&mutex_);
       return true;
     } else {
-      pthread_cond_wait(&cond, &mutex);
-      pthread_mutex_unlock(&mutex);
+      pthread_cond_wait(&cond_, &mutex_);
+      pthread_mutex_unlock(&mutex_);
       return false;
     }
   }

From 2d13462a2ccb358cc0b09ddb1b12ef23a68e9742 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Mon, 19 Sep 2016 17:14:11 +0800
Subject: [PATCH 26/41] Fix incompatible on CUDA atomicAdd operation

---
 paddle/cuda/include/hl_device_functions.cuh | 49 ++++++++++++---------
 paddle/cuda/include/hl_gpu_lstm.cuh         |  6 +--
 paddle/cuda/src/hl_cuda_lstm.cu             |  6 +--
 paddle/cuda/src/hl_cuda_matrix.cu           |  4 +-
 paddle/cuda/src/hl_cuda_sequence.cu         |  2 +-
 paddle/cuda/src/hl_cuda_sparse.cuh          | 10 ++---
 paddle/cuda/src/hl_table_apply.cu           |  2 +-
 7 files changed, 44 insertions(+), 35 deletions(-)

diff --git a/paddle/cuda/include/hl_device_functions.cuh b/paddle/cuda/include/hl_device_functions.cuh
index 27e3f450c5c1c..88d950d6c1713 100755
--- a/paddle/cuda/include/hl_device_functions.cuh
+++ b/paddle/cuda/include/hl_device_functions.cuh
@@ -16,28 +16,37 @@ limitations under the License. */
 #ifndef HL_DEVICE_FUNCTIONS_CUH_
 #define HL_DEVICE_FUNCTIONS_CUH_
 
-namespace hppl {
-
-static __inline__ __device__ double atomicAdd(double* address, double val) {
-    // NOLINTNEXTLINE
-    unsigned long long int* address_as_ull = (unsigned long long int*)address;
-    unsigned long long int old = *address_as_ull, assumed; // NOLINT
-
-    do {
-        assumed = old;
-        old = atomicCAS(address_as_ull,
-                        assumed,
-                        __double_as_longlong(val +
-                        __longlong_as_double(assumed)));
-    } while (assumed != old);
-
-    return __longlong_as_double(old);
-}
+namespace paddle {
+
+template <class T>
+inline __device__ T paddleAtomicAdd(T* address, T val);
 
-}  // namespace hppl
+template <>
+inline __device__ float paddleAtomicAdd(float* address, float val) {
+  return atomicAdd(address, val);
+}
 
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
-using hppl::atomicAdd;
+template <>
+inline __device__ double paddleAtomicAdd(double* address, double val) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600
+  return atomicAdd(address, val);
+#else
+  // NOLINTNEXTLINE
+  unsigned long long int* address_as_ull = (unsigned long long int*)address;
+  unsigned long long int old = *address_as_ull, assumed; // NOLINT
+
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_ull,
+                    assumed,
+                    __double_as_longlong(val +
+                    __longlong_as_double(assumed)));
+  } while (assumed != old);
+
+  return __longlong_as_double(old);
 #endif
+}
+}  // namespace paddle
+
 
 #endif /* HL_DEVICE_FUNCTIONS_CUH_ */
diff --git a/paddle/cuda/include/hl_gpu_lstm.cuh b/paddle/cuda/include/hl_gpu_lstm.cuh
index 2ca33f2b13a1f..07806e11c18a2 100644
--- a/paddle/cuda/include/hl_gpu_lstm.cuh
+++ b/paddle/cuda/include/hl_gpu_lstm.cuh
@@ -192,10 +192,10 @@ __global__ void KeLstmBackward(Op op,
 
   if (isBatch) {
     if (value.prevStateValue) {
-      if (grad.checkIgGrad) atomicAdd(grad.checkIgGrad+frameIdx, rCheckIGrad);
-      if (grad.checkFgGrad) atomicAdd(grad.checkFgGrad+frameIdx, rCheckFGrad);
+      if (grad.checkIgGrad) paddle::paddleAtomicAdd(grad.checkIgGrad+frameIdx, rCheckIGrad);
+      if (grad.checkFgGrad) paddle::paddleAtomicAdd(grad.checkFgGrad+frameIdx, rCheckFGrad);
     }
-    if (grad.checkOgGrad) atomicAdd(grad.checkOgGrad+frameIdx, rCheckOGrad);
+    if (grad.checkOgGrad) paddle::paddleAtomicAdd(grad.checkOgGrad+frameIdx, rCheckOGrad);
   } else {
     if (value.prevStateValue) {
       if (grad.checkIgGrad) grad.checkIgGrad[frameIdx] += rCheckIGrad;
diff --git a/paddle/cuda/src/hl_cuda_lstm.cu b/paddle/cuda/src/hl_cuda_lstm.cu
index 64699c9f6d450..cf009620bf69d 100644
--- a/paddle/cuda/src/hl_cuda_lstm.cu
+++ b/paddle/cuda/src/hl_cuda_lstm.cu
@@ -564,11 +564,11 @@ __global__ void KeLstmBackward(real *gateValue,
 
   /* TODO: Temporary save & merger in another kernel */
   if (frameIdy == 1) {
-    if (checkIgGrad) atomicAdd(checkIgGrad+frameIdx, rCheckGrad);
+    if (checkIgGrad) paddle::paddleAtomicAdd(checkIgGrad+frameIdx, rCheckGrad);
   } else if (frameIdy == 2) {
-    if (checkFgGrad) atomicAdd(checkFgGrad+frameIdx, rCheckGrad);
+    if (checkFgGrad) paddle::paddleAtomicAdd(checkFgGrad+frameIdx, rCheckGrad);
   } else if (frameIdy == 3) {
-    if (checkOgGrad) atomicAdd(checkOgGrad+frameIdx, rCheckGrad);
+    if (checkOgGrad) paddle::paddleAtomicAdd(checkOgGrad+frameIdx, rCheckGrad);
   }
 }
 
diff --git a/paddle/cuda/src/hl_cuda_matrix.cu b/paddle/cuda/src/hl_cuda_matrix.cu
index ecc44944e4fa1..38e4f16217c2a 100644
--- a/paddle/cuda/src/hl_cuda_matrix.cu
+++ b/paddle/cuda/src/hl_cuda_matrix.cu
@@ -623,7 +623,7 @@ __global__ void KeCosSimDerivative(real* grad,
         prevGradY[index] +=
           scale * grad[ty] * prevOutX[index] * reciprocal;
       } else {
-        atomicAdd(prevGradY + index,
+        paddle::paddleAtomicAdd(prevGradY + index,
           scale * grad[ty] * prevOutX[index] * reciprocal);
       }
     }
@@ -640,7 +640,7 @@ __global__ void KeCosSimDerivative(real* grad,
           (prevOutX[index] * reciprocalXY -
            prevOutY[index] * reciprocalSquareSumY);
       } else {
-        atomicAdd(prevGradY + index, output[ty] * grad[ty] *
+        paddle::paddleAtomicAdd(prevGradY + index, output[ty] * grad[ty] *
           (prevOutX[index] * reciprocalXY -
            prevOutY[index] * reciprocalSquareSumY));
       }
diff --git a/paddle/cuda/src/hl_cuda_sequence.cu b/paddle/cuda/src/hl_cuda_sequence.cu
index f88a2682fd060..e028880156e5b 100644
--- a/paddle/cuda/src/hl_cuda_sequence.cu
+++ b/paddle/cuda/src/hl_cuda_sequence.cu
@@ -362,7 +362,7 @@ __global__ void KeMatrixAddRows(real* output,
         if (AddRow == 0) {
           outputData[i] += tableData[i];
         } else {
-          atomicAdd(&tableData[i], outputData[i]);
+          paddle::paddleAtomicAdd(&tableData[i], outputData[i]);
         }
       }
     }
diff --git a/paddle/cuda/src/hl_cuda_sparse.cuh b/paddle/cuda/src/hl_cuda_sparse.cuh
index becb6c66492c1..db5c9ce979885 100644
--- a/paddle/cuda/src/hl_cuda_sparse.cuh
+++ b/paddle/cuda/src/hl_cuda_sparse.cuh
@@ -280,7 +280,7 @@ __global__ void KeSMatrixCscMulDense(real *C_d,
         if (index_n_t < dimN) {
           real tmp;
           tmp = alpha*a_r*b_r[n];
-          atomicAdd(C_d_r, tmp);
+          paddle::paddleAtomicAdd(C_d_r, tmp);
           C_d_r += CU_CSC_MUL_DENSE_THREAD_X;
           index_n_t += CU_CSC_MUL_DENSE_THREAD_X;
         }
@@ -328,7 +328,7 @@ __global__ void KeSMatrixCscMulDense(real *C_d,
         if (index_n_t < dimN) {
           real tmp;
           tmp = alpha*a_r*b_r[n];
-          atomicAdd(C_d_r, tmp);
+          paddle::paddleAtomicAdd(C_d_r, tmp);
           C_d_r += CU_CSC_MUL_DENSE_THREAD_X;
           index_n_t += CU_CSC_MUL_DENSE_THREAD_X;
         }
@@ -629,7 +629,7 @@ __global__ void KeSMatrixDenseMulCsr(real *C_d,
       for (int n=0; n < CU_DM_CSR_N; n++) {
         if (index_m_t++ < dimM) {
           tmp = alpha * b_r * a_r[n];
-          atomicAdd(C_d_r, tmp);
+          paddle::paddleAtomicAdd(C_d_r, tmp);
           C_d_r += dimN;
         }
       }
@@ -660,7 +660,7 @@ __global__ void KeSMatrixDenseMulCsr(real *C_d,
       for (int n=0; n < CU_DM_CSR_N; n++) {
         if (index_m_t++ < dimM) {
           tmp = alpha * b_r * a_r[n];
-          atomicAdd(C_d_r, tmp);
+          paddle::paddleAtomicAdd(C_d_r, tmp);
           C_d_r += dimN;
         }
       }
@@ -912,7 +912,7 @@ __global__ void KeSMatrixCsrColumnSum(real* a_val, real* csr_val,
   for (int idx = gid; idx < dimNNZ; idx += gridDim.x * blockDim.x) {
     int colIdx = csr_col[idx];
     real val = csr_val[idx];
-    atomicAdd(a_val + colIdx, val);
+    paddle::paddleAtomicAdd(a_val + colIdx, val);
   }
 }
 
diff --git a/paddle/cuda/src/hl_table_apply.cu b/paddle/cuda/src/hl_table_apply.cu
index 05335c5f835fc..52ee4610edf67 100644
--- a/paddle/cuda/src/hl_table_apply.cu
+++ b/paddle/cuda/src/hl_table_apply.cu
@@ -35,7 +35,7 @@ __global__ void KeMatrixAddRows(real* output, int ldo,
       real *tab = table + tableId * ldt;
       for (int i = idx; i < dim; i += blockDimX) {
         if (AddRow) {
-          atomicAdd(&tab[i], out[i]);
+          paddle::paddleAtomicAdd(&tab[i], out[i]);
         } else {
           out[i] += tab[i];
         }

From 75cbf5ea9636ef74ec2682fd221db9a6ce5ec719 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Tue, 20 Sep 2016 12:02:33 +0800
Subject: [PATCH 27/41] add gitignore for  VS CODE on MAC OS

---
 .gitignore | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.gitignore b/.gitignore
index 801c76325c92e..7e21ba0b750df 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,6 @@
 *.DS_Store
 build/
 *.user
+
+.vscode
+.idea
\ No newline at end of file

From 2daa05c0b5a14d9b02267a7fff1d48f409f0cb50 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Tue, 20 Sep 2016 16:09:05 +0800
Subject: [PATCH 28/41] add build on MAC OSX docs

---
 doc/build/build_from_source.md | 142 +++++++++++++++++++++++++++++++--
 1 file changed, 136 insertions(+), 6 deletions(-)

diff --git a/doc/build/build_from_source.md b/doc/build/build_from_source.md
index a191d31318aa6..c71ff260f8d0a 100644
--- a/doc/build/build_from_source.md
+++ b/doc/build/build_from_source.md
@@ -1,7 +1,11 @@
 Build and Install
 =================
 
-## Requirement
+* [1. Requirement](#Requirement)
+* [2. Build on Ubuntu](#ubuntu)
+* [3. Build on Mac OS X](#mac)
+
+## <span id="Requirement">Requirement</span>
 
 ### Dependents
 
@@ -28,7 +32,7 @@ PaddlePaddle also support some build options, you have to install related librar
 - **WITH_STYLE_CHECK**: Style check for source code
 
 
-## Building on Ubuntu14.04
+## <span id="ubuntu">Building on Ubuntu14.04</span>
 
 ### Install Dependencies
 
@@ -44,7 +48,7 @@ sudo apt-get install libgflags-dev
 sudo apt-get install libgtest-dev
 sudo pip install wheel
 pushd /usr/src/gtest
-cmake .
+cmake ..
 make
 sudo cp *.a /usr/lib
 popd
@@ -102,19 +106,19 @@ Here are some examples of cmake command with different options:
 **only cpu**
 
 ```bash
-cmake -DWITH_GPU=OFF -DWITH_DOC=OFF
+cmake -DWITH_GPU=OFF -DWITH_DOC=OFF ..
 ```
 
 **gpu**
 
 ```bash
-cmake -DWITH_GPU=ON -DWITH_DOC=OFF
+cmake -DWITH_GPU=ON -DWITH_DOC=OFF ..
 ```
 
 **gpu with doc and swig**
 
 ```bash
-cmake -DWITH_GPU=ON -DWITH_DOC=ON -DWITH_SWIG_PY=ON
+cmake -DWITH_GPU=ON -DWITH_DOC=ON -DWITH_SWIG_PY=ON ..
 ``` 
 
 Finally, you can download source code and build:
@@ -139,3 +143,129 @@ And if you set WITH_SWIG_PY=ON, you have to install related python predict api a
 ```bash
 pip install <path to install>/opt/paddle/share/wheels/*.whl
 ```
+## <span id="mac">Building on Mac OS X</span>
+
+### Prerequisites
+This guide is based on Mac OS X 10.11 (El Capitan). Note that if you are running an up to date version of OS X, 
+you will already have Python 2.7.10 and Numpy 1.8 installed.
+
+The best option is to use the package manager homebrew to handle installations and upgrades for you.
+To install homebrew, first open a terminal window (you can find Terminal in the Utilities folder in Applications), and issue the command:
+
+```bash
+# install brew
+/usr/bin/ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)"
+# install pip
+easy_install pip
+```
+
+### Install Dependencies
+
+- **CPU Dependencies**
+
+```bash
+# Install fundamental dependents 
+brew install glog gflags cmake protobuf openblas
+
+# Install google test on Mac OS X
+# Download gtest 1.7.0
+wget https://github.com/google/googletest/archive/release-1.7.0.tar.gz
+tar -xvf googletest-release-1.7.0.tar.gz && cd googletest-release-1.7.0
+# Build gtest
+mkdir build && cmake ..
+make
+# Install gtest library
+sudo cp -r ../include/gtest /usr/local/include/
+sudo cp lib*.a /usr/local/lib
+```
+    
+  
+- **GPU Dependencies(optional)**
+
+If you need to build GPU version, the first thing you need is a machine that has NVIDIA GPU and CUDA installed.
+And you also need to install cuDNN.
+
+You can download CUDA toolkit and cuDNN from nvidia website:
+    
+```bash
+https://developer.nvidia.com/cuda-downloads
+https://developer.nvidia.com/cudnn
+```
+You can copy cuDNN files into the CUDA toolkit directory, for instance:
+
+```bash
+sudo tar -xzf cudnn-7.5-osx-x64-v5.0-ga.tgz -C /usr/local
+sudo chmod a+r /usr/local/cuda/include/cudnn.h /usr/local/cuda/lib64/libcudnn*
+```
+Then you need to set DYLD\_LIBRARY\_PATH, CUDA\_HOME and PATH environment variables in ~/.bashrc.
+
+```bash
+export DYLD_LIBRARY_PATH=/usr/local/cuda/lib:$DYLD_LIBRARY_PATH
+export PATH=/usr/local/cuda/bin:$PATH
+```
+- **Python Dependencies(optional)**
+
+If you want to compile PaddlePaddle with python predict API, you need to add -DWITH_SWIG_PY=ON in cmake command and install these first:
+
+```bash
+brew install swig
+```
+
+- **Doc Dependencies(optional)**
+
+If you want to compile PaddlePaddle with doc, you need to add -DWITH_DOC=ON in cmake command and install these first:
+
+```bash
+pip install 'sphinx>=1.4.0'
+pip install sphinx_rtd_theme breathe recommonmark
+brew install doxygen 
+```
+
+### Build and Install
+
+CMake can find dependent libraries in system default paths firstly.
+After installing some optional libraries, corresponding build option will be on automatically (for instance, glog, gtest and gflags).
+If not found, you have to set following variables manually via CMake command (CUDNN_ROOT, ATLAS_ROOT, MKL_ROOT, OPENBLAS_ROOT).
+
+Here are some examples of CMake command with different options:
+
+**only cpu**
+
+```bash
+cmake -DWITH_GPU=OFF -DWITH_DOC=OFF ..
+```
+
+**gpu**
+
+```bash
+cmake -DWITH_GPU=ON -DWITH_DOC=OFF ..
+```
+
+**gpu with doc and swig**
+
+```bash
+cmake -DWITH_GPU=ON -DWITH_DOC=ON -DWITH_SWIG_PY=ON ..
+``` 
+
+Finally, you can download source code and build:
+
+```bash
+git clone https://github.com/baidu/Paddle paddle
+cd paddle
+mkdir build
+cd build
+# you can add build option here, such as:    
+cmake -DWITH_GPU=ON -DWITH_DOC=OFF -DCMAKE_INSTALL_PREFIX=<path to install> ..
+# please use sudo make install, if you want
+# to install PaddlePaddle into the system
+make -j `nproc` && make install
+# PaddlePaddle installation path
+export PATH=<path to install>/bin:$PATH
+```
+**Note**
+
+And if you set WITH_SWIG_PY=ON, you have to install related python predict api at the same time:
+
+```bash
+sudo pip install <path to install>/opt/paddle/share/wheels/*.whl
+```
\ No newline at end of file

From 536bf7d87f22ba2dbc84288a35341af523c9169d Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Tue, 20 Sep 2016 20:23:58 +0800
Subject: [PATCH 29/41] fix poolAllocator unordered_map bug on Mac OS X

* using map to replace unordered_map on Mac
---
 paddle/math/PoolAllocator.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/paddle/math/PoolAllocator.h b/paddle/math/PoolAllocator.h
index 22af0eb893753..a50deece3a853 100644
--- a/paddle/math/PoolAllocator.h
+++ b/paddle/math/PoolAllocator.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <mutex>
 #include <vector>
 #include <unordered_map>
+#include <map>
 #include "Allocator.h"
 
 namespace paddle {
@@ -52,7 +53,11 @@ class PoolAllocator {
   void printAll();
   std::unique_ptr<Allocator> allocator_;
   std::mutex mutex_;
+#if defined(__APPLE__) || defined(__OSX__)
+  std::map<size_t, std::vector<void*>> pool_;
+#else
   std::unordered_map<size_t, std::vector<void*>> pool_;
+#endif
   size_t sizeLimit_;
   size_t poolMemorySize_;
   std::string name_;

From 8b1c76af3c0ffdcc548bbf952f00f6b0a5186871 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Tue, 20 Sep 2016 20:31:01 +0800
Subject: [PATCH 30/41] comment LOG(info) in createPythonClass

* it makes unit test failed.
---
 paddle/utils/PythonUtil.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/utils/PythonUtil.cpp b/paddle/utils/PythonUtil.cpp
index 78c3a80674f9c..9ee7a29aad0b6 100644
--- a/paddle/utils/PythonUtil.cpp
+++ b/paddle/utils/PythonUtil.cpp
@@ -144,12 +144,12 @@ PyObjectPtr createPythonClass(
     const std::map<std::string, std::string>& kwargs) {
   PyGuard guard;
   PyObjectPtr pyModule(PyImport_ImportModule(moduleName.c_str()));
-  LOG(INFO) << "createPythonClass moduleName.c_str:" << moduleName.c_str();
+  // LOG(INFO) << "createPythonClass moduleName.c_str:" << moduleName.c_str();
   CHECK_PY(pyModule) << "Import module " << moduleName << " failed.";
   PyObjectPtr pyDict(PyModule_GetDict(pyModule.get()));
   CHECK_PY(pyDict) << "Get Dict failed.";
   PyObjectPtr pyClass(PyDict_GetItemString(pyDict.get(), className.c_str()));
-  LOG(INFO) << "createPythonClass className.c_str():" << className.c_str();
+  // LOG(INFO) << "createPythonClass className.c_str():" << className.c_str();
   CHECK_PY(pyClass) << "Import class " << className << " failed.";
   PyObjectPtr argsObjectList(PyTuple_New(args.size()));
   for (size_t i = 0; i < args.size(); ++i) {

From 7ff8e76229325b6f5dcb11dce83fff30493ea6bf Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Tue, 20 Sep 2016 20:51:02 +0800
Subject: [PATCH 31/41] Shrink batch size on unit test for Mac OS X

---
 paddle/gserver/tests/test_LayerGrad.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index 5c80eb546cfaf..3150c31e4900c 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -50,7 +50,7 @@ TEST(Operator, dot_mul) {
 TEST(Projection, context) {
   for (auto contextStart : {-5, -3, -1, 0, 3}) {
     for (auto contextLength : {1, 2, 5, 7}) {
-      for (auto batchSize : {1, 2, 5, 20, 100}) {
+      for (auto batchSize : {1, 2, 5, 20, 50}) {
         for (auto trainablePadding : {false, true}) {
           LOG(INFO) << " contextStart=" << contextStart
                     << " contextLength=" << contextLength

From aaed5cfccc6460b32c7884ac1a07391bb7b3d869 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Tue, 20 Sep 2016 22:26:45 +0800
Subject: [PATCH 32/41] revert real into float for swig API

---
 paddle/api/Matrix.cpp  | 34 ++++++++++++++++----------------
 paddle/api/PaddleAPI.h | 44 +++++++++++++++++++++---------------------
 paddle/api/Util.cpp    |  4 ++--
 paddle/api/Vector.cpp  | 30 ++++++++++++++--------------
 4 files changed, 56 insertions(+), 56 deletions(-)

diff --git a/paddle/api/Matrix.cpp b/paddle/api/Matrix.cpp
index c40a47f3accf9..9ae3716fa862c 100644
--- a/paddle/api/Matrix.cpp
+++ b/paddle/api/Matrix.cpp
@@ -44,7 +44,7 @@ Matrix* Matrix::createZero(size_t height, size_t width, bool useGpu) {
   return m;
 }
 
-Matrix* Matrix::createDense(const std::vector<real>& data, size_t height,
+Matrix* Matrix::createDense(const std::vector<float>& data, size_t height,
                             size_t width, bool useGpu) {
   auto m = new Matrix();
   m->m->mat = paddle::Matrix::create(height, width, useGpu);
@@ -52,7 +52,7 @@ Matrix* Matrix::createDense(const std::vector<real>& data, size_t height,
   return m;
 }
 
-Matrix* Matrix::createCpuDenseFromNumpy(real* data, int dim1, int dim2,
+Matrix* Matrix::createCpuDenseFromNumpy(float* data, int dim1, int dim2,
                                         bool copy) {
   auto m = new Matrix();
   if (copy) {
@@ -64,7 +64,7 @@ Matrix* Matrix::createCpuDenseFromNumpy(real* data, int dim1, int dim2,
   return m;
 }
 
-Matrix* Matrix::createGpuDenseFromNumpy(real* data, int dim1, int dim2) {
+Matrix* Matrix::createGpuDenseFromNumpy(float* data, int dim1, int dim2) {
   auto m = new Matrix();
   m->m->mat = paddle::Matrix::create(dim1, dim2, false, true);
   m->m->mat->copyFrom(data, dim1 * dim2);
@@ -86,7 +86,7 @@ size_t Matrix::getHeight() const { return m->mat->getHeight(); }
 
 size_t Matrix::getWidth() const { return m->mat->getWidth(); }
 
-real Matrix::get(size_t x, size_t y) const throw(RangeError) {
+float Matrix::get(size_t x, size_t y) const throw(RangeError) {
   if (x > this->getWidth() || y > this->getHeight()) {
     RangeError e;
     throw e;
@@ -94,7 +94,7 @@ real Matrix::get(size_t x, size_t y) const throw(RangeError) {
   return m->mat->getElement(x, y);
 }
 
-void Matrix::set(size_t x, size_t y, real val) throw(RangeError,
+void Matrix::set(size_t x, size_t y, float val) throw(RangeError,
                                                      UnsupportError) {
   if (x > this->getWidth() || y > this->getHeight()) {
     RangeError e;
@@ -193,10 +193,10 @@ FloatArray Matrix::getData() const {
   auto rawMat = m->mat.get();
   if (dynamic_cast<paddle::GpuMemoryHandle*>(rawMat->getMemoryHandle().get())) {
     // is gpu. then copy data
-    real* data = rawMat->getData();
+    float* data = rawMat->getData();
     size_t len = rawMat->getElementCnt();
-    real* cpuData = new real[len];
-    hl_memcpy_device2host(cpuData, data, len * sizeof(real));
+    float* cpuData = new float[len];
+    hl_memcpy_device2host(cpuData, data, len * sizeof(float));
     FloatArray ret_val(cpuData, len);
     ret_val.needFree = true;
     return ret_val;
@@ -208,7 +208,7 @@ FloatArray Matrix::getData() const {
 
 void Matrix::sparseCopyFrom(
     const std::vector<int>& rows, const std::vector<int>& cols,
-    const std::vector<real>& vals) throw(UnsupportError) {
+    const std::vector<float>& vals) throw(UnsupportError) {
   auto cpuSparseMat =
       std::dynamic_pointer_cast<paddle::CpuSparseMatrix>(m->mat);
   if (cpuSparseMat != nullptr) {
@@ -217,7 +217,7 @@ void Matrix::sparseCopyFrom(
     //  <<" ValSize = "<<vals.size();
     cpuSparseMat->copyFrom(const_cast<std::vector<int>&>(rows),
                            const_cast<std::vector<int>&>(cols),
-                           const_cast<std::vector<real>&>(vals));
+                           const_cast<std::vector<float>&>(vals));
   } else {
     UnsupportError e;
     throw e;
@@ -226,7 +226,7 @@ void Matrix::sparseCopyFrom(
 
 void* Matrix::getSharedPtr() const { return &m->mat; }
 
-void Matrix::toNumpyMatInplace(real** view_data, int* dim1,
+void Matrix::toNumpyMatInplace(float** view_data, int* dim1,
                                int* dim2) throw(UnsupportError) {
   auto cpuMat = std::dynamic_pointer_cast<paddle::CpuMatrix>(m->mat);
   if (cpuMat) {
@@ -237,9 +237,9 @@ void Matrix::toNumpyMatInplace(real** view_data, int* dim1,
     throw UnsupportError();
   }
 }
-void Matrix::copyToNumpyMat(real** view_m_data, int* dim1,
+void Matrix::copyToNumpyMat(float** view_m_data, int* dim1,
                             int* dim2) throw(UnsupportError) {
-  static_assert(sizeof(paddle::real) == sizeof(real),
+  static_assert(sizeof(float) == sizeof(float),
                 "Currently PaddleAPI only support for single "
                 "precision version of paddle.");
   if (this->isSparse()) {
@@ -247,16 +247,16 @@ void Matrix::copyToNumpyMat(real** view_m_data, int* dim1,
   } else {
     *dim1 = m->mat->getHeight();
     *dim2 = m->mat->getWidth();
-    *view_m_data = new real[(*dim1) * (*dim2)];
+    *view_m_data = new float[(*dim1) * (*dim2)];
     if (auto cpuMat = dynamic_cast<paddle::CpuMatrix*>(m->mat.get())) {
       auto src = cpuMat->getData();
       auto dest = *view_m_data;
-      std::memcpy(dest, src, sizeof(paddle::real) * (*dim1) * (*dim2));
+      std::memcpy(dest, src, sizeof(float) * (*dim1) * (*dim2));
     } else if (auto gpuMat = dynamic_cast<paddle::GpuMatrix*>(m->mat.get())) {
       auto src = gpuMat->getData();
       auto dest = *view_m_data;
       hl_memcpy_device2host(dest, src,
-                            sizeof(paddle::real) * (*dim1) * (*dim2));
+                            sizeof(float) * (*dim1) * (*dim2));
     } else {
       LOG(WARNING) << "Unexpected Situation";
       throw UnsupportError();
@@ -264,7 +264,7 @@ void Matrix::copyToNumpyMat(real** view_m_data, int* dim1,
   }
 }
 
-void Matrix::copyFromNumpyMat(real* data, int dim1,
+void Matrix::copyFromNumpyMat(float* data, int dim1,
                               int dim2) throw(UnsupportError, RangeError) {
   if (isSparse()) {
     throw UnsupportError();
diff --git a/paddle/api/PaddleAPI.h b/paddle/api/PaddleAPI.h
index 69f3240a77974..b3140617af188 100644
--- a/paddle/api/PaddleAPI.h
+++ b/paddle/api/PaddleAPI.h
@@ -56,10 +56,10 @@ class UnsupportError {};
 
 /// This type will map to python's list of float.
 struct FloatArray {
-  const real* buf;
+  const float* buf;
   const size_t length;
   bool needFree;  // true if the buf is dynamic alloced.
-  FloatArray(const real* b, const size_t l);
+  FloatArray(const float* b, const size_t l);
 };
 
 /// This type will map to python's list of int
@@ -72,11 +72,11 @@ struct IntArray {
 
 /// This type will map to python's list of (int, float)
 struct IntWithFloatArray {
-  const real* valBuf;
+  const float* valBuf;
   const int* idxBuf;
   const size_t length;
   bool needFree;
-  IntWithFloatArray(const real* v, const int* i, size_t l, bool f = false);
+  IntWithFloatArray(const float* v, const int* i, size_t l, bool f = false);
 };
 
 enum SparseValueType { SPARSE_NON_VALUE = 0, SPARSE_VALUE = 1 };
@@ -122,7 +122,7 @@ class Matrix {
    * @param data  list of float should be passed in python.
    * @note        the value will be copy into a new matrix.
    */
-  static Matrix* createDense(const std::vector<real>& data, size_t height,
+  static Matrix* createDense(const std::vector<float>& data, size_t height,
                              size_t width, bool useGpu = false);
 
   /**
@@ -134,11 +134,11 @@ class Matrix {
    *  @param copy  true if copy into a new matrix, false will create
    *               matrix inplace.
    */
-  static Matrix* createCpuDenseFromNumpy(real* data, int dim1, int dim2,
+  static Matrix* createCpuDenseFromNumpy(float* data, int dim1, int dim2,
                                          bool copy = false);
 
   /// Create Gpu Dense Matrix from numpy matrix, dtype=float32
-  static Matrix* createGpuDenseFromNumpy(real* data, int dim1, int dim2);
+  static Matrix* createGpuDenseFromNumpy(float* data, int dim1, int dim2);
 
   /**
    * Cast to numpy matrix.
@@ -154,15 +154,15 @@ class Matrix {
    * numpy_mat = m.toNumpyMat()
    * @endcode
    */
-  void toNumpyMatInplace(real** view_data, int* dim1,
+  void toNumpyMatInplace(float** view_data, int* dim1,
                          int* dim2) throw(UnsupportError);
 
   /// Copy To numpy mat.
-  void copyToNumpyMat(real** view_m_data, int* dim1,
+  void copyToNumpyMat(float** view_m_data, int* dim1,
                       int* dim2) throw(UnsupportError);
 
   /// Copy From Numpy Mat
-  void copyFromNumpyMat(real* data, int dim1, int dim2) throw(UnsupportError,
+  void copyFromNumpyMat(float* data, int dim1, int dim2) throw(UnsupportError,
                                                                RangeError);
 
   /// return true if this matrix is sparse.
@@ -181,9 +181,9 @@ class Matrix {
 
   size_t getWidth() const;
 
-  real get(size_t x, size_t y) const throw(RangeError);
+  float get(size_t x, size_t y) const throw(RangeError);
 
-  void set(size_t x, size_t y, real val) throw(RangeError, UnsupportError);
+  void set(size_t x, size_t y, float val) throw(RangeError, UnsupportError);
 
   /// return type is list of float
   FloatArray getData() const;
@@ -195,8 +195,8 @@ class Matrix {
    */
   void sparseCopyFrom(const std::vector<int>& rows,
                       const std::vector<int>& cols,
-                      const std::vector<real>& values =
-                          std::vector<real>()) throw(UnsupportError);
+                      const std::vector<float>& values =
+                          std::vector<float>()) throw(UnsupportError);
 
   bool isGpu() const;
 
@@ -228,33 +228,33 @@ class Vector {
    *
    * It will create a new vector, and copy data into it.
    */
-  static Vector* create(const std::vector<real>& data, bool useGpu = false);
+  static Vector* create(const std::vector<float>& data, bool useGpu = false);
 
   /**
    * Create Cpu Vector from numpy array, which dtype=float32
    *
    * If copy is false, it will create vector inplace.
    */
-  static Vector* createCpuVectorFromNumpy(real* data, int dim,
+  static Vector* createCpuVectorFromNumpy(float* data, int dim,
                                           bool copy = false);
 
   /// Create Gpu Vector from numpy array, which dtype=float32
-  static Vector* createGpuVectorFromNumpy(real* data, int dim);
+  static Vector* createGpuVectorFromNumpy(float* data, int dim);
 
   /// Cast to numpy array inplace.
-  void toNumpyArrayInplace(real** view_data, int* dim1) throw(UnsupportError);
+  void toNumpyArrayInplace(float** view_data, int* dim1) throw(UnsupportError);
 
   /// Copy to numpy array.
-  void copyToNumpyArray(real** view_m_data, int* dim1);
+  void copyToNumpyArray(float** view_m_data, int* dim1);
 
   /// Copy from numpy array.
-  void copyFromNumpyArray(real* data, int dim);
+  void copyFromNumpyArray(float* data, int dim);
 
   /// __getitem__ in python
-  real get(const size_t idx) const throw(RangeError, UnsupportError);
+  float get(const size_t idx) const throw(RangeError, UnsupportError);
 
   /// __setitem__ in python
-  void set(const size_t idx, real val) throw(RangeError, UnsupportError);
+  void set(const size_t idx, float val) throw(RangeError, UnsupportError);
 
   /// Return is GPU vector or not.
   bool isGpu() const;
diff --git a/paddle/api/Util.cpp b/paddle/api/Util.cpp
index fe89a62cd3908..8a6741078f2f1 100644
--- a/paddle/api/Util.cpp
+++ b/paddle/api/Util.cpp
@@ -31,13 +31,13 @@ void initPaddle(int argc, char** argv) {
   feenableexcept(FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW);
 }
 
-FloatArray::FloatArray(const real* b, const size_t l)
+FloatArray::FloatArray(const float* b, const size_t l)
     : buf(b), length(l), needFree(false) {}
 
 IntArray::IntArray(const int* b, const size_t l, bool f)
     : buf(b), length(l), needFree(f) {}
 
-IntWithFloatArray::IntWithFloatArray(const real* v, const int* i, size_t l,
+IntWithFloatArray::IntWithFloatArray(const float* v, const int* i, size_t l,
                                      bool f)
     : valBuf(v), idxBuf(i), length(l), needFree(f) {}
 
diff --git a/paddle/api/Vector.cpp b/paddle/api/Vector.cpp
index b61eb7934b781..1affc1a5fefb8 100644
--- a/paddle/api/Vector.cpp
+++ b/paddle/api/Vector.cpp
@@ -140,7 +140,7 @@ struct VectorPrivate {
   paddle::VectorPtr vec;
 
   void safeAccessData(const size_t idx,
-                      const std::function<void(real&)>& func) const
+                      const std::function<void(float&)>& func) const
       throw(RangeError, UnsupportError) {
     auto cpuVec = std::dynamic_pointer_cast<const paddle::CpuVector>(vec);
     if (cpuVec != nullptr) {
@@ -170,7 +170,7 @@ Vector* Vector::createZero(size_t sz, bool useGpu) {
   return retVec;
 }
 
-Vector* Vector::create(const std::vector<real>& data, bool useGpu) {
+Vector* Vector::create(const std::vector<float>& data, bool useGpu) {
   auto retVec = new Vector();
   retVec->m->vec = paddle::Vector::create(data.size(), useGpu);
   retVec->m->vec->copyFrom(data.data(), data.size());
@@ -188,7 +188,7 @@ Vector* Vector::createByPaddleVectorPtr(void* ptr) {
   }
 }
 
-Vector* Vector::createCpuVectorFromNumpy(real* data, int dim, bool copy) {
+Vector* Vector::createCpuVectorFromNumpy(float* data, int dim, bool copy) {
   CHECK_GT(dim, 0);
   auto retVec = new Vector();
   if (copy) {
@@ -200,7 +200,7 @@ Vector* Vector::createCpuVectorFromNumpy(real* data, int dim, bool copy) {
   return retVec;
 }
 
-Vector* Vector::createGpuVectorFromNumpy(real* data, int dim) {
+Vector* Vector::createGpuVectorFromNumpy(float* data, int dim) {
   CHECK_GT(dim, 0);
   auto retVec = new Vector();
   retVec->m->vec = paddle::Vector::create((size_t)dim, true);
@@ -208,7 +208,7 @@ Vector* Vector::createGpuVectorFromNumpy(real* data, int dim) {
   return retVec;
 }
 
-void Vector::toNumpyArrayInplace(real** view_data,
+void Vector::toNumpyArrayInplace(float** view_data,
                                  int* dim1) throw(UnsupportError) {
   auto v = std::dynamic_pointer_cast<paddle::CpuVector>(m->vec);
   if (v != nullptr) {
@@ -219,20 +219,20 @@ void Vector::toNumpyArrayInplace(real** view_data,
   }
 }
 
-void Vector::copyToNumpyArray(real** view_m_data, int* dim1) {
+void Vector::copyToNumpyArray(float** view_m_data, int* dim1) {
   *dim1 = m->vec->getSize();
-  *view_m_data = new real[*dim1];
+  *view_m_data = new float[*dim1];
   if (auto cpuVec = dynamic_cast<paddle::CpuVector*>(m->vec.get())) {
-    std::memcpy(*view_m_data, cpuVec->getData(), sizeof(real) * (*dim1));
+    std::memcpy(*view_m_data, cpuVec->getData(), sizeof(float) * (*dim1));
   } else if (auto gpuVec = dynamic_cast<paddle::CpuVector*>(m->vec.get())) {
     hl_memcpy_device2host(*view_m_data, gpuVec->getData(),
-                          sizeof(real) * (*dim1));
+                          sizeof(float) * (*dim1));
   } else {
     LOG(INFO) << "Unexpected situation";
   }
 }
 
-void Vector::copyFromNumpyArray(real* data, int dim) {
+void Vector::copyFromNumpyArray(float* data, int dim) {
   m->vec->resize(dim);
   m->vec->copyFrom(data, dim);
 }
@@ -241,15 +241,15 @@ bool Vector::isGpu() const {
   return std::dynamic_pointer_cast<paddle::GpuVector>(m->vec) != nullptr;
 }
 
-real Vector::get(const size_t idx) const throw(RangeError, UnsupportError) {
-  real r;
-  m->safeAccessData(idx, [&](real& o) { r = o; });
+float Vector::get(const size_t idx) const throw(RangeError, UnsupportError) {
+  float r;
+  m->safeAccessData(idx, [&](float& o) { r = o; });
   return r;
 }
 
-void Vector::set(const size_t idx, real val) throw(RangeError,
+void Vector::set(const size_t idx, float val) throw(RangeError,
                                                     UnsupportError) {
-  m->safeAccessData(idx, [&](real& o) { o = val; });
+  m->safeAccessData(idx, [&](float& o) { o = val; });
 }
 
 size_t Vector::getSize() const { return m->vec->getSize(); }

From b8d26ff4d7ceb2a7c78741e98992024fe9254ac2 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@1ad973e4-5ce8-4261-8a94-b56d1f490c56>
Date: Wed, 21 Sep 2016 17:10:15 +0800
Subject: [PATCH 33/41] fix float except bugs

---
 paddle/gserver/dataproviders/PyDataProvider.cpp | 3 +--
 paddle/math/PoolAllocator.h                     | 4 ----
 2 files changed, 1 insertion(+), 6 deletions(-)

diff --git a/paddle/gserver/dataproviders/PyDataProvider.cpp b/paddle/gserver/dataproviders/PyDataProvider.cpp
index cc3e09a3c2ecb..1332c0ab635b6 100644
--- a/paddle/gserver/dataproviders/PyDataProvider.cpp
+++ b/paddle/gserver/dataproviders/PyDataProvider.cpp
@@ -46,7 +46,6 @@ PyDataProvider::PyDataProvider(const DataConfig& config, bool useGpu,
 }
 
 void PyDataProvider::loadData(const std::vector<std::string>& fileList) {
-  int feFlag = fegetexcept();
   VLOG(1) << "module:" << pyModuleName_ << " class:" << pyClassName_;
   classInstance_ =
       createPythonClass(pyModuleName_, pyClassName_, fileList, pyUserArgs_);
@@ -57,7 +56,7 @@ void PyDataProvider::loadData(const std::vector<std::string>& fileList) {
   std::string headerInfo =
       std::string(PyString_AsString(obj.get()), PyString_Size(obj.get()));
   parseHeaderData(headerInfo);
-  feenableexcept(feFlag);
+  feenableexcept(FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW);
 }
 
 void PyDataProvider::parseHeaderData(const std::string& headerData) {
diff --git a/paddle/math/PoolAllocator.h b/paddle/math/PoolAllocator.h
index a50deece3a853..aca8ffb0ab42e 100644
--- a/paddle/math/PoolAllocator.h
+++ b/paddle/math/PoolAllocator.h
@@ -53,11 +53,7 @@ class PoolAllocator {
   void printAll();
   std::unique_ptr<Allocator> allocator_;
   std::mutex mutex_;
-#if defined(__APPLE__) || defined(__OSX__)
-  std::map<size_t, std::vector<void*>> pool_;
-#else
   std::unordered_map<size_t, std::vector<void*>> pool_;
-#endif
   size_t sizeLimit_;
   size_t poolMemorySize_;
   std::string name_;

From 32b55573292b40452f99616db40f2f8c8d7809da Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Fri, 23 Sep 2016 21:43:53 +0800
Subject: [PATCH 34/41] Add thread Barrier unit test

---
 paddle/utils/tests/CMakeLists.txt         |  1 +
 paddle/utils/tests/test_ThreadBarrier.cpp | 68 +++++++++++++++++++++++
 2 files changed, 69 insertions(+)
 create mode 100644 paddle/utils/tests/test_ThreadBarrier.cpp

diff --git a/paddle/utils/tests/CMakeLists.txt b/paddle/utils/tests/CMakeLists.txt
index 5b31cd393dd1f..51f1889392845 100644
--- a/paddle/utils/tests/CMakeLists.txt
+++ b/paddle/utils/tests/CMakeLists.txt
@@ -3,6 +3,7 @@ add_simple_unittest(test_Logging)
 add_simple_unittest(test_Thread)
 add_simple_unittest(test_StringUtils)
 add_simple_unittest(test_CustomStackTrace)
+add_simple_unittest(test_ThreadBarrier)
 
 add_executable(
     test_CustomStackTracePrint
diff --git a/paddle/utils/tests/test_ThreadBarrier.cpp b/paddle/utils/tests/test_ThreadBarrier.cpp
new file mode 100644
index 0000000000000..241cdda7bd1c9
--- /dev/null
+++ b/paddle/utils/tests/test_ThreadBarrier.cpp
@@ -0,0 +1,68 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <set>
+#include <vector>
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/CommandLineParser.h"
+#include "paddle/utils/Util.h"
+#include "paddle/utils/Locks.h"
+
+P_DEFINE_int32(test_thread_num, 100, "testing thread number");
+
+void testNormalImpl(size_t thread_num,
+                    const std::function<void(size_t,
+                    std::mutex&, std::set<std::thread::id>&,
+                    paddle::ThreadBarrier&)>& callback) {
+ std::mutex mutex;
+ std::set<std::thread::id> tids;
+ paddle::ThreadBarrier barrier(thread_num);
+
+ std::vector<std::thread> threads;
+ threads.reserve(thread_num);
+ for (int32_t i = 0; i < thread_num; ++i) {
+    threads.emplace_back([&thread_num, &mutex,
+                         &tids, &barrier, &callback]{
+        callback(thread_num, mutex, tids, barrier);
+    });
+ }
+
+ for (auto& thread : threads) {
+   thread.join();
+ }
+}
+
+TEST(ThreadBarrier, normalTest) {
+  for (auto &thread_num : {10, 30, 50 , 100 , 300, 1000}) {
+    testNormalImpl(thread_num,
+                  [](size_t thread_num, std::mutex& mutex,
+                  std::set<std::thread::id>& tids,
+                  paddle::ThreadBarrier& barrier){
+      {
+        std::lock_guard<std::mutex> guard(mutex);
+        tids.insert(std::this_thread::get_id());
+      }
+      barrier.wait();
+      // Check whether all threads reach this point or not
+      CHECK_EQ(tids.size(), thread_num);
+    });
+  }
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  paddle::initMain(argc, argv);
+  return RUN_ALL_TESTS();
+}

From 1d4bc47805252b7b8859cc6d65bbad508aa95028 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Mon, 26 Sep 2016 17:27:43 +0800
Subject: [PATCH 35/41] support gettid() on MAC OS X

---
 paddle/cuda/src/hl_cuda_device.cc | 13 ++++++++++++-
 paddle/utils/Thread.h             | 13 ++++++++++++-
 paddle/utils/ThreadLocal.h        | 10 +++++++++-
 3 files changed, 33 insertions(+), 3 deletions(-)

diff --git a/paddle/cuda/src/hl_cuda_device.cc b/paddle/cuda/src/hl_cuda_device.cc
index f07538d6ba713..acd8e2fe6afb4 100644
--- a/paddle/cuda/src/hl_cuda_device.cc
+++ b/paddle/cuda/src/hl_cuda_device.cc
@@ -209,7 +209,18 @@ __thread cudaStream_t default_stream = 0;
 __thread bool g_sync_flag = true;
 bool hl_start_flag = false;
 
-#define gettid() syscall(SYS_gettid)
+inline pid_t gettid() {
+#if defined(__APPLE__) || defined(__OSX__)
+  pid_t tid = syscall(SYS_thread_selfid);
+#else
+  #ifndef __NR_gettid
+  #define __NR_gettid 224
+  #endif
+  pid_t tid = syscall(__NR_gettid);
+#endif
+  CHECK_NE(tid, -1);
+  return tid;    
+}
 
 void hl_init(int device) {
   CHECK(hl_start_flag)
diff --git a/paddle/utils/Thread.h b/paddle/utils/Thread.h
index 3e1d95ab1fcde..f1352e75d73a0 100644
--- a/paddle/utils/Thread.h
+++ b/paddle/utils/Thread.h
@@ -18,7 +18,18 @@ limitations under the License. */
 
 #include <sys/syscall.h>
 #include <unistd.h>
-inline pid_t gettid() { return syscall(SYS_gettid); }
+inline pid_t gettid() {
+#if defined(__APPLE__) || defined(__OSX__)
+  pid_t tid = syscall(SYS_thread_selfid);
+#else
+  #ifndef __NR_gettid
+  #define __NR_gettid 224
+  #endif
+  pid_t tid = syscall(__NR_gettid);
+#endif
+  CHECK_NE(tid, -1);
+  return tid;
+}
 
 #include "Queue.h"
 #include "ThreadLocal.h"
diff --git a/paddle/utils/ThreadLocal.h b/paddle/utils/ThreadLocal.h
index e782868f69a5d..686a1a99a4aa0 100644
--- a/paddle/utils/ThreadLocal.h
+++ b/paddle/utils/ThreadLocal.h
@@ -156,7 +156,15 @@ class ThreadLocalD {
   static void dataDestructor(void* p) { delete (T*)p; }
 
   void updateMap(T* p) {
-    pid_t tid = syscall(SYS_gettid);
+#if defined(__APPLE__) || defined(__OSX__)
+    pid_t tid = syscall(SYS_thread_selfid);
+#else
+    #ifndef __NR_gettid
+    #define __NR_gettid 224
+    #endif
+    pid_t tid = syscall(__NR_gettid);
+#endif
+    CHECK_NE(tid, -1);
     std::lock_guard<std::mutex> guard(mutex_);
     auto ret = threadMap_.insert(std::make_pair(tid, p));
     if (!ret.second) {

From a8df411192f9831a2c99916dbd88db409dbcf01b Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Wed, 28 Sep 2016 15:56:29 +0800
Subject: [PATCH 36/41] Replace random_shuffle using shuffle. * reduce trainer
 count for unit test on MAC OSX

---
 paddle/gserver/dataproviders/DataProviderGroup.h   | 3 ++-
 paddle/gserver/dataproviders/ProtoDataProvider.cpp | 3 ++-
 paddle/math/Matrix.cpp                             | 3 ++-
 paddle/trainer/tests/test_CompareSparse.cpp        | 6 +++---
 4 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/paddle/gserver/dataproviders/DataProviderGroup.h b/paddle/gserver/dataproviders/DataProviderGroup.h
index decbde6c91758..0689f90f3e7dd 100644
--- a/paddle/gserver/dataproviders/DataProviderGroup.h
+++ b/paddle/gserver/dataproviders/DataProviderGroup.h
@@ -65,7 +65,8 @@ void DataProviderGroup<T>::reset() {
   provider_ = nullptr;
 
   // shuffle file list
-  std::random_shuffle(fileList_.begin(), fileList_.end());
+  std::shuffle(fileList_.begin(), fileList_.end(),
+      ThreadLocalRandomEngine::get());
 
   startLoader();
   DataProvider::reset();
diff --git a/paddle/gserver/dataproviders/ProtoDataProvider.cpp b/paddle/gserver/dataproviders/ProtoDataProvider.cpp
index b0c14c85b2d81..344644755f240 100644
--- a/paddle/gserver/dataproviders/ProtoDataProvider.cpp
+++ b/paddle/gserver/dataproviders/ProtoDataProvider.cpp
@@ -374,7 +374,8 @@ void ProtoDataProvider::reset() {
 }
 
 void ProtoDataProvider::shuffle() {
-  std::random_shuffle(shuffledSequenceIds_.begin(), shuffledSequenceIds_.end());
+  std::shuffle(shuffledSequenceIds_.begin(), shuffledSequenceIds_.end(),
+      ThreadLocalRandomEngine::get());
 }
 
 /*
diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp
index 1b7f9ac5dac16..e351bede724ac 100644
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -2514,7 +2514,8 @@ void SharedCpuMatrix::mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB,
     for (int k = 0; k < blockNum_; ++k) {
       blockSeq.push_back(k);
     }
-    std::random_shuffle(blockSeq.begin(), blockSeq.end());
+    std::shuffle(blockSeq.begin(), blockSeq.end(),
+        ThreadLocalRandomEngine::get());
   }
   std::vector<int>& localBufRows = *localBufRows_;
   int* cols = a->getCols();
diff --git a/paddle/trainer/tests/test_CompareSparse.cpp b/paddle/trainer/tests/test_CompareSparse.cpp
index 3070682c0a2ef..ff37d7b364840 100644
--- a/paddle/trainer/tests/test_CompareSparse.cpp
+++ b/paddle/trainer/tests/test_CompareSparse.cpp
@@ -146,12 +146,12 @@ TEST(compareSparse, remote_cpu) {
 TEST(compareSparse, cpu10_local_vs_remote) {
   FLAGS_local = 1;  // disable remote sparse update in parameter config
   std::vector<ParameterPtr> localParameters =
-      trainerOnePassTest(configFile1, true, 10);
+      trainerOnePassTest(configFile1, true, 2);
 
   FLAGS_local = 0;  // will enable remote sparse update
   FLAGS_ports_num_for_sparse = 5;
   std::vector<ParameterPtr> remoteParameters =
-      trainerOnePassTest(configFile1, true, 10);
+      trainerOnePassTest(configFile1, true, 2);
 
   compareValue(localParameters, remoteParameters);
 }
@@ -174,7 +174,7 @@ TEST(compareSparse, multiGradientMachine) {
       FLAGS_parallel_nn = useGpu;
       LOG(INFO) << " local=" << local
                 << " useGpu=" << useGpu;
-      int trainerCount = useGpu ? numGpu : 10;
+      int trainerCount = useGpu ? numGpu : 2;
       std::vector<ParameterPtr> parameters =
           trainerOnePassTest(configFile1, true, trainerCount, useGpu);
       compareValue(getDenseParameters(), parameters, eps);

From 0072ef50bfa5574ad4d084968b8fbbb0380549c6 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Wed, 28 Sep 2016 18:00:39 +0800
Subject: [PATCH 37/41] Fix compile check type failed in linux

---
 paddle/gserver/tests/test_PyDataProvider2.cpp | 2 +-
 paddle/utils/tests/test_ThreadBarrier.cpp     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/gserver/tests/test_PyDataProvider2.cpp b/paddle/gserver/tests/test_PyDataProvider2.cpp
index c5fe31b29187f..e75e53ab7f431 100644
--- a/paddle/gserver/tests/test_PyDataProvider2.cpp
+++ b/paddle/gserver/tests/test_PyDataProvider2.cpp
@@ -321,7 +321,7 @@ TEST(PyDataProvider2, input_order) {
     if (!realBatchSize) {
       break;
     }
-    ASSERT_EQ(batch.getStreams().size(), 2);
+    ASSERT_EQ(batch.getStreams().size(), (size_t)2);
     for (size_t i = 0; i < realBatchSize; ++i) {
       ASSERT_EQ(batch.getStream(0).ids->getData()[i], 0);
       ASSERT_EQ(batch.getStream(1).ids->getData()[i], 1);
diff --git a/paddle/utils/tests/test_ThreadBarrier.cpp b/paddle/utils/tests/test_ThreadBarrier.cpp
index 241cdda7bd1c9..90bd6c21bc8e5 100644
--- a/paddle/utils/tests/test_ThreadBarrier.cpp
+++ b/paddle/utils/tests/test_ThreadBarrier.cpp
@@ -32,7 +32,7 @@ void testNormalImpl(size_t thread_num,
 
  std::vector<std::thread> threads;
  threads.reserve(thread_num);
- for (int32_t i = 0; i < thread_num; ++i) {
+ for (size_t i = 0; i < thread_num; ++i) {
     threads.emplace_back([&thread_num, &mutex,
                          &tids, &barrier, &callback]{
         callback(thread_num, mutex, tids, barrier);

From eaedef89d0e65a378f1253509c014d4f7d6a197e Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Wed, 28 Sep 2016 18:01:25 +0800
Subject: [PATCH 38/41] Update build doc

---
 doc/build/build_from_source.md | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/doc/build/build_from_source.md b/doc/build/build_from_source.md
index c71ff260f8d0a..6f9e03f2c28f1 100644
--- a/doc/build/build_from_source.md
+++ b/doc/build/build_from_source.md
@@ -48,7 +48,7 @@ sudo apt-get install libgflags-dev
 sudo apt-get install libgtest-dev
 sudo pip install wheel
 pushd /usr/src/gtest
-cmake ..
+cmake .
 make
 sudo cp *.a /usr/lib
 popd
@@ -106,19 +106,19 @@ Here are some examples of cmake command with different options:
 **only cpu**
 
 ```bash
-cmake -DWITH_GPU=OFF -DWITH_DOC=OFF ..
+cmake -DWITH_GPU=OFF -DWITH_DOC=OFF
 ```
 
 **gpu**
 
 ```bash
-cmake -DWITH_GPU=ON -DWITH_DOC=OFF ..
+cmake -DWITH_GPU=ON -DWITH_DOC=OFF
 ```
 
 **gpu with doc and swig**
 
 ```bash
-cmake -DWITH_GPU=ON -DWITH_DOC=ON -DWITH_SWIG_PY=ON ..
+cmake -DWITH_GPU=ON -DWITH_DOC=ON -DWITH_SWIG_PY=ON
 ``` 
 
 Finally, you can download source code and build:
@@ -232,19 +232,19 @@ Here are some examples of CMake command with different options:
 **only cpu**
 
 ```bash
-cmake -DWITH_GPU=OFF -DWITH_DOC=OFF ..
+cmake -DWITH_GPU=OFF -DWITH_DOC=OFF
 ```
 
 **gpu**
 
 ```bash
-cmake -DWITH_GPU=ON -DWITH_DOC=OFF ..
+cmake -DWITH_GPU=ON -DWITH_DOC=OFF
 ```
 
 **gpu with doc and swig**
 
 ```bash
-cmake -DWITH_GPU=ON -DWITH_DOC=ON -DWITH_SWIG_PY=ON ..
+cmake -DWITH_GPU=ON -DWITH_DOC=ON -DWITH_SWIG_PY=ON
 ``` 
 
 Finally, you can download source code and build:

From cbb904356cec0b90effe46892ae3af2b073af9e4 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Thu, 29 Sep 2016 15:30:21 +0800
Subject: [PATCH 39/41] Remove main function in some unittest.

---
 paddle/math/tests/test_CpuGpuVector.cpp        |  6 ------
 paddle/math/tests/test_matrixCompare.cpp       |  5 -----
 paddle/math/tests/test_perturbation.cpp        | 11 -----------
 paddle/math/tests/test_sparseMatrixCompare.cpp |  6 ------
 paddle/utils/tests/test_StringUtils.cpp        |  5 -----
 5 files changed, 33 deletions(-)

diff --git a/paddle/math/tests/test_CpuGpuVector.cpp b/paddle/math/tests/test_CpuGpuVector.cpp
index 61b424e3c6647..7b50b020cda93 100644
--- a/paddle/math/tests/test_CpuGpuVector.cpp
+++ b/paddle/math/tests/test_CpuGpuVector.cpp
@@ -84,10 +84,4 @@ int main(int argc, char** argv) {
   return ret;
 }
 
-#else
-
-int main(int argc, char const* argv[]) {
-    return 0;
-}
-
 #endif
diff --git a/paddle/math/tests/test_matrixCompare.cpp b/paddle/math/tests/test_matrixCompare.cpp
index ac50e7b7499d8..fe8eacc2efbc5 100644
--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
@@ -1851,10 +1851,5 @@ int main(int argc, char** argv) {
   initMain(argc, argv);
   return RUN_ALL_TESTS();
 }
-#else
-
-int main(int argc, char const* argv[]) {
-    return 0;
-}
 
 #endif
diff --git a/paddle/math/tests/test_perturbation.cpp b/paddle/math/tests/test_perturbation.cpp
index 050f2ca9ced80..4fa9bc72013da 100644
--- a/paddle/math/tests/test_perturbation.cpp
+++ b/paddle/math/tests/test_perturbation.cpp
@@ -249,15 +249,4 @@ TEST_F(PerturbationTest, scale_test) {
   }
 }
 
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
-
-#else
-
-int main(int argc, char const* argv[]) {
-    return 0;
-}
-
 #endif
diff --git a/paddle/math/tests/test_sparseMatrixCompare.cpp b/paddle/math/tests/test_sparseMatrixCompare.cpp
index b3467e4982e24..6048dd8112229 100644
--- a/paddle/math/tests/test_sparseMatrixCompare.cpp
+++ b/paddle/math/tests/test_sparseMatrixCompare.cpp
@@ -178,10 +178,4 @@ int main(int argc, char** argv) {
   return ret;
 }
 
-#else
-
-int main(int argc, char const* argv[]) {
-    return 0;
-}
-
 #endif
diff --git a/paddle/utils/tests/test_StringUtils.cpp b/paddle/utils/tests/test_StringUtils.cpp
index 95290005ae983..b8636709e9b42 100644
--- a/paddle/utils/tests/test_StringUtils.cpp
+++ b/paddle/utils/tests/test_StringUtils.cpp
@@ -22,8 +22,3 @@ TEST(StringUtil, to) {
   ASSERT_DEATH(paddle::str::to<double>("12.45x23"), ".*");
   ASSERT_DEATH(paddle::str::to<int>(""), ".*");
 }
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}

From 8ddc5faac162137678761d5c38fd9e80b70e87c7 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Thu, 29 Sep 2016 19:38:07 +0800
Subject: [PATCH 40/41] Update Mac OS X port * follow comments to fix bugs

---
 cmake/cblas.cmake                             |   4 +-
 cmake/util.cmake                              |   4 +-
 doc/build/build_from_source.md                | 385 ++++++++++--------
 paddle/api/Matrix.cpp                         |   8 +-
 .../gradientmachines/NeuralNetwork.cpp        |   9 +-
 paddle/trainer/tests/test_Trainer.cpp         |   4 +
 paddle/utils/PythonUtil.cpp                   |   4 +-
 paddle/utils/PythonUtil.h                     |   7 -
 paddle/utils/Stat.cpp                         |  18 +-
 paddle/utils/Thread.h                         |  20 +-
 paddle/utils/ThreadLocal.cpp                  |  10 +-
 paddle/utils/Util.cpp                         |  13 +
 paddle/utils/Util.h                           |   5 +
 13 files changed, 259 insertions(+), 232 deletions(-)

diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake
index 5568f927572f5..529b4b9d15d09 100644
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -44,8 +44,8 @@ set(ATLAS_LIB_SEARCH_PATHS
         /usr/lib
         /usr/lib/blas/atlas
         /usr/lib/atlas
-        /usr/lib/atlas-base)   # special for ubuntu 14.04.
-
+        /usr/lib/atlas-base   # special for ubuntu 14.04.
+    )
 find_path(ATLAS_INC_DIR NAMES cblas.h 
   PATHS ${ATLAS_INCLUDE_SEARCH_PATHS})
 find_library(ATLAS_CBLAS_LIB NAMES cblas libcblas.so.3 
diff --git a/cmake/util.cmake b/cmake/util.cmake
index 4e9efd3c187b0..d776c3ae49952 100644
--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@@ -24,7 +24,9 @@ function(target_circle_link_libraries TARGET_NAME)
                 list(APPEND libsInArgn ${arg})
             endif()
         endforeach()
-
+        if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
+            list(APPEND LIBS "-undefined dynamic_lookup")
+        endif()
         list(REVERSE libsInArgn)
         target_link_libraries(${TARGET_NAME}
             ${LIBS}
diff --git a/doc/build/build_from_source.md b/doc/build/build_from_source.md
index 6f9e03f2c28f1..f9899086bf060 100644
--- a/doc/build/build_from_source.md
+++ b/doc/build/build_from_source.md
@@ -1,148 +1,189 @@
-Build and Install
+Installing from Sources
 =================
 
-* [1. Requirement](#Requirement)
-* [2. Build on Ubuntu](#ubuntu)
-* [3. Build on Mac OS X](#mac)
+* [1. Download and Setup](#download)
+* [2. Requirements](#requirements)
+* [3. Build on Ubuntu](#ubuntu)
+* [4. Build on Mac OS X](#mac)
 
-## <span id="Requirement">Requirement</span>
+## <span id="download">Download and Setup</span> 
+You can download PaddlePaddle from the [github source](https://github.com/gangliao/Paddle).
 
-### Dependents
+```bash
+git clone https://github.com/baidu/Paddle paddle
+```
 
-- **CMake**: required for 2.8+ version
-- **g++**: a recent c++ compiler supporting c++11, >= 4.6, < 5
-- **BLAS library**: such as openBLAS, MKL, ATLAS
-- **protobuf**: required for 2.4+ version, 3.x is not supported
-- **python**: currently only 2.7 version is supported
+## <span id="requirements">Requirements</span>
 
-### Optional
+To compile the source code, your computer must be equipped with GCC >=4.6 or Clang Compiler.
+### Dependencies
 
-PaddlePaddle also support some build options, you have to install related libraries. 
+- **CMake**: version >= 2.8
+- **BLAS**: MKL, OpenBlas or ATLAS
+- **protobuf**: version >= 2.4, **Note: 3.x is not supported**
+- **python**: only python 2.7 is supported currently
 
-- **WITH_GPU**: Compile with gpu mode
-  - The GPU version works best with Cuda Toolkit 7.5 and cuDNN v5
-  - Other versions Cuda Toolkit 6.5, 7.0 and cuDNN v2, v3, v4 are also supported
-  - Note: to utilize cuDNN v5, Cuda Toolkit 7.5 is prerequisite and vice versa
-- **WITH_DOUBLE**: Compile with double precision, otherwise use single precision 
-- **WITH_GLOG**: Compile with glog, otherwise use a log implement internally
-- **WITH_GFLAGS**: Compile with gflags, otherwise use a flag implement internally
-- **WITH_TESTING**: Compile with gtest and run unittest for PaddlePaddle 
-- **WITH_DOC**: Compile with documentation
-- **WITH_SWIG_PY**: Compile with python predict api
-- **WITH_STYLE_CHECK**: Style check for source code
+### Options
 
+PaddlePaddle supports some build options. To enable it, first you need to install the related libraries. 
 
-## <span id="ubuntu">Building on Ubuntu14.04</span>
+    Optional            | Description
+    ------------        | :-----------
+    **WITH_GPU**        | Compile with GPU mode.
+    **WITH_DOUBLE**     | Compile with double precision floating-point, default: single precision. |
+    **WITH_GLOG**       | Compile with glog. If not found, default: an internal log implementation.
+    **WITH_GFLAGS**     | Compile with gflags. If not found, default: an internal flag implementation.
+    **WITH_TESTING**    | Compile with gtest for PaddlePaddle's unit testing. 
+    **WITH_DOC**        | Compile to generate PaddlePaddle's docs, default: disabled (OFF).
+    **WITH_SWIG_PY**    | Compile with python predict API, default: disabled (OFF).
+    **WITH_STYLE_CHECK**| Compile with code style check, default: enabled (ON).
+|
 
-### Install Dependencies
+**Note:**
+  - The GPU version works best with Cuda Toolkit 7.5 and cuDNN v5.
+  - Other versions like Cuda Toolkit 6.5, 7.0, 8.0 and cuDNN v2, v3, v4 are also supported.
+  - **To utilize cuDNN v5, Cuda Toolkit 7.5 is prerequisite and vice versa.**
 
-- **CPU Dependencies**
+As a simple example, consider the following:  
 
-```bash
-# necessary
-sudo apt-get update
-sudo apt-get install -y g++ make cmake build-essential libatlas-base-dev python python-pip libpython-dev m4 libprotobuf-dev protobuf-compiler python-protobuf python-numpy git
-# optional
-sudo apt-get install libgoogle-glog-dev
-sudo apt-get install libgflags-dev
-sudo apt-get install libgtest-dev
-sudo pip install wheel
-pushd /usr/src/gtest
-cmake .
-make
-sudo cp *.a /usr/lib
-popd
-```
-    
+1. **Python Dependencies(optional)**
   
-- **GPU Dependencies(optional)**
+    To compile PaddlePaddle with python predict API, make sure swig installed and set `-DWITH_SWIG_PY=ON` as follows:
 
-If you need to build GPU version, the first thing you need is a machine that has GPU and CUDA installed.
-And you also need to install cuDNN.
+    ```bash
+    # install swig on ubuntu
+    sudo apt-get install swig
+    # install swig on Mac OS X
+    brew install swig
 
-You can download CUDA toolkit and cuDNN from nvidia website:
-    
-```bash
-https://developer.nvidia.com/cuda-downloads
-https://developer.nvidia.com/cudnn
-```
-You can copy cuDNN files into the CUDA toolkit directory, such as:
+    # active swig in cmake
+    cmake .. -DWITH_SWIG_PY=ON
+    ```
 
-```bash
-sudo tar -xzf cudnn-7.5-linux-x64-v5.1.tgz -C /usr/local
-sudo chmod a+r /usr/local/cuda/include/cudnn.h /usr/local/cuda/lib64/libcudnn*
-```
-Then you need to set LD\_LIBRARY\_PATH, CUDA\_HOME and PATH environment variables in ~/.bashrc.
+2. **Doc Dependencies(optional)**
 
-```bash
-export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
-export CUDA_HOME=/usr/local/cuda
-export PATH=/usr/local/cuda/bin:$PATH
-```
-- **Python Dependencies(optional)**
+    To generate PaddlePaddle's documentation, install dependencies and set `-DWITH_DOC=ON` as follows:
 
-If you want to compile PaddlePaddle with python predict api, you need to add -DWITH_SWIG_PY=ON in cmake command and install these first:
+    ```bash
+    pip install 'sphinx>=1.4.0'
+    pip install sphinx_rtd_theme breathe recommonmark
 
-```bash
-sudo apt-get install swig
-```
+    # install doxygen on Ubuntu
+    sudo apt-get install doxygen 
+    # install doxygen on Mac OS X
+    brew install doxygen
 
-- **Doc Dependencies(optional)**
+    # active docs in cmake
+    cmake .. -DWITH_DOC=ON`
+    ```
 
-If you want to compile PaddlePaddle with doc, you need to add -DWITH_DOC=ON in cmake command and install these first:
+## <span id="ubuntu">Build on Ubuntu 14.04</span>
 
-```bash
-pip install 'sphinx>=1.4.0'
-pip install sphinx_rtd_theme breathe recommonmark
-sudo apt-get install doxygen 
-```
+### Install Dependencies
 
-### Build and Install
+- **CPU Dependencies**
 
-CMake will find dependent libraries in system default paths first. After installing some optional libraries, corresponding build option will automatically be on(such as glog, gtest and gflags). And if libraries are not found, you have to set following variables manually in cmake command(CUDNN_ROOT, ATLAS_ROOT, MKL_ROOT, OPENBLAS_ROOT).
+    ```bash
+    # necessary
+    sudo apt-get update
+    sudo apt-get install -y g++ make cmake build-essential libatlas-base-dev python python-pip libpython-dev m4 libprotobuf-dev protobuf-compiler python-protobuf python-numpy git
+    # optional
+    sudo apt-get install libgoogle-glog-dev
+    sudo apt-get install libgflags-dev
+    sudo apt-get install libgtest-dev
+    sudo pip install wheel
+    pushd /usr/src/gtest
+    cmake .
+    make
+    sudo cp *.a /usr/lib
+    popd
+    ```
+  
+- **GPU Dependencies (optional)**
 
-Here are some examples of cmake command with different options:
+    To build GPU version, you will need the following installed:
 
-**only cpu**
+        1. a CUDA-capable GPU
+        2. A supported version of Linux with a gcc compiler and toolchain
+        3. NVIDIA CUDA Toolkit (available at http://developer.nvidia.com/cuda-downloads)
+        4. NVIDIA cuDNN Library (availabel at https://developer.nvidia.com/cudnn)
 
-```bash
-cmake -DWITH_GPU=OFF -DWITH_DOC=OFF
-```
+    The CUDA development environment relies on tight integration with the host development environment,
+    including the host compiler and C runtime libraries, and is therefore only supported on
+    distribution versions that have been qualified for this CUDA Toolkit release.
+        
+    After downloading cuDNN library, issue the following commands:
 
-**gpu**
+    ```bash
+    sudo tar -xzf cudnn-7.5-linux-x64-v5.1.tgz -C /usr/local
+    sudo chmod a+r /usr/local/cuda/include/cudnn.h /usr/local/cuda/lib64/libcudnn*
+    ```
+    Then you need to set LD\_LIBRARY\_PATH, CUDA\_HOME and PATH environment variables in ~/.bashrc.
+
+    ```bash
+    export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
+    export CUDA_HOME=/usr/local/cuda
+    export PATH=/usr/local/cuda/bin:$PATH
+    ```
+
+### Build and Install
+
+As usual, the best option is to create build folder under paddle project directory.
 
 ```bash
-cmake -DWITH_GPU=ON -DWITH_DOC=OFF
+mkdir build && cd build
+cmake ..
 ```
 
-**gpu with doc and swig**
+CMake first check PaddlePaddle's dependecies in system default path. After installing some optional
+libraries, corresponding build option will be set automatically (for instance, glog, gtest and gflags).
+If still not found, you can manually set it based on CMake error information from your screen.
 
-```bash
-cmake -DWITH_GPU=ON -DWITH_DOC=ON -DWITH_SWIG_PY=ON
-``` 
+As a simple example, consider the following:
+
+- **Only CPU**
+
+  ```bash
+  cmake  .. -DWITH_GPU=OFF -DWITH_DOC=OFF
+  ```
+- **GPU**
+
+  ```bash
+  cmake .. -DWITH_GPU=ON -DWITH_DOC=OFF
+  ```
+
+- **GPU with doc and swig**
+
+  ```bash
+  cmake .. -DWITH_GPU=ON -DWITH_DOC=ON -DWITH_SWIG_PY=ON
+  ``` 
 
 Finally, you can download source code and build:
 
 ```bash
-git clone https://github.com/baidu/Paddle paddle
-cd paddle
-mkdir build
-cd build
 # you can add build option here, such as:    
-cmake -DWITH_GPU=ON -DWITH_DOC=OFF -DCMAKE_INSTALL_PREFIX=<path to install> ..
+cmake .. -DWITH_GPU=ON -DWITH_DOC=OFF -DCMAKE_INSTALL_PREFIX=<path to install>
 # please use sudo make install, if you want
 # to install PaddlePaddle into the system
 make -j `nproc` && make install
-# PaddlePaddle installation path
+# set PaddlePaddle installation path in ~/.bashrc
 export PATH=<path to install>/bin:$PATH
 ```
-**Note**
 
-And if you set WITH_SWIG_PY=ON, you have to install related python predict api at the same time:
+**Note:**
+
+If you set `WITH_SWIG_PY=ON`, related python dependencies also need to be installed.
+Otherwise, PaddlePaddle will automatically install python dependencies
+at first time when user run paddle commands, such as `paddle version`, `paddle train`.
+It may require sudo privileges:
 
 ```bash
-pip install <path to install>/opt/paddle/share/wheels/*.whl
+# you can run
+sudo pip install <path to install>/opt/paddle/share/wheels/*.whl
+# or just run 
+sudo paddle version
 ```
+
 ## <span id="mac">Building on Mac OS X</span>
 
 ### Prerequisites
@@ -150,7 +191,7 @@ This guide is based on Mac OS X 10.11 (El Capitan). Note that if you are running
 you will already have Python 2.7.10 and Numpy 1.8 installed.
 
 The best option is to use the package manager homebrew to handle installations and upgrades for you.
-To install homebrew, first open a terminal window (you can find Terminal in the Utilities folder in Applications), and issue the command:
+To install [homebrew](http://brew.sh/), first open a terminal window (you can find Terminal in the Utilities folder in Applications), and issue the command:
 
 ```bash
 # install brew
@@ -163,109 +204,103 @@ easy_install pip
 
 - **CPU Dependencies**
 
-```bash
-# Install fundamental dependents 
-brew install glog gflags cmake protobuf openblas
-
-# Install google test on Mac OS X
-# Download gtest 1.7.0
-wget https://github.com/google/googletest/archive/release-1.7.0.tar.gz
-tar -xvf googletest-release-1.7.0.tar.gz && cd googletest-release-1.7.0
-# Build gtest
-mkdir build && cmake ..
-make
-# Install gtest library
-sudo cp -r ../include/gtest /usr/local/include/
-sudo cp lib*.a /usr/local/lib
-```
-    
-  
+  ```bash
+  # Install fundamental dependents 
+  brew install glog gflags cmake protobuf openblas
+
+  # Install google test on Mac OS X
+  # Download gtest 1.7.0
+  wget https://github.com/google/googletest/archive/release-1.7.0.tar.gz
+  tar -xvf googletest-release-1.7.0.tar.gz && cd googletest-release-1.7.0
+  # Build gtest
+  mkdir build && cmake ..
+  make
+  # Install gtest library
+  sudo cp -r ../include/gtest /usr/local/include/
+  sudo cp lib*.a /usr/local/lib
+  ```
+
 - **GPU Dependencies(optional)**
 
-If you need to build GPU version, the first thing you need is a machine that has NVIDIA GPU and CUDA installed.
-And you also need to install cuDNN.
+    To build GPU version, you will need the following installed:
 
-You can download CUDA toolkit and cuDNN from nvidia website:
-    
-```bash
-https://developer.nvidia.com/cuda-downloads
-https://developer.nvidia.com/cudnn
-```
-You can copy cuDNN files into the CUDA toolkit directory, for instance:
-
-```bash
-sudo tar -xzf cudnn-7.5-osx-x64-v5.0-ga.tgz -C /usr/local
-sudo chmod a+r /usr/local/cuda/include/cudnn.h /usr/local/cuda/lib64/libcudnn*
-```
-Then you need to set DYLD\_LIBRARY\_PATH, CUDA\_HOME and PATH environment variables in ~/.bashrc.
+        1. a CUDA-capable GPU
+        2. Mac OS X 10.11 or later
+        2. the Clang compiler and toolchain installed using Xcode
+        3. NVIDIA CUDA Toolkit (available at http://developer.nvidia.com/cuda-downloads)
+        4. NVIDIA cuDNN Library (availabel at https://developer.nvidia.com/cudnn)
 
-```bash
-export DYLD_LIBRARY_PATH=/usr/local/cuda/lib:$DYLD_LIBRARY_PATH
-export PATH=/usr/local/cuda/bin:$PATH
-```
-- **Python Dependencies(optional)**
+    The CUDA development environment relies on tight integration with the host development environment,
+    including the host compiler and C runtime libraries, and is therefore only supported on
+    distribution versions that have been qualified for this CUDA Toolkit release.
+        
+    1. After downloading cuDNN library, issue the following commands:
 
-If you want to compile PaddlePaddle with python predict API, you need to add -DWITH_SWIG_PY=ON in cmake command and install these first:
+        ```bash
+        sudo tar -xzf cudnn-7.5-osx-x64-v5.0-ga.tgz -C /usr/local
+        sudo chmod a+r /usr/local/cuda/include/cudnn.h /usr/local/cuda/lib64/libcudnn*
+        ```
+    2. Then you need to set DYLD\_LIBRARY\_PATH, CUDA\_HOME and PATH environment variables in ~/.bashrc.
 
-```bash
-brew install swig
-```
+        ```bash
+        export DYLD_LIBRARY_PATH=/usr/local/cuda/lib:$DYLD_LIBRARY_PATH
+        export PATH=/usr/local/cuda/bin:$PATH
+        ```
 
-- **Doc Dependencies(optional)**
+### Build and Install
 
-If you want to compile PaddlePaddle with doc, you need to add -DWITH_DOC=ON in cmake command and install these first:
+As usual, the best option is to create build folder under paddle project directory.
 
 ```bash
-pip install 'sphinx>=1.4.0'
-pip install sphinx_rtd_theme breathe recommonmark
-brew install doxygen 
+mkdir build && cd build
+cmake ..
 ```
 
-### Build and Install
-
-CMake can find dependent libraries in system default paths firstly.
-After installing some optional libraries, corresponding build option will be on automatically (for instance, glog, gtest and gflags).
-If not found, you have to set following variables manually via CMake command (CUDNN_ROOT, ATLAS_ROOT, MKL_ROOT, OPENBLAS_ROOT).
-
-Here are some examples of CMake command with different options:
+CMake first check PaddlePaddle's dependecies in system default path. After installing some optional
+libraries, corresponding build option will be set automatically (for instance, glog, gtest and gflags).
+If still not found, you can manually set it based on CMake error information from your screen.
 
-**only cpu**
+As a simple example, consider the following:
 
-```bash
-cmake -DWITH_GPU=OFF -DWITH_DOC=OFF
-```
+- **Only CPU**
 
-**gpu**
+  ```bash
+  cmake  .. -DWITH_GPU=OFF -DWITH_DOC=OFF
+  ```
+- **GPU**
 
-```bash
-cmake -DWITH_GPU=ON -DWITH_DOC=OFF
-```
+  ```bash
+  cmake .. -DWITH_GPU=ON -DWITH_DOC=OFF
+  ```
 
-**gpu with doc and swig**
+- **GPU with doc and swig**
 
-```bash
-cmake -DWITH_GPU=ON -DWITH_DOC=ON -DWITH_SWIG_PY=ON
-``` 
+  ```bash
+  cmake .. -DWITH_GPU=ON -DWITH_DOC=ON -DWITH_SWIG_PY=ON
+  ``` 
 
-Finally, you can download source code and build:
+Finally, you can build PaddlePaddle:
 
 ```bash
-git clone https://github.com/baidu/Paddle paddle
-cd paddle
-mkdir build
-cd build
 # you can add build option here, such as:    
-cmake -DWITH_GPU=ON -DWITH_DOC=OFF -DCMAKE_INSTALL_PREFIX=<path to install> ..
-# please use sudo make install, if you want
-# to install PaddlePaddle into the system
+cmake .. -DWITH_GPU=ON -DWITH_DOC=OFF -DCMAKE_INSTALL_PREFIX=<installation path>
+# please use sudo make install, if you want to install PaddlePaddle into the system
 make -j `nproc` && make install
-# PaddlePaddle installation path
-export PATH=<path to install>/bin:$PATH
+# set PaddlePaddle installation path in ~/.bashrc
+export PATH=<installation path>/bin:$PATH
 ```
-**Note**
 
-And if you set WITH_SWIG_PY=ON, you have to install related python predict api at the same time:
+
+**Note:**
+
+If you set `WITH_SWIG_PY=ON`, related python dependencies also need to be installed.
+Otherwise, PaddlePaddle will automatically install python dependencies
+at first time when user run paddle commands, such as `paddle version`, `paddle train`.
+It may require sudo privileges:
 
 ```bash
+# you can run
 sudo pip install <path to install>/opt/paddle/share/wheels/*.whl
+# or just run 
+sudo paddle version
 ```
\ No newline at end of file
diff --git a/paddle/api/Matrix.cpp b/paddle/api/Matrix.cpp
index 9ae3716fa862c..6a79f83495a56 100644
--- a/paddle/api/Matrix.cpp
+++ b/paddle/api/Matrix.cpp
@@ -95,7 +95,7 @@ float Matrix::get(size_t x, size_t y) const throw(RangeError) {
 }
 
 void Matrix::set(size_t x, size_t y, float val) throw(RangeError,
-                                                     UnsupportError) {
+                                                      UnsupportError) {
   if (x > this->getWidth() || y > this->getHeight()) {
     RangeError e;
     throw e;
@@ -239,7 +239,7 @@ void Matrix::toNumpyMatInplace(float** view_data, int* dim1,
 }
 void Matrix::copyToNumpyMat(float** view_m_data, int* dim1,
                             int* dim2) throw(UnsupportError) {
-  static_assert(sizeof(float) == sizeof(float),
+  static_assert(sizeof(paddle::real) == sizeof(float),
                 "Currently PaddleAPI only support for single "
                 "precision version of paddle.");
   if (this->isSparse()) {
@@ -251,12 +251,12 @@ void Matrix::copyToNumpyMat(float** view_m_data, int* dim1,
     if (auto cpuMat = dynamic_cast<paddle::CpuMatrix*>(m->mat.get())) {
       auto src = cpuMat->getData();
       auto dest = *view_m_data;
-      std::memcpy(dest, src, sizeof(float) * (*dim1) * (*dim2));
+      std::memcpy(dest, src, sizeof(paddle::real) * (*dim1) * (*dim2));
     } else if (auto gpuMat = dynamic_cast<paddle::GpuMatrix*>(m->mat.get())) {
       auto src = gpuMat->getData();
       auto dest = *view_m_data;
       hl_memcpy_device2host(dest, src,
-                            sizeof(float) * (*dim1) * (*dim2));
+                            sizeof(paddle::real) * (*dim1) * (*dim2));
     } else {
       LOG(WARNING) << "Unexpected Situation";
       throw UnsupportError();
diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.cpp b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
index 0f497e44d4c25..3127b4dd9a2fd 100644
--- a/paddle/gserver/gradientmachines/NeuralNetwork.cpp
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
@@ -385,10 +385,17 @@ void NeuralNetwork::setOutputGrad(const std::vector<Argument>& args) {
   }
 }
 
+extern NeuralNetwork* newCustomNerualNetwork(
+  const std::string& name, NeuralNetwork* network) __attribute__((weak));
+
 NeuralNetwork* NeuralNetwork::newNeuralNetwork(
     const std::string& name,
     NeuralNetwork* rootNetwork) {
-    return new NeuralNetwork(name, rootNetwork);
+    if (newCustomNerualNetwork) {
+      return newCustomNerualNetwork(name, rootNetwork);
+    } else {
+      return new NeuralNetwork(name, rootNetwork);
+    }
 }
 
 }  // namespace paddle
diff --git a/paddle/trainer/tests/test_Trainer.cpp b/paddle/trainer/tests/test_Trainer.cpp
index 2044279c2151f..ad2a715ef89c6 100644
--- a/paddle/trainer/tests/test_Trainer.cpp
+++ b/paddle/trainer/tests/test_Trainer.cpp
@@ -94,7 +94,11 @@ TEST(checkGradient, multi) {
 TEST(checkGradient, hsigmoid) { checkGradientTest(configFile2, false, false); }
 
 TEST(checkGradient, chunk) {
+#if defined(__APPLE__) || defined (__OSX__)
   EXPECT_EQ(0, system("python trainer/tests/gen_proto_data.py"));
+#else
+  EXPECT_EQ(0, system("python2 trainer/tests/gen_proto_data.py"));
+#endif
   checkGradientTest(configFile3, false, false);
 #ifndef PADDLE_ONLY_CPU
   checkGradientTest(configFile3, true, true);
diff --git a/paddle/utils/PythonUtil.cpp b/paddle/utils/PythonUtil.cpp
index 9ee7a29aad0b6..78c3a80674f9c 100644
--- a/paddle/utils/PythonUtil.cpp
+++ b/paddle/utils/PythonUtil.cpp
@@ -144,12 +144,12 @@ PyObjectPtr createPythonClass(
     const std::map<std::string, std::string>& kwargs) {
   PyGuard guard;
   PyObjectPtr pyModule(PyImport_ImportModule(moduleName.c_str()));
-  // LOG(INFO) << "createPythonClass moduleName.c_str:" << moduleName.c_str();
+  LOG(INFO) << "createPythonClass moduleName.c_str:" << moduleName.c_str();
   CHECK_PY(pyModule) << "Import module " << moduleName << " failed.";
   PyObjectPtr pyDict(PyModule_GetDict(pyModule.get()));
   CHECK_PY(pyDict) << "Get Dict failed.";
   PyObjectPtr pyClass(PyDict_GetItemString(pyDict.get(), className.c_str()));
-  // LOG(INFO) << "createPythonClass className.c_str():" << className.c_str();
+  LOG(INFO) << "createPythonClass className.c_str():" << className.c_str();
   CHECK_PY(pyClass) << "Import class " << className << " failed.";
   PyObjectPtr argsObjectList(PyTuple_New(args.size()));
   for (size_t i = 0; i < args.size(); ++i) {
diff --git a/paddle/utils/PythonUtil.h b/paddle/utils/PythonUtil.h
index 2808338fbdf59..db02d1252b405 100644
--- a/paddle/utils/PythonUtil.h
+++ b/paddle/utils/PythonUtil.h
@@ -35,13 +35,6 @@ limitations under the License. */
 #include <Python.h>
 #include <frameobject.h>
 
-// #ifndef _POSIX_C_SOURCE
-// #warning "no _POSIX_C_SOURCE defined in Python.h"
-// #endif
-// #ifndef _XOPEN_SOURCE
-// #warning "no _XOPEN_SOURCE defined in Python.h"
-// #endif
-
 #endif
 
 #include "paddle/utils/Util.h"
diff --git a/paddle/utils/Stat.cpp b/paddle/utils/Stat.cpp
index ff6e8ade2cd48..d7b20ca5eb2f4 100644
--- a/paddle/utils/Stat.cpp
+++ b/paddle/utils/Stat.cpp
@@ -13,28 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "Stat.h"
-
-#include <sys/syscall.h>  // for syscall()
-#include <sys/types.h>
+#include "Util.h"
 #include <iomanip>
 #include <algorithm>
 
 namespace paddle {
 
-// return the thread id used by glog
-pid_t getTID() {
-  #if defined(__APPLE__) || defined(__OSX__)
-      pid_t tid = syscall(SYS_thread_selfid);
-  #else
-      #ifndef __NR_gettid
-      #define __NR_gettid 224
-      #endif
-      pid_t tid = syscall(__NR_gettid);
-  #endif
-  CHECK_NE(tid, -1);
-  return tid;
-}
-
 StatSet globalStat("GlobalStatInfo");
 
 void Stat::addSample(uint64_t value) {
diff --git a/paddle/utils/Thread.h b/paddle/utils/Thread.h
index f1352e75d73a0..f6c826a1eeb65 100644
--- a/paddle/utils/Thread.h
+++ b/paddle/utils/Thread.h
@@ -13,24 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include "Util.h"
 #include "Logging.h"
 #include <thread>
 
-#include <sys/syscall.h>
-#include <unistd.h>
-inline pid_t gettid() {
-#if defined(__APPLE__) || defined(__OSX__)
-  pid_t tid = syscall(SYS_thread_selfid);
-#else
-  #ifndef __NR_gettid
-  #define __NR_gettid 224
-  #endif
-  pid_t tid = syscall(__NR_gettid);
-#endif
-  CHECK_NE(tid, -1);
-  return tid;
-}
-
 #include "Queue.h"
 #include "ThreadLocal.h"
 
@@ -186,7 +172,7 @@ class SyncThreadPool {
         jobFinishBarrier_(numWorkers + 1),
         jobFunc_(nullptr),
         checkOwner_(checkOwner) {
-    ownerThreadId_ = ::gettid();
+    ownerThreadId_ = getTID();
     workers_.resize(numWorkers);
     start();
   }
@@ -210,7 +196,7 @@ class SyncThreadPool {
    */
   void exec(JobFunc jobFunc, JobFunc ownerFunc = nullptr) {
     if (checkOwner_) {
-      CHECK_EQ(ownerThreadId_, ::gettid())
+      CHECK_EQ(ownerThreadId_, getTID())
           << "this sync thread pool should be used in one thread";
     }
 
diff --git a/paddle/utils/ThreadLocal.cpp b/paddle/utils/ThreadLocal.cpp
index a4b399d144ee3..0f948f1029af8 100644
--- a/paddle/utils/ThreadLocal.cpp
+++ b/paddle/utils/ThreadLocal.cpp
@@ -12,10 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "Util.h"
 #include "ThreadLocal.h"
-
-#include "Thread.h"
-
 #include "CommandLineParser.h"
 
 P_DEFINE_bool(thread_local_rand_use_global_seed, false,
@@ -31,11 +29,11 @@ unsigned int* ThreadLocalRand::getSeed() {
   if (!p) {  // init seed
     if (FLAGS_thread_local_rand_use_global_seed) {
       p = new unsigned int(defaultSeed_);
-    } else if (getpid() == gettid()) {  // main thread
+    } else if (getpid() == getTID()) {  // main thread
       // deterministic, but differs from global srand()
       p = new unsigned int(defaultSeed_ - 1);
     } else {
-      p = new unsigned int(defaultSeed_ + gettid());
+      p = new unsigned int(defaultSeed_ + getTID());
       LOG(INFO) << "thread use undeterministic rand seed:" << *p;
     }
     seed_.set(p);
@@ -51,7 +49,7 @@ std::default_random_engine& ThreadLocalRandomEngine::get() {
     int defaultSeed = ThreadLocalRand::getDefaultSeed();
     engine->seed(FLAGS_thread_local_rand_use_global_seed
                      ? defaultSeed
-                     : defaultSeed + gettid());
+                     : defaultSeed + getTID());
     engine_.set(engine);
   }
   return *engine;
diff --git a/paddle/utils/Util.cpp b/paddle/utils/Util.cpp
index d8c3376fb18c4..c3c76f907d40e 100644
--- a/paddle/utils/Util.cpp
+++ b/paddle/utils/Util.cpp
@@ -93,6 +93,19 @@ static void installProfilerSwitch() {}
 
 namespace paddle {
 
+pid_t getTID() {
+  #if defined(__APPLE__) || defined(__OSX__)
+      pid_t tid = syscall(SYS_thread_selfid);
+  #else
+      #ifndef __NR_gettid
+      #define __NR_gettid 224
+      #endif
+      pid_t tid = syscall(__NR_gettid);
+  #endif
+  CHECK_NE(tid, -1);
+  return tid;
+}
+
 static bool g_initialized = false;
 typedef std::pair<int, std::function<void()>> PriorityFuncPair;
 typedef std::vector<PriorityFuncPair> InitFuncList;
diff --git a/paddle/utils/Util.h b/paddle/utils/Util.h
index 11a03e141dec5..a059ca50a538d 100644
--- a/paddle/utils/Util.h
+++ b/paddle/utils/Util.h
@@ -24,6 +24,8 @@ limitations under the License. */
 #include <unordered_map>
 #include <mutex>
 #include <functional>
+#include <sys/syscall.h>  // for syscall()
+#include <sys/types.h>
 
 #include "CommandLineParser.h"
 #include "Logging.h"
@@ -63,6 +65,9 @@ limitations under the License. */
 
 namespace paddle {
 
+// return the thread id used by glog
+pid_t getTID();
+
 /**
  * return the 1-based index of the highest bit set
  *

From efea5c84607cd73b9a318915624ba16b7448f8cf Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Thu, 29 Sep 2016 19:52:40 +0800
Subject: [PATCH 41/41] Revise some word in build doc

---
 doc/build/build_from_source.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/build/build_from_source.md b/doc/build/build_from_source.md
index f9899086bf060..a6090d6819162 100644
--- a/doc/build/build_from_source.md
+++ b/doc/build/build_from_source.md
@@ -135,7 +135,7 @@ mkdir build && cd build
 cmake ..
 ```
 
-CMake first check PaddlePaddle's dependecies in system default path. After installing some optional
+CMake first check PaddlePaddle's dependencies in system default path. After installing some optional
 libraries, corresponding build option will be set automatically (for instance, glog, gtest and gflags).
 If still not found, you can manually set it based on CMake error information from your screen.
 
@@ -256,7 +256,7 @@ mkdir build && cd build
 cmake ..
 ```
 
-CMake first check PaddlePaddle's dependecies in system default path. After installing some optional
+CMake first check PaddlePaddle's dependencies in system default path. After installing some optional
 libraries, corresponding build option will be set automatically (for instance, glog, gtest and gflags).
 If still not found, you can manually set it based on CMake error information from your screen.