Merge branch 'develop' of https://github.com/baidu/Paddle into benchm…

…ark_cfg_doc
PaddlePaddle · Nov 30, 2016 · 68060aa · 68060aa
2 parents b3945f9 + 767c4e8
commit 68060aa
Show file tree

Hide file tree

Showing 188 changed files with 2,152 additions and 804 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -42,7 +42,7 @@ addons:
 before_install:
   - |
     if [ ${JOB} == "BUILD_AND_TEST" ]; then
-      if ! git diff --name-only $TRAVIS_COMMIT_RANGE | grep -qvE '(\.md$)'
+      if ! git diff --name-only $TRAVIS_COMMIT_RANGE | grep -qvE '(\.md$)|(\.rst$)|(\.jpg$)|(\.png$)'
       then
         echo "Only markdown docs were updated, stopping build process."
         exit

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -3,7 +3,7 @@ cmake_minimum_required(VERSION 2.8)
 project(paddle CXX C)
 set(PADDLE_MAJOR_VERSION 0)
 set(PADDLE_MINOR_VERSION 9)
-set(PADDLE_PATCH_VERSION 0a0)
+set(PADDLE_PATCH_VERSION 0)
 set(PADDLE_VERSION ${PADDLE_MAJOR_VERSION}.${PADDLE_MINOR_VERSION}.${PADDLE_PATCH_VERSION})
 
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake")
@@ -36,6 +36,7 @@ option(WITH_RDMA "Compile PaddlePaddle with rdma support" OFF)
 option(WITH_GLOG "Compile PaddlePaddle use glog, otherwise use a log implement internally" ${LIBGLOG_FOUND})
 option(WITH_GFLAGS "Compile PaddlePaddle use gflags, otherwise use a flag implement internally" ${GFLAGS_FOUND})
 option(WITH_TIMER "Compile PaddlePaddle use timer" OFF)
+option(WITH_PROFILER "Compile PaddlePaddle use gpu profiler" OFF)
 option(WITH_TESTING "Compile and run unittest for PaddlePaddle" ${GTEST_FOUND})
 option(WITH_DOC "Compile PaddlePaddle with documentation" OFF)
 option(WITH_SWIG_PY "Compile PaddlePaddle with py PaddlePaddle prediction api" ${SWIG_FOUND})
@@ -115,7 +116,6 @@ else()
     endif(WITH_AVX)
 
     if(WITH_DSO)
-        set(CUDA_LIBRARIES "")
         add_definitions(-DPADDLE_USE_DSO)
     endif(WITH_DSO)
 
@@ -135,6 +135,10 @@ if(NOT WITH_TIMER)
     add_definitions(-DPADDLE_DISABLE_TIMER)
 endif(NOT WITH_TIMER)
 
+if(NOT WITH_PROFILER)
+    add_definitions(-DPADDLE_DISABLE_PROFILER)
+endif(NOT WITH_PROFILER)
+
 if(WITH_AVX)
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${AVX_FLAG}")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${AVX_FLAG}")

diff --git a/RELEASE.md b/RELEASE.md
@@ -0,0 +1,69 @@
+# Release v0.9.0
+
+## New Features:
+
+* New Layers
+  * bilinear interpolation layer.
+  * spatial pyramid-pool layer.
+  * de-convolution layer.
+  * maxout layer.
+* Support rectangle padding, stride, window and input for Pooling Operation.
+* Add —job=time in trainer, which can be used to print time info without compiler option -WITH_TIMER=ON.
+* Expose cost_weight/nce_layer in `trainer_config_helpers`
+* Add FAQ, concepts, h-rnn docs.
+* Add Bidi-LSTM and DB-LSTM to quick start demo @alvations
+* Add usage track scripts.
+
+## Improvements
+
+* Add Travis-CI for Mac OS X. Enable swig unittest in Travis-CI. Skip Travis-CI when only docs are changed.
+* Add code coverage tools.
+* Refine convolution layer to speedup and reduce GPU memory.
+* Speed up PyDataProvider2
+* Add ubuntu deb package build scripts.
+* Make Paddle use git-flow branching model.
+* PServer support no parameter blocks.
+
+## Bug Fixes
+
+* add zlib link to py_paddle
+* add input sparse data check for sparse layer at runtime
+* Bug fix for sparse matrix multiplication
+* Fix floating-point overflow problem of tanh
+* Fix some nvcc compile options
+* Fix a bug in yield dictionary in DataProvider
+* Fix SRL hang when exit.
+
+# Release v0.8.0beta.1
+New features:
+
+* Mac OSX is supported by source code. #138
+   * Both GPU and CPU versions of PaddlePaddle are supported.
+
+* Support CUDA 8.0
+
+* Enhance `PyDataProvider2`
+   * Add dictionary yield format. `PyDataProvider2` can yield a dictionary with key is data_layer's name, value is features.
+   * Add `min_pool_size` to control memory pool in provider.
+
+* Add `deb` install package & docker image for no_avx machines.
+   * Especially for cloud computing and virtual machines
+
+* Automatically disable `avx` instructions in cmake when machine's CPU don't support `avx` instructions.
+
+* Add Parallel NN api in trainer_config_helpers.
+
+* Add `travis ci` for Github
+
+Bug fixes:
+
+* Several bugs in trainer_config_helpers. Also complete the unittest for trainer_config_helpers
+* Check if PaddlePaddle is installed when unittest.
+* Fix bugs in GTX series GPU
+* Fix bug in MultinomialSampler
+
+Also more documentation was written since last release.
+
+# Release v0.8.0beta.0
+
+PaddlePaddle v0.8.0beta.0 release. The install package is not stable yet and it's a pre-release version.
diff --git a/benchmark/figs/alexnet-4gpu.png b/benchmark/figs/alexnet-4gpu.png
diff --git a/benchmark/figs/googlenet-4gpu.png b/benchmark/figs/googlenet-4gpu.png
diff --git a/demo/image_classification/train.sh b/demo/image_classification/train.sh
@@ -24,7 +24,7 @@ paddle train \
 --test_all_data_in_one_period=1 \
 --use_gpu=1 \
 --trainer_count=1 \
---num_passes=200 \
+--num_passes=300 \
 --save_dir=$output \
 2>&1 | tee $log
 

diff --git a/demo/model_zoo/embedding/pre_DictAndModel.sh b/demo/model_zoo/embedding/pre_DictAndModel.sh
@@ -18,7 +18,5 @@ set -x
 # download the dictionary and pretrained model 
 for file in baidu.dict model_32.emb model_64.emb model_128.emb model_256.emb
 do 
-  # following is the google drive address
-  # you can also directly download from https://pan.baidu.com/s/1o8q577s
-  wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/embedding/$file --no-check-certificate
+  wget http://paddlepaddle.bj.bcebos.com/model_zoo/embedding/$file
 done
diff --git a/demo/model_zoo/resnet/get_model.sh b/demo/model_zoo/resnet/get_model.sh
@@ -24,9 +24,7 @@ echo "Downloading ResNet models..."
 
 for file in resnet_50.tar.gz resnet_101.tar.gz resnet_152.tar.gz mean_meta_224.tar.gz 
 do 
-  # following is the google drive address
-  # you can also directly download from https://pan.baidu.com/s/1o8q577s
-  wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/imagenet/$file --no-check-certificate
+  wget http://paddlepaddle.bj.bcebos.com/model_zoo/imagenet/$file
   tar -xvf $file 
   rm $file
 done

diff --git a/demo/quick_start/data/README.md b/demo/quick_start/data/README.md
@@ -0,0 +1,9 @@
+This dataset consists of electronics product reviews associated with
+binary labels (positive/negative) for sentiment classification.
+
+The preprocessed data can be downloaded by script `get_data.sh`.
+The data was derived from reviews_Electronics_5.json.gz at
+
+http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz
+
+If you want to process the raw data, you can use the script `proc_from_raw_data/get_data.sh`.
diff --git a/demo/quick_start/data/get_data.sh b/demo/quick_start/data/get_data.sh
@@ -17,14 +17,11 @@ set -e
 DIR="$( cd "$(dirname "$0")" ; pwd -P )"
 cd $DIR
 
-echo "Downloading Amazon Electronics reviews data..."
-# http://jmcauley.ucsd.edu/data/amazon/
-wget http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz
+# Download the preprocessed data
+wget http://paddlepaddle.bj.bcebos.com/demo/quick_start_preprocessed_data/preprocessed_data.tar.gz
 
-echo "Downloading mosesdecoder..."
-#https://github.com/moses-smt/mosesdecoder
-wget https://github.com/moses-smt/mosesdecoder/archive/master.zip
+# Extract package
+tar zxvf preprocessed_data.tar.gz
 
-unzip master.zip
-rm master.zip
-echo "Done."
+# Remove compressed package
+rm preprocessed_data.tar.gz
diff --git a/demo/quick_start/data/pred.list b/demo/quick_start/data/pred.list
diff --git a/demo/quick_start/data/pred.txt b/demo/quick_start/data/pred.txt
diff --git a/demo/quick_start/preprocess.sh → ...start/data/proc_from_raw_data/get_data.sh b/demo/quick_start/preprocess.sh → ...start/data/proc_from_raw_data/get_data.sh
@@ -16,10 +16,26 @@
 # 1. size of pos : neg = 1:1.
 # 2. size of testing set = min(25k, len(all_data) * 0.1), others is traning set.
 # 3. distinct train set and test set.
-# 4. build dict
 
 set -e
 
+DIR="$( cd "$(dirname "$0")" ; pwd -P )"
+cd $DIR
+
+# Download data
+echo "Downloading Amazon Electronics reviews data..."
+# http://jmcauley.ucsd.edu/data/amazon/
+wget http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz
+echo "Downloading mosesdecoder..."
+# https://github.com/moses-smt/mosesdecoder
+wget https://github.com/moses-smt/mosesdecoder/archive/master.zip
+
+unzip master.zip
+rm master.zip
+
+##################
+# Preprocess data 
+echo "Preprocess data..."
 export LC_ALL=C
 UNAME_STR=`uname`
 
@@ -29,11 +45,11 @@ else
   SHUF_PROG='gshuf'
 fi
 
-mkdir -p data/tmp
-python preprocess.py -i data/reviews_Electronics_5.json.gz
+mkdir -p tmp
+python preprocess.py -i reviews_Electronics_5.json.gz
 # uniq and shuffle
-cd data/tmp
-echo 'uniq and shuffle...'
+cd tmp
+echo 'Uniq and shuffle...'
 cat pos_*|sort|uniq|${SHUF_PROG}> pos.shuffed
 cat neg_*|sort|uniq|${SHUF_PROG}> neg.shuffed
 
@@ -53,11 +69,11 @@ cat train.pos train.neg | ${SHUF_PROG} >../train.txt
 cat test.pos test.neg | ${SHUF_PROG} >../test.txt
 
 cd -
-echo 'data/train.txt' > data/train.list
-echo 'data/test.txt' > data/test.list
+echo 'train.txt' > train.list
+echo 'test.txt' > test.list
 
 # use 30k dict
-rm -rf data/tmp
-mv data/dict.txt data/dict_all.txt
-cat data/dict_all.txt | head -n 30001 > data/dict.txt
-echo 'preprocess finished'
+rm -rf tmp
+mv dict.txt dict_all.txt
+cat dict_all.txt | head -n 30001 > dict.txt
+echo 'Done.'
diff --git a/demo/quick_start/preprocess.py → ...art/data/proc_from_raw_data/preprocess.py b/demo/quick_start/preprocess.py → ...art/data/proc_from_raw_data/preprocess.py
@@ -14,7 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-1. (remove HTML before or not)tokensizing
+1. Tokenize the words and punctuation 
 2. pos sample : rating score 5; neg sample: rating score 1-2.
 
 Usage:
@@ -76,7 +76,11 @@ def tokenize(sentences):
     sentences : a list of input sentences.
     return: a list of processed text.
     """
-    dir = './data/mosesdecoder-master/scripts/tokenizer/tokenizer.perl'
+    dir = './mosesdecoder-master/scripts/tokenizer/tokenizer.perl'
+    if not os.path.exists(dir):
+        sys.exit(
+            "The ./mosesdecoder-master/scripts/tokenizer/tokenizer.perl does not exists."
+        )
     tokenizer_cmd = [dir, '-l', 'en', '-q', '-']
     assert isinstance(sentences, list)
     text = "\n".join(sentences)
@@ -104,7 +108,7 @@ def tokenize_batch(id):
         num_batch, instance, pre_fix = parse_queue.get()
         if num_batch == -1:  ### parse_queue finished
             tokenize_queue.put((-1, None, None))
-            sys.stderr.write("tokenize theread %s finish\n" % (id))
+            sys.stderr.write("Thread %s finish\n" % (id))
             break
         tokenize_instance = tokenize(instance)
         tokenize_queue.put((num_batch, tokenize_instance, pre_fix))

diff --git a/demo/semantic_role_labeling/data/get_data.sh b/demo/semantic_role_labeling/data/get_data.sh
@@ -14,10 +14,10 @@
 # limitations under the License.
 set -e
 wget http://www.cs.upc.edu/~srlconll/conll05st-tests.tar.gz
-wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/semantic_role_labeling/verbDict.txt --no-check-certificate
-wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/semantic_role_labeling/targetDict.txt --no-check-certificate
-wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/semantic_role_labeling/wordDict.txt --no-check-certificate
-wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/semantic_role_labeling/emb --no-check-certificate
+wget http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/verbDict.txt
+wget http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/targetDict.txt 
+wget http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/wordDict.txt 
+wget http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/emb
 tar -xzvf conll05st-tests.tar.gz
 rm conll05st-tests.tar.gz
 cp ./conll05st-release/test.wsj/words/test.wsj.words.gz  .

diff --git a/demo/semantic_role_labeling/dataprovider.py b/demo/semantic_role_labeling/dataprovider.py
@@ -25,12 +25,13 @@ def hook(settings, word_dict, label_dict, predicate_dict, **kwargs):
     #all inputs are integral and sequential type
     settings.slots = [
         integer_value_sequence(len(word_dict)),
-        integer_value_sequence(len(predicate_dict)),
         integer_value_sequence(len(word_dict)),
         integer_value_sequence(len(word_dict)),
         integer_value_sequence(len(word_dict)),
         integer_value_sequence(len(word_dict)),
-        integer_value_sequence(len(word_dict)), integer_value_sequence(2),
+        integer_value_sequence(len(word_dict)), 
+        integer_value_sequence(len(predicate_dict)),
+        integer_value_sequence(2),
         integer_value_sequence(len(label_dict))
     ]
 
@@ -63,5 +64,5 @@ def process(settings, file_name):
 
             label_list = label.split()
             label_slot = [settings.label_dict.get(w) for w in label_list]
-            yield word_slot, predicate_slot, ctx_n2_slot, ctx_n1_slot, \
-                  ctx_0_slot, ctx_p1_slot, ctx_p2_slot, mark_slot, label_slot
+            yield word_slot, ctx_n2_slot, ctx_n1_slot, \
+                  ctx_0_slot, ctx_p1_slot, ctx_p2_slot, predicate_slot, mark_slot, label_slot
diff --git a/demo/semantic_role_labeling/predict.py b/demo/semantic_role_labeling/predict.py
@@ -55,18 +55,14 @@ def __init__(self, train_conf, dict_file, model_dir, label_file, predicate_dict_
 
         slots = [
             integer_value_sequence(len_dict),
-            integer_value_sequence(len_pred),
             integer_value_sequence(len_dict),
             integer_value_sequence(len_dict),
             integer_value_sequence(len_dict),
             integer_value_sequence(len_dict),
             integer_value_sequence(len_dict), 
+            integer_value_sequence(len_pred),
             integer_value_sequence(2)
             ]
-            integer_value_sequence(len_dict), integer_value_sequence(len_dict),
-            integer_value_sequence(len_dict), integer_value_sequence(len_dict),
-            integer_value_sequence(len_dict), integer_value_sequence(2)
-        ]
         self.converter = DataProviderConverter(slots)
 
     def load_dict_label(self, dict_file, label_file, predicate_dict_file):
@@ -104,8 +100,8 @@ def get_data(self, data_file):
                 marks = mark.split()
                 mark_slot = [int(w) for w in marks]
 
-                yield word_slot, predicate_slot, ctx_n2_slot, ctx_n1_slot, \
-                      ctx_0_slot, ctx_p1_slot, ctx_p2_slot,  mark_slot
+                yield word_slot, ctx_n2_slot, ctx_n1_slot, \
+                      ctx_0_slot, ctx_p1_slot, ctx_p2_slot, predicate_slot, mark_slot
 
     def predict(self, data_file, output_file):
         """

diff --git a/demo/semantic_role_labeling/predict.sh b/demo/semantic_role_labeling/predict.sh
@@ -18,7 +18,7 @@ set -e
 function get_best_pass() {
   cat $1  | grep -Pzo 'Test .*\n.*pass-.*' | \
   sed  -r 'N;s/Test.* cost=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g' | \
-  sort | head -n 1
+  sort -n | head -n 1
 }   
 
 log=train.log

diff --git a/demo/semantic_role_labeling/test.sh b/demo/semantic_role_labeling/test.sh
@@ -18,7 +18,7 @@ set -e
 function get_best_pass() {
   cat $1  | grep -Pzo 'Test .*\n.*pass-.*' | \
   sed  -r 'N;s/Test.* cost=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g' |\
-  sort | head -n 1
+  sort -n | head -n 1
 }
 
 log=train.log

diff --git a/demo/sentiment/test.sh b/demo/sentiment/test.sh
@@ -17,7 +17,7 @@ set -e
 function get_best_pass() {
   cat $1  | grep -Pzo 'Test .*\n.*pass-.*' | \
   sed  -r 'N;s/Test.* classification_error_evaluator=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g' |\
-  sort | head -n 1
+  sort -n | head -n 1
 }
 
 log=train.log

diff --git a/demo/seqToseq/data/paraphrase_data.sh b/demo/seqToseq/data/paraphrase_data.sh
@@ -16,9 +16,7 @@ set -e
 set -x
 
 # download the in-house paraphrase dataset
-# following is the google drive address
-# you can also directly download from https://pan.baidu.com/s/1o8q577s
-wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/embedding/paraphrase.tar.gz --no-check-certificate
+wget http://paddlepaddle.bj.bcebos.com/model_zoo/embedding/paraphrase.tar.gz
 
 # untar the dataset
 tar -zxvf paraphrase.tar.gz

diff --git a/demo/seqToseq/data/wmt14_model.sh b/demo/seqToseq/data/wmt14_model.sh
@@ -16,9 +16,7 @@ set -e
 set -x
 
 # download the pretrained model
-# following is the google drive address
-# you can also directly download from https://pan.baidu.com/s/1o8q577s
-wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/wmt14_model.tar.gz --no-check-certificate
+wget http://paddlepaddle.bj.bcebos.com/model_zoo/wmt14_model.tar.gz
 
 # untar the model
 tar -zxvf wmt14_model.tar.gz