Skip to content

Commit

Permalink
Merge branch 'develop' of https://github.com/baidu/Paddle into benchm…
Browse files Browse the repository at this point in the history
…ark_cfg_doc
  • Loading branch information
qingqing01 committed Nov 30, 2016
2 parents b3945f9 + 767c4e8 commit 68060aa
Show file tree
Hide file tree
Showing 188 changed files with 2,152 additions and 804 deletions.
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ addons:
before_install:
- |
if [ ${JOB} == "BUILD_AND_TEST" ]; then
if ! git diff --name-only $TRAVIS_COMMIT_RANGE | grep -qvE '(\.md$)'
if ! git diff --name-only $TRAVIS_COMMIT_RANGE | grep -qvE '(\.md$)|(\.rst$)|(\.jpg$)|(\.png$)'
then
echo "Only markdown docs were updated, stopping build process."
exit
Expand Down
8 changes: 6 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ cmake_minimum_required(VERSION 2.8)
project(paddle CXX C)
set(PADDLE_MAJOR_VERSION 0)
set(PADDLE_MINOR_VERSION 9)
set(PADDLE_PATCH_VERSION 0a0)
set(PADDLE_PATCH_VERSION 0)
set(PADDLE_VERSION ${PADDLE_MAJOR_VERSION}.${PADDLE_MINOR_VERSION}.${PADDLE_PATCH_VERSION})

set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake")
Expand Down Expand Up @@ -36,6 +36,7 @@ option(WITH_RDMA "Compile PaddlePaddle with rdma support" OFF)
option(WITH_GLOG "Compile PaddlePaddle use glog, otherwise use a log implement internally" ${LIBGLOG_FOUND})
option(WITH_GFLAGS "Compile PaddlePaddle use gflags, otherwise use a flag implement internally" ${GFLAGS_FOUND})
option(WITH_TIMER "Compile PaddlePaddle use timer" OFF)
option(WITH_PROFILER "Compile PaddlePaddle use gpu profiler" OFF)
option(WITH_TESTING "Compile and run unittest for PaddlePaddle" ${GTEST_FOUND})
option(WITH_DOC "Compile PaddlePaddle with documentation" OFF)
option(WITH_SWIG_PY "Compile PaddlePaddle with py PaddlePaddle prediction api" ${SWIG_FOUND})
Expand Down Expand Up @@ -115,7 +116,6 @@ else()
endif(WITH_AVX)

if(WITH_DSO)
set(CUDA_LIBRARIES "")
add_definitions(-DPADDLE_USE_DSO)
endif(WITH_DSO)

Expand All @@ -135,6 +135,10 @@ if(NOT WITH_TIMER)
add_definitions(-DPADDLE_DISABLE_TIMER)
endif(NOT WITH_TIMER)

if(NOT WITH_PROFILER)
add_definitions(-DPADDLE_DISABLE_PROFILER)
endif(NOT WITH_PROFILER)

if(WITH_AVX)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${AVX_FLAG}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${AVX_FLAG}")
Expand Down
69 changes: 69 additions & 0 deletions RELEASE.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
# Release v0.9.0

## New Features:

* New Layers
* bilinear interpolation layer.
* spatial pyramid-pool layer.
* de-convolution layer.
* maxout layer.
* Support rectangle padding, stride, window and input for Pooling Operation.
* Add —job=time in trainer, which can be used to print time info without compiler option -WITH_TIMER=ON.
* Expose cost_weight/nce_layer in `trainer_config_helpers`
* Add FAQ, concepts, h-rnn docs.
* Add Bidi-LSTM and DB-LSTM to quick start demo @alvations
* Add usage track scripts.

## Improvements

* Add Travis-CI for Mac OS X. Enable swig unittest in Travis-CI. Skip Travis-CI when only docs are changed.
* Add code coverage tools.
* Refine convolution layer to speedup and reduce GPU memory.
* Speed up PyDataProvider2
* Add ubuntu deb package build scripts.
* Make Paddle use git-flow branching model.
* PServer support no parameter blocks.

## Bug Fixes

* add zlib link to py_paddle
* add input sparse data check for sparse layer at runtime
* Bug fix for sparse matrix multiplication
* Fix floating-point overflow problem of tanh
* Fix some nvcc compile options
* Fix a bug in yield dictionary in DataProvider
* Fix SRL hang when exit.

# Release v0.8.0beta.1
New features:

* Mac OSX is supported by source code. #138
* Both GPU and CPU versions of PaddlePaddle are supported.

* Support CUDA 8.0

* Enhance `PyDataProvider2`
* Add dictionary yield format. `PyDataProvider2` can yield a dictionary with key is data_layer's name, value is features.
* Add `min_pool_size` to control memory pool in provider.

* Add `deb` install package & docker image for no_avx machines.
* Especially for cloud computing and virtual machines

* Automatically disable `avx` instructions in cmake when machine's CPU don't support `avx` instructions.

* Add Parallel NN api in trainer_config_helpers.

* Add `travis ci` for Github

Bug fixes:

* Several bugs in trainer_config_helpers. Also complete the unittest for trainer_config_helpers
* Check if PaddlePaddle is installed when unittest.
* Fix bugs in GTX series GPU
* Fix bug in MultinomialSampler

Also more documentation was written since last release.

# Release v0.8.0beta.0

PaddlePaddle v0.8.0beta.0 release. The install package is not stable yet and it's a pre-release version.
Binary file modified benchmark/figs/alexnet-4gpu.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified benchmark/figs/googlenet-4gpu.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 1 addition & 1 deletion demo/image_classification/train.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ paddle train \
--test_all_data_in_one_period=1 \
--use_gpu=1 \
--trainer_count=1 \
--num_passes=200 \
--num_passes=300 \
--save_dir=$output \
2>&1 | tee $log

Expand Down
4 changes: 1 addition & 3 deletions demo/model_zoo/embedding/pre_DictAndModel.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,5 @@ set -x
# download the dictionary and pretrained model
for file in baidu.dict model_32.emb model_64.emb model_128.emb model_256.emb
do
# following is the google drive address
# you can also directly download from https://pan.baidu.com/s/1o8q577s
wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/embedding/$file --no-check-certificate
wget http://paddlepaddle.bj.bcebos.com/model_zoo/embedding/$file
done
4 changes: 1 addition & 3 deletions demo/model_zoo/resnet/get_model.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,7 @@ echo "Downloading ResNet models..."

for file in resnet_50.tar.gz resnet_101.tar.gz resnet_152.tar.gz mean_meta_224.tar.gz
do
# following is the google drive address
# you can also directly download from https://pan.baidu.com/s/1o8q577s
wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/imagenet/$file --no-check-certificate
wget http://paddlepaddle.bj.bcebos.com/model_zoo/imagenet/$file
tar -xvf $file
rm $file
done
Expand Down
9 changes: 9 additions & 0 deletions demo/quick_start/data/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
This dataset consists of electronics product reviews associated with
binary labels (positive/negative) for sentiment classification.

The preprocessed data can be downloaded by script `get_data.sh`.
The data was derived from reviews_Electronics_5.json.gz at

http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz

If you want to process the raw data, you can use the script `proc_from_raw_data/get_data.sh`.
15 changes: 6 additions & 9 deletions demo/quick_start/data/get_data.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,11 @@ set -e
DIR="$( cd "$(dirname "$0")" ; pwd -P )"
cd $DIR

echo "Downloading Amazon Electronics reviews data..."
# http://jmcauley.ucsd.edu/data/amazon/
wget http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz
# Download the preprocessed data
wget http://paddlepaddle.bj.bcebos.com/demo/quick_start_preprocessed_data/preprocessed_data.tar.gz

echo "Downloading mosesdecoder..."
#https://github.com/moses-smt/mosesdecoder
wget https://github.com/moses-smt/mosesdecoder/archive/master.zip
# Extract package
tar zxvf preprocessed_data.tar.gz

unzip master.zip
rm master.zip
echo "Done."
# Remove compressed package
rm preprocessed_data.tar.gz
1 change: 0 additions & 1 deletion demo/quick_start/data/pred.list

This file was deleted.

2 changes: 0 additions & 2 deletions demo/quick_start/data/pred.txt

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,26 @@
# 1. size of pos : neg = 1:1.
# 2. size of testing set = min(25k, len(all_data) * 0.1), others is traning set.
# 3. distinct train set and test set.
# 4. build dict

set -e

DIR="$( cd "$(dirname "$0")" ; pwd -P )"
cd $DIR

# Download data
echo "Downloading Amazon Electronics reviews data..."
# http://jmcauley.ucsd.edu/data/amazon/
wget http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz
echo "Downloading mosesdecoder..."
# https://github.com/moses-smt/mosesdecoder
wget https://github.com/moses-smt/mosesdecoder/archive/master.zip

unzip master.zip
rm master.zip

##################
# Preprocess data
echo "Preprocess data..."
export LC_ALL=C
UNAME_STR=`uname`

Expand All @@ -29,11 +45,11 @@ else
SHUF_PROG='gshuf'
fi

mkdir -p data/tmp
python preprocess.py -i data/reviews_Electronics_5.json.gz
mkdir -p tmp
python preprocess.py -i reviews_Electronics_5.json.gz
# uniq and shuffle
cd data/tmp
echo 'uniq and shuffle...'
cd tmp
echo 'Uniq and shuffle...'
cat pos_*|sort|uniq|${SHUF_PROG}> pos.shuffed
cat neg_*|sort|uniq|${SHUF_PROG}> neg.shuffed

Expand All @@ -53,11 +69,11 @@ cat train.pos train.neg | ${SHUF_PROG} >../train.txt
cat test.pos test.neg | ${SHUF_PROG} >../test.txt

cd -
echo 'data/train.txt' > data/train.list
echo 'data/test.txt' > data/test.list
echo 'train.txt' > train.list
echo 'test.txt' > test.list

# use 30k dict
rm -rf data/tmp
mv data/dict.txt data/dict_all.txt
cat data/dict_all.txt | head -n 30001 > data/dict.txt
echo 'preprocess finished'
rm -rf tmp
mv dict.txt dict_all.txt
cat dict_all.txt | head -n 30001 > dict.txt
echo 'Done.'
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""
1. (remove HTML before or not)tokensizing
1. Tokenize the words and punctuation
2. pos sample : rating score 5; neg sample: rating score 1-2.
Usage:
Expand Down Expand Up @@ -76,7 +76,11 @@ def tokenize(sentences):
sentences : a list of input sentences.
return: a list of processed text.
"""
dir = './data/mosesdecoder-master/scripts/tokenizer/tokenizer.perl'
dir = './mosesdecoder-master/scripts/tokenizer/tokenizer.perl'
if not os.path.exists(dir):
sys.exit(
"The ./mosesdecoder-master/scripts/tokenizer/tokenizer.perl does not exists."
)
tokenizer_cmd = [dir, '-l', 'en', '-q', '-']
assert isinstance(sentences, list)
text = "\n".join(sentences)
Expand Down Expand Up @@ -104,7 +108,7 @@ def tokenize_batch(id):
num_batch, instance, pre_fix = parse_queue.get()
if num_batch == -1: ### parse_queue finished
tokenize_queue.put((-1, None, None))
sys.stderr.write("tokenize theread %s finish\n" % (id))
sys.stderr.write("Thread %s finish\n" % (id))
break
tokenize_instance = tokenize(instance)
tokenize_queue.put((num_batch, tokenize_instance, pre_fix))
Expand Down
8 changes: 4 additions & 4 deletions demo/semantic_role_labeling/data/get_data.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,10 @@
# limitations under the License.
set -e
wget http://www.cs.upc.edu/~srlconll/conll05st-tests.tar.gz
wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/semantic_role_labeling/verbDict.txt --no-check-certificate
wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/semantic_role_labeling/targetDict.txt --no-check-certificate
wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/semantic_role_labeling/wordDict.txt --no-check-certificate
wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/semantic_role_labeling/emb --no-check-certificate
wget http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/verbDict.txt
wget http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/targetDict.txt
wget http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/wordDict.txt
wget http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/emb
tar -xzvf conll05st-tests.tar.gz
rm conll05st-tests.tar.gz
cp ./conll05st-release/test.wsj/words/test.wsj.words.gz .
Expand Down
9 changes: 5 additions & 4 deletions demo/semantic_role_labeling/dataprovider.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,13 @@ def hook(settings, word_dict, label_dict, predicate_dict, **kwargs):
#all inputs are integral and sequential type
settings.slots = [
integer_value_sequence(len(word_dict)),
integer_value_sequence(len(predicate_dict)),
integer_value_sequence(len(word_dict)),
integer_value_sequence(len(word_dict)),
integer_value_sequence(len(word_dict)),
integer_value_sequence(len(word_dict)),
integer_value_sequence(len(word_dict)), integer_value_sequence(2),
integer_value_sequence(len(word_dict)),
integer_value_sequence(len(predicate_dict)),
integer_value_sequence(2),
integer_value_sequence(len(label_dict))
]

Expand Down Expand Up @@ -63,5 +64,5 @@ def process(settings, file_name):

label_list = label.split()
label_slot = [settings.label_dict.get(w) for w in label_list]
yield word_slot, predicate_slot, ctx_n2_slot, ctx_n1_slot, \
ctx_0_slot, ctx_p1_slot, ctx_p2_slot, mark_slot, label_slot
yield word_slot, ctx_n2_slot, ctx_n1_slot, \
ctx_0_slot, ctx_p1_slot, ctx_p2_slot, predicate_slot, mark_slot, label_slot
10 changes: 3 additions & 7 deletions demo/semantic_role_labeling/predict.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,18 +55,14 @@ def __init__(self, train_conf, dict_file, model_dir, label_file, predicate_dict_

slots = [
integer_value_sequence(len_dict),
integer_value_sequence(len_pred),
integer_value_sequence(len_dict),
integer_value_sequence(len_dict),
integer_value_sequence(len_dict),
integer_value_sequence(len_dict),
integer_value_sequence(len_dict),
integer_value_sequence(len_pred),
integer_value_sequence(2)
]
integer_value_sequence(len_dict), integer_value_sequence(len_dict),
integer_value_sequence(len_dict), integer_value_sequence(len_dict),
integer_value_sequence(len_dict), integer_value_sequence(2)
]
self.converter = DataProviderConverter(slots)

def load_dict_label(self, dict_file, label_file, predicate_dict_file):
Expand Down Expand Up @@ -104,8 +100,8 @@ def get_data(self, data_file):
marks = mark.split()
mark_slot = [int(w) for w in marks]

yield word_slot, predicate_slot, ctx_n2_slot, ctx_n1_slot, \
ctx_0_slot, ctx_p1_slot, ctx_p2_slot, mark_slot
yield word_slot, ctx_n2_slot, ctx_n1_slot, \
ctx_0_slot, ctx_p1_slot, ctx_p2_slot, predicate_slot, mark_slot

def predict(self, data_file, output_file):
"""
Expand Down
2 changes: 1 addition & 1 deletion demo/semantic_role_labeling/predict.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ set -e
function get_best_pass() {
cat $1 | grep -Pzo 'Test .*\n.*pass-.*' | \
sed -r 'N;s/Test.* cost=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g' | \
sort | head -n 1
sort -n | head -n 1
}

log=train.log
Expand Down
2 changes: 1 addition & 1 deletion demo/semantic_role_labeling/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ set -e
function get_best_pass() {
cat $1 | grep -Pzo 'Test .*\n.*pass-.*' | \
sed -r 'N;s/Test.* cost=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g' |\
sort | head -n 1
sort -n | head -n 1
}

log=train.log
Expand Down
2 changes: 1 addition & 1 deletion demo/sentiment/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ set -e
function get_best_pass() {
cat $1 | grep -Pzo 'Test .*\n.*pass-.*' | \
sed -r 'N;s/Test.* classification_error_evaluator=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g' |\
sort | head -n 1
sort -n | head -n 1
}

log=train.log
Expand Down
4 changes: 1 addition & 3 deletions demo/seqToseq/data/paraphrase_data.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,7 @@ set -e
set -x

# download the in-house paraphrase dataset
# following is the google drive address
# you can also directly download from https://pan.baidu.com/s/1o8q577s
wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/embedding/paraphrase.tar.gz --no-check-certificate
wget http://paddlepaddle.bj.bcebos.com/model_zoo/embedding/paraphrase.tar.gz

# untar the dataset
tar -zxvf paraphrase.tar.gz
Expand Down
4 changes: 1 addition & 3 deletions demo/seqToseq/data/wmt14_model.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,7 @@ set -e
set -x

# download the pretrained model
# following is the google drive address
# you can also directly download from https://pan.baidu.com/s/1o8q577s
wget https://www.googledrive.com/host/0B7Q8d52jqeI9ejh6Q1RpMTFQT1k/wmt14_model.tar.gz --no-check-certificate
wget http://paddlepaddle.bj.bcebos.com/model_zoo/wmt14_model.tar.gz

# untar the model
tar -zxvf wmt14_model.tar.gz
Expand Down
Loading

0 comments on commit 68060aa

Please sign in to comment.