Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[rabit harden] fix rabit tests #81

Merged
merged 20 commits into from
Mar 14, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,11 @@ env:
- TASK=lint LINT_LANG=cpp
- TASK=lint LINT_LANG=python
- TASK=doc
- TASK=build CXX=g++
- TASK=test CXX=g++
- TASK=build
- TASK=test

# dependent apt packages
dist: xenial
addons:
apt:
packages:
Expand All @@ -21,6 +22,8 @@ addons:
- libcurl4-openssl-dev
- unzip
- python-numpy
- gcc-4.8
- g++-4.8

before_install:
- git clone https://github.com/dmlc/dmlc-core
Expand Down
41 changes: 27 additions & 14 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,24 +1,37 @@
OS := $(shell uname)

ifeq ($(OS), Linux)
ifndef CXX
export CXX = g++
endif
export MPICXX = mpicxx
export LDFLAGS= -Llib -lrt

endif
export LDFLAGS= -Llib

ifeq ($(OS), FreeBSD)
ifndef CXX
export CXX = g++6
endif
export MPICXX = /usr/local/mpi/bin/mpicxx
export LDFLAGS= -Llib -Wl,-rpath=/usr/local/lib/gcc6
OS := $(shell uname)

ifeq ($(OS), Darwin)
ifndef CC
export CC = $(if $(shell which clang), clang, gcc)
endif
ifndef CXX
export CXX = $(if $(shell which clang++), clang++, g++)
endif
else
ifeq ($(OS), FreeBSD)
ifndef CXX
export CXX = g++6
endif
export MPICXX = /usr/local/mpi/bin/mpicxx
export LDFLAGS= -Llib -Wl,-rpath=/usr/local/lib/gcc6
else
# linux defaults
ifndef CC
export CC = gcc
endif
ifndef CXX
export CXX = g++
endif
LDFLAGS += -lrt
endif
endif

export WARNFLAGS= -Wall -Wextra -Wno-unused-parameter -Wno-unknown-pragmas -std=c++0x
export WARNFLAGS= -Wall -Wextra -Wno-unused-parameter -Wno-unknown-pragmas -std=c++11
export CFLAGS = -O3 $(WARNFLAGS)

#----------------------------
Expand Down
2 changes: 1 addition & 1 deletion include/dmlc/io.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
#include <ostream>
#include <streambuf>

#include "dmlc/base.h"
#include "base.h"

// include uint64_t only to make io standalone
#ifdef _MSC_VER
Expand Down
2 changes: 1 addition & 1 deletion include/rabit/serializable.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
#include <vector>
#include <string>
#include "./internal/utils.h"
#include "dmlc/io.h"
#include "../dmlc/io.h"

namespace rabit {
/*!
Expand Down
12 changes: 7 additions & 5 deletions scripts/travis_runtest.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
#!/bin/bash
#make -f test.mk model_recover_10_10k || exit -1
#make -f test.mk model_recover_10_10k_die_same || exit -1
#make -f test.mk local_recover_10_10k || exit -1
#make -f test.mk lazy_recover_10_10k_die_hard || exit -1
#make -f test.mk lazy_recover_10_10k_die_same || exit -1

make -f test.mk model_recover_10_10k || exit -1
make -f test.mk model_recover_10_10k_die_same || exit -1
make -f test.mk model_recover_10_10k_die_hard || exit -1
make -f test.mk local_recover_10_10k || exit -1
make -f test.mk lazy_recover_10_10k_die_hard || exit -1
make -f test.mk lazy_recover_10_10k_die_same || exit -1
make -f test.mk ringallreduce_10_10k || exit -1
7 changes: 6 additions & 1 deletion src/allreduce_robust.cc
Original file line number Diff line number Diff line change
Expand Up @@ -268,8 +268,9 @@ void AllreduceRobust::CheckPoint_(const Serializable *global_model,
if (num_local_replica != 0) {
while (true) {
if (RecoverExec(NULL, 0, 0, ActionSummary::kLocalCheckPoint)) break;
// save model model to new version place
// save model to new version place
int new_version = !local_chkpt_version;

local_chkpt[new_version].clear();
utils::MemoryBufferStream fs(&local_chkpt[new_version]);
if (local_model != NULL) {
Expand All @@ -296,6 +297,7 @@ void AllreduceRobust::CheckPoint_(const Serializable *global_model,
if (lazy_checkpt) {
global_lazycheck = global_model;
} else {
printf("[%d] save global checkpoint #%d \n", this->rank, version_number);
global_checkpoint.resize(0);
utils::MemoryBufferStream fs(&global_checkpoint);
fs.Write(&version_number, sizeof(version_number));
Expand Down Expand Up @@ -737,6 +739,9 @@ AllreduceRobust::ReturnType AllreduceRobust::TryLoadCheckPoint(bool requester) {
succ = TryRecoverLocalState(&local_rptr[local_chkpt_version],
&local_chkpt[local_chkpt_version]);
if (succ != kSuccess) return succ;

printf("[%d] recovered from local checkpoint version %d \n", this->rank, local_chkpt_version);

int nlocal = std::max(static_cast<int>(local_rptr[local_chkpt_version].size()) - 1, 0);
// check if everyone is OK
unsigned state = 0;
Expand Down
25 changes: 21 additions & 4 deletions test/Makefile
Original file line number Diff line number Diff line change
@@ -1,8 +1,25 @@
export CC = gcc
export CXX = g++
export MPICXX = mpicxx
export LDFLAGS= -L../lib -pthread -lm -lrt
export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -fPIC -I../include -std=c++0x
export LDFLAGS= -L../lib -pthread -lm
export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -fPIC -I../include -std=c++11

OS := $(shell uname)

ifeq ($(OS), Darwin)
ifndef CC
export CC = $(if $(shell which clang), clang, gcc)
endif
ifndef CXX
export CXX = $(if $(shell which clang++), clang++, g++)
endif
else
ifndef CC
export CC = gcc
endif
ifndef CXX
export CXX = g++
endif
LDFLAGS += -lrt
endif

# specify tensor path
BIN = speed_test model_recover local_recover lazy_recover
Expand Down
13 changes: 6 additions & 7 deletions test/model_recover.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include <cstdio>
#include <cstdlib>
#include <cmath>

using namespace rabit;

// dummy model
Expand Down Expand Up @@ -77,10 +78,9 @@ inline void TestBcast(size_t n, int root, int ntrial, int iter) {
std::string res;
if (root == rank) {
res = s;
rabit::Broadcast(&res, root);
} else {
rabit::Broadcast(&res, root);
}
rabit::Broadcast(&res, root);

utils::Check(res == s, "[%d] TestBcast fail", rank);
}

Expand All @@ -104,10 +104,9 @@ int main(int argc, char *argv[]) {
int iter = rabit::LoadCheckPoint(&model);
if (iter == 0) {
model.InitModel(n);
printf("[%d] reload-trail=%d, init iter=%d\n", rank, ntrial, iter);
} else {
printf("[%d] reload-trail=%d, init iter=%d\n", rank, ntrial, iter);
chenqin marked this conversation as resolved.
Show resolved Hide resolved
}
printf("[%d] reload-trail=%d, init iter=%d\n", rank, ntrial, iter);

for (int r = iter; r < 3; ++r) {
TestMax(&model, ntrial, r);
printf("[%d] !!!TestMax pass, iter=%d\n", rank, r);
Expand All @@ -119,7 +118,7 @@ int main(int argc, char *argv[]) {
TestSum(&model, ntrial, r);
printf("[%d] !!!TestSum pass, iter=%d\n", rank, r);
rabit::CheckPoint(&model);
printf("[%d] !!!CheckPont pass, iter=%d\n", rank, r);
printf("[%d] !!!Checkpoint pass, iter=%d\n", rank, r);
}
rabit::Finalize();
return 0;
Expand Down
15 changes: 7 additions & 8 deletions test/test.mk
Original file line number Diff line number Diff line change
Expand Up @@ -4,27 +4,26 @@
all: model_recover_10_10k model_recover_10_10k_die_same model_recover_10_10k_die_hard local_recover_10_10k

# this experiment test recovery with actually process exit, use keepalive to keep program alive
# TODO: enable those tests once we fix issue in rabit
model_recover_10_10k:
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=10 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0

model_recover_10_10k_die_same:
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=10 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0

model_recover_10_10k_die_hard:
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=1,1,1,1 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=8,1,2,0 mock=4,1,3,0
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=10 model_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=1,1,1,1 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=8,1,2,0 mock=4,1,3,0

local_recover_10_10k:
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 local_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=1,1,1,1
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=10 local_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=1,1,1,1

pylocal_recover_10_10k:
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 ./local_recover.py 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=1,1,1,1
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 local_recover.py 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=1,1,1,1

lazy_recover_10_10k_die_hard:
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 lazy_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=1,1,1,1 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=8,1,2,0 mock=4,1,3,0
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=10 lazy_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=1,1,1,1 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0 mock=8,1,2,0 mock=4,1,3,0

lazy_recover_10_10k_die_same:
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 lazy_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 --local-num-attempt=10 lazy_recover 10000 mock=0,0,1,0 mock=1,1,1,0 mock=0,1,1,0 mock=4,1,1,0 mock=9,1,1,0

ringallreduce_10_10k:
../dmlc-core/tracker/dmlc-submit --cluster local --num-workers=10 model_recover 100 rabit_reduce_ring_mincount=10