docs & examples: increase training iterations

eric-haibin-lin · Jul 23, 2019 · f6f7f96 · f6f7f96
1 parent c59d988
commit f6f7f96
Show file tree

Hide file tree

Showing 3 changed files with 9 additions and 9 deletions.
diff --git a/docs/step-by-step-tutorial.md b/docs/step-by-step-tutorial.md
@@ -26,7 +26,7 @@ export DMLC_PS_ROOT_PORT=1234
 export EVAL_TYPE=benchmark 
 python /usr/local/byteps/launcher/launch.py \
        /usr/local/byteps/example/tensorflow/run_tensorflow_byteps.sh \
-       --model ResNet50 --num-iters 1000        
+       --model ResNet50 --num-iters 1000000        
 ```
 
 ### PyTorch
@@ -51,7 +51,7 @@ export DMLC_PS_ROOT_PORT=1234
 export EVAL_TYPE=benchmark 
 python /usr/local/byteps/launcher/launch.py \
        /usr/local/byteps/example/pytorch/start_pytorch_byteps.sh \
-       --model resnet50 --num-iters 1000      
+       --model resnet50 --num-iters 1000000      
 ```
 
 ### MXNet
@@ -166,15 +166,15 @@ If your workers use TensorFlow, you need to change the image name to `bytepsimag
 ```
 python /usr/local/byteps/launcher/launch.py \
        /usr/local/byteps/example/tensorflow/run_tensorflow_byteps.sh \
-       --model ResNet50 --num-iters 1000     
+       --model ResNet50 --num-iters 1000000     
 ```
 
 If your workers use PyTorch, you need to change the image name to `bytepsimage/worker_pytorch`, and replace the python script with
 
 ```
 python /usr/local/byteps/launcher/launch.py \
        /usr/local/byteps/example/pytorch/start_pytorch_byteps.sh \
-       --model resnet50 --num-iters 1000   
+       --model resnet50 --num-iters 1000000   
 ```
 
 ## Distributed Training with RDMA
@@ -300,13 +300,13 @@ If your workers use TensorFlow, you need to change the image name to `bytepsimag
 ```
 python /usr/local/byteps/launcher/launch.py \
        /usr/local/byteps/example/tensorflow/run_tensorflow_byteps.sh \
-       --model ResNet50 --num-iters 1000     
+       --model ResNet50 --num-iters 1000000     
 ```
 
 If your workers use PyTorch, you need to change the image name to `bytepsimage/worker_pytorch_rdma`, and replace the python script with
 
 ```
 python /usr/local/byteps/launcher/launch.py \
        /usr/local/byteps/example/pytorch/start_pytorch_byteps.sh \
-       --model resnet50 --num-iters 1000   
+       --model resnet50 --num-iters 1000000   
 ```
diff --git a/example/pytorch/train_mnist_byteps.py b/example/pytorch/train_mnist_byteps.py
@@ -14,8 +14,8 @@
                     help='input batch size for training (default: 64)')
 parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
                     help='input batch size for testing (default: 1000)')
-parser.add_argument('--epochs', type=int, default=10, metavar='N',
-                    help='number of epochs to train (default: 10)')
+parser.add_argument('--epochs', type=int, default=100, metavar='N',
+                    help='number of epochs to train (default: 100)')
 parser.add_argument('--lr', type=float, default=0.01, metavar='LR',
                     help='learning rate (default: 0.01)')
 parser.add_argument('--momentum', type=float, default=0.5, metavar='M',

diff --git a/example/tensorflow/tensorflow_mnist.py b/example/tensorflow/tensorflow_mnist.py
@@ -128,7 +128,7 @@ def main(_):
         bps.BroadcastGlobalVariablesHook(0),
 
         # BytePS: adjust number of steps based on number of GPUs.
-        tf.train.StopAtStepHook(last_step=20000 // bps.size()),
+        tf.train.StopAtStepHook(last_step=200000 // bps.size()),
 
         tf.train.LoggingTensorHook(tensors={'step': global_step, 'loss': loss},
                                    every_n_iter=10),