PaddlePaddle · typhoonzero · Feb 2, 2018 · Jan 15, 2018 · Jan 16, 2018 · Jan 16, 2018
diff --git a/benchmark/cluster/vgg16/fluid/Dockerfile → benchmark/cluster/vgg16/Dockerfile b/benchmark/cluster/vgg16/fluid/Dockerfile → benchmark/cluster/vgg16/Dockerfile
@@ -12,4 +12,4 @@ ENV LD_LIBRARY_PATH=/usr/local/lib
 ADD reader.py /workspace/
 RUN python /workspace/reader.py
 
-ADD vgg16.py /workspace/
+ADD vgg16_fluid.py vgg16_v2.py /workspace/
diff --git a/benchmark/cluster/vgg16/README.md b/benchmark/cluster/vgg16/README.md
@@ -0,0 +1,58 @@
+# Performance for distributed vgg16
+
+## Test Result
+
+### Single node single thread
+
+| Batch Size | 32 | 64 | 128 | 256 |
+| -- | -- | -- | -- | -- |
+| PaddlePaddle Fluid | - | - | 16.74 | - |
+| PaddlePaddle v2 | - | - | 17.60 | - |
+| TensorFlow | - | - | - | - |
+
+### different batch size
+
+- PServer Count: 10
+- Trainer Count: 20
+- Metrics: samples / sec
+
+| Batch Size | 32 | 64 | 128 | 256 |
+| -- | -- | -- | -- | -- |
+| PaddlePaddle Fluid | - | 247.40 | - | - |
+| PaddlePaddle v2 | - | - | 256.14 | - |
+| TensorFlow | - | - | - | - |
+
+### different pserver number
+
+- Trainer Count: 100
+- Batch Size: 64
+- Metrics: mini-batch / sec
+
+| PServer Count | 10 | 20 | 40 | 60 |
+| -- | -- | -- | -- | -- |
+| PaddlePaddle Fluid | - | - | - | - |
+| PaddlePaddle v2 | - | - | - | - |
+| TensorFlow | - | - | - | - |
+
+### Accelerate rate
+
+| Trainer Counter | 20 | 40 | 80 | 100 |
+| -- | -- | -- | -- | -- |
+| PaddlePaddle Fluid | - | - | - | - |
+| PaddlePaddle v2 | - | - | - | - |
+| TensorFlow | - | - | - | - |
+
+
+## Steps to run the performance test
+
+1. You must re-compile PaddlePaddle and enable `-DWITH_DISTRIBUTE` to build PaddlePaddle with distributed support.
+1. When the build finishes, copy the output `whl` package located under `build/python/dist` to current directory.
+1. Run `docker build -t [image:tag] .` to build the docker image and run `docker push [image:tag]` to push the image to reponsitory so kubernetes can find it.
+1. Run `kubectl create -f pserver.yaml && kubectl create -f trainer.yaml` to start the job on your kubernetes cluster (you must configure the `kubectl` client before this step).
+1. Run `kubectl get po` to get running pods, and run `kubectl logs [podID]` to fetch the pod log of pservers and trainers.
+
+Check the logs for the distributed training progress and analyze the performance.
+
+## Enable verbos logs
+
+Edit `pserver.yaml` and `trainer.yaml` and add an environment variable `GLOG_v=3` to see what happend in detail.
diff --git a/benchmark/cluster/vgg16/fluid/README.md b/benchmark/cluster/vgg16/fluid/README.md
diff --git a/benchmark/cluster/vgg16/fluid/pserver.yaml → benchmark/cluster/vgg16/fluid_pserver.yaml b/benchmark/cluster/vgg16/fluid/pserver.yaml → benchmark/cluster/vgg16/fluid_pserver.yaml
@@ -14,7 +14,7 @@ spec:
       - name: job-registry-secret
       containers:
       - name: pserver
-        image: "registry.baidu.com/paddlepaddle/rawjob:vgg16_fluid"
+        image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
         imagePullPolicy: Always
         ports:
         - name: jobport-30236
@@ -33,7 +33,7 @@ spec:
         - name: TOPOLOGY
           value: ""
         - name: ENTRY
-          value: "LD_LIBRARY_PATH=/usr/local/lib MKL_NUM_THREADS=1 python /workspace/vgg16.py --local 0"
+          value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0"
         - name: TRAINER_PACKAGE
           value: "/workspace"
         - name: PADDLE_INIT_PORT
@@ -53,7 +53,7 @@ spec:
         - name: PADDLE_INIT_USE_GPU
           value: "0"
         - name: LD_LIBRARY_PATH
-          value: "/usr/local/nvidia/lib64"
+          value: "/usr/local/lib:/usr/local/nvidia/lib64"
         - name: NAMESPACE
           valueFrom:
             fieldRef:

diff --git a/benchmark/cluster/vgg16/fluid/trainer.yaml → benchmark/cluster/vgg16/fluid_trainer.yaml b/benchmark/cluster/vgg16/fluid/trainer.yaml → benchmark/cluster/vgg16/fluid_trainer.yaml
@@ -15,7 +15,7 @@ spec:
       hostNetwork: true
       containers:
       - name: trainer
-        image: "registry.baidu.com/paddlepaddle/rawjob:vgg16_fluid"
+        image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
         imagePullPolicy: Always
         command: ["paddle_k8s", "start_fluid"]
         env:
@@ -30,7 +30,7 @@ spec:
         - name: TOPOLOGY
           value: ""
         - name: ENTRY
-          value: "cd /workspace && LD_LIBRARY_PATH=/usr/local/lib MKL_NUM_THREADS=1 python /workspace/vgg16.py --local 0"
+          value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0 --batch_size 128"
         - name: TRAINER_PACKAGE
           value: "/workspace"
         - name: PADDLE_INIT_PORT
@@ -50,7 +50,7 @@ spec:
         - name: PADDLE_INIT_USE_GPU
           value: "0"
         - name: LD_LIBRARY_PATH
-          value: "/usr/local/nvidia/lib64"
+          value: "/usr/local/lib:/usr/local/nvidia/lib64"
         - name: NAMESPACE
           valueFrom:
             fieldRef:

diff --git a/benchmark/cluster/vgg16/fluid/k8s_tools.py → benchmark/cluster/vgg16/k8s_tools.py b/benchmark/cluster/vgg16/fluid/k8s_tools.py → benchmark/cluster/vgg16/k8s_tools.py
diff --git a/benchmark/cluster/vgg16/fluid/paddle_k8s → benchmark/cluster/vgg16/paddle_k8s b/benchmark/cluster/vgg16/fluid/paddle_k8s → benchmark/cluster/vgg16/paddle_k8s
diff --git a/benchmark/cluster/vgg16/fluid/reader.py → benchmark/cluster/vgg16/reader.py b/benchmark/cluster/vgg16/fluid/reader.py → benchmark/cluster/vgg16/reader.py
diff --git a/benchmark/cluster/vgg16/v2/Dockerfile b/benchmark/cluster/vgg16/v2/Dockerfile
diff --git a/benchmark/cluster/vgg16/v2/reader.py b/benchmark/cluster/vgg16/v2/reader.py
diff --git a/benchmark/cluster/vgg16/v2/pserver.yaml → benchmark/cluster/vgg16/v2_pserver.yaml b/benchmark/cluster/vgg16/v2/pserver.yaml → benchmark/cluster/vgg16/v2_pserver.yaml
@@ -14,7 +14,7 @@ spec:
       - name: job-registry-secret
       containers:
       - name: pserver
-        image: "registry.baidu.com/paddlepaddle/rawjob:vgg16"
+        image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
         imagePullPolicy: Always
         ports:
         - name: jobport-30236
@@ -49,7 +49,7 @@ spec:
         - name: PADDLE_INIT_USE_GPU
           value: "0"
         - name: LD_LIBRARY_PATH
-          value: "/usr/local/nvidia/lib64"
+          value: "/usr/local/lib:/usr/local/nvidia/lib64"
         - name: NAMESPACE
           valueFrom:
             fieldRef:

diff --git a/benchmark/cluster/vgg16/v2/trainer.yaml → benchmark/cluster/vgg16/v2_trainer.yaml b/benchmark/cluster/vgg16/v2/trainer.yaml → benchmark/cluster/vgg16/v2_trainer.yaml
@@ -15,40 +15,42 @@ spec:
       hostNetwork: true
       containers:
       - name: trainer
-        image: "registry.baidu.com/paddlepaddle/rawjob:vgg16"
+        image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
         imagePullPolicy: Always
         command: ["paddle_k8s", "start_trainer", "v2"]
         env:
         - name: PADDLE_JOB_NAME
           value: vgg16v2job
+        - name: BATCH_SIZE
+          value: "128"
         - name: TRAINERS
           value: "20"
         - name: PSERVERS
           value: "10"
         - name: TOPOLOGY
           value: ""
         - name: ENTRY
-          value: "cd /workspace && MKL_NUM_THREADS=1 python /workspace/vgg16.py"
+          value: "cd /workspace && MKL_NUM_THREADS=1 python /workspace/vgg16_v2.py"
         - name: TRAINER_PACKAGE
           value: "/workspace"
         - name: PADDLE_INIT_PORT
           value: "30236"
         - name: PADDLE_INIT_NICS
           value: "xgbe0"
         - name: PADDLE_INIT_TRAINER_COUNT
-          value: "1"
+          value: "2"
         - name: PADDLE_INIT_PORTS_NUM
           value: "1"
         - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
           value: "1"
         - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
           value: "20"
         - name: PADDLE_INIT_NUM_PASSES
-          value: "1"
+          value: "2"
         - name: PADDLE_INIT_USE_GPU
           value: "0"
         - name: LD_LIBRARY_PATH
-          value: "/usr/local/nvidia/lib64"
+          value: "/usr/local/lib:/usr/local/nvidia/lib64"
         - name: NAMESPACE
           valueFrom:
             fieldRef:

diff --git a/benchmark/cluster/vgg16/fluid/vgg16.py → benchmark/cluster/vgg16/vgg16_fluid.py b/benchmark/cluster/vgg16/fluid/vgg16.py → benchmark/cluster/vgg16/vgg16_fluid.py
diff --git a/benchmark/cluster/vgg16/v2/vgg16.py → benchmark/cluster/vgg16/vgg16_v2.py b/benchmark/cluster/vgg16/v2/vgg16.py → benchmark/cluster/vgg16/vgg16_v2.py
@@ -16,12 +16,17 @@
 
 import paddle.v2.dataset.cifar as cifar
 import paddle.v2 as paddle
-import reader
 import time
+import os
 
 DATA_DIM = 3 * 32 * 32
 CLASS_DIM = 10
-BATCH_SIZE = 128
+BATCH_SIZE = os.getenv("BATCH_SIZE")
+if BATCH_SIZE:
+    BATCH_SIZE = int(BATCH_SIZE)
+else:
+    BATCH_SIZE = 128
+NODE_COUNT = int(os.getenv("TRAINERS"))
 ts = 0
 
 
@@ -77,14 +82,15 @@ def vgg19(input, class_dim):
 
 def main():
     global ts
-    paddle.init(use_gpu=False, trainer_count=1)
+    paddle.init(use_gpu=False)
     image = paddle.layer.data(
         name="image", type=paddle.data_type.dense_vector(DATA_DIM))
     lbl = paddle.layer.data(
         name="label", type=paddle.data_type.integer_value(CLASS_DIM))
 
     extra_layers = None
-    learning_rate = 0.01
+    # NOTE: for v2 distributed training need averaging updates.
+    learning_rate = 1e-3 / NODE_COUNT
     out = vgg16(image, class_dim=CLASS_DIM)
     cost = paddle.layer.classification_cost(input=out, label=lbl)
 
@@ -123,7 +129,9 @@ def main():
 
     # End batch and end pass event handler
     def event_handler(event):
-        global ts
+        global ts, ts_pass
+        if isinstance(event, paddle.event.BeginPass):
+            ts_pass = time.time()
         if isinstance(event, paddle.event.BeginIteration):
             ts = time.time()
         if isinstance(event, paddle.event.EndIteration):
@@ -132,9 +140,8 @@ def event_handler(event):
                     event.pass_id, event.batch_id, event.cost, event.metrics,
                     time.time() - ts)
         if isinstance(event, paddle.event.EndPass):
-            with gzip.open('params_pass_%d.tar.gz' % event.pass_id, 'w') as f:
-                trainer.save_parameter_to_tar(f)
-
+            print "Pass %d end, spent: %f" % (event.pass_id,
+                                              time.time() - ts_pass)
             result = trainer.test(reader=test_reader)
             print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)