From 6b4ede3d5bc66739eb142fa99435bfd7ef66ba49 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Thu, 10 Jun 2021 07:11:25 +0000
Subject: [PATCH 1/2] update spawn doc for xpu, test=ducument_fix

---
 python/paddle/distributed/spawn.py | 65 +++++++++++++++---------------
 1 file changed, 32 insertions(+), 33 deletions(-)

diff --git a/python/paddle/distributed/spawn.py b/python/paddle/distributed/spawn.py
index e21f142f10b36..5ca68eda88f9e 100644
--- a/python/paddle/distributed/spawn.py
+++ b/python/paddle/distributed/spawn.py
@@ -335,7 +335,7 @@ def spawn(func, args=(), nprocs=-1, join=True, daemon=False, **options):
     Start multiple processes with ``spawn`` method for parallel training.
 
     .. note::
-        ``spawn`` now only supports GPU collective mode.
+        ``spawn`` now only supports GPU or XPU collective mode.
 
     Args:
         func (function): The target function is called by spawned process.
@@ -343,28 +343,27 @@ def spawn(func, args=(), nprocs=-1, join=True, daemon=False, **options):
             at the top level of a module.
         args (list|tuple, optional): Arguments passed to ``func``.
         nprocs (int, optional): Number of processed to start. Default: -1.
-            when nprocs is -1, the available device will be obtained from 
-            the environment variable when the model is executed: If use GPU, 
-            the currently available device ID is obtained from the environment 
-            variable CUDA_VISIBLE_DEVICES; If use CPU, the currently available
-            CPU number is obtained from the environment variable CPU_NUM. 
-            For example, export CPU_NUM=4, if the environment variable is not set, 
-            the spawn method will add default value to the environment variable 
-            and set its value to 1.
+            when nprocs is -1, the available device will be obtained from
+            the environment variable when the model is executed: If use GPU,
+            the currently available device ID is obtained from the environment
+            variable CUDA_VISIBLE_DEVICES; If use XPU, the currently available
+            device ID is obtained from the environment variable XPU_VISIBLE_DEVICES.
         join (bool, optional): Perform a blocking join on all spawned processes.
             Default: True.
         daemon (bool, optional): The spawned processes' daemon flag. Default: False.
-        **options(dict, optional): Other initial parallel execution environment 
-            configuration options. The following options are currently supported: 
-            (1) start_method (string): the way to start a process. 
-            The start method can be ``spawn`` , ``fork`` , ``forkserver`` . 
-            Because the CUDA runtime does not support the ``fork`` start method, 
-            when use CUDA in subprocesses, we should start process by ``spawn`` 
-            or ``forkserver`` method. Default: "spawn" ; 
-            (2) gpus (string): The training process will run on the 
-            selected gpus, such as "0,1,2,3". Default: None; 
-            (3) ips (string): Paddle cluster nodes ips, such as 
-            "192.168.0.16,192.168.0.17". Default: "127.0.0.1" . 
+        **options(dict, optional): Other initial parallel execution environment
+            configuration options. The following options are currently supported:
+            (1) start_method (string): the way to start a process.
+            The start method can be ``spawn`` , ``fork`` , ``forkserver`` .
+            Because the CUDA runtime does not support the ``fork`` start method,
+            when use CUDA in subprocesses, we should start process by ``spawn``
+            or ``forkserver`` method. Default: "spawn" ;
+            (2) gpus (string): The training process will run on the
+            selected gpus, such as "0,1,2,3". Default: None;
+            (3) xpus (string): The training process will run on the
+            selected xpus, such as "0,1,2,3". Default: None;
+            (4) ips (string): Paddle cluster nodes ips, such as
+            "192.168.0.16,192.168.0.17". Default: "127.0.0.1" .
 
     Returns:
         ``MultiprocessContext`` object, it hold the spawned processes.
@@ -384,11 +383,11 @@ def __init__(self):
                     super(LinearNet, self).__init__()
                     self._linear1 = nn.Linear(10, 10)
                     self._linear2 = nn.Linear(10, 1)
-                    
+
                 def forward(self, x):
                     return self._linear2(self._linear1(x))
 
-            def train(print_result=False): 
+            def train(print_result=False):
                 # 1. initialize parallel environment
                 dist.init_parallel_env()
 
@@ -405,43 +404,43 @@ def train(print_result=False):
                 outputs = dp_layer(inputs)
                 labels = paddle.randn([10, 1], 'float32')
                 loss = loss_fn(outputs, labels)
-                
+
                 if print_result is True:
                     print("loss:", loss.numpy())
-                
+
                 loss.backward()
 
                 adam.step()
                 adam.clear_grad()
 
-            # Usage 1: only pass function. 
-            # If your training method no need any argument, and 
-            # use all visible devices for parallel training. 
+            # Usage 1: only pass function.
+            # If your training method no need any argument, and
+            # use all visible devices for parallel training.
             if __name__ == '__main__':
                 dist.spawn(train)
 
             # Usage 2: pass function and arguments.
-            # If your training method need some arguments, and 
+            # If your training method need some arguments, and
             # use all visible devices for parallel training.
             if __name__ == '__main__':
                 dist.spawn(train, args=(True,))
 
             # Usage 3: pass function, arguments and nprocs.
-            # If your training method need some arguments, and 
+            # If your training method need some arguments, and
             # only use part of visible devices for parallel training.
             # If your machine hold 8 cards {0,1,2,3,4,5,6,7},
-            # this case will use cards {0,1}; If you set 
+            # this case will use cards {0,1}; If you set
             # CUDA_VISIBLE_DEVICES=4,5,6,7, this case will use
             # cards {4,5}
             if __name__ == '__main__':
                 dist.spawn(train, args=(True,), nprocs=2)
 
             # Usage 4: pass function, arguments, nprocs and gpus.
-            # If your training method need some arguments, and 
+            # If your training method need some arguments, and
             # only use part of visible devices for parallel training,
-            # but you can't set your machine's environment variable 
+            # but you can't set your machine's environment variable
             # CUDA_VISIBLE_DEVICES, such as it is None or all cards
-            # {0,1,2,3,4,5,6,7}, you can pass `gpus` to 
+            # {0,1,2,3,4,5,6,7}, you can pass `gpus` to
             # select the GPU cards you want to use. For example,
             # this case will use cards {4,5} if your machine hold 8 cards.
             if __name__ == '__main__':

From 7e19c668dedd352e16555b544a5b6559321ebb4f Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Fri, 11 Jun 2021 03:32:37 +0000
Subject: [PATCH 2/2] add note for gpu and xpu, test=document_fix

---
 python/paddle/distributed/spawn.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/python/paddle/distributed/spawn.py b/python/paddle/distributed/spawn.py
index 5ca68eda88f9e..a60e4642e494d 100644
--- a/python/paddle/distributed/spawn.py
+++ b/python/paddle/distributed/spawn.py
@@ -335,7 +335,9 @@ def spawn(func, args=(), nprocs=-1, join=True, daemon=False, **options):
     Start multiple processes with ``spawn`` method for parallel training.
 
     .. note::
-        ``spawn`` now only supports GPU or XPU collective mode.
+        ``spawn`` now only supports GPU or XPU collective mode. The collective mode
+        of GPU and XPU cannot be started at the same time, so the option `gpus` and
+        `xpus` cannot be configured at the same time.
 
     Args:
         func (function): The target function is called by spawned process.