From 68b2c1042fabaf3c8a59639b59d3235664522e89 Mon Sep 17 00:00:00 2001 From: Ao Yulong Date: Wed, 10 Nov 2021 06:28:20 +0000 Subject: [PATCH 01/30] [Auto Parallel] Add the unified cluster representation --- .../distributed/auto_parallel/cluster.py | 349 +++++++++++++++ .../unittests/test_auto_parallel_cluster.py | 413 ++++++++++++++++++ 2 files changed, 762 insertions(+) create mode 100644 python/paddle/distributed/auto_parallel/cluster.py create mode 100644 python/paddle/fluid/tests/unittests/test_auto_parallel_cluster.py diff --git a/python/paddle/distributed/auto_parallel/cluster.py b/python/paddle/distributed/auto_parallel/cluster.py new file mode 100644 index 0000000000000..c06557987e305 --- /dev/null +++ b/python/paddle/distributed/auto_parallel/cluster.py @@ -0,0 +1,349 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import json +from enum import IntEnum +from enum import unique + + +@unique +class DeviceType(IntEnum): + UNKNOWN = 0 + CPU = 1 + GPU = 2 + XPU = 3 + NPU = 4 + DCU = 5 + NIC = 6 + + +@unique +class LinkType(IntEnum): + UNKNOWN = 0 + LOC = 1 + SYS = 2 + PHB = 3 + PIX = 4 + PIB = 5 + NVL = 6 + NVB = 7 + NET = 8 + + +class Device: + def __init__(self, id, machine): + self._id = id + self._machine = machine + self._type = None + # different device have different models, such as + # "Tesla V100-SXM2-32GB" and "A100-SXM4-40GB" etc. + self._model = None + self._dp_gflops = None + self._sp_gflops = None + # memory is stored by GB + self._memory = None + + @property + def id(self): + return self._id + + @id.setter + def id(self, value): + self._id = value + + @property + def machine(self): + return self._machine + + @machine.setter + def machine(self, value): + self._machine = value + + @property + def type(self): + return self._type + + @type.setter + def type(self, value): + self._type = value + + @property + def model(self): + return self._model + + @model.setter + def model(self, value): + self._model = value + + @property + def dp_gflops(self): + return self._dp_gflops + + @dp_gflops.setter + def dp_gflops(self, value): + self._dp_gflops = value + + @property + def sp_gflops(self): + return self._sp_gflops + + @sp_gflops.setter + def sp_gflops(self, value): + self._sp_gflops = value + + @property + def memory(self): + return self._memory + + @memory.setter + def memory(self, value): + self._memory = value + + def __str__(self): + str = "" + str += "device_id: {}, machine_id: {}, type: {}, model: {}, dp_flops: {}, sp_flops: {}, memory: {}".format( + self.id, self.machine.id, self.type.name, self.model, + self.dp_gflops, self.sp_gflops, self.memory) + return str + + def __repr__(self): + return self.__str__() + + +class Link: + def __init__(self, source, target): + self._src = source + self._tgt = target + self._type = None + # bandwidth is stored by GB/s + self._bandwidth = None + # latency is stored by millisecond + self._latency = None + + @property + def source(self): + return self._src + + @source.setter + def source(self, value): + self._source = value + + @property + def target(self): + return self._tgt + + @target.setter + def target(self, value): + self._target = value + + @property + def type(self): + return self._type + + @type.setter + def type(self, value): + self._type = value + + @property + def bandwidth(self): + return self._bandwidth + + @bandwidth.setter + def bandwidth(self, value): + self._bandwidth = value + + @property + def latency(self): + return self._latency + + @latency.setter + def latency(self, value): + self._latency = value + + def __str__(self): + str = "" + str += "source_id: {}, target_id: {}, type: {}, bandwidth: {}, latency: {}".format( + self.source.id, self.target.id, self.type, self.bandwidth, + self.latency) + return str + + def __repr__(self): + return self.__str__() + + +class Machine: + def __init__(self, id): + self._id = id + self._hostname = None + self._addr = None + self._port = None + self._devices = {} + self._links = {} + + @property + def id(self): + return self._id + + @id.setter + def id(self, value): + self._id = value + + @property + def hostname(self): + return self._hostname + + @hostname.setter + def hostname(self, value): + self._hostname = value + + @property + def addr(self): + return self._addr + + @addr.setter + def addr(self, value): + self._addr = value + + @property + def port(self): + return self._port + + @port.setter + def port(self, value): + self._port = value + + @property + def devices(self): + return self._devices + + @property + def links(self): + return self._links + + def add_device(self, device): + # Use the device id as the key + self._devices[device.id] = device + + def add_link(self, link): + # Use the source device id and target device id as the key + self._links[(link.source.id, link.target.id)] = link + + def __str__(self): + str = "" + for device in self.devices.values(): + str += ", device: {}".format(device) + for link in self.links.values(): + str += ", link: {}".format(link) + return str + + def __repr__(self): + return self.__str__() + + +class Cluster: + """ + The cluster is an abstract of the hardware resource for training, which contains the cluster topology and + related hardware information. It will serve the task mapping, cost model and auto searching. + """ + + def __init__(self): + # Used to compute machine id + self._num_machines = 0 + # Store all machines within the cluster + self._machines = {} + # Cluster graph topology + self._topology = None + + @property + def machines(self): + return self._machines + + def add_machine(self, machine): + assert isinstance(machine, Machine) + self._machines[machine.id] = machine + + def add_device(self, device): + assert isinstance(device, Device) + device.machine.add_device(device) + + def add_link(self, link): + assert isinstance(link, Link) + # Only add the link to the source machine + link.source.machine.add_link(link) + + def get_device(self, device_id): + device = None + for machine in self.machines.values(): + if device_id in machine.devices.keys(): + device = machine.devices[device_id] + return device + + def build_from_file(self, json_file_path): + with open(json_file_path) as json_file: + cluster_info = json.load(json_file) + machines_info = cluster_info["machines"] + for machine_info in machines_info: + machine_id = self._generate_machine_id() + machine = Machine(machine_id) + machine.hostname = machine_info.get("hostname") + machine.addr = machine_info.get("addr") + machine.port = machine_info.get("port") + devices_info = machine_info.get("devices", []) + for device_info in devices_info: + device_id = device_info.get("id") + device = Device(device_id, machine) + device_type = device_info.get("type", None) + if device_type is not None: + device_type = DeviceType[device_type] + else: + device_type = DeviceType.UNKNOWN + device.type = device_type + device.model = device_info.get("model", None) + device.dp_gflops = float(device_info.get("dp_gflops", 0)) + device.sp_gflops = float(device_info.get("sp_gflops", 0)) + device.memory = float(device_info.get("memory", 0)) + self.add_device(device) + self.add_machine(machine) + for machine_info in machines_info: + links_info = machine_info.get("links", []) + for link_info in links_info: + source_id = link_info.get("source_id") + target_id = link_info.get("target_id") + source = self.get_device(source_id) + target = self.get_device(target_id) + link = Link(source, target) + link_type = link_info.get("type", None) + if link_type is not None: + link_type = LinkType[link_type] + else: + link_type = LinkType.UNKNOWN + link.type = link_type + link.bandwidth = float(link_info.get("bandwidth", 0)) + link.latency = float(link_info.get("latency", 0)) + self.add_link(link) + + def _generate_machine_id(self): + cur_machine_id = self._num_machines + self._num_machines += 1 + return cur_machine_id + + def __str__(self): + str = "" + for machine in self.machines.values(): + str += "machine: {}\n".format(machine) + return str + + def __repr__(self): + return self.__str__() diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_cluster.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_cluster.py new file mode 100644 index 0000000000000..e217e40967753 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_cluster.py @@ -0,0 +1,413 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import os +import json +from paddle.distributed.auto_parallel.cluster import Cluster +from paddle.distributed.auto_parallel.cluster import DeviceType +from paddle.distributed.auto_parallel.cluster import LinkType + +cluster_json = """ +{ + "machines": [ + { + "hostname": "machine0", + "addr": "0.0.0.1", + "port": "768", + "devices": [ + { + "id": 0, + "type": "GPU", + "model": "A100-SXM4-40GB", + "sp_gflops": 19500, + "dp_gflops": 9700, + "memory": 40 + }, + { + "id": 1, + "type": "GPU", + "model": "A100-SXM4-40GB", + "sp_gflops": 19500, + "dp_gflops": 9700, + "memory": 40 + }, + { + "id": 2, + "type": "CPU", + "model": "Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GH", + "arch": "x86_64", + "vendor": "GenuineIntel", + "sp_gflops": 150, + "dp_gflops": 75, + "memory": 1510 + }, + { + "id": 3, + "type": "NIC" + } + ], + "links": [ + { + "source_id": 0, + "target_id": 1, + "type": "NVL", + "bandwidth": 252 + }, + { + "source_id": 0, + "target_id": 2, + "type": "PHB", + "bandwidth": 12 + }, + { + "source_id": 1, + "target_id": 2, + "type": "PHB", + "bandwidth": 12 + }, + { + "source_id": 0, + "target_id": 3, + "type": "NET", + "bandwidth": 1 + }, + { + "source_id": 1, + "target_id": 3, + "type": "NET", + "bandwidth": 1 + }, + { + "source_id": 2, + "target_id": 3, + "type": "NET", + "bandwidth": 1 + }, + { + "source_id": 3, + "target_id": 7, + "type": "NET", + "bandwidth": 1 + } + ] + }, + { + "hostname": "machine1", + "addr": "0.0.0.2", + "port": "768", + "devices": [ + { + "id": 4, + "type": "GPU", + "model": "Tesla V100-SXM2-32GB", + "sp_gflops": 15700, + "dp_gflops": 7800, + "memory": 32 + }, + { + "id": 5, + "type": "GPU", + "model": "Tesla V100-SXM2-32GB", + "sp_gflops": 15700, + "dp_gflops": 7800, + "memory": 32 + }, + { + "id": 6, + "type": "CPU", + "model": "Intel(R) Xeon(R) Gold 6271C CPU @ 2.60G", + "arch": "x86_64", + "vendor": "GenuineIntel", + "sp_gflops": 150, + "dp_gflops": 75, + "memory": "503" + }, + { + "id": 7, + "type": "NIC" + } + ], + "links": [ + { + "source_id": 4, + "target_id": 5, + "type": "NVL", + "bandwidth": 42 + }, + { + "source_id": 4, + "target_id": 6, + "type": "PHB", + "bandwidth": 12 + }, + { + "source_id": 5, + "target_id": 6, + "type": "PHB", + "bandwidth": 12 + }, + { + "source_id": 4, + "target_id": 7, + "type": "NET", + "bandwidth": 1 + }, + { + "source_id": 5, + "target_id": 7, + "type": "NET", + "bandwidth": 1 + }, + { + "source_id": 6, + "target_id": 7, + "type": "NET", + "bandwidth": 1 + }, + { + "source_id": 7, + "target_id": 3, + "type": "NET", + "bandwidth": 1 + } + ] + } + ] +} +""" + + +class TestAutoParallelCluster(unittest.TestCase): + def test_cluster(self): + cluster_json_file = "" + cluster_json_object = json.loads(cluster_json) + with open("./auto_parallel_cluster.json", "w") as cluster_json_file: + json.dump(cluster_json_object, cluster_json_file) + + cluster = Cluster() + cluster.build_from_file("./auto_parallel_cluster.json") + os.remove("./auto_parallel_cluster.json") + self.assertEqual(len(cluster.machines), 2) + + # machine0 + machine0 = cluster.machines[0] + self.assertEqual(machine0.hostname, "machine0") + self.assertEqual(machine0.addr, "0.0.0.1") + self.assertEqual(machine0.port, "768") + self.assertEqual(len(machine0.devices), 4) + self.assertEqual(len(machine0.links), 7) + + # device0 + device0_machine0 = machine0.devices[0] + self.assertEqual(device0_machine0.id, 0) + self.assertEqual(device0_machine0.type, DeviceType.GPU) + self.assertEqual(device0_machine0.model, "A100-SXM4-40GB") + self.assertAlmostEqual(device0_machine0.sp_gflops, 19500) + self.assertAlmostEqual(device0_machine0.dp_gflops, 9700) + self.assertAlmostEqual(device0_machine0.memory, 40) + + # device0, link0 + link0_machine0 = machine0.links[(0, 1)] + self.assertEqual(link0_machine0.source.id, 0) + self.assertEqual(link0_machine0.target.id, 1) + self.assertEqual(link0_machine0.type, LinkType.NVL) + self.assertAlmostEqual(link0_machine0.bandwidth, 252) + self.assertAlmostEqual(link0_machine0.latency, 0) + + # device 0, link 1 + link1_machine0 = machine0.links[(0, 2)] + self.assertEqual(link1_machine0.source.id, 0) + self.assertEqual(link1_machine0.target.id, 2) + self.assertEqual(link1_machine0.type, LinkType.PHB) + self.assertAlmostEqual(link1_machine0.bandwidth, 12) + self.assertAlmostEqual(link1_machine0.latency, 0) + + # device0, link2 + link2_machine0 = machine0.links[(0, 3)] + self.assertEqual(link2_machine0.source.id, 0) + self.assertEqual(link2_machine0.target.id, 3) + self.assertEqual(link2_machine0.type, LinkType.NET) + self.assertAlmostEqual(link2_machine0.bandwidth, 1) + self.assertAlmostEqual(link2_machine0.latency, 0) + + # device1 + device1_machine0 = machine0.devices[1] + self.assertEqual(device1_machine0.id, 1) + self.assertEqual(device1_machine0.type, DeviceType.GPU) + self.assertEqual(device1_machine0.model, "A100-SXM4-40GB") + self.assertAlmostEqual(device1_machine0.sp_gflops, 19500) + self.assertAlmostEqual(device1_machine0.dp_gflops, 9700) + self.assertAlmostEqual(device1_machine0.memory, 40) + + # device1, link0 + link0_machine0 = machine0.links[(1, 2)] + self.assertEqual(link0_machine0.source.id, 1) + self.assertEqual(link0_machine0.target.id, 2) + self.assertEqual(link0_machine0.type, LinkType.PHB) + self.assertAlmostEqual(link0_machine0.bandwidth, 12) + self.assertAlmostEqual(link0_machine0.latency, 0) + + # device1, link1 + link1_machine0 = machine0.links[(1, 3)] + self.assertEqual(link1_machine0.source.id, 1) + self.assertEqual(link1_machine0.target.id, 3) + self.assertEqual(link1_machine0.type, LinkType.NET) + self.assertAlmostEqual(link1_machine0.bandwidth, 1) + self.assertAlmostEqual(link1_machine0.latency, 0) + + # device2 + device2_machine0 = machine0.devices[2] + self.assertEqual(device2_machine0.id, 2) + self.assertEqual(device2_machine0.type, DeviceType.CPU) + self.assertEqual(device2_machine0.model, + "Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GH") + self.assertAlmostEqual(device2_machine0.sp_gflops, 150) + self.assertAlmostEqual(device2_machine0.dp_gflops, 75) + self.assertAlmostEqual(device2_machine0.memory, 1510) + + # device2, link0 + link0_machine0 = machine0.links[(2, 3)] + self.assertEqual(link0_machine0.source.id, 2) + self.assertEqual(link0_machine0.target.id, 3) + self.assertEqual(link0_machine0.type, LinkType.NET) + self.assertAlmostEqual(link0_machine0.bandwidth, 1) + self.assertAlmostEqual(link0_machine0.latency, 0) + + # device3 + device3_machine0 = machine0.devices[3] + self.assertEqual(device3_machine0.id, 3) + self.assertEqual(device3_machine0.type, DeviceType.NIC) + self.assertAlmostEqual(device3_machine0.model, None) + self.assertAlmostEqual(device3_machine0.sp_gflops, 0) + self.assertAlmostEqual(device3_machine0.dp_gflops, 0) + self.assertAlmostEqual(device3_machine0.memory, 0) + + link0_machine0 = machine0.links[(3, 7)] + # device3, link0 + self.assertEqual(link0_machine0.source.id, 3) + self.assertEqual(link0_machine0.target.id, 7) + self.assertEqual(link0_machine0.type, LinkType.NET) + self.assertAlmostEqual(link0_machine0.bandwidth, 1) + self.assertAlmostEqual(link0_machine0.latency, 0) + + # machine1 + machine1 = cluster.machines[1] + self.assertEqual(len(machine1.devices), 4) + self.assertEqual(machine1.hostname, "machine1") + self.assertEqual(machine1.addr, "0.0.0.2") + self.assertEqual(machine1.port, "768") + self.assertEqual(len(machine0.devices), 4) + self.assertEqual(len(machine0.links), 7) + + # device4 + device4_machine1 = machine1.devices[4] + self.assertEqual(device4_machine1.id, 4) + self.assertEqual(device4_machine1.type, DeviceType.GPU) + self.assertEqual(device4_machine1.model, "Tesla V100-SXM2-32GB") + self.assertAlmostEqual(device4_machine1.sp_gflops, 15700) + self.assertAlmostEqual(device4_machine1.dp_gflops, 7800) + self.assertAlmostEqual(device4_machine1.memory, 32) + + # device4, link0 + link0_machine1 = machine1.links[(4, 5)] + self.assertEqual(link0_machine1.source.id, 4) + self.assertEqual(link0_machine1.target.id, 5) + self.assertEqual(link0_machine1.type, LinkType.NVL) + self.assertAlmostEqual(link0_machine1.bandwidth, 42) + self.assertAlmostEqual(link0_machine1.latency, 0) + + # device 4, link 1 + link1_machine1 = machine1.links[(4, 6)] + self.assertEqual(link1_machine1.source.id, 4) + self.assertEqual(link1_machine1.target.id, 6) + self.assertEqual(link1_machine1.type, LinkType.PHB) + self.assertAlmostEqual(link1_machine1.bandwidth, 12) + self.assertAlmostEqual(link1_machine1.latency, 0) + + # device4, link2 + link2_machine1 = machine1.links[(4, 7)] + self.assertEqual(link2_machine1.source.id, 4) + self.assertEqual(link2_machine1.target.id, 7) + self.assertEqual(link2_machine1.type, LinkType.NET) + self.assertAlmostEqual(link2_machine1.bandwidth, 1) + self.assertAlmostEqual(link2_machine1.latency, 0) + + # device5 + device5_machine1 = machine1.devices[5] + self.assertEqual(device5_machine1.id, 5) + self.assertEqual(device5_machine1.type, DeviceType.GPU) + self.assertEqual(device4_machine1.model, "Tesla V100-SXM2-32GB") + self.assertAlmostEqual(device4_machine1.sp_gflops, 15700) + self.assertAlmostEqual(device4_machine1.dp_gflops, 7800) + self.assertAlmostEqual(device4_machine1.memory, 32) + + # device5, link0 + link0_machine1 = machine1.links[(5, 6)] + self.assertEqual(link0_machine1.source.id, 5) + self.assertEqual(link0_machine1.target.id, 6) + self.assertEqual(link0_machine1.type, LinkType.PHB) + self.assertAlmostEqual(link0_machine1.bandwidth, 12) + self.assertAlmostEqual(link0_machine1.latency, 0) + + # device5, link1 + link1_machine1 = machine1.links[(5, 7)] + self.assertEqual(link1_machine1.source.id, 5) + self.assertEqual(link1_machine1.target.id, 7) + self.assertEqual(link1_machine1.type, LinkType.NET) + self.assertAlmostEqual(link1_machine1.bandwidth, 1) + self.assertAlmostEqual(link1_machine1.latency, 0) + + # device6 + device6_machine1 = machine1.devices[6] + self.assertEqual(device6_machine1.id, 6) + self.assertEqual(device6_machine1.type, DeviceType.CPU) + self.assertEqual(device6_machine1.model, + "Intel(R) Xeon(R) Gold 6271C CPU @ 2.60G") + self.assertAlmostEqual(device6_machine1.sp_gflops, 150) + self.assertAlmostEqual(device6_machine1.dp_gflops, 75) + self.assertAlmostEqual(device6_machine1.memory, 503) + + # device6, link0 + link0_machine1 = machine1.links[(6, 7)] + self.assertEqual(link0_machine1.source.id, 6) + self.assertEqual(link0_machine1.target.id, 7) + self.assertEqual(link0_machine1.type, LinkType.NET) + self.assertAlmostEqual(link0_machine1.bandwidth, 1) + self.assertAlmostEqual(link0_machine1.latency, 0) + + # device7 + device7_machine1 = machine1.devices[7] + self.assertEqual(device7_machine1.id, 7) + self.assertEqual(device7_machine1.type, DeviceType.NIC) + self.assertAlmostEqual(device7_machine1.model, None) + self.assertAlmostEqual(device7_machine1.sp_gflops, 0) + self.assertAlmostEqual(device7_machine1.dp_gflops, 0) + self.assertAlmostEqual(device7_machine1.memory, 0) + + # device3, link0 + link0_machine1 = machine1.links[(7, 3)] + self.assertEqual(link0_machine1.source.id, 7) + self.assertEqual(link0_machine1.target.id, 3) + self.assertEqual(link0_machine1.type, LinkType.NET) + self.assertAlmostEqual(link0_machine1.bandwidth, 1) + self.assertAlmostEqual(link0_machine1.latency, 0) + + str = "cluster: {}".format(cluster) + + +if __name__ == '__main__': + unittest.main() From 70e188ad1d05bcc17a5b7f207290d6f0420726e5 Mon Sep 17 00:00:00 2001 From: Ao Yulong Date: Wed, 10 Nov 2021 06:30:18 +0000 Subject: [PATCH 02/30] [Auto Parallel] Add the graph class for physical mapping --- .../paddle/distributed/auto_parallel/graph.py | 180 ++++++++++++++++++ .../unittests/test_auto_parallel_graph.py | 79 ++++++++ 2 files changed, 259 insertions(+) create mode 100644 python/paddle/distributed/auto_parallel/graph.py create mode 100644 python/paddle/fluid/tests/unittests/test_auto_parallel_graph.py diff --git a/python/paddle/distributed/auto_parallel/graph.py b/python/paddle/distributed/auto_parallel/graph.py new file mode 100644 index 0000000000000..c28b8bfdd5320 --- /dev/null +++ b/python/paddle/distributed/auto_parallel/graph.py @@ -0,0 +1,180 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License + + +class Node: + def __init__(self, id, **attrs): + # Each node must has a unique id + self._id = id + # Attributes for Node + self._attrs = {} + self._attrs.update(attrs) + + @property + def id(self): + return self._id + + @property + def attrs(self): + return self._attrs + + def __getitem__(self, attr_name): + return self._attrs[attr_name] + + def __setitem__(self, attr_name, attr_value): + self._attrs[attr_name] = attr_value + + def __contains__(self, attr_name): + try: + return attr_name in self._attrs + except TypeError: + return False + + def __str__(self): + str = "(id: {}, attrs: {})".format(self.id, self.attrs) + return str + + +class Edge: + def __init__(self, src_id, tgt_id, **attrs): + # The id of source node in an Edge + self._src_id = src_id + # The id of target node in an Edge + self._tgt_id = tgt_id + # Attributes for Edge + self._attrs = {} + self._attrs.update(attrs) + + @property + def src_id(self): + return self._src_id + + @property + def tgt_id(self): + return self._tgt_id + + @property + def attrs(self): + return self._attrs + + def __getitem__(self, attr_name): + return self._attrs[attr_name] + + def __setitem__(self, attr_name, attr_value): + self._attrs[attr_name] = attr_value + + def __contains__(self, attr_name): + try: + return attr_name in self._attrs + except TypeError: + return False + + def __str__(self): + str = "" + str += "(src_id: {}, tgt_id: {}, attrs: {})".format( + self.src_id, self.tgt_id, self._attrs) + return str + + +class Graph: + def __init__(self, **attrs): + # _nodes is dict for storing the nodes of the graph. + # The key of this dict is the node id. + self._nodes = {} + # _adjs is a dict of dict for storing the adjacency of the graph. + # The key of the outer dict is the node id of the source node and + # the key of the inner dict is the node id of the target node. + self._adjs = {} + # Attributes for Graph + self._attrs = {} + self._attrs.update(attrs) + + @property + def nodes(self): + return self._nodes + + @property + def attrs(self): + return self._attrs + + @property + def adjs(self): + return self._adjs + + def add_node(self, node_id, **attrs): + if node_id is None: + raise ValueError("None cannot be a node") + if node_id not in self._nodes: + node = Node(node_id, **attrs) + self._nodes[node_id] = node + self._adjs[node_id] = {} + else: + self._nodes[node_id].attrs.update(attrs) + + def add_edge(self, src_id, tgt_id, **attrs): + # add nodes + if src_id is None: + raise ValueError("None cannot be a node") + if tgt_id is None: + raise ValueError("None cannot be a node") + if src_id not in self._nodes: + src_node = Node(src_id) + self._nodes[src_id] = src_node + self._adjs[src_id] = {} + if tgt_id not in self._nodes: + tgt_node = Node(tgt_id) + self._nodes[tgt_id] = tgt_node + self._adjs[tgt_id] = {} + # add the edge + edge = Edge(src_id, tgt_id, **attrs) + self._adjs[src_id][tgt_id] = edge + + def __len__(self): + return len(self._nodes) + + def __iter__(self): + return iter(self._nodes.values()) + + def __getitem__(self, n): + # Return the adjacency of a node + if isinstance(n, Node): + node_id = n.id + else: + node_id = n + return self._adjs[node_id] + + def __contains__(self, n): + # Check whether a node in the graph + if isinstance(n, Node): + node_id = n.id + else: + node_id = n + try: + return node_id in self._nodes + except TypeError: + return False + + def __str__(self): + str = "" + str += "**************Nodes**************\n" + for node_id in self.nodes: + str += "{}\n".format(self.nodes[node_id]) + + str += "**************Edges**************\n" + for src_id in self.adjs: + str += "--------------{}--------------\n".format(src_id) + for idx, tgt_id in enumerate(self.adjs[src_id]): + str += "{}\n".format(self.adjs[src_id][tgt_id]) + + return str diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_graph.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_graph.py new file mode 100644 index 0000000000000..eee1ad3ffb991 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_graph.py @@ -0,0 +1,79 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import os +import json +from paddle.distributed.auto_parallel.graph import Node +from paddle.distributed.auto_parallel.graph import Edge +from paddle.distributed.auto_parallel.graph import Graph + + +class TestAutoParallelGraph(unittest.TestCase): + def test_graph(self): + graph = Graph(name="foo") + self.assertEqual(graph.attrs["name"], "foo") + + graph.add_node(1, weight=1) + graph.add_node(2, weight=2) + graph.add_node(3, weight=3) + + node = graph.nodes[1] + node["info"] = "is a node" + self.assertTrue(node.id, 1) + self.assertTrue("weight" in node) + self.assertTrue("info" in node) + for node_attr in node.attrs: + self.assertTrue(node_attr in ["weight", "info"]) + + self.assertTrue(1 in graph) + self.assertTrue(2 in graph) + self.assertTrue(3 in graph) + self.assertEqual(len(graph), 3) + self.assertEqual(graph.nodes[1].id, 1) + self.assertEqual(graph.nodes[2].id, 2) + self.assertEqual(graph.nodes[3].id, 3) + for node in graph: + if node.id == 1: + self.assertEqual(node["weight"], 1) + if node.id == 2: + self.assertEqual(node["weight"], 2) + if node.id == 3: + self.assertEqual(node["weight"], 3) + + graph.add_edge(1, 2, weight=0.1) + graph.add_edge(1, 3, weight=0.2) + graph.add_edge(2, 3, weight=0.3) + + edge = graph[1][2] + edge["info"] = "is a edge" + self.assertTrue(edge.src_id, 1) + self.assertTrue(edge.tgt_id, 2) + self.assertTrue("weight" in edge) + self.assertTrue("info" in edge) + for edge_attr in edge.attrs: + self.assertTrue(edge_attr in ["weight", "info"]) + + self.assertEqual(graph[1][2]["weight"], 0.1) + self.assertEqual(graph[1][3]["weight"], 0.2) + self.assertEqual(graph[2][3]["weight"], 0.3) + + str = "{}".format(graph) + self.assertIsNotNone(str) + + +if __name__ == '__main__': + unittest.main() From 1a44c06a1f0c600051d6ba8b071715ed8d4ab932 Mon Sep 17 00:00:00 2001 From: Ao Yulong Date: Wed, 10 Nov 2021 06:32:12 +0000 Subject: [PATCH 03/30] [Auto Parallel] Add the simple physical mapper --- .../distributed/auto_parallel/mapper.py | 287 ++++ .../auto_parallel/process_group.py | 69 +- .../distributed/auto_parallel/process_mesh.py | 5 + .../distributed/auto_parallel/reshard.py | 6 +- .../unittests/test_auto_parallel_mapper.py | 1260 +++++++++++++++++ 5 files changed, 1602 insertions(+), 25 deletions(-) create mode 100644 python/paddle/distributed/auto_parallel/mapper.py create mode 100644 python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py diff --git a/python/paddle/distributed/auto_parallel/mapper.py b/python/paddle/distributed/auto_parallel/mapper.py new file mode 100644 index 0000000000000..ccf288410b39b --- /dev/null +++ b/python/paddle/distributed/auto_parallel/mapper.py @@ -0,0 +1,287 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License + +import operator +import functools +import json +import paddle +from collections import deque +from .graph import Node +from .graph import Edge +from .graph import Graph +from .cluster import DeviceType +from .process_group import get_process_group + + +def is_collective_comm_op(op): + comm_list = [ + "c_allreduce_sum", "c_allreduce_min", "c_allreduce_max", + "c_allreduce_prod", "c_reduce_sum", "c_reduce_min", "c_reduce_max", + "c_reduce_prod", "c_broadcast", "c_allgather", "send_v2", "recv_v2" + ] + if op.type in comm_list: + return True + else: + return False + + +def is_p2p_comm_op(op): + comm_list = ["send_v2", "recv_v2"] + if op.type in comm_list: + return True + else: + return False + + +def get_dtype_bytes(dtype): + num_bytes = 0 + if dtype == paddle.float64: + num_bytes = 8 + elif dtype == paddle.float32: + num_bytes = 4 + elif dtype == paddle.float16: + num_bytes = 2 + elif dtype == paddle.bfloat16: + num_bytes = 2 + elif dtype == paddle.int64: + num_bytes = 8 + elif dtype == paddle.int32: + num_bytes = 4 + elif dtype == paddle.int16: + num_bytes = 2 + elif dtype == paddle.int8: + num_bytes = 1 + elif dtype == paddle.uint8: + num_bytes = 1 + else: + raise ValueError("Unrecognized dtype {}.".format(dtype)) + return num_bytes + + +def get_comm_volume(comm_op, src_rank, tgt_rank): + comm_volume = None + if src_rank == tgt_rank: + return comm_volume + comm_op_type = comm_op.type + if comm_op_type != "recv_v2": + tensor_name = comm_op.input_arg_names[0] + else: + tensor_name = comm_op.output_arg_names[0] + tensor = comm_op.block._find_var_recursive(tensor_name) + assert tensor is not None + tensor_shape = tensor.shape + # Skip the batch dim + new_tensor_shape = [] + for val in tensor_shape: + if val == -1: + print("Warning: -1 in the tensor shape.") + new_tensor_shape.append(1) + new_tensor_shape.append(val) + tensor_size = functools.reduce(operator.mul, new_tensor_shape, 1) + tensor_bytes = tensor_size * get_dtype_bytes(tensor.dtype) + if "c_allreduce" in comm_op_type: + comm_volume = 2 * tensor_bytes + elif "c_allgather" in comm_op_type: + comm_volume = tensor_bytes + elif "c_broadcast" in comm_op_type: + if comm_op.attr("root_id") == src_rank: + comm_volume = tensor_bytes + else: + comm_volume = None + elif "c_reduce" in comm_op_type: + if comm_op.attr("root_id") == src_rank: + comm_volume = None + else: + comm_volume = tensor_bytes + elif "send_v2" in comm_op_type: + if comm_op.attr("peer") == tgt_rank: + comm_volume = tensor_bytes + else: + comm_volume = None + elif "recv_v2" in comm_op_type: + comm_volume = None + else: + raise ValueError("Unrecognized communication operator.") + return comm_volume + + +def analyze_comm_requirements_from_op(op, rank): + comm_requirements_to_ranks = {} + if is_collective_comm_op(op): + process_group_id = op.attr("ring_id") + process_group = get_process_group(process_group_id) + if rank not in process_group.ranks: + return comm_requirements_to_ranks + for tgt_rank in process_group.ranks: + comm_volume = get_comm_volume(op, rank, tgt_rank) + if comm_volume is not None: + comm_requirements_to_ranks[tgt_rank] = {} + comm_requirements_to_ranks[tgt_rank][ + "comm_volume"] = comm_volume + elif is_p2p_comm_op(op): + tgt_rank = op.attr("peer") + comm_volume = get_comm_volume(op, rank, tgt_rank) + if comm_volume is not None: + comm_requirements_to_ranks[tgt_rank] = {} + comm_requirements_to_ranks[tgt_rank]["comm_volume"] = comm_volume + else: + comm_requirements_to_ranks = {} + return comm_requirements_to_ranks + + +def analyze_requirements_for_program(program, rank): + resource_requirements = {} + comm_requirements_to_ranks = {} + # only support device_type and only support GPU for now + resource_requirements["device_type"] = DeviceType.GPU + for block in program.blocks: + for op in block.ops: + cur_comm_requirements_to_ranks = analyze_comm_requirements_from_op( + op, rank) + for tgt_rank, link_info in cur_comm_requirements_to_ranks.items(): + if tgt_rank in comm_requirements_to_ranks: + comm_requirements_to_ranks[tgt_rank][ + "comm_volume"] += link_info["comm_volume"] + else: + comm_requirements_to_ranks[tgt_rank] = {} + comm_requirements_to_ranks[tgt_rank][ + "comm_volume"] = link_info["comm_volume"] + return resource_requirements, comm_requirements_to_ranks + + +def build_process_graph(distributed_program): + graph = Graph() + for src_rank, src_program in distributed_program.items(): + resource_requirements, comm_requirements_to_ranks = analyze_requirements_for_program( + src_program, src_rank) + graph.add_node(src_rank, resource_requirements=resource_requirements) + for tgt_rank, comm_requirements in comm_requirements_to_ranks.items(): + graph.add_edge( + src_rank, tgt_rank, comm_requirements=comm_requirements) + return graph + + +def build_cluster_graph(cluster): + graph = Graph() + for machine in cluster.machines.values(): + for device in machine.devices.values(): + graph.add_node(device.id, device=device) + for link in machine.links.values(): + graph.add_edge(link.source.id, link.target.id, link=link) + return graph + + +def mapping(distributed_program, cluster): + # A very simple mapping algorithm only for GPUs. + # Here we assume one process will be mapped to one GPU. + # In the future, more mapping configurations and algorithms will be supported. + process_graph = build_process_graph(distributed_program) + + cluster_graph = build_cluster_graph(cluster) + + for cur_rank_node in process_graph: + cur_rank_node["visited"] = False + + for cur_device_node in cluster_graph: + cur_device_node["occupied"] = False + + def sort_by_comm_volume(rank_edge): + return rank_edge["comm_requirements"]["comm_volume"] + + def sort_by_comm_bandwidth(device_edge): + return device_edge["link"].bandwidth + + def select_unvisited_rank_node(rank_node_list): + selected_rank_node = None + for rank_node in rank_node_list: + if rank_node["visited"] is False: + selected_rank_node = rank_node + return selected_rank_node + + queue = deque() + root_rank_node = select_unvisited_rank_node( + list(process_graph.nodes.values())) + while root_rank_node is not None: + queue.append(root_rank_node) + while queue: + cur_rank_node = queue.popleft() + if cur_rank_node["visited"]: + continue + device_type = cur_rank_node["resource_requirements"]["device_type"] + cur_device_node = None + for device_node in cluster_graph.nodes.values(): + if (device_node["device"].type == device_type) and ( + not device_node["occupied"]): + device_node["occupied"] = True + cur_rank_node["visited"] = True + cur_rank_node["device"] = device_node["device"] + cur_device_node = device_node + break + assert cur_device_node, "Cannot find a device to satisfy the requirement." + + nbr_rank_edges = [] + for nbr_rank_node_id, nbr_rank_edge in process_graph.adjs[ + cur_rank_node.id].items(): + assert nbr_rank_edge.src_id == cur_rank_node.id and nbr_rank_edge.tgt_id == nbr_rank_node_id + queue.append(process_graph.nodes[nbr_rank_node_id]) + nbr_rank_edges.append(nbr_rank_edge) + nbr_rank_edges.sort(key=sort_by_comm_volume) + + nbr_device_edges = [] + for nbr_device_edge in cluster_graph.adjs[ + cur_device_node.id].values(): + nbr_device_edges.append(nbr_device_edge) + nbr_device_edges.sort(key=sort_by_comm_bandwidth) + + for nbr_rank_edge in nbr_rank_edges: + src_rank_node = process_graph.nodes[nbr_rank_edge.src_id][ + "visited"] + if src_rank_node: + continue + device_type = src_rank_node["resource_requirements"][ + "device_type"] + nbr_rank_node = process_graph.nodes[nbr_rank_edge.tgt_id] + for nbr_device_edge in nbr_device_edges: + nbr_device_node = cluster_graph.nodes[ + nbr_device_edge.tgt_id] + if (nbr_device_node["device"].type == device_type) and ( + not nbr_device_node["occupied"]): + nbr_device_node["occupied"] = True + nbr_rank_node["visited"] = True + nbr_rank_node["device"] = nbr_device_node["device"] + break + root_rank_node = select_unvisited_rank_node( + list(process_graph.nodes.values())) + + rank_mapping = {} + for rank, rank_node in process_graph.nodes.items(): + device = rank_node["device"] + machine = device.machine + if machine.id in rank_mapping: + rank_mapping[machine.id]["hostname"] = machine.hostname + rank_mapping[machine.id]["addr"] = machine.addr + rank_mapping[machine.id]["port"] = machine.port + rank_mapping[machine.id]["ranks"].append(rank) + else: + rank_mapping[machine.id] = {} + rank_mapping[machine.id]["ranks"] = [] + rank_mapping[machine.id]["hostname"] = machine.hostname + rank_mapping[machine.id]["addr"] = machine.addr + rank_mapping[machine.id]["port"] = machine.port + if rank not in rank_mapping[machine.id]["ranks"]: + rank_mapping[machine.id]["ranks"].append(rank) + for mapping in rank_mapping.values(): + mapping["ranks"].sort() + + return rank_mapping diff --git a/python/paddle/distributed/auto_parallel/process_group.py b/python/paddle/distributed/auto_parallel/process_group.py index 8bbe6f69155a4..1edbab0f51aaa 100644 --- a/python/paddle/distributed/auto_parallel/process_group.py +++ b/python/paddle/distributed/auto_parallel/process_group.py @@ -19,6 +19,8 @@ from ...fluid.framework import in_dygraph_mode from ...fluid.layers.tensor import fill_constant +# Note that Process group 0 is reserved for representing all ranks. +# At the begining, group 0 is empty and new ranks will be added automatically. _g_process_group_map = {} @@ -27,25 +29,27 @@ def get_all_process_groups(): return _g_process_group_map.values() +def get_process_group(group_id): + global _g_process_group_map + return _g_process_group_map.get(group_id, None) + + def new_process_group(ranks): global _g_process_group_map - if not _g_process_group_map: - genv = _get_global_env() - _g_process_group_map["global_group"] = ProcessGroup( - 0, list(range(genv.world_size))) - # A key constructed from ranks is used in the global process group map - key = ''.join(map(str, sorted(ranks))) - if key not in _g_process_group_map: - num_groups = len(_g_process_group_map) - # Note: our process group may interfere with the original implementation - # so the created group id should start from the original _new_ring_id() - group_id = _new_ring_id() + num_groups + 1 - pg = ProcessGroup(group_id, ranks) - _g_process_group_map[key] = pg - return pg - else: - pg = _g_process_group_map[key] - return pg + # A key constructed from ranks is used for avoiding duplication + new_key = ''.join(map(str, sorted(ranks))) + for pg_id, pg in _g_process_group_map.items(): + cur_key = ''.join(map(str, sorted(pg.ranks))) + if pg_id != 0 and new_key == cur_key: + return pg + # If not matching the existing one, construt a new process group + num_groups = len(_g_process_group_map) + # Note: our process group may interfere with the original implementation + # so the created group id should start from the original _new_ring_id() + group_id = _new_ring_id() + num_groups + 1 + new_pg = ProcessGroup(group_id, ranks) + _g_process_group_map[group_id] = new_pg + return new_pg # This implementation refers to lots of Paddle/python/paddle/distributed/collective.py, @@ -56,8 +60,14 @@ def new_process_group(ranks): # handle the communication implementation choice. class ProcessGroup: def __init__(self, group_id, ranks): + if group_id == 0 and get_process_group(0) is not None: + assert group_id != 0, "Process group id 0 is reserved for all ranks." self._group_id = group_id self._ranks = sorted(ranks) + # Add the current ranks into group 0 + if group_id != 0: + global _g_process_group_map + _g_process_group_map[0].add_ranks(ranks) self._nranks = len(self._ranks) self._is_instantiate = False @@ -65,9 +75,15 @@ def __init__(self, group_id, ranks): def id(self): return self._group_id - # @property - # def key(self): - # return ''.join(map(str, sorted(self._ranks))) + @property + def ranks(self): + return self._ranks + + def add_ranks(self, new_ranks): + assert self.is_instantiate() == False, \ + "Cannot add new ranks after instantiating the process group" + self._ranks.extend(new_ranks) + self._ranks = sorted(list(set(self._ranks))) def local_rank(self, global_rank): if global_rank in self._ranks: @@ -113,7 +129,20 @@ def instantiate(self): self._is_instantiate = True + def __eq__(self, other): + if not isinstance(other, ProcessGroup): + return False + if self.id != other.id: + return False + return True + + def __ne__(self, other): + return not self.__eq__(other) + def __str__(self): string = "id: {}, nranks: {}, ranks: {}.".format( self.id, self._nranks, ", ".join(map(str, self._ranks))) return string + + +_g_process_group_map[0] = ProcessGroup(0, []) diff --git a/python/paddle/distributed/auto_parallel/process_mesh.py b/python/paddle/distributed/auto_parallel/process_mesh.py index ecdd77f7ea754..f95951a3bad73 100644 --- a/python/paddle/distributed/auto_parallel/process_mesh.py +++ b/python/paddle/distributed/auto_parallel/process_mesh.py @@ -93,9 +93,14 @@ def __init__(self, mesh): self._topology = _get_nested_list_shape(mesh) self._processes = processes + # Store all process meshes from .dist_context import get_default_distributed_context default_dist_cxt = get_default_distributed_context() default_dist_cxt.add_process_mesh(self) + # Add new processes to process group 0 + from .process_group import get_process_group + pg0 = get_process_group(0) + pg0.add_ranks(self.processes) @property def topology(self): diff --git a/python/paddle/distributed/auto_parallel/reshard.py b/python/paddle/distributed/auto_parallel/reshard.py index fb130e9deefe8..62e71844da49a 100644 --- a/python/paddle/distributed/auto_parallel/reshard.py +++ b/python/paddle/distributed/auto_parallel/reshard.py @@ -666,11 +666,7 @@ def _concat_partitions_with_op(partition_tensor_list, tensor, partition_index, def _init_comm_for_send_recv(): - if not _g_process_group_map: - genv = _get_global_env() - _g_process_group_map["global_group"] = ProcessGroup( - 0, list(range(genv.world_size))) - _g_process_group_map["global_group"].instantiate() + pass HAS_SENT = {} diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py new file mode 100644 index 0000000000000..adb629f83c83c --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py @@ -0,0 +1,1260 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import os +import json +import collections +import math +import paddle + +import paddle.nn as nn +import paddle.nn.functional as F +import paddle.tensor as tensor +import paddle.utils as utils +import paddle.static as static +from paddle.fluid import layers +from paddle.fluid.framework import in_dygraph_mode +from paddle.nn.layer.transformer import _convert_param_attr_to_list +from paddle.fluid.initializer import Normal, Constant, NumpyArrayInitializer +from paddle.distributed import fleet + +import paddle.distributed.auto_parallel as auto +from paddle.distributed.auto_parallel.dist_context import DistributedContext +from paddle.distributed.auto_parallel.partitioner import Partitioner +from paddle.distributed.auto_parallel.reshard import reshard +from paddle.distributed.auto_parallel.process_group import get_all_process_groups +from paddle.distributed.auto_parallel.process_group import new_process_group +from paddle.distributed.auto_parallel.cluster import Cluster +from paddle.distributed.auto_parallel.cluster import DeviceType +from paddle.distributed.auto_parallel.cluster import LinkType +from paddle.distributed.auto_parallel.mapper import build_process_graph +from paddle.distributed.auto_parallel.mapper import build_cluster_graph +from paddle.distributed.auto_parallel.mapper import mapping +from paddle.distributed.auto_parallel.utils import check_distributed_attr_for_program +from paddle.distributed.auto_parallel.utils import print_program_with_dist_attr +from paddle.distributed.auto_parallel.utils import _get_comm_group + +paddle.enable_static() +_global_parallel_strategy = None +_global_process_mesh = None +_global_num_stages = None + +cluster_json = """ +{ + "machines": [ + { + "hostname": "machine0", + "addr": "0.0.0.1", + "port": "768", + "devices": [ + { + "id": 0, + "type": "GPU", + "model": "A100-SXM4-40GB", + "sp_gflops": 19500, + "dp_gflops": 9700, + "memory": 40 + }, + { + "id": 1, + "type": "GPU", + "model": "A100-SXM4-40GB", + "sp_gflops": 19500, + "dp_gflops": 9700, + "memory": 40 + }, + { + "id": 2, + "type": "GPU", + "model": "A100-SXM4-40GB", + "sp_gflops": 19500, + "dp_gflops": 9700, + "memory": 40 + }, + { + "id": 3, + "type": "GPU", + "model": "A100-SXM4-40GB", + "sp_gflops": 19500, + "dp_gflops": 9700, + "memory": 40 + } + ], + "links": [ + { + "source_id": 0, + "target_id": 1, + "type": "NVL", + "bandwidth": 42 + }, + { + "source_id": 0, + "target_id": 2, + "type": "NVL", + "bandwidth": 42 + }, + { + "source_id": 0, + "target_id": 3, + "type": "NVL", + "bandwidth": 42 + }, + { + "source_id": 1, + "target_id": 0, + "type": "NVL", + "bandwidth": 42 + }, + { + "source_id": 1, + "target_id": 2, + "type": "NVL", + "bandwidth": 42 + }, + { + "source_id": 1, + "target_id": 3, + "type": "NVL", + "bandwidth": 42 + }, + { + "source_id": 2, + "target_id": 0, + "type": "NVL", + "bandwidth": 42 + }, + { + "source_id": 2, + "target_id": 1, + "type": "NVL", + "bandwidth": 42 + }, + { + "source_id": 2, + "target_id": 3, + "type": "NVL", + "bandwidth": 42 + }, + { + "source_id": 3, + "target_id": 0, + "type": "NVL", + "bandwidth": 42 + }, + { + "source_id": 3, + "target_id": 1, + "type": "NVL", + "bandwidth": 42 + }, + { + "source_id": 3, + "target_id": 2, + "type": "NVL", + "bandwidth": 42 + }, + { + "source_id": 3, + "target_id": 7, + "type": "NVL", + "bandwidth": 10 + } + ] + }, + { + "hostname": "machine1", + "addr": "0.0.0.2", + "port": "768", + "devices": [ + { + "id": 4, + "type": "GPU", + "model": "Tesla V100-SXM2-32GB", + "sp_gflops": 15700, + "dp_gflops": 7800, + "memory": 32 + }, + { + "id": 5, + "type": "GPU", + "model": "Tesla V100-SXM2-32GB", + "sp_gflops": 15700, + "dp_gflops": 7800, + "memory": 32 + }, + { + "id": 6, + "type": "GPU", + "model": "Tesla V100-SXM2-32GB", + "sp_gflops": 15700, + "dp_gflops": 7800, + "memory": 32 + }, + { + "id": 7, + "type": "GPU", + "model": "Tesla V100-SXM2-32GB", + "sp_gflops": 15700, + "dp_gflops": 7800, + "memory": 32 + } + ], + "links": [ + { + "source_id": 4, + "target_id": 5, + "type": "NVL", + "bandwidth": 42 + }, + { + "source_id": 4, + "target_id": 6, + "type": "NVL", + "bandwidth": 42 + }, + { + "source_id": 4, + "target_id": 7, + "type": "NVL", + "bandwidth": 42 + }, + { + "source_id": 5, + "target_id": 4, + "type": "NVL", + "bandwidth": 42 + }, + { + "source_id": 5, + "target_id": 6, + "type": "NVL", + "bandwidth": 42 + }, + { + "source_id": 5, + "target_id": 7, + "type": "NVL", + "bandwidth": 42 + }, + { + "source_id": 6, + "target_id": 4, + "type": "NVL", + "bandwidth": 42 + }, + { + "source_id": 6, + "target_id": 5, + "type": "NVL", + "bandwidth": 42 + }, + { + "source_id": 6, + "target_id": 7, + "type": "NVL", + "bandwidth": 42 + }, + { + "source_id": 7, + "target_id": 4, + "type": "NVL", + "bandwidth": 42 + }, + { + "source_id": 7, + "target_id": 5, + "type": "NVL", + "bandwidth": 42 + }, + { + "source_id": 7, + "target_id": 6, + "type": "NVL", + "bandwidth": 42 + }, + { + "source_id": 7, + "target_id": 3, + "type": "NVL", + "bandwidth": 10 + } + ] + } + ] +} +""" + + +class MultiHeadAttention(nn.Layer): + """ + Attention mapps queries and a set of key-value pairs to outputs, and + Multi-Head Attention performs multiple parallel attention to jointly attending + to information from different representation subspaces. + """ + + Cache = collections.namedtuple("Cache", ["k", "v"]) + StaticCache = collections.namedtuple("StaticCache", ["k", "v"]) + + def __init__(self, + embed_dim, + num_heads, + dropout=0., + kdim=None, + vdim=None, + need_weights=False, + weight_attr=None, + bias_attr=None, + topo=None, + fuse=False, + stage=None): + super(MultiHeadAttention, self).__init__() + self.embed_dim = embed_dim + self.kdim = kdim if kdim is not None else embed_dim + self.vdim = vdim if vdim is not None else embed_dim + self.num_heads = num_heads + self.dropout = dropout + self.need_weights = need_weights + self.fuse = fuse + + self.stage = stage + + self.head_dim = embed_dim // num_heads + assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads" + + if topo is None or topo.mp_info.size == 1: + if self.fuse: + assert self.kdim == embed_dim + assert self.vdim == embed_dim + self.qkv_proj = nn.Linear( + embed_dim, 3 * embed_dim, weight_attr, bias_attr=bias_attr) + else: + self.q_proj = nn.Linear( + embed_dim, embed_dim, weight_attr, bias_attr=bias_attr) + self.k_proj = nn.Linear( + self.kdim, embed_dim, weight_attr, bias_attr=bias_attr) + self.v_proj = nn.Linear( + self.vdim, embed_dim, weight_attr, bias_attr=bias_attr) + self.out_proj = nn.Linear( + embed_dim, embed_dim, weight_attr, bias_attr=bias_attr) + + def _fuse_prepare_qkv(self, query): + mix_layer = self.qkv_proj(query) + mix_layer = paddle.reshape_(mix_layer, + [0, 0, self.num_heads, 3 * self.head_dim]) + mix_layer = paddle.transpose(mix_layer, [0, 2, 1, 3]) + q, k, v = paddle.split(mix_layer, num_or_sections=3, axis=-1) + return q, k, v + + def _prepare_qkv(self, query, key, value, use_cache=False, cache=None): + r""" + Prapares linear projected queries, keys and values for usage of subsequnt + multiple parallel attention. If `cache` is not None, using cached results + to reduce redundant calculations. + """ + q = self.q_proj(query) + + if _global_parallel_strategy == "mp": + auto.shard_tensor( + self.q_proj.weight, + dist_attr={ + "process_mesh": _global_process_mesh, + "dims_mapping": [-1, 0] + }) + elif _global_parallel_strategy == "dp_mp": + auto.shard_tensor( + self.q_proj.weight, + dist_attr={ + "process_mesh": _global_process_mesh, + "dims_mapping": [-1, 1] + }) + elif _global_parallel_strategy == "dp_mp_pp": + auto.shard_tensor( + self.q_proj.weight, + dist_attr={ + "process_mesh": _global_process_mesh[self.stage], + "dims_mapping": [-1, 1] + }) + + q = tensor.reshape(x=q, shape=[0, 0, self.num_heads, self.head_dim]) + q = tensor.transpose(x=q, perm=[0, 2, 1, 3]) + + if isinstance(cache, self.StaticCache): + # for encoder-decoder attention in inference and has cached + k, v = cache.k, cache.v + else: + k, v = self.compute_kv(key, value) + + if isinstance(cache, self.Cache): + # for decoder self-attention in inference + k = tensor.concat([cache.k, k], axis=2) + v = tensor.concat([cache.v, v], axis=2) + if use_cache is True: + cache = self.Cache(k, v) + + return (q, k, v) if use_cache is False else (q, k, v, cache) + + def compute_kv(self, key, value): + r""" + Applies linear projection on input keys and values, then splits heads + (reshape and transpose) to get keys and values from different representation + subspaces. The results are used as key-values pairs for subsequent multiple + parallel attention. + It is part of calculations in multi-head attention, and is provided as + a method to pre-compute and prefetch these results, thus we can use them + to construct cache for inference. + """ + k = self.k_proj(key) + + if _global_parallel_strategy == "mp": + auto.shard_tensor( + self.k_proj.weight, + dist_attr={ + "process_mesh": _global_process_mesh, + "dims_mapping": [-1, 0] + }) + elif _global_parallel_strategy == "dp_mp": + auto.shard_tensor( + self.k_proj.weight, + dist_attr={ + "process_mesh": _global_process_mesh, + "dims_mapping": [-1, 1] + }) + elif _global_parallel_strategy == "dp_mp_pp": + auto.shard_tensor( + self.k_proj.weight, + dist_attr={ + "process_mesh": _global_process_mesh[self.stage], + "dims_mapping": [-1, 1] + }) + + v = self.v_proj(value) + + if _global_parallel_strategy == "mp": + auto.shard_tensor( + self.v_proj.weight, + dist_attr={ + "process_mesh": _global_process_mesh, + "dims_mapping": [-1, 0] + }) + elif _global_parallel_strategy == "dp_mp": + auto.shard_tensor( + self.v_proj.weight, + dist_attr={ + "process_mesh": _global_process_mesh, + "dims_mapping": [-1, 1] + }) + elif _global_parallel_strategy == "dp_mp_pp": + auto.shard_tensor( + self.v_proj.weight, + dist_attr={ + "process_mesh": _global_process_mesh[self.stage], + "dims_mapping": [-1, 1] + }) + + k = tensor.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim]) + k = tensor.transpose(x=k, perm=[0, 2, 1, 3]) + v = tensor.reshape(x=v, shape=[0, 0, self.num_heads, self.head_dim]) + v = tensor.transpose(x=v, perm=[0, 2, 1, 3]) + return k, v + + def gen_cache(self, key, value=None, type=Cache): + """ + Generates cache for `forward` usage in inference accroding to arguments. + The generated cache is an instance of `MultiHeadAttention.Cache` or an + instance of `MultiHeadAttention.StaticCache`. + """ + if type == MultiHeadAttention.StaticCache: # static_kv + k, v = self.compute_kv(key, value) + return self.StaticCache(k, v) + elif value is None: # incremental_state + k = layers.fill_constant_batch_size_like( + input=key, + shape=[-1, self.num_heads, 0, self.head_dim], + dtype=key.dtype, + value=0) + v = layers.fill_constant_batch_size_like( + input=key, + shape=[-1, self.num_heads, 0, self.head_dim], + dtype=key.dtype, + value=0) + return self.Cache(k, v) + else: + # incremental_state with initial value, mainly for usage like UniLM + return self.Cache(key, value) + + def forward(self, + query, + key, + value, + attn_mask=None, + use_cache=False, + cache=None): + r""" + Applies multi-head attention to map queries and a set of key-value pairs + to outputs. + """ + key = query if key is None else key + value = query if value is None else value + # compute q ,k ,v + if use_cache is False: + if self.fuse: + q, k, v = self._fuse_prepare_qkv(query) + else: + q, k, v = self._prepare_qkv(query, key, value, use_cache, cache) + else: + q, k, v, cache = self._prepare_qkv(query, key, value, use_cache, + cache) + # scale dot product attention + product = layers.matmul( + x=q, y=k, transpose_y=True, alpha=self.head_dim**-0.5) + + if attn_mask is not None: + product = product + attn_mask + + weights = F.softmax(product) + if self.dropout: + weights = F.dropout( + weights, + self.dropout, + training=self.training, + mode="upscale_in_train") + + out = tensor.matmul(weights, v) + + # combine heads + out = tensor.transpose(out, perm=[0, 2, 1, 3]) + out = tensor.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]]) + + # project to output + out = self.out_proj(out) + + if _global_parallel_strategy == "mp": + auto.shard_tensor( + self.out_proj.weight, + dist_attr={ + "process_mesh": _global_process_mesh, + "dims_mapping": [0, -1] + }) + elif _global_parallel_strategy == "dp_mp": + auto.shard_tensor( + self.out_proj.weight, + dist_attr={ + "process_mesh": _global_process_mesh, + "dims_mapping": [1, -1] + }) + elif _global_parallel_strategy == "dp_mp_pp": + auto.shard_tensor( + self.out_proj.weight, + dist_attr={ + "process_mesh": _global_process_mesh[self.stage], + "dims_mapping": [1, -1] + }) + + outs = [out] + if self.need_weights: + outs.append(weights) + if use_cache: + outs.append(cache) + return out if len(outs) == 1 else tuple(outs) + + +class TransformerDecoder(nn.Layer): + """ + TransformerDecoder is a stack of N decoder layers. + """ + + def __init__(self, + decoder_layers, + num_layers, + norm=None, + hidden_size=None, + topo=None): + super(TransformerDecoder, self).__init__() + + self.topo = topo + self.num_layers = num_layers + self.layers = decoder_layers + self.norm = norm + if norm is "LayerNorm": + self.norm = nn.LayerNorm(hidden_size) + elif norm is not None: + raise ValueError("Only support LayerNorm") + self.checkpoints = [] + + def forward(self, + tgt, + memory, + tgt_mask=None, + memory_mask=None, + use_cache=False, + cache=None): + r""" + Applies a stack of N Transformer decoder layers on inputs. If `norm` is + provided, also applies layer normalization on the output of last decoder + layer. + """ + output = tgt + new_caches = [] + self.checkpoints = [] + assert cache is None and use_cache == False + if _global_parallel_strategy == "dp_mp_pp": + auto.shard_tensor( + output, + dist_attr={ + "process_mesh": _global_process_mesh[0], + "dims_mapping": + [0] + [-1 for i in range(len(output.shape) - 1)] + }) + for i, mod in enumerate(self.layers): + if cache is None: + if use_cache: + output, new_cache = mod(output, + memory, + tgt_mask=tgt_mask, + use_cache=use_cache, + cache=cache) + new_caches.append(new_cache) + else: + if _global_parallel_strategy == "dp_mp_pp": + output = auto.shard_op( + mod, + dist_attr={ + "process_mesh": _global_process_mesh[mod.stage] + })(output, memory, tgt_mask, use_cache, cache)[0] + + auto.shard_tensor( + output, + dist_attr={ + "process_mesh": _global_process_mesh[mod.stage], + "dims_mapping": [0] + + [-1 for i in range(len(output.shape) - 1)] + }) + else: + output = mod(output, + memory, + tgt_mask=tgt_mask, + use_cache=use_cache, + cache=cache) + + else: + output, new_cache = mod(output, + memory, + tgt_mask=tgt_mask, + use_cache=use_cache, + cache=cache[i]) + new_caches.append(new_cache) + self.checkpoints.append(output.name) + + if self.norm is not None: + output = self.norm(output) + return output if use_cache is False else (output, new_caches) + + def gen_cache(self, memory, do_zip=False): + r""" + Generates cache for `forward` usage. The generated cache is a list, and + each element in it is a tuple( :code:`(incremental_cache, static_cache)` ) + produced by `TransformerDecoderLayer.gen_cache`. See `TransformerDecoderLayer.gen_cache` + for more details. If `do_zip` is True, apply `zip` on these tuples to get + a list with two elements. + """ + cache = [layer.gen_cache(memory) for layer in self.layers] + if do_zip: + cache = list(zip(*cache)) + return cache + + +class TransformerDecoderLayer(nn.Layer): + """ + The transformer decoder layer. + It contains multiheadattention and some linear layers. + """ + + def __init__(self, + d_model, + nhead, + dim_feedforward, + dropout=0.1, + activation="gelu", + attn_dropout=None, + act_dropout=None, + normalize_before=True, + weight_attr=None, + bias_attr=None, + topo=None, + stage=None): + self._config = locals() + self._config.pop("self") + self._config.pop("__class__", None) # py3 + + self.stage = stage + + super(TransformerDecoderLayer, self).__init__() + attn_dropout = dropout if attn_dropout is None else attn_dropout + act_dropout = dropout if act_dropout is None else act_dropout + self.normalize_before = normalize_before + + weight_attrs = _convert_param_attr_to_list(weight_attr, 3) + bias_attrs = _convert_param_attr_to_list(bias_attr, 3) + + self.self_attn = MultiHeadAttention( + d_model, + nhead, + dropout=attn_dropout, + weight_attr=weight_attrs[0], + bias_attr=bias_attrs[0], + topo=topo, + stage=self.stage) + if topo is None or topo.mp_info.size == 1: + self.linear1 = nn.Linear( + d_model, + dim_feedforward, + weight_attrs[2], + bias_attr=bias_attrs[2]) + self.linear2 = nn.Linear( + dim_feedforward, + d_model, + weight_attrs[2], + bias_attr=bias_attrs[2]) + + self.norm1 = nn.LayerNorm(d_model, epsilon=1e-5) + self.norm2 = nn.LayerNorm(d_model, epsilon=1e-5) + self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train") + self.dropout2 = nn.Dropout(act_dropout, mode="upscale_in_train") + self.activation = getattr(F, activation) + + def forward(self, tgt, memory, tgt_mask=None, use_cache=False, cache=None): + residual = tgt + + if self.normalize_before: + tgt = self.norm1(tgt) + + if use_cache is False: + tgt = self.self_attn(tgt, tgt, tgt, tgt_mask, use_cache, cache) + else: + tgt, incremental_cache = self.self_attn(tgt, tgt, tgt, tgt_mask, + use_cache, cache) + tgt = residual + self.dropout1(tgt) + if not self.normalize_before: + tgt = self.norm1(tgt) + + residual = tgt + if self.normalize_before: + tgt = self.norm2(tgt) + + if _global_parallel_strategy == "mp": + auto.shard_tensor( + self.linear1.weight, + dist_attr={ + "process_mesh": _global_process_mesh, + "dims_mapping": [-1, 0] + }) + elif _global_parallel_strategy == "dp_mp": + auto.shard_tensor( + self.linear1.weight, + dist_attr={ + "process_mesh": _global_process_mesh, + "dims_mapping": [-1, 1] + }) + elif _global_parallel_strategy == "dp_mp_pp": + auto.shard_tensor( + self.linear1.weight, + dist_attr={ + "process_mesh": _global_process_mesh[self.stage], + "dims_mapping": [-1, 1] + }) + + if _global_parallel_strategy == "mp": + auto.shard_tensor( + self.linear2.weight, + dist_attr={ + "process_mesh": _global_process_mesh, + "dims_mapping": [0, -1] + }) + elif _global_parallel_strategy == "dp_mp": + auto.shard_tensor( + self.linear2.weight, + dist_attr={ + "process_mesh": _global_process_mesh, + "dims_mapping": [1, -1] + }) + elif _global_parallel_strategy == "dp_mp_pp": + auto.shard_tensor( + self.linear2.weight, + dist_attr={ + "process_mesh": _global_process_mesh[self.stage], + "dims_mapping": [1, -1] + }) + + # tgt = self.dropout2( + # self.linear2(F.gelu( + # self.linear1(tgt), approximate=True))) + tgt = self.linear1(tgt) + tgt = F.gelu(tgt, approximate=True) + tgt = self.dropout2(self.linear2(tgt)) + tgt = residual + tgt + + if not self.normalize_before: + tgt = self.norm2(tgt) + + return tgt if use_cache is False else (tgt, incremental_cache) + + def gen_cache(self, memory): + incremental_cache = self.self_attn.gen_cache( + memory, type=self.self_attn.Cache) + return incremental_cache + + +class GPTEmbeddings(nn.Layer): + """ + Include embeddings from word, position and token_type embeddings + """ + + def __init__(self, + vocab_size, + hidden_size=768, + hidden_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + initializer_range=0.02, + topo=None, + stage=None): + super(GPTEmbeddings, self).__init__() + if topo is None or topo.mp_info.size == 1: + self.word_embeddings = nn.Embedding( + vocab_size, + hidden_size, + weight_attr=paddle.ParamAttr( + name="word_embeddings", + initializer=nn.initializer.Normal( + mean=0.0, std=initializer_range))) + self.position_embeddings = nn.Embedding( + max_position_embeddings, + hidden_size, + weight_attr=paddle.ParamAttr( + name="pos_embeddings", + initializer=nn.initializer.Normal( + mean=0.0, std=initializer_range))) + + self.dropout = nn.Dropout(hidden_dropout_prob) + + def forward(self, input_ids, position_ids=None): + if position_ids is None: + ones = paddle.ones_like(input_ids, dtype="int64") + seq_length = paddle.cumsum(ones, axis=-1) + position_ids = seq_length - ones + + input_embedings = self.word_embeddings(input_ids) + + if _global_parallel_strategy == "mp": + auto.shard_tensor( + self.word_embeddings.weight, + dist_attr={ + "process_mesh": _global_process_mesh, + "dims_mapping": [0, -1] + }) + elif _global_parallel_strategy == "dp_mp": + auto.shard_tensor( + self.word_embeddings.weight, + dist_attr={ + "process_mesh": _global_process_mesh, + "dims_mapping": [1, -1] + }) + elif _global_parallel_strategy == "dp_mp_pp": + auto.shard_tensor( + self.word_embeddings.weight, + dist_attr={ + "process_mesh": _global_process_mesh[0], + "dims_mapping": [1, -1] + }) + + position_embeddings = self.position_embeddings(position_ids) + embeddings = input_embedings + position_embeddings + embeddings = self.dropout(embeddings) + return embeddings + + +class GPTModel(nn.Layer): + """ + The base model of gpt. + """ + + def __init__(self, + vocab_size, + hidden_size=768, + num_hidden_layers=4, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + initializer_range=0.02, + pad_token_id=0, + topo=None): + super(GPTModel, self).__init__() + + self.pad_token_id = pad_token_id + self.initializer_range = initializer_range + self.topo = topo + self.hidden_size = hidden_size + self.vocab_size = vocab_size + + self.pipline_mode = topo is not None and topo.pp_info.size > 1 + if self.pipline_mode: + self.layer_per_stage = num_hidden_layers // self.topo.pp_info.size + + self.embeddings = GPTEmbeddings( + vocab_size, hidden_size, hidden_dropout_prob, + max_position_embeddings, type_vocab_size, self.initializer_range, + topo) + + layer_per_stage = num_hidden_layers // _global_num_stages + decoder_layers = nn.LayerList() + for i in range(num_hidden_layers): + stage = i // layer_per_stage + DecoderLayer = TransformerDecoderLayer + decoder_layers.append( + DecoderLayer( + d_model=hidden_size, + nhead=num_attention_heads, + dim_feedforward=intermediate_size, + dropout=hidden_dropout_prob, + activation=hidden_act, + attn_dropout=attention_probs_dropout_prob, + act_dropout=hidden_dropout_prob, + weight_attr=paddle.ParamAttr( + initializer=nn.initializer.Normal( + mean=0.0, std=self.initializer_range)), + bias_attr=None, + topo=topo, + stage=stage)) + + Decoder = TransformerDecoder + + self.decoder = Decoder( + decoder_layers, + num_hidden_layers, + norm="LayerNorm", + hidden_size=hidden_size, + topo=topo) + + self.checkpoints = [] + + def forward(self, + input_ids, + position_ids=None, + attention_mask=None, + use_cache=False, + cache=None): + self.checkpoints = [] + if attention_mask is None: + length = paddle.shape(input_ids)[1] + # Use bool mask + attention_mask = paddle.tensor.tril( + paddle.ones( + (length, length), + dtype=self.embeddings.word_embeddings.weight.dtype)) + if position_ids is None: + past_length = 0 + if cache is not None: + past_length = paddle.shape(cache[0].k)[-2] + position_ids = paddle.arange( + past_length, + paddle.shape(input_ids)[-1] + past_length, + dtype='int64') + position_ids = position_ids.unsqueeze(0) + # .expand_as(input_ids) + position_ids = paddle.fluid.layers.expand_as(position_ids, + input_ids) + embedding_output = self.embeddings( + input_ids=input_ids, position_ids=position_ids) + + # TODO, use registered buffer + causal_mask = paddle.tensor.triu( + paddle.ones((paddle.shape(input_ids)[-1], + paddle.shape(input_ids)[-1])) * -1e9, + diagonal=1) + + if attention_mask is not None: + attention_mask = attention_mask + causal_mask + else: + attention_mask = causal_mask + + # The tensor returned by triu not in static graph. + attention_mask.stop_gradient = True + + encoder_outputs = self.decoder( + embedding_output, + memory=None, + tgt_mask=attention_mask, + use_cache=use_cache, + cache=cache) + self.checkpoints.extend(self.decoder.checkpoints) + return encoder_outputs + + +class GPTForPretraining(nn.Layer): + """ + The pretraining model of GPT. + It returns some logits and cached_kvs. + """ + + def __init__(self, gpt): + super(GPTForPretraining, self).__init__() + self.gpt = gpt + self.share_param = False + self.weight = self.gpt.embeddings.word_embeddings.weight + if not self.share_param: + self.weight = self.create_parameter(shape=self.weight.shape) + + def parallel_matmul(self, lm_output, logit_weights, parallel_output, topo): + if topo is not None and topo.mp_info.size > 1: + input_parallel = paddle.distributed.collective._c_identity( + lm_output, group=None) + + logits = paddle.matmul( + input_parallel, logit_weights, transpose_y=True) + + if parallel_output: + return logits + + return paddle.distributed.collective._c_concat(logits, group=None) + else: + logits = paddle.matmul(lm_output, logit_weights, transpose_y=True) + return logits + + def forward(self, + input_ids, + position_ids=None, + attention_mask=None, + masked_positions=None, + use_cache=False, + cache=None): + outputs = self.gpt(input_ids, + position_ids=position_ids, + attention_mask=attention_mask, + use_cache=use_cache, + cache=cache) + if use_cache: + encoder_outputs, cached_kvs = outputs[:2] + else: + encoder_outputs = outputs + logits = self.parallel_matmul(encoder_outputs, self.weight, True, + self.gpt.topo) + + if use_cache: + return logits, cached_kvs + else: + return logits + + +class GPTPretrainingCriterion(nn.Layer): + """ + Criterion for GPT. + It calculates the final loss. + """ + + def __init__(self, topo=None): + super(GPTPretrainingCriterion, self).__init__() + if topo is None or topo.mp_info.size == 1: + self.loss_func = paddle.nn.CrossEntropyLoss(reduction="none") + else: + self.loss_func = paddle.distributed.collective._c_softmax_with_cross_entropy + + def forward(self, prediction_scores, masked_lm_labels, loss_mask): + masked_lm_loss = self.loss_func(prediction_scores, + masked_lm_labels.unsqueeze(2)) + + loss_mask = loss_mask.reshape([-1]) + masked_lm_loss = paddle.sum(masked_lm_loss.reshape([-1]) * loss_mask) + loss = masked_lm_loss / loss_mask.sum() + return loss + + +def gpt_pretrain_forward(train_program, startup_program): + with static.program_guard(train_program, + startup_program), utils.unique_name.guard(): + batch_size = 16 + sequence_len = 512 + input_ids = static.data( + name="input_ids", shape=[batch_size, sequence_len], dtype='int64') + position_ids = static.data( + name="position_ids", + shape=[batch_size, sequence_len], + dtype='int64') + attention_mask = static.data( + name="attention_mask", + shape=[batch_size, 1, sequence_len, sequence_len], + dtype='float64') + labels = static.data( + name="labels", shape=[batch_size, sequence_len], dtype='int64') + loss_mask = static.data( + name="loss_mask", shape=[batch_size, sequence_len], dtype='float64') + + if _global_parallel_strategy == "dp": + auto.shard_tensor( + input_ids, + dist_attr={ + "process_mesh": _global_process_mesh, + "dims_mapping": [0, -1] + }) + elif _global_parallel_strategy == "dp_mp": + auto.shard_tensor( + input_ids, + dist_attr={ + "process_mesh": _global_process_mesh, + "dims_mapping": [0, -1] + }) + elif _global_parallel_strategy == "dp_mp_pp": + auto.shard_tensor( + input_ids, + dist_attr={ + "process_mesh": _global_process_mesh[0], + "dims_mapping": [0, -1] + }) + + gpt = GPTModel( + vocab_size=32768, + hidden_size=768, + num_hidden_layers=2, + num_attention_heads=12, + intermediate_size=4096, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=1024, + type_vocab_size=16, + initializer_range=0.02, + pad_token_id=0, + topo=None) + + model = GPTForPretraining(gpt) + + preds = model(input_ids, position_ids, attention_mask) + + criterion = GPTPretrainingCriterion() + + loss = criterion(preds, labels, loss_mask) + + return train_program, startup_program, loss + + +def get_dist_prog(train_program, startup_program, dist_context, rank_id): + train_program, startup_program, loss = gpt_pretrain_forward(train_program, + startup_program) + + dist_strategy = fleet.DistributedStrategy() + + # auto completion + complete_train_program = auto.complete_annotation(train_program, + dist_context) + partitioner = Partitioner(dist_strategy, dist_context, rank_id) + # logical partition + dist_main_prog, dist_startup_prog = partitioner.transpile_forward( + complete_train_program, startup_program) + dist_params_grads = partitioner.apply_backward( + loss, complete_train_program, startup_program, dist_main_prog, + dist_startup_prog) + optimizer = paddle.fluid.optimizer.AdamOptimizer() + opt_ops = partitioner.apply_optimize(optimizer, dist_params_grads, + dist_main_prog, dist_startup_prog) + + return dist_main_prog, dist_startup_prog + + +def check_rank_mapping(rank_mapping): + for machine_id, mapping in rank_mapping.items(): + print("machine_id: ", rank, "mapping: ", mapping) + + +class TestAutoParallelMapper(unittest.TestCase): + def test_mapper_dp_mp(self): + cluster_json_file = "" + cluster_json_object = json.loads(cluster_json) + with open("./auto_parallel_cluster.json", "w") as cluster_json_file: + json.dump(cluster_json_object, cluster_json_file) + cluster = Cluster() + cluster.build_from_file("./auto_parallel_cluster.json") + os.remove("./auto_parallel_cluster.json") + + global _global_parallel_strategy + _global_parallel_strategy = "dp_mp" + global _global_num_stages + _global_num_stages = 1 + global _global_process_mesh + _global_process_mesh = auto.ProcessMesh( + mesh=[[0, 1, 2, 3], [4, 5, 6, 7]]) + + dist_programs = {} + for rank_id in _global_process_mesh.processes: + train_program = static.Program() + startup_program = static.Program() + dist_context = DistributedContext() + dist_train_program, dist_startup_prog = get_dist_prog( + train_program, startup_program, dist_context, rank_id) + reshard(dist_train_program, dist_startup_prog, rank_id, + dist_context) + dist_programs[rank_id] = dist_train_program + + process_graph = build_process_graph(dist_programs) + + rank_mapping = mapping(dist_programs, cluster) + + ranks = [] + for machine_id, cur_mapping in rank_mapping.items(): + ranks.extend(cur_mapping["ranks"]) + self.assertEqual(sorted(_global_process_mesh.processes), sorted(ranks)) + + def test_mapper_dp_mp_pp(self): + cluster_json_file = "" + cluster_json_object = json.loads(cluster_json) + with open("./auto_parallel_cluster.json", "w") as cluster_json_file: + json.dump(cluster_json_object, cluster_json_file) + cluster = Cluster() + cluster.build_from_file("./auto_parallel_cluster.json") + os.remove("./auto_parallel_cluster.json") + + global _global_parallel_strategy + _global_parallel_strategy = "dp_mp_pp" + global _global_num_stages + _global_num_stages = 2 + global _global_process_mesh + _global_process_mesh = [[[0, 1], [2, 3]], [[4, 5], [6, 7]]] + processes = [0, 1, 2, 3, 4, 5, 6, 7] + + dist_programs = {} + for rank_id in processes: + train_program = static.Program() + startup_program = static.Program() + dist_context = DistributedContext() + dist_train_program, dist_startup_prog = get_dist_prog( + train_program, startup_program, dist_context, rank_id) + reshard(dist_train_program, dist_startup_prog, rank_id, + dist_context) + dist_programs[rank_id] = dist_train_program + + rank_mapping = mapping(dist_programs, cluster) + + ranks = [] + for machine_id, cur_mapping in rank_mapping.items(): + ranks.extend(cur_mapping["ranks"]) + self.assertEqual(sorted(processes), sorted(ranks)) + + +if __name__ == '__main__': + unittest.main() From b00f5fb5b40bab25bf8a28c9d7cf295c7f046d7a Mon Sep 17 00:00:00 2001 From: Ao Yulong Date: Thu, 11 Nov 2021 01:07:53 +0000 Subject: [PATCH 04/30] Set the timeout of the mapper --- python/paddle/fluid/tests/unittests/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 4edc675acc730..bb5974c8a6382 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -1017,6 +1017,7 @@ set_tests_properties(test_reader_reset PROPERTIES TIMEOUT 120) set_tests_properties(test_pool3d_api PROPERTIES TIMEOUT 120) set_tests_properties(test_cumprod_op PROPERTIES TIMEOUT 120) set_tests_properties(test_solve_op PROPERTIES TIMEOUT 120) +set_tests_properties(test_auto_parallel_mapper PROPERTIES TIMEOUT 120) if(WITH_DISTRIBUTE AND WITH_GPU AND WITH_NCCL) set_tests_properties(test_parallel_dygraph_dataparallel PROPERTIES TIMEOUT 120) set_tests_properties(test_parallel_dygraph_unused_variables PROPERTIES TIMEOUT 120) From 76498be179a2d4317047b962e33c6d096e2162bd Mon Sep 17 00:00:00 2001 From: Ao Yulong Date: Thu, 11 Nov 2021 01:14:47 +0000 Subject: [PATCH 05/30] Merge the upstream develop unittests cmake files --- python/paddle/fluid/tests/unittests/CMakeLists.txt | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index bb5974c8a6382..358de9ce0a00f 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -38,6 +38,7 @@ list(APPEND DIST_TEST_OPS test_hybrid_parallel_inference_helper) list(APPEND DIST_TEST_OPS test_parallel_class_center_sample) list(APPEND DIST_TEST_OPS test_parallel_margin_cross_entropy) list(APPEND DIST_TEST_OPS test_auto_parallel_data_unshard) +list(APPEND DIST_TEST_OPS test_auto_parallel_save_load) set(MIXED_DIST_TEST_OPS ${DIST_TEST_OPS}) #remove distribute unittests. list(APPEND MIXED_DIST_TEST_OPS test_dgc_op) @@ -139,6 +140,7 @@ if(((NOT WITH_ROCM) AND (NOT WITH_GPU)) OR WIN32) LIST(REMOVE_ITEM TEST_OPS test_raw_program_optimizer) LIST(REMOVE_ITEM TEST_OPS test_fleet_gradient_scale) LIST(REMOVE_ITEM TEST_OPS test_disable_signal_handler) + LIST(REMOVE_ITEM TEST_OPS test_fleet_executor) endif() # Temporally disable test_deprecated_decorator @@ -253,6 +255,7 @@ if ((NOT WITH_GPU) AND (NOT WITH_ROCM)) LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_reshard_dpmppp) LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_cost_model) LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_data_unshard) + LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_save_load) elseif(WITH_GPU) if (${CUDNN_VERSION} VERSION_LESS 7100) LIST(REMOVE_ITEM TEST_OPS test_conv2d_fusion_op) @@ -1016,7 +1019,6 @@ set_tests_properties(test_dataloader_unkeep_order PROPERTIES TIMEOUT 120) set_tests_properties(test_reader_reset PROPERTIES TIMEOUT 120) set_tests_properties(test_pool3d_api PROPERTIES TIMEOUT 120) set_tests_properties(test_cumprod_op PROPERTIES TIMEOUT 120) -set_tests_properties(test_solve_op PROPERTIES TIMEOUT 120) set_tests_properties(test_auto_parallel_mapper PROPERTIES TIMEOUT 120) if(WITH_DISTRIBUTE AND WITH_GPU AND WITH_NCCL) set_tests_properties(test_parallel_dygraph_dataparallel PROPERTIES TIMEOUT 120) @@ -1033,6 +1035,7 @@ if(WITH_DISTRIBUTE AND WITH_GPU AND WITH_NCCL) set_tests_properties(test_parallel_class_center_sample PROPERTIES TIMEOUT 120) set_tests_properties(test_parallel_margin_cross_entropy PROPERTIES TIMEOUT 120) set_tests_properties(test_auto_parallel_data_unshard PROPERTIES TIMEOUT 120) + set_tests_properties(test_auto_parallel_save_load PROPERTIES TIMEOUT 120) if(${NCCL_VERSION} VERSION_GREATER_EQUAL 2212) set_tests_properties(test_parallel_dygraph_sparse_embedding PROPERTIES TIMEOUT 120) set_tests_properties(test_parallel_dygraph_transformer PROPERTIES TIMEOUT 120) From 23154726d12a01569af4ffc6738c88eac50518f5 Mon Sep 17 00:00:00 2001 From: Ao Yulong Date: Thu, 11 Nov 2021 11:56:35 +0000 Subject: [PATCH 06/30] Fix a bug of the process group --- .../distributed/auto_parallel/parallelizer.py | 2 +- .../auto_parallel/process_group.py | 26 ++++++++++++------- .../distributed/auto_parallel/reshard.py | 10 ++----- .../test_auto_parallel_reshard_serial.py | 2 -- 4 files changed, 19 insertions(+), 21 deletions(-) diff --git a/python/paddle/distributed/auto_parallel/parallelizer.py b/python/paddle/distributed/auto_parallel/parallelizer.py index 7a0cbd7da31c5..b44c01045597e 100644 --- a/python/paddle/distributed/auto_parallel/parallelizer.py +++ b/python/paddle/distributed/auto_parallel/parallelizer.py @@ -78,7 +78,7 @@ def parallelize(self, # instantiate communication by process_mapping. all_process_groups = get_all_process_groups() for process_group in all_process_groups: - if rank not in process_group._ranks: + if rank not in process_group.ranks: continue process_group.instantiate() diff --git a/python/paddle/distributed/auto_parallel/process_group.py b/python/paddle/distributed/auto_parallel/process_group.py index 1edbab0f51aaa..29bb3edc94c85 100644 --- a/python/paddle/distributed/auto_parallel/process_group.py +++ b/python/paddle/distributed/auto_parallel/process_group.py @@ -68,7 +68,6 @@ def __init__(self, group_id, ranks): if group_id != 0: global _g_process_group_map _g_process_group_map[0].add_ranks(ranks) - self._nranks = len(self._ranks) self._is_instantiate = False @property @@ -79,15 +78,22 @@ def id(self): def ranks(self): return self._ranks + @property + def nranks(self): + return len(self._ranks) + def add_ranks(self, new_ranks): - assert self.is_instantiate() == False, \ - "Cannot add new ranks after instantiating the process group" + if set(new_ranks) <= set(self.ranks): + return + else: + assert self.is_instantiate() == False, \ + "Cannot add new ranks after instantiating the process group" self._ranks.extend(new_ranks) - self._ranks = sorted(list(set(self._ranks))) + self._ranks = sorted(list(set(self.ranks))) def local_rank(self, global_rank): - if global_rank in self._ranks: - return self._ranks.index(global_rank) + if global_rank in self.ranks: + return self.ranks.index(global_rank) else: assert False, \ "Rank {} doesn't belong to this group".format(global_rank) @@ -102,12 +108,12 @@ def instantiate(self): genv = _get_global_env() global_rank = genv.rank - if self._nranks >= 2: + if self.nranks >= 2: strategy = core.ParallelStrategy() - strategy.nranks = self._nranks + strategy.nranks = self.nranks strategy.local_rank = self.local_rank(global_rank) strategy.trainer_endpoints = [ - genv.trainer_endpoints[i] for i in self._ranks + genv.trainer_endpoints[i] for i in self.ranks ] strategy.current_endpoint = genv.current_endpoint strategy.nrings = 1 @@ -141,7 +147,7 @@ def __ne__(self, other): def __str__(self): string = "id: {}, nranks: {}, ranks: {}.".format( - self.id, self._nranks, ", ".join(map(str, self._ranks))) + self.id, self.nranks, ", ".join(map(str, self.ranks))) return string diff --git a/python/paddle/distributed/auto_parallel/reshard.py b/python/paddle/distributed/auto_parallel/reshard.py index 62e71844da49a..b0249356eddb1 100644 --- a/python/paddle/distributed/auto_parallel/reshard.py +++ b/python/paddle/distributed/auto_parallel/reshard.py @@ -627,13 +627,13 @@ def _insert_fill_constant_op(block, idx): attrs={ 'ring_id': group.id, 'use_calc_stream': True, - 'nranks': group._nranks + 'nranks': group.nranks }) idx_offset += 1 # insert split op split_out = _insert_split_op(block, idx + idx_offset, allgather_out, - group._nranks) + group.nranks) idx_offset += 1 tensor_list.extend(split_out) return tensor_list, idx_offset @@ -665,10 +665,6 @@ def _concat_partitions_with_op(partition_tensor_list, tensor, partition_index, partition_tensor_list.append((tensor, partition_index)) -def _init_comm_for_send_recv(): - pass - - HAS_SENT = {} HAS_RECV = {} HAS_ALLGATHER = {} @@ -722,7 +718,6 @@ def parse_op_desc(program, rank_id, op_desc_seq, var_name, reshard_op, assert tensor_list, "The result of parsing allgather op should not be None." elif isinstance(op_desc, SendOpDesc): - _init_comm_for_send_recv() if var_name not in HAS_SENT.keys(): HAS_SENT[var_name] = [] if op_desc.dst not in HAS_SENT[var_name]: @@ -731,7 +726,6 @@ def parse_op_desc(program, rank_id, op_desc_seq, var_name, reshard_op, HAS_SENT[var_name].append(op_desc.dst) elif isinstance(op_desc, RecvOpDesc): - _init_comm_for_send_recv() if var_name not in HAS_RECV.keys(): HAS_RECV[var_name] = {} if op_desc.src not in HAS_RECV[var_name].keys(): diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_serial.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_serial.py index 90dd0111dff3d..b4b7e50a3a206 100644 --- a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_serial.py +++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_serial.py @@ -166,8 +166,6 @@ def get_dist_prog_with_parallelizer(train_program, startup_program, grad_clip=None) optimizer = fleet.distributed_optimizer(optimizer) - # fake a comm group - pg = new_process_group([3, 4]) _, _, distributed_startup_program, distributed_main_program = optimizer.minimize( loss, startup_program) From 8f3b236b82c99ce44872a63d1692b707c2cccc34 Mon Sep 17 00:00:00 2001 From: Ao Yulong Date: Thu, 11 Nov 2021 13:17:48 +0000 Subject: [PATCH 07/30] Remove mapper unittest from platforms which is not GPU --- python/paddle/fluid/tests/unittests/CMakeLists.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 358de9ce0a00f..7d235c69d8fab 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -141,6 +141,7 @@ if(((NOT WITH_ROCM) AND (NOT WITH_GPU)) OR WIN32) LIST(REMOVE_ITEM TEST_OPS test_fleet_gradient_scale) LIST(REMOVE_ITEM TEST_OPS test_disable_signal_handler) LIST(REMOVE_ITEM TEST_OPS test_fleet_executor) + LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_mapper) endif() # Temporally disable test_deprecated_decorator @@ -1019,7 +1020,6 @@ set_tests_properties(test_dataloader_unkeep_order PROPERTIES TIMEOUT 120) set_tests_properties(test_reader_reset PROPERTIES TIMEOUT 120) set_tests_properties(test_pool3d_api PROPERTIES TIMEOUT 120) set_tests_properties(test_cumprod_op PROPERTIES TIMEOUT 120) -set_tests_properties(test_auto_parallel_mapper PROPERTIES TIMEOUT 120) if(WITH_DISTRIBUTE AND WITH_GPU AND WITH_NCCL) set_tests_properties(test_parallel_dygraph_dataparallel PROPERTIES TIMEOUT 120) set_tests_properties(test_parallel_dygraph_unused_variables PROPERTIES TIMEOUT 120) @@ -1036,6 +1036,7 @@ if(WITH_DISTRIBUTE AND WITH_GPU AND WITH_NCCL) set_tests_properties(test_parallel_margin_cross_entropy PROPERTIES TIMEOUT 120) set_tests_properties(test_auto_parallel_data_unshard PROPERTIES TIMEOUT 120) set_tests_properties(test_auto_parallel_save_load PROPERTIES TIMEOUT 120) + set_tests_properties(test_auto_parallel_mapper PROPERTIES TIMEOUT 120) if(${NCCL_VERSION} VERSION_GREATER_EQUAL 2212) set_tests_properties(test_parallel_dygraph_sparse_embedding PROPERTIES TIMEOUT 120) set_tests_properties(test_parallel_dygraph_transformer PROPERTIES TIMEOUT 120) From 95d6d3ae0c8548c9dc1c12b9ebb4ede44bbbac4b Mon Sep 17 00:00:00 2001 From: Ao Yulong Date: Fri, 12 Nov 2021 11:31:15 +0000 Subject: [PATCH 08/30] Move the instantiation of process group after resharding --- .../distributed/auto_parallel/parallelizer.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/python/paddle/distributed/auto_parallel/parallelizer.py b/python/paddle/distributed/auto_parallel/parallelizer.py index b44c01045597e..2c0a4e581c330 100644 --- a/python/paddle/distributed/auto_parallel/parallelizer.py +++ b/python/paddle/distributed/auto_parallel/parallelizer.py @@ -74,14 +74,6 @@ def parallelize(self, self._optimizer, dist_params_grads, partitioned_main_prog, partitioned_startup_prog) - # Traverse different rank programs and traverse each op of them, - # instantiate communication by process_mapping. - all_process_groups = get_all_process_groups() - for process_group in all_process_groups: - if rank not in process_group.ranks: - continue - process_group.instantiate() - # The last step: remove all distributed attributes to be compatiable # with inference. self._remove_distributed_attrs(partitioned_main_prog) @@ -91,6 +83,14 @@ def parallelize(self, reshard(partitioned_main_prog, partitioned_startup_prog, rank, self._dist_context) + # Traverse different rank programs and traverse each op of them, + # instantiate communication by process_mapping. + all_process_groups = get_all_process_groups() + for process_group in all_process_groups: + if rank not in process_group.ranks: + continue + process_group.instantiate() + # Copy distributed info to the default context set_default_distributed_context(self._dist_context) From e50494f50f42dc54e10d38127b305363c56891aa Mon Sep 17 00:00:00 2001 From: Ao Yulong Date: Sun, 14 Nov 2021 09:07:56 +0000 Subject: [PATCH 09/30] Add the local id for devices --- .../distributed/auto_parallel/cluster.py | 62 ++++--- .../unittests/test_auto_parallel_cluster.py | 167 ++++++++++-------- 2 files changed, 128 insertions(+), 101 deletions(-) diff --git a/python/paddle/distributed/auto_parallel/cluster.py b/python/paddle/distributed/auto_parallel/cluster.py index c06557987e305..23ea90071a56d 100644 --- a/python/paddle/distributed/auto_parallel/cluster.py +++ b/python/paddle/distributed/auto_parallel/cluster.py @@ -43,8 +43,9 @@ class LinkType(IntEnum): class Device: - def __init__(self, id, machine): - self._id = id + def __init__(self, global_id, local_id, machine): + self._global_id = global_id + self._local_id = local_id self._machine = machine self._type = None # different device have different models, such as @@ -56,12 +57,20 @@ def __init__(self, id, machine): self._memory = None @property - def id(self): - return self._id + def global_id(self): + return self._global_id - @id.setter - def id(self, value): - self._id = value + @global_id.setter + def global_id(self, value): + self._global_id = value + + @property + def local_id(self): + return self._local_id + + @local_id.setter + def local_id(self, value): + self._local_id = value @property def machine(self): @@ -113,9 +122,9 @@ def memory(self, value): def __str__(self): str = "" - str += "device_id: {}, machine_id: {}, type: {}, model: {}, dp_flops: {}, sp_flops: {}, memory: {}".format( - self.id, self.machine.id, self.type.name, self.model, - self.dp_gflops, self.sp_gflops, self.memory) + str += "global_id: {}, local_id: {}, machine_id: {}, type: {}, model: {}, dp_flops: {}, sp_flops: {}, memory: {}".format( + self.global_id, self.local_id, self.machine.id, self.type.name, + self.model, self.dp_gflops, self.sp_gflops, self.memory) return str def __repr__(self): @@ -174,9 +183,9 @@ def latency(self, value): def __str__(self): str = "" - str += "source_id: {}, target_id: {}, type: {}, bandwidth: {}, latency: {}".format( - self.source.id, self.target.id, self.type, self.bandwidth, - self.latency) + str += "source_global_id: {}, target_global_id: {}, type: {}, bandwidth: {}, latency: {}".format( + self.source.global_id, self.target.global_id, self.type, + self.bandwidth, self.latency) return str def __repr__(self): @@ -233,12 +242,12 @@ def links(self): return self._links def add_device(self, device): - # Use the device id as the key - self._devices[device.id] = device + # Use the device global_id as the key + self._devices[device.global_id] = device def add_link(self, link): - # Use the source device id and target device id as the key - self._links[(link.source.id, link.target.id)] = link + # Use the source device global_id and target device global_id as the key + self._links[(link.source.global_id, link.target.global_id)] = link def __str__(self): str = "" @@ -283,11 +292,11 @@ def add_link(self, link): # Only add the link to the source machine link.source.machine.add_link(link) - def get_device(self, device_id): + def get_device(self, device_global_id): device = None for machine in self.machines.values(): - if device_id in machine.devices.keys(): - device = machine.devices[device_id] + if device_global_id in machine.devices.keys(): + device = machine.devices[device_global_id] return device def build_from_file(self, json_file_path): @@ -302,8 +311,9 @@ def build_from_file(self, json_file_path): machine.port = machine_info.get("port") devices_info = machine_info.get("devices", []) for device_info in devices_info: - device_id = device_info.get("id") - device = Device(device_id, machine) + device_global_id = device_info.get("global_id") + device_local_id = device_info.get("local_id") + device = Device(device_global_id, device_local_id, machine) device_type = device_info.get("type", None) if device_type is not None: device_type = DeviceType[device_type] @@ -319,10 +329,10 @@ def build_from_file(self, json_file_path): for machine_info in machines_info: links_info = machine_info.get("links", []) for link_info in links_info: - source_id = link_info.get("source_id") - target_id = link_info.get("target_id") - source = self.get_device(source_id) - target = self.get_device(target_id) + source_global_id = link_info.get("source_global_id") + target_global_id = link_info.get("target_global_id") + source = self.get_device(source_global_id) + target = self.get_device(target_global_id) link = Link(source, target) link_type = link_info.get("type", None) if link_type is not None: diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_cluster.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_cluster.py index e217e40967753..d3942716f56c2 100644 --- a/python/paddle/fluid/tests/unittests/test_auto_parallel_cluster.py +++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_cluster.py @@ -30,7 +30,8 @@ "port": "768", "devices": [ { - "id": 0, + "global_id": 0, + "local_id": 0, "type": "GPU", "model": "A100-SXM4-40GB", "sp_gflops": 19500, @@ -38,7 +39,8 @@ "memory": 40 }, { - "id": 1, + "global_id": 1, + "local_id": 1, "type": "GPU", "model": "A100-SXM4-40GB", "sp_gflops": 19500, @@ -46,7 +48,8 @@ "memory": 40 }, { - "id": 2, + "global_id": 2, + "local_id": 0, "type": "CPU", "model": "Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GH", "arch": "x86_64", @@ -56,50 +59,51 @@ "memory": 1510 }, { - "id": 3, + "global_id": 3, + "local_id": 0, "type": "NIC" } ], "links": [ { - "source_id": 0, - "target_id": 1, + "source_global_id": 0, + "target_global_id": 1, "type": "NVL", "bandwidth": 252 }, { - "source_id": 0, - "target_id": 2, + "source_global_id": 0, + "target_global_id": 2, "type": "PHB", "bandwidth": 12 }, { - "source_id": 1, - "target_id": 2, + "source_global_id": 1, + "target_global_id": 2, "type": "PHB", "bandwidth": 12 }, { - "source_id": 0, - "target_id": 3, + "source_global_id": 0, + "target_global_id": 3, "type": "NET", "bandwidth": 1 }, { - "source_id": 1, - "target_id": 3, + "source_global_id": 1, + "target_global_id": 3, "type": "NET", "bandwidth": 1 }, { - "source_id": 2, - "target_id": 3, + "source_global_id": 2, + "target_global_id": 3, "type": "NET", "bandwidth": 1 }, { - "source_id": 3, - "target_id": 7, + "source_global_id": 3, + "target_global_id": 7, "type": "NET", "bandwidth": 1 } @@ -111,7 +115,8 @@ "port": "768", "devices": [ { - "id": 4, + "global_id": 4, + "local_id": 0, "type": "GPU", "model": "Tesla V100-SXM2-32GB", "sp_gflops": 15700, @@ -119,7 +124,8 @@ "memory": 32 }, { - "id": 5, + "global_id": 5, + "local_id": 1, "type": "GPU", "model": "Tesla V100-SXM2-32GB", "sp_gflops": 15700, @@ -127,7 +133,8 @@ "memory": 32 }, { - "id": 6, + "global_id": 6, + "local_id": 0, "type": "CPU", "model": "Intel(R) Xeon(R) Gold 6271C CPU @ 2.60G", "arch": "x86_64", @@ -137,50 +144,51 @@ "memory": "503" }, { - "id": 7, + "global_id": 7, + "local_id": 0, "type": "NIC" } ], "links": [ { - "source_id": 4, - "target_id": 5, + "source_global_id": 4, + "target_global_id": 5, "type": "NVL", "bandwidth": 42 }, { - "source_id": 4, - "target_id": 6, + "source_global_id": 4, + "target_global_id": 6, "type": "PHB", "bandwidth": 12 }, { - "source_id": 5, - "target_id": 6, + "source_global_id": 5, + "target_global_id": 6, "type": "PHB", "bandwidth": 12 }, { - "source_id": 4, - "target_id": 7, + "source_global_id": 4, + "target_global_id": 7, "type": "NET", "bandwidth": 1 }, { - "source_id": 5, - "target_id": 7, + "source_global_id": 5, + "target_global_id": 7, "type": "NET", "bandwidth": 1 }, { - "source_id": 6, - "target_id": 7, + "source_global_id": 6, + "target_global_id": 7, "type": "NET", "bandwidth": 1 }, { - "source_id": 7, - "target_id": 3, + "source_global_id": 7, + "target_global_id": 3, "type": "NET", "bandwidth": 1 } @@ -205,6 +213,7 @@ def test_cluster(self): # machine0 machine0 = cluster.machines[0] + self.assertEqual(machine0.id, 0) self.assertEqual(machine0.hostname, "machine0") self.assertEqual(machine0.addr, "0.0.0.1") self.assertEqual(machine0.port, "768") @@ -213,7 +222,8 @@ def test_cluster(self): # device0 device0_machine0 = machine0.devices[0] - self.assertEqual(device0_machine0.id, 0) + self.assertEqual(device0_machine0.global_id, 0) + self.assertEqual(device0_machine0.local_id, 0) self.assertEqual(device0_machine0.type, DeviceType.GPU) self.assertEqual(device0_machine0.model, "A100-SXM4-40GB") self.assertAlmostEqual(device0_machine0.sp_gflops, 19500) @@ -222,31 +232,32 @@ def test_cluster(self): # device0, link0 link0_machine0 = machine0.links[(0, 1)] - self.assertEqual(link0_machine0.source.id, 0) - self.assertEqual(link0_machine0.target.id, 1) + self.assertEqual(link0_machine0.source.global_id, 0) + self.assertEqual(link0_machine0.target.global_id, 1) self.assertEqual(link0_machine0.type, LinkType.NVL) self.assertAlmostEqual(link0_machine0.bandwidth, 252) self.assertAlmostEqual(link0_machine0.latency, 0) # device 0, link 1 link1_machine0 = machine0.links[(0, 2)] - self.assertEqual(link1_machine0.source.id, 0) - self.assertEqual(link1_machine0.target.id, 2) + self.assertEqual(link1_machine0.source.global_id, 0) + self.assertEqual(link1_machine0.target.global_id, 2) self.assertEqual(link1_machine0.type, LinkType.PHB) self.assertAlmostEqual(link1_machine0.bandwidth, 12) self.assertAlmostEqual(link1_machine0.latency, 0) # device0, link2 link2_machine0 = machine0.links[(0, 3)] - self.assertEqual(link2_machine0.source.id, 0) - self.assertEqual(link2_machine0.target.id, 3) + self.assertEqual(link2_machine0.source.global_id, 0) + self.assertEqual(link2_machine0.target.global_id, 3) self.assertEqual(link2_machine0.type, LinkType.NET) self.assertAlmostEqual(link2_machine0.bandwidth, 1) self.assertAlmostEqual(link2_machine0.latency, 0) # device1 device1_machine0 = machine0.devices[1] - self.assertEqual(device1_machine0.id, 1) + self.assertEqual(device1_machine0.global_id, 1) + self.assertEqual(device1_machine0.local_id, 1) self.assertEqual(device1_machine0.type, DeviceType.GPU) self.assertEqual(device1_machine0.model, "A100-SXM4-40GB") self.assertAlmostEqual(device1_machine0.sp_gflops, 19500) @@ -255,23 +266,24 @@ def test_cluster(self): # device1, link0 link0_machine0 = machine0.links[(1, 2)] - self.assertEqual(link0_machine0.source.id, 1) - self.assertEqual(link0_machine0.target.id, 2) + self.assertEqual(link0_machine0.source.global_id, 1) + self.assertEqual(link0_machine0.target.global_id, 2) self.assertEqual(link0_machine0.type, LinkType.PHB) self.assertAlmostEqual(link0_machine0.bandwidth, 12) self.assertAlmostEqual(link0_machine0.latency, 0) # device1, link1 link1_machine0 = machine0.links[(1, 3)] - self.assertEqual(link1_machine0.source.id, 1) - self.assertEqual(link1_machine0.target.id, 3) + self.assertEqual(link1_machine0.source.global_id, 1) + self.assertEqual(link1_machine0.target.global_id, 3) self.assertEqual(link1_machine0.type, LinkType.NET) self.assertAlmostEqual(link1_machine0.bandwidth, 1) self.assertAlmostEqual(link1_machine0.latency, 0) # device2 device2_machine0 = machine0.devices[2] - self.assertEqual(device2_machine0.id, 2) + self.assertEqual(device2_machine0.global_id, 2) + self.assertEqual(device2_machine0.local_id, 0) self.assertEqual(device2_machine0.type, DeviceType.CPU) self.assertEqual(device2_machine0.model, "Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GH") @@ -281,15 +293,16 @@ def test_cluster(self): # device2, link0 link0_machine0 = machine0.links[(2, 3)] - self.assertEqual(link0_machine0.source.id, 2) - self.assertEqual(link0_machine0.target.id, 3) + self.assertEqual(link0_machine0.source.global_id, 2) + self.assertEqual(link0_machine0.target.global_id, 3) self.assertEqual(link0_machine0.type, LinkType.NET) self.assertAlmostEqual(link0_machine0.bandwidth, 1) self.assertAlmostEqual(link0_machine0.latency, 0) # device3 device3_machine0 = machine0.devices[3] - self.assertEqual(device3_machine0.id, 3) + self.assertEqual(device3_machine0.global_id, 3) + self.assertEqual(device3_machine0.local_id, 0) self.assertEqual(device3_machine0.type, DeviceType.NIC) self.assertAlmostEqual(device3_machine0.model, None) self.assertAlmostEqual(device3_machine0.sp_gflops, 0) @@ -298,24 +311,25 @@ def test_cluster(self): link0_machine0 = machine0.links[(3, 7)] # device3, link0 - self.assertEqual(link0_machine0.source.id, 3) - self.assertEqual(link0_machine0.target.id, 7) + self.assertEqual(link0_machine0.source.global_id, 3) + self.assertEqual(link0_machine0.target.global_id, 7) self.assertEqual(link0_machine0.type, LinkType.NET) self.assertAlmostEqual(link0_machine0.bandwidth, 1) self.assertAlmostEqual(link0_machine0.latency, 0) # machine1 machine1 = cluster.machines[1] - self.assertEqual(len(machine1.devices), 4) + self.assertEqual(machine1.id, 1) self.assertEqual(machine1.hostname, "machine1") self.assertEqual(machine1.addr, "0.0.0.2") self.assertEqual(machine1.port, "768") - self.assertEqual(len(machine0.devices), 4) - self.assertEqual(len(machine0.links), 7) + self.assertEqual(len(machine1.devices), 4) + self.assertEqual(len(machine1.links), 7) # device4 device4_machine1 = machine1.devices[4] - self.assertEqual(device4_machine1.id, 4) + self.assertEqual(device4_machine1.global_id, 4) + self.assertEqual(device4_machine1.local_id, 0) self.assertEqual(device4_machine1.type, DeviceType.GPU) self.assertEqual(device4_machine1.model, "Tesla V100-SXM2-32GB") self.assertAlmostEqual(device4_machine1.sp_gflops, 15700) @@ -324,31 +338,32 @@ def test_cluster(self): # device4, link0 link0_machine1 = machine1.links[(4, 5)] - self.assertEqual(link0_machine1.source.id, 4) - self.assertEqual(link0_machine1.target.id, 5) + self.assertEqual(link0_machine1.source.global_id, 4) + self.assertEqual(link0_machine1.target.global_id, 5) self.assertEqual(link0_machine1.type, LinkType.NVL) self.assertAlmostEqual(link0_machine1.bandwidth, 42) self.assertAlmostEqual(link0_machine1.latency, 0) # device 4, link 1 link1_machine1 = machine1.links[(4, 6)] - self.assertEqual(link1_machine1.source.id, 4) - self.assertEqual(link1_machine1.target.id, 6) + self.assertEqual(link1_machine1.source.global_id, 4) + self.assertEqual(link1_machine1.target.global_id, 6) self.assertEqual(link1_machine1.type, LinkType.PHB) self.assertAlmostEqual(link1_machine1.bandwidth, 12) self.assertAlmostEqual(link1_machine1.latency, 0) # device4, link2 link2_machine1 = machine1.links[(4, 7)] - self.assertEqual(link2_machine1.source.id, 4) - self.assertEqual(link2_machine1.target.id, 7) + self.assertEqual(link2_machine1.source.global_id, 4) + self.assertEqual(link2_machine1.target.global_id, 7) self.assertEqual(link2_machine1.type, LinkType.NET) self.assertAlmostEqual(link2_machine1.bandwidth, 1) self.assertAlmostEqual(link2_machine1.latency, 0) # device5 device5_machine1 = machine1.devices[5] - self.assertEqual(device5_machine1.id, 5) + self.assertEqual(device5_machine1.global_id, 5) + self.assertEqual(device5_machine1.local_id, 1) self.assertEqual(device5_machine1.type, DeviceType.GPU) self.assertEqual(device4_machine1.model, "Tesla V100-SXM2-32GB") self.assertAlmostEqual(device4_machine1.sp_gflops, 15700) @@ -357,23 +372,24 @@ def test_cluster(self): # device5, link0 link0_machine1 = machine1.links[(5, 6)] - self.assertEqual(link0_machine1.source.id, 5) - self.assertEqual(link0_machine1.target.id, 6) + self.assertEqual(link0_machine1.source.global_id, 5) + self.assertEqual(link0_machine1.target.global_id, 6) self.assertEqual(link0_machine1.type, LinkType.PHB) self.assertAlmostEqual(link0_machine1.bandwidth, 12) self.assertAlmostEqual(link0_machine1.latency, 0) # device5, link1 link1_machine1 = machine1.links[(5, 7)] - self.assertEqual(link1_machine1.source.id, 5) - self.assertEqual(link1_machine1.target.id, 7) + self.assertEqual(link1_machine1.source.global_id, 5) + self.assertEqual(link1_machine1.target.global_id, 7) self.assertEqual(link1_machine1.type, LinkType.NET) self.assertAlmostEqual(link1_machine1.bandwidth, 1) self.assertAlmostEqual(link1_machine1.latency, 0) # device6 device6_machine1 = machine1.devices[6] - self.assertEqual(device6_machine1.id, 6) + self.assertEqual(device6_machine1.global_id, 6) + self.assertEqual(device6_machine1.local_id, 0) self.assertEqual(device6_machine1.type, DeviceType.CPU) self.assertEqual(device6_machine1.model, "Intel(R) Xeon(R) Gold 6271C CPU @ 2.60G") @@ -383,15 +399,16 @@ def test_cluster(self): # device6, link0 link0_machine1 = machine1.links[(6, 7)] - self.assertEqual(link0_machine1.source.id, 6) - self.assertEqual(link0_machine1.target.id, 7) + self.assertEqual(link0_machine1.source.global_id, 6) + self.assertEqual(link0_machine1.target.global_id, 7) self.assertEqual(link0_machine1.type, LinkType.NET) self.assertAlmostEqual(link0_machine1.bandwidth, 1) self.assertAlmostEqual(link0_machine1.latency, 0) # device7 device7_machine1 = machine1.devices[7] - self.assertEqual(device7_machine1.id, 7) + self.assertEqual(device7_machine1.global_id, 7) + self.assertEqual(device7_machine1.local_id, 0) self.assertEqual(device7_machine1.type, DeviceType.NIC) self.assertAlmostEqual(device7_machine1.model, None) self.assertAlmostEqual(device7_machine1.sp_gflops, 0) @@ -400,8 +417,8 @@ def test_cluster(self): # device3, link0 link0_machine1 = machine1.links[(7, 3)] - self.assertEqual(link0_machine1.source.id, 7) - self.assertEqual(link0_machine1.target.id, 3) + self.assertEqual(link0_machine1.source.global_id, 7) + self.assertEqual(link0_machine1.target.global_id, 3) self.assertEqual(link0_machine1.type, LinkType.NET) self.assertAlmostEqual(link0_machine1.bandwidth, 1) self.assertAlmostEqual(link0_machine1.latency, 0) From 0ccb24294868abc960d451287ff29cfc434d581d Mon Sep 17 00:00:00 2001 From: Ao Yulong Date: Sun, 14 Nov 2021 12:18:20 +0000 Subject: [PATCH 10/30] Update the rank mapping format --- .../distributed/auto_parallel/cluster.py | 2 +- .../distributed/auto_parallel/mapper.py | 22 +- .../unittests/test_auto_parallel_mapper.py | 248 ++++++++++++------ 3 files changed, 188 insertions(+), 84 deletions(-) diff --git a/python/paddle/distributed/auto_parallel/cluster.py b/python/paddle/distributed/auto_parallel/cluster.py index 23ea90071a56d..75ab8540e2d42 100644 --- a/python/paddle/distributed/auto_parallel/cluster.py +++ b/python/paddle/distributed/auto_parallel/cluster.py @@ -35,8 +35,8 @@ class LinkType(IntEnum): LOC = 1 SYS = 2 PHB = 3 + PXB = 5 PIX = 4 - PIB = 5 NVL = 6 NVB = 7 NET = 8 diff --git a/python/paddle/distributed/auto_parallel/mapper.py b/python/paddle/distributed/auto_parallel/mapper.py index ccf288410b39b..4c514b2f1d070 100644 --- a/python/paddle/distributed/auto_parallel/mapper.py +++ b/python/paddle/distributed/auto_parallel/mapper.py @@ -176,9 +176,10 @@ def build_cluster_graph(cluster): graph = Graph() for machine in cluster.machines.values(): for device in machine.devices.values(): - graph.add_node(device.id, device=device) + graph.add_node(device.global_id, device=device) for link in machine.links.values(): - graph.add_edge(link.source.id, link.target.id, link=link) + graph.add_edge( + link.source.global_id, link.target.global_id, link=link) return graph @@ -272,16 +273,21 @@ def select_unvisited_rank_node(rank_node_list): rank_mapping[machine.id]["hostname"] = machine.hostname rank_mapping[machine.id]["addr"] = machine.addr rank_mapping[machine.id]["port"] = machine.port - rank_mapping[machine.id]["ranks"].append(rank) + if rank not in rank_mapping[machine.id]["ranks"]: + rank_mapping[machine.id]["ranks"][rank] = [] + rank_mapping[machine.id]["ranks"][rank].append(device.local_id) + else: + rank_mapping[machine.id]["ranks"][rank].append(device.local_id) else: rank_mapping[machine.id] = {} - rank_mapping[machine.id]["ranks"] = [] rank_mapping[machine.id]["hostname"] = machine.hostname rank_mapping[machine.id]["addr"] = machine.addr rank_mapping[machine.id]["port"] = machine.port - if rank not in rank_mapping[machine.id]["ranks"]: - rank_mapping[machine.id]["ranks"].append(rank) - for mapping in rank_mapping.values(): - mapping["ranks"].sort() + rank_mapping[machine.id]["ranks"] = {} + rank_mapping[machine.id]["ranks"][rank] = [] + rank_mapping[machine.id]["ranks"][rank].append(device.local_id) + for machine_mapping in rank_mapping.values(): + for rank_devices in machine_mapping["ranks"].values(): + rank_devices.sort() return rank_mapping diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py index adb629f83c83c..2bd05f1ba221a 100644 --- a/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py +++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py @@ -62,7 +62,8 @@ "port": "768", "devices": [ { - "id": 0, + "global_id": 0, + "local_id": 0, "type": "GPU", "model": "A100-SXM4-40GB", "sp_gflops": 19500, @@ -70,7 +71,8 @@ "memory": 40 }, { - "id": 1, + "global_id": 1, + "local_id": 1, "type": "GPU", "model": "A100-SXM4-40GB", "sp_gflops": 19500, @@ -78,7 +80,8 @@ "memory": 40 }, { - "id": 2, + "global_id": 2, + "local_id": 2, "type": "GPU", "model": "A100-SXM4-40GB", "sp_gflops": 19500, @@ -86,92 +89,122 @@ "memory": 40 }, { - "id": 3, + "global_id": 3, + "local_id": 3, "type": "GPU", "model": "A100-SXM4-40GB", "sp_gflops": 19500, "dp_gflops": 9700, "memory": 40 + }, + { + "global_id": 4, + "local_id": 0, + "type": "NIC" } ], "links": [ { - "source_id": 0, - "target_id": 1, + "source_global_id": 0, + "target_global_id": 1, "type": "NVL", "bandwidth": 42 }, { - "source_id": 0, - "target_id": 2, + "source_global_id": 0, + "target_global_id": 2, "type": "NVL", "bandwidth": 42 }, { - "source_id": 0, - "target_id": 3, + "source_global_id": 0, + "target_global_id": 3, "type": "NVL", "bandwidth": 42 }, { - "source_id": 1, - "target_id": 0, + "source_global_id": 0, + "target_global_id": 4, + "type": "PHB", + "bandwidth": 12 + }, + { + "source_global_id": 1, + "target_global_id": 0, "type": "NVL", "bandwidth": 42 }, { - "source_id": 1, - "target_id": 2, + "source_global_id": 1, + "target_global_id": 2, "type": "NVL", "bandwidth": 42 }, { - "source_id": 1, - "target_id": 3, + "source_global_id": 1, + "target_global_id": 3, "type": "NVL", "bandwidth": 42 }, { - "source_id": 2, - "target_id": 0, + "source_global_id": 1, + "target_global_id": 4, + "type": "PHB", + "bandwidth": 12 + }, + { + "source_global_id": 2, + "target_global_id": 0, "type": "NVL", "bandwidth": 42 }, { - "source_id": 2, - "target_id": 1, + "source_global_id": 2, + "target_global_id": 1, "type": "NVL", "bandwidth": 42 }, { - "source_id": 2, - "target_id": 3, + "source_global_id": 2, + "target_global_id": 3, "type": "NVL", "bandwidth": 42 }, { - "source_id": 3, - "target_id": 0, + "source_global_id": 2, + "target_global_id": 4, + "type": "PHB", + "bandwidth": 12 + }, + { + "source_global_id": 3, + "target_global_id": 0, "type": "NVL", "bandwidth": 42 }, { - "source_id": 3, - "target_id": 1, + "source_global_id": 3, + "target_global_id": 1, "type": "NVL", "bandwidth": 42 }, { - "source_id": 3, - "target_id": 2, + "source_global_id": 3, + "target_global_id": 2, "type": "NVL", "bandwidth": 42 }, { - "source_id": 3, - "target_id": 7, - "type": "NVL", - "bandwidth": 10 + "source_global_id": 3, + "target_global_id": 4, + "type": "PHB", + "bandwidth": 12 + }, + { + "source_global_id": 4, + "target_global_id": 9, + "type": "NET", + "bandwidth": 1 } ] }, @@ -181,7 +214,8 @@ "port": "768", "devices": [ { - "id": 4, + "global_id": 5, + "local_id": 0, "type": "GPU", "model": "Tesla V100-SXM2-32GB", "sp_gflops": 15700, @@ -189,7 +223,8 @@ "memory": 32 }, { - "id": 5, + "global_id": 6, + "local_id": 1, "type": "GPU", "model": "Tesla V100-SXM2-32GB", "sp_gflops": 15700, @@ -197,7 +232,8 @@ "memory": 32 }, { - "id": 6, + "global_id": 7, + "local_id": 2, "type": "GPU", "model": "Tesla V100-SXM2-32GB", "sp_gflops": 15700, @@ -205,92 +241,122 @@ "memory": 32 }, { - "id": 7, + "global_id": 8, + "local_id": 3, "type": "GPU", "model": "Tesla V100-SXM2-32GB", "sp_gflops": 15700, "dp_gflops": 7800, "memory": 32 + }, + { + "global_id": 9, + "local_id": 0, + "type": "NIC" } ], "links": [ { - "source_id": 4, - "target_id": 5, + "source_global_id": 5, + "target_global_id": 6, "type": "NVL", "bandwidth": 42 }, { - "source_id": 4, - "target_id": 6, + "source_global_id": 5, + "target_global_id": 7, "type": "NVL", "bandwidth": 42 }, { - "source_id": 4, - "target_id": 7, + "source_global_id": 5, + "target_global_id": 8, "type": "NVL", "bandwidth": 42 }, { - "source_id": 5, - "target_id": 4, + "source_global_id": 5, + "target_global_id": 9, + "type": "PHB", + "bandwidth": 12 + }, + { + "source_global_id": 6, + "target_global_id": 5, "type": "NVL", "bandwidth": 42 }, { - "source_id": 5, - "target_id": 6, + "source_global_id": 6, + "target_global_id": 7, "type": "NVL", "bandwidth": 42 }, { - "source_id": 5, - "target_id": 7, + "source_global_id": 6, + "target_global_id": 8, "type": "NVL", "bandwidth": 42 }, { - "source_id": 6, - "target_id": 4, + "source_global_id": 6, + "target_global_id": 9, + "type": "PHB", + "bandwidth": 12 + }, + { + "source_global_id": 7, + "target_global_id": 5, "type": "NVL", "bandwidth": 42 }, { - "source_id": 6, - "target_id": 5, + "source_global_id": 7, + "target_global_id": 6, "type": "NVL", "bandwidth": 42 }, { - "source_id": 6, - "target_id": 7, + "source_global_id": 7, + "target_global_id": 8, "type": "NVL", "bandwidth": 42 }, { - "source_id": 7, - "target_id": 4, + "source_global_id": 7, + "target_global_id": 9, + "type": "PHB", + "bandwidth": 12 + }, + { + "source_global_id": 8, + "target_global_id": 5, "type": "NVL", "bandwidth": 42 }, { - "source_id": 7, - "target_id": 5, + "source_global_id": 8, + "target_global_id": 6, "type": "NVL", "bandwidth": 42 }, { - "source_id": 7, - "target_id": 6, + "source_global_id": 8, + "target_global_id": 7, "type": "NVL", "bandwidth": 42 }, { - "source_id": 7, - "target_id": 3, - "type": "NVL", - "bandwidth": 10 + "source_global_id": 8, + "target_global_id": 9, + "type": "PHB", + "bandwidth": 12 + }, + { + "source_global_id": 9, + "target_global_id": 4, + "type": "NET", + "bandwidth": 1 } ] } @@ -1177,9 +1243,18 @@ def get_dist_prog(train_program, startup_program, dist_context, rank_id): return dist_main_prog, dist_startup_prog -def check_rank_mapping(rank_mapping): - for machine_id, mapping in rank_mapping.items(): - print("machine_id: ", rank, "mapping: ", mapping) +def is_in_machine(device_local_id, machine): + for device in machine.devices.values(): + if device_local_id == device.local_id: + return True + return False + + +def get_device_local_ids(machine): + local_ids = [] + for device in machine.devices.values(): + local_ids.append[device.local_id] + return local_ids class TestAutoParallelMapper(unittest.TestCase): @@ -1215,10 +1290,22 @@ def test_mapper_dp_mp(self): rank_mapping = mapping(dist_programs, cluster) - ranks = [] - for machine_id, cur_mapping in rank_mapping.items(): - ranks.extend(cur_mapping["ranks"]) - self.assertEqual(sorted(_global_process_mesh.processes), sorted(ranks)) + all_mapped_ranks = set() + for machine_id, machine_mapping in rank_mapping.items(): + machine = cluster.machines[machine_id] + machine_mapped_ranks = set() + machine_mapped_device_local_ids = set() + for rank, device_ids in machine_mapping["ranks"].items(): + # Only allow one process to one device mapping + self.assertEqual(len(device_ids), 1) + self.assertTrue(is_in_machine(device_ids[0], machine)) + machine_mapped_ranks.add(rank) + machine_mapped_device_local_ids.add(device_ids[0]) + self.assertEqual( + len(machine_mapped_ranks), len(machine_mapped_device_local_ids)) + all_mapped_ranks.update(machine_mapped_ranks) + print(_global_process_mesh.processes, all_mapped_ranks) + self.assertEqual(set(_global_process_mesh.processes), all_mapped_ranks) def test_mapper_dp_mp_pp(self): cluster_json_file = "" @@ -1250,10 +1337,21 @@ def test_mapper_dp_mp_pp(self): rank_mapping = mapping(dist_programs, cluster) - ranks = [] - for machine_id, cur_mapping in rank_mapping.items(): - ranks.extend(cur_mapping["ranks"]) - self.assertEqual(sorted(processes), sorted(ranks)) + all_mapped_ranks = set() + for machine_id, machine_mapping in rank_mapping.items(): + machine = cluster.machines[machine_id] + machine_mapped_ranks = set() + machine_mapped_device_local_ids = set() + for rank, device_ids in machine_mapping["ranks"].items(): + # Only allow one process to one device mapping + self.assertEqual(len(device_ids), 1) + self.assertTrue(is_in_machine(device_ids[0], machine)) + machine_mapped_ranks.add(rank) + machine_mapped_device_local_ids.add(device_ids[0]) + self.assertEqual( + len(machine_mapped_ranks), len(machine_mapped_device_local_ids)) + all_mapped_ranks.update(machine_mapped_ranks) + self.assertEqual(set(processes), all_mapped_ranks) if __name__ == '__main__': From 40608568ac9b9a323443ca4fb9e76d9183268165 Mon Sep 17 00:00:00 2001 From: Ao Yulong Date: Thu, 18 Nov 2021 07:13:21 +0000 Subject: [PATCH 11/30] [Auto Parallel] Relaunch with the rank mapping file --- .../distributed/auto_parallel/mapper.py | 14 +- .../distributed/auto_parallel/parallelizer.py | 142 +++++++++---- .../auto_parallel/process_group.py | 12 +- python/paddle/distributed/fleet/launch.py | 56 +++-- .../paddle/distributed/fleet/launch_utils.py | 192 +++++++++++++++--- .../fluid/tests/unittests/CMakeLists.txt | 2 + .../unittests/auto_parallel/CMakeLists.txt | 12 ++ .../auto_parallel_launch_demo.py | 164 +++++++++++++++ .../auto_parallel_rank_mapping.json | 1 + .../tests/unittests/auto_parallel/launch.py | 22 ++ .../test_auto_parallel_launch.py | 109 ++++++++++ .../unittests/test_auto_parallel_mapper.py | 19 +- 12 files changed, 643 insertions(+), 102 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt create mode 100644 python/paddle/fluid/tests/unittests/auto_parallel/auto_parallel_launch_demo.py create mode 100644 python/paddle/fluid/tests/unittests/auto_parallel/auto_parallel_rank_mapping.json create mode 100644 python/paddle/fluid/tests/unittests/auto_parallel/launch.py create mode 100644 python/paddle/fluid/tests/unittests/auto_parallel/test_auto_parallel_launch.py diff --git a/python/paddle/distributed/auto_parallel/mapper.py b/python/paddle/distributed/auto_parallel/mapper.py index 4c514b2f1d070..cb3ab2e524ac9 100644 --- a/python/paddle/distributed/auto_parallel/mapper.py +++ b/python/paddle/distributed/auto_parallel/mapper.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License +import os import operator import functools import json @@ -174,9 +175,19 @@ def build_process_graph(distributed_program): def build_cluster_graph(cluster): graph = Graph() + cuda_visible_devices_env = os.getenv("CUDA_VISIBLE_DEVICES") + cuda_visible_devices = [] + if cuda_visible_devices_env is not None and cuda_visible_devices_env != "": + cuda_visible_devices = [ + int(d.strip()) for d in cuda_visible_devices_env.split(",") + ] for machine in cluster.machines.values(): for device in machine.devices.values(): graph.add_node(device.global_id, device=device) + if cuda_visible_devices and device.local_id not in cuda_visible_devices: + graph.nodes[device.global_id]["occupied"] = True + else: + graph.nodes[device.global_id]["occupied"] = False for link in machine.links.values(): graph.add_edge( link.source.global_id, link.target.global_id, link=link) @@ -194,9 +205,6 @@ def mapping(distributed_program, cluster): for cur_rank_node in process_graph: cur_rank_node["visited"] = False - for cur_device_node in cluster_graph: - cur_device_node["occupied"] = False - def sort_by_comm_volume(rank_edge): return rank_edge["comm_requirements"]["comm_volume"] diff --git a/python/paddle/distributed/auto_parallel/parallelizer.py b/python/paddle/distributed/auto_parallel/parallelizer.py index 2c0a4e581c330..7e6863c532c90 100644 --- a/python/paddle/distributed/auto_parallel/parallelizer.py +++ b/python/paddle/distributed/auto_parallel/parallelizer.py @@ -12,6 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os +import sys +import json +import shlex +import copy +import pathlib +import subprocess import paddle from paddle.distributed.fleet import cloud_utils import paddle.fluid.core as core @@ -21,8 +28,11 @@ from .completion import complete_annotation, complete_backward_annotation from .partitioner import Partitioner from .process_group import get_all_process_groups +from .process_group import get_world_process_groups from .utils import make_data_unshard from .reshard import reshard +from .cluster import Cluster +from .mapper import mapping class AutoParallelizer: @@ -40,6 +50,18 @@ def __init__(self, fleet): self._optimizer = self._fleet.user_defined_optimizer self._dist_strategy = self._fleet._user_defined_strategy self._dist_context = DistributedContext() + self._cluster = None + self._cluster_topo_path = os.getenv("PADDLE_CLUSTER_TOPO_PATH", None) + if self._cluster_topo_path is not None: + self._cluster = Cluster() + self._cluster.build_from_file(self._cluster_topo_path) + # Prepare information for auto mapping + self._rank_mapping_path = os.getenv("PADDLE_RANK_MAPPING_PATH", None) + enable_auto_mapping_env = os.getenv("PADDLE_ENABLE_AUTO_MAPPING", None) + if enable_auto_mapping_env is None: + self._enable_auto_mapping = False + else: + self._enable_auto_mapping = True def _remove_distributed_attrs(self, main_program): suffix = core.kAutoParallelSuffix() @@ -51,47 +73,93 @@ def _remove_distributed_attrs(self, main_program): if suffix in attr_name: op._remove_attr(attr_name) + def _get_dist_program(self, dist_context, rank): + # Annotation completion + completed_main_program = complete_annotation(self._main_program, + dist_context) + # Logical partition + partitioner = Partitioner(self._dist_strategy, dist_context, rank) + dist_main_prog, dist_startup_prog = partitioner.transpile_forward( + completed_main_program, self._startup_program) + dist_params_grads = partitioner.apply_backward( + self._loss, completed_main_program, self._startup_program, + dist_main_prog, dist_startup_prog) + dist_optimize_ops = partitioner.apply_optimize( + copy.deepcopy(self._optimizer), dist_params_grads, dist_main_prog, + dist_startup_prog) + + make_data_unshard(dist_main_prog, dist_startup_prog, dist_context) + + reshard(dist_main_prog, dist_startup_prog, rank, dist_context) + + return dist_optimize_ops, dist_params_grads, dist_startup_prog, dist_main_prog + def parallelize(self, loss, startup_program, parameter_list=None, no_grad_set=None): assert startup_program is not None - main_program = loss.block.program + self._loss = loss + self._startup_program = startup_program + self._main_program = loss.block.program + self._parameter_list = parameter_list + self._no_grad_set = no_grad_set - # Annotation completion - completed_main_program = complete_annotation(main_program, - self._dist_context) - # Logical partition - rank = paddle.distributed.get_rank() - partitioner = Partitioner(self._dist_strategy, self._dist_context, rank) - partitioned_main_prog, partitioned_startup_prog = partitioner.transpile_forward( - completed_main_program, startup_program) - dist_params_grads = partitioner.apply_backward( - loss, completed_main_program, startup_program, - partitioned_main_prog, partitioned_startup_prog) - dist_optimize_ops = partitioner.apply_optimize( - self._optimizer, dist_params_grads, partitioned_main_prog, - partitioned_startup_prog) - - # The last step: remove all distributed attributes to be compatiable - # with inference. - self._remove_distributed_attrs(partitioned_main_prog) - make_data_unshard(partitioned_main_prog, partitioned_startup_prog, - self._dist_context) - - reshard(partitioned_main_prog, partitioned_startup_prog, rank, - self._dist_context) - - # Traverse different rank programs and traverse each op of them, - # instantiate communication by process_mapping. - all_process_groups = get_all_process_groups() - for process_group in all_process_groups: - if rank not in process_group.ranks: - continue - process_group.instantiate() - - # Copy distributed info to the default context - set_default_distributed_context(self._dist_context) - - return dist_optimize_ops, dist_params_grads, partitioned_startup_prog, partitioned_main_prog + if self._enable_auto_mapping and self._rank_mapping_path is None: + # Do the mapping pass before parallelization + assert self._cluster is not None, \ + "The cluster must not be none when using auto mapping." + dist_programs = {} + world_process_group = get_world_process_groups() + for rank in world_process_group.ranks: + dist_context = DistributedContext() + dist_optimize_ops, dist_params_grads, dist_startup_prog, dist_main_prog = self._get_dist_program( + dist_context, rank) + dist_programs[rank] = dist_main_prog + + # Do the mapping between the distributed program graph and the cluster graph + rank_mapping_dict = mapping(dist_programs, self._cluster) + rank_mapping = list(rank_mapping_dict.values()) + + # Relaunch the training by using the rank mapping file + cwd = pathlib.Path().resolve() + rank_mapping_path = os.path.join(cwd, + "auto_parallel_rank_mapping.json") + with open(rank_mapping_path, "w") as rank_mapping_file: + json.dump(rank_mapping, rank_mapping_file) + original_cmd_args = os.getenv("PADDLE_ORIGINAL_CMD_ARGS") + rank_mapping_args = " ".join( + ["--rank_mapping_path", rank_mapping_path]) + new_cmd_args = "-u -m paddle.distributed.fleet.launch" + " " + rank_mapping_args + " " + original_cmd_args + new_cmd = [sys.executable] + shlex.split(new_cmd_args) + print(new_cmd) + new_process = subprocess.Popen(new_cmd) + new_process.wait() + assert new_process.returncode == 0, \ + "Launch failed with rank mapping" + print("Successfully do the second launch for auto mapping!") + sys.exit(0) + else: + # Parallelization after the mapping pass + rank = paddle.distributed.get_rank() + + dist_optimize_ops, dist_params_grads, dist_startup_prog, dist_main_prog = self._get_dist_program( + self._dist_context, rank) + + # Traverse different rank programs and traverse each op of them, + # instantiate communication by process_mapping. + all_process_groups = get_all_process_groups() + for process_group in all_process_groups: + if rank not in process_group.ranks: + continue + process_group.instantiate() + + # Copy distributed info to the default context + set_default_distributed_context(self._dist_context) + + # The last step: remove all distributed attributes to be compatible + # with inference. + self._remove_distributed_attrs(dist_main_prog) + + return dist_optimize_ops, dist_params_grads, dist_startup_prog, dist_main_prog diff --git a/python/paddle/distributed/auto_parallel/process_group.py b/python/paddle/distributed/auto_parallel/process_group.py index 29bb3edc94c85..4f3ed385cbf8e 100644 --- a/python/paddle/distributed/auto_parallel/process_group.py +++ b/python/paddle/distributed/auto_parallel/process_group.py @@ -19,10 +19,6 @@ from ...fluid.framework import in_dygraph_mode from ...fluid.layers.tensor import fill_constant -# Note that Process group 0 is reserved for representing all ranks. -# At the begining, group 0 is empty and new ranks will be added automatically. -_g_process_group_map = {} - def get_all_process_groups(): global _g_process_group_map @@ -34,6 +30,11 @@ def get_process_group(group_id): return _g_process_group_map.get(group_id, None) +def get_world_process_groups(): + global _g_process_group_map + return _g_process_group_map[0] + + def new_process_group(ranks): global _g_process_group_map # A key constructed from ranks is used for avoiding duplication @@ -151,4 +152,7 @@ def __str__(self): return string +# Note that Process group 0 is reserved for representing all ranks. +# At the begining, group 0 is empty and new ranks will be added automatically. +_g_process_group_map = {} _g_process_group_map[0] = ProcessGroup(0, []) diff --git a/python/paddle/distributed/fleet/launch.py b/python/paddle/distributed/fleet/launch.py index 87a43a8536d6b..04f5c828ded5e 100644 --- a/python/paddle/distributed/fleet/launch.py +++ b/python/paddle/distributed/fleet/launch.py @@ -65,6 +65,7 @@ import time import six import copy +import shlex import argparse from argparse import ArgumentParser, REMAINDER import paddle @@ -164,25 +165,17 @@ def _parse_args(): default="127.0.0.1", help="Paddle cluster nodes ips, such as 192.168.0.16,192.168.0.17..") collective_group.add_argument( - "--rank_mapping_file", - type=argparse.FileType('r'), - default=sys.stdin, - help="This rank mapping information in json format is used specifically " - "for lazy launch for auto parallel. Some of the ranks in each node " - "may not be used, and the indices of rank should be kept the same " - "as the indices of sub-task splited by auto parallel. " - " { " - " \"ip_ranks\": [ " - " { " - " \"ip\": \"127.0.0.1\", " - " \"ranks\": [0,1] " - " }, " - " { " - " \"ip\": \"127.0.0.2\", " - " \"ranks\": [2,3,4] " - " } " - " ] " - " } ") + "--cluster_topo_path", + type=str, + default=None, + help="A json format file will be stored in this path which is used" + "to represent the cluster topology information for auto parallel.") + collective_group.add_argument( + "--rank_mapping_path", + type=str, + default=None, + help="A json format file will be stored in this path which is used" + "to map processes to machines for auto parallel.") collective_group.add_argument( "--enable_auto_mapping", type=bool, @@ -286,15 +279,34 @@ def launch_collective(args): logger.debug("parsed from args trainerss_num:{} mode:{} devices:{}".format( trainers_num, device_mode, devices_per_proc)) + cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES") + cluster = None pod = None start_port = 6170 if os.environ.get('FLAGS_START_PORT') is not None: start_port = os.environ.get('FLAGS_START_PORT') - # lazy launch for auto-parallel + # auto mapping between processes and devices for auto-parallel if args.enable_auto_mapping == True: - cluster, pod = get_mapped_cluster_from_args(args, device_mode) + assert args.cluster_topo_path is not None, \ + "The cluster topology must be provied when enabling auto mapping." + if args.rank_mapping_path is None: + # original_args = [shlex.quote(c) for c in sys.argv[1:]] + original_args = sys.argv[1:] + os.environ["PADDLE_ORIGINAL_CMD_ARGS"] = " ".join(original_args) + os.environ["PADDLE_CLUSTER_TOPO_PATH"] = str(args.cluster_topo_path) + os.environ["PADDLE_ENABLE_AUTO_MAPPING"] = str( + args.enable_auto_mapping) + cluster, pod = launch_utils.get_mapped_cluster_from_args_without_rank_mapping( + args, device_mode) + else: + os.environ["PADDLE_CLUSTER_TOPO_PATH"] = str(args.cluster_topo_path) + os.environ["PADDLE_RANK_MAPPING_PATH"] = str(args.rank_mapping_path) + os.environ["PADDLE_ENABLE_AUTO_MAPPING"] = str( + args.enable_auto_mapping) + cluster, pod = launch_utils.get_mapped_cluster_from_args_with_rank_mapping( + args, device_mode) else: # for ascend if device_mode == DeviceMode.ASCEND_NPU: @@ -442,7 +454,7 @@ def which_distributed_mode(args): if args.servers: logger.warning( "Not found distinct arguments and not compiled with cuda or xpu. \ -But found args.servers not empty, default use ps mode") + But found args.servers not empty, default use ps mode") return DistributeMode.PS else: return DistributeMode.COLLECTIVE diff --git a/python/paddle/distributed/fleet/launch_utils.py b/python/paddle/distributed/fleet/launch_utils.py index 8439e61ac108d..8a9ba5165a00e 100644 --- a/python/paddle/distributed/fleet/launch_utils.py +++ b/python/paddle/distributed/fleet/launch_utils.py @@ -465,6 +465,18 @@ def __init__(self): self.cmd = None +_run_with_coverage = False + + +def run_with_coverage(*args): + global _run_with_coverage + assert len(args) <= 1, "len(args) {} should <= 1".format(len(args)) + if len(args) == 1: + assert isinstance(args[0], bool) + _run_with_coverage = args[0] + return _run_with_coverage + + def start_local_trainers(cluster, pod, training_script, @@ -499,6 +511,17 @@ def start_local_trainers(cluster, "PADDLE_WORLD_DEVICE_IDS": ",".join(res), } + # The following three environnement variables are used for auto mapping + if current_env.get("PADDLE_CLUSTER_TOPO_PATH", None) is not None: + proc_env["PADDLE_CLUSTER_TOPO_PATH"] = current_env[ + "PADDLE_CLUSTER_TOPO_PATH"] + if current_env.get("PADDLE_RANK_MAPPING_PATH", None) is not None: + proc_env["PADDLE_RANK_MAPPING_PATH"] = current_env[ + "PADDLE_RANK_MAPPING_PATH"] + if current_env.get("PADDLE_ENABLE_AUTO_MAPPING", None) is not None: + proc_env["PADDLE_ENABLE_AUTO_MAPPING"] = current_env[ + "PADDLE_ENABLE_AUTO_MAPPING"] + if len(t.accelerators) > 0 and pod.device_mode == DeviceMode.GPU: proc_env["FLAGS_selected_gpus"] = "%s" % ",".join( [str(g) for g in t.accelerators]) @@ -518,7 +541,11 @@ def start_local_trainers(cluster, current_env.update(proc_env) - cmd = [sys.executable, "-u", training_script] + training_script_args + coverage_args = [] + if run_with_coverage(): + coverage_args = ["-m", "coverage", "run", "--branch", "-p"] + cmd = [sys.executable, "-u"] + coverage_args + [training_script + ] + training_script_args logger.debug("start trainer proc{} env:{}".format(cmd, current_env)) @@ -541,7 +568,11 @@ def start_local_trainers(cluster, with open("%s/endpoints.log" % log_dir, "w") as f: f.write("PADDLE_TRAINER_ENDPOINTS: \n") f.write("\n".join(cluster.trainers_endpoints())) - fn = open("%s/workerlog.%d" % (log_dir, idx), "a") + if current_env.get("PADDLE_ENABLE_AUTO_MAPPING") is not None \ + and current_env.get("PADDLE_RANK_MAPPING_PATH", None) is None: + fn = open("%s/prelaunchlog.%d" % (log_dir, idx), "a") + else: + fn = open("%s/workerlog.%d" % (log_dir, idx), "a") proc = subprocess.Popen( cmd, env=current_env, stdout=fn, stderr=fn, preexec_fn=pre_fn) else: @@ -697,6 +728,9 @@ def get_device_mode(backend): def get_device_proc_info(args): + if args.enable_auto_mapping: + return (DeviceMode.GPU, []) + # device_mode device_mode = get_device_mode(args.backend) @@ -808,8 +842,8 @@ def cloud_ps_heter_env_set(args): pretty_print_envs(environs))) -def get_mapped_cluster(node_ips, node_ip, trainer_endpoints, device_mode, - node_mapping_ranks): +def get_mapped_cluster_without_rank_mapping( + node_ips, node_ip, trainer_endpoints, device_mode, node_ranks): assert type(trainer_endpoints) is list, "trainer_endpoints must be list" assert device_mode == DeviceMode.GPU, \ "Only support get mapped cluster for gpu now." @@ -822,17 +856,114 @@ def get_mapped_cluster(node_ips, node_ip, trainer_endpoints, device_mode, cur_node_endpoints = trainer_endpoints[node_rank] # choose rank from global mapped ranks and set it to the trainer. - ranks_per_node = node_mapping_ranks[node_rank] + ranks_per_node = node_ranks[node_rank] + assert len(ranks_per_node) == 1 for i in range(len(ranks_per_node)): trainer = Trainer() - # change global rank(mapped) to local rank within each node. - # e.g. mapped ranks of node: 3,4,7 -> 0,1,2 - local_rank = ranks_per_node.index(ranks_per_node[i]) - trainer.accelerators.append(local_rank) trainer.endpoint = "%s" % (cur_node_endpoints[i]) - # global mapped ranks trainer.rank = ranks_per_node[i] + pod.trainers.append(trainer) + cluster.pods.append(pod) + + pod_rank = node_ips.index(node_ip) + return cluster, cluster.pods[pod_rank] + + +def get_mapped_cluster_from_args_without_rank_mapping(args, device_mode): + assert device_mode == DeviceMode.GPU, \ + "Only support get mapped cluster for gpu now." + gpus_num = fluid.core.get_cuda_device_count() + # parse ip-ranks json file + cluster_topo = None + with open(args.cluster_topo_path, "r") as json_file: + cluster_topo = json.load(json_file) + + node_ips = [] + node_ranks = [] + for idx, cur_cluster_topo in enumerate(cluster_topo["machines"]): + node_ips.append(cur_cluster_topo['addr']) + node_ranks.append([idx]) + + if len(node_ips) == 1: + node_ip = node_ips[0] + else: + if args.host: + node_ip = args.host + else: + _, node_ip = get_host_name_ip() + + assert node_ip in node_ips, \ + "Can't find your local ip {%s} in node_ips: {%s}" % (node_ip, node_ips) + node_rank = node_ips.index(node_ip) + + assert len(node_ranks) == len(node_ips), \ + "ranks length should be equal to ips length." + + logger.debug("parsed from args: node_ips:{} node_ip:{} " + "node_rank:{} node_ranks:{}".format( + node_ips, node_ip, node_rank, node_ranks[node_rank])) + + # NOTE: there are different number of global mapped ranks on each node. + free_ports = [] + trainer_endpoints = [] + for ip in node_ips: + node_rank = node_ips.index(ip) + if os.environ.get('FLAGS_START_PORT') is not None: + start_port = int(os.environ.get('FLAGS_START_PORT')) + free_ports = [ + x + for x in range(start_port, start_port + len(node_ranks[ + node_rank])) + ] + else: + free_ports = find_free_ports(len(node_ranks[node_rank])) + trainer_endpoints.append(["%s:%d" % (ip, port) for port in free_ports]) + + return get_mapped_cluster_without_rank_mapping( + node_ips, node_ip, trainer_endpoints, device_mode, node_ranks) + + +def get_mapped_cluster_with_rank_mapping(node_ips, node_ip, trainer_endpoints, + device_mode, node_ranks, + node_rank_mappings): + assert type(trainer_endpoints) is list, "trainer_endpoints must be list" + assert device_mode == DeviceMode.GPU, \ + "Only support get mapped cluster for gpu now." + + def get_relative_gpu_id(gpu_id): + cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES") + if cuda_visible_devices is None or cuda_visible_devices == "": + return gpu_id + else: + cuda_visible_devices_list = cuda_visible_devices.split(',') + relative_id = cuda_visible_devices_list.index(str(gpu_id)) + logger.info( + "Change gpu id from {} to {} based on CUDA_VISIBLE_DEVICES {}". + format(gpu_id, relative_id, cuda_visible_devices_list)) + return relative_id + + cluster = Cluster(hdfs=None) + for node_rank, ip in enumerate(node_ips): + pod = Pod() + pod.rank = node_rank + pod.addr = ip + pod.device_mode = device_mode + cur_node_endpoints = trainer_endpoints[node_rank] + + # choose rank from global mapped ranks and set it to the trainer. + ranks_per_node = node_ranks[node_rank] + cur_node_rank_mapping = node_rank_mappings[node_rank] + for i in range(len(ranks_per_node)): + trainer = Trainer() + local_device_ids = cur_node_rank_mapping["ranks"][str( + ranks_per_node[i])] + assert len(local_device_ids) == 1, \ + "Only support one process to one device mapping" + trainer.accelerators.append( + get_relative_gpu_id(local_device_ids[0])) + trainer.endpoint = "%s" % (cur_node_endpoints[i]) + trainer.rank = ranks_per_node[i] pod.trainers.append(trainer) cluster.pods.append(pod) @@ -840,22 +971,27 @@ def get_mapped_cluster(node_ips, node_ip, trainer_endpoints, device_mode, return cluster, cluster.pods[pod_rank] -def get_mapped_cluster_from_args(args, device_mode): +def get_mapped_cluster_from_args_with_rank_mapping(args, device_mode): assert device_mode == DeviceMode.GPU, \ "Only support get mapped cluster for gpu now." gpus_num = fluid.core.get_cuda_device_count() # parse ip-ranks json file - json_data = None - with args.rank_mapping_file as json_file: - json_data = json.load(json_file) + rank_mapping = None + with open(args.rank_mapping_path, "r") as json_file: + rank_mapping = json.load(json_file) node_ips = [] - node_ranks_mapping = [] - ip_ranks_list = json_data['ip_ranks'] - for ip_ranks in ip_ranks_list: - node_ips.append(ip_ranks['ip']) - node_ranks_mapping.append(ip_ranks['ranks']) + node_ranks = [] + node_rank_mappings = [] + for cur_rank_mapping in rank_mapping: + node_ips.append(cur_rank_mapping['addr']) + cur_node_rank_list = [ + int(i) for i in list(cur_rank_mapping['ranks'].keys()) + ] + cur_node_rank_list.sort() + node_ranks.append(cur_node_rank_list) + node_rank_mappings.append(cur_rank_mapping) if len(node_ips) == 1: node_ip = node_ips[0] @@ -869,15 +1005,14 @@ def get_mapped_cluster_from_args(args, device_mode): "Can't find your local ip {%s} in node_ips: {%s}" % (node_ip, node_ips) node_rank = node_ips.index(node_ip) - assert len(node_ranks_mapping[node_rank]) <= gpus_num, \ + assert len(node_ranks[node_rank]) <= gpus_num, \ "number of ranks mapped to one node should not exceed the avaiable ones." - assert len(node_ranks_mapping) == len(node_ips), \ + assert len(node_ranks) == len(node_ips), \ "ranks length should be equal to ips length." logger.debug("parsed from args: node_ips:{} node_ip:{} " - "node_rank:{} node_ranks_mapping:{}".format( - node_ips, node_ip, node_rank, node_ranks_mapping[ - node_rank])) + "node_rank:{} node_ranks:{}".format( + node_ips, node_ip, node_rank, node_ranks[node_rank])) # NOTE: there are different number of global mapped ranks on each node. free_ports = [] @@ -888,15 +1023,16 @@ def get_mapped_cluster_from_args(args, device_mode): start_port = int(os.environ.get('FLAGS_START_PORT')) free_ports = [ x - for x in range(start_port, start_port + len(node_ranks_mapping[ + for x in range(start_port, start_port + len(node_ranks[ node_rank])) ] else: - free_ports = find_free_ports(len(node_ranks_mapping[node_rank])) + free_ports = find_free_ports(len(node_ranks[node_rank])) trainer_endpoints.append(["%s:%d" % (ip, port) for port in free_ports]) - return get_mapped_cluster(node_ips, node_ip, trainer_endpoints, device_mode, - node_ranks_mapping) + return get_mapped_cluster_with_rank_mapping(node_ips, node_ip, + trainer_endpoints, device_mode, + node_ranks, node_rank_mappings) class ParameterServerLauncher(object): diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 30633050cc196..32c8054501a15 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -581,6 +581,8 @@ set_tests_properties(test_conv_nn_grad PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE") set_tests_properties(test_norm_nn_grad PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE") set_tests_properties(test_nn_grad PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE") if(WITH_DISTRIBUTE) + add_subdirectory(auto_parallel) + # FIXME(typhoonzero): add these tests back list(REMOVE_ITEM DIST_TEST_OPS "test_dist_transformer") list(REMOVE_ITEM DIST_TEST_OPS "test_dist_transpiler") diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt new file mode 100644 index 0000000000000..f666e4c261076 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt @@ -0,0 +1,12 @@ +file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") +string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") + +if(NOT WITH_NCCL) + list(REMOVE_ITEM TEST_OPS test_auto_parallel_launch) + list(APPEND DIST_TEST_OPS test_auto_parallel_launch) +endif() + +# foreach(TEST_OP ${TEST_OPS}) +# py_test_modules(${TEST_OP} MODULES ${TEST_OP}) +# set_tests_properties(${TEST_OP} PROPERTIES TIMEOUT 120) +# endforeach(TEST_OP) diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/auto_parallel_launch_demo.py b/python/paddle/fluid/tests/unittests/auto_parallel/auto_parallel_launch_demo.py new file mode 100644 index 0000000000000..77d5704579f93 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/auto_parallel/auto_parallel_launch_demo.py @@ -0,0 +1,164 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import time +import paddle.fluid as fluid +import copy +import os +import numpy as np +import subprocess +import paddle +import paddle.nn as nn +import paddle.fluid as fluid +import paddle.static as static +import paddle.nn.functional as F +import paddle.utils as utils +from paddle.fluid import layers +from paddle.io import IterableDataset, DataLoader +from paddle.distributed import fleet +import paddle.distributed.auto_parallel as auto + +paddle.enable_static() +_global_parallel_strategy = None +_global_process_mesh = None +batch_size = 4 +hidden_size = 1024 +sequence_len = 512 + + +def get_random_inputs_and_labels(input_shape, label_shape): + input = np.random.random(size=input_shape).astype('float32') + label = np.random.random(size=label_shape).astype('float32') + return input, label + + +def batch_generator_creator(): + def __reader__(): + for _ in range(batch_size): + batch_input, batch_label = get_random_inputs_and_labels( + [batch_size, sequence_len, hidden_size], + [batch_size, sequence_len, 1]) + yield batch_input, batch_label + + return __reader__ + + +class MLPLayer(nn.Layer): + def __init__(self, + hidden_size=1024, + intermediate_size=4 * 1024, + dropout_ratio=0.1, + initializer_range=0.02): + super(MLPLayer, self).__init__() + d_model = hidden_size + dim_feedforward = intermediate_size + weight_attr = paddle.ParamAttr(initializer=nn.initializer.Normal( + mean=0.0, std=initializer_range)) + bias_attr = None + + self.linear0 = nn.Linear( + d_model, dim_feedforward, weight_attr, bias_attr=bias_attr) + self.linear1 = nn.Linear( + dim_feedforward, d_model, weight_attr, bias_attr=bias_attr) + self.linear2 = nn.Linear(d_model, 1, weight_attr, bias_attr=bias_attr) + self.norm = nn.LayerNorm(d_model, epsilon=1e-5) + self.dropout = nn.Dropout(dropout_ratio, mode="upscale_in_train") + + def forward(self, input): + out = self.norm(input) + out = self.linear0(out) + out = F.gelu(out, approximate=True) + out = self.linear1(out) + out = self.dropout(out) + out = self.linear2(out) + + return out + + +def mlp_pretrain_forward(train_program, start_program): + with static.program_guard(train_program, + start_program), utils.unique_name.guard(): + input = static.data( + name="input", + shape=[batch_size, sequence_len, hidden_size], + dtype='float32') + label = static.data( + name="label", shape=[batch_size, sequence_len, 1], dtype='float32') + + auto.shard_tensor( + input, + dist_attr={ + "process_mesh": _global_process_mesh, + "dims_mappig": [-1, -1, -1] + }) + + mlp = MLPLayer( + hidden_size=hidden_size, + intermediate_size=4 * hidden_size, + dropout_ratio=0.1, + initializer_range=0.02) + + predict = mlp(input) + error_cost = paddle.nn.functional.square_error_cost(predict, label) + loss = paddle.mean(error_cost) + + loader = paddle.io.DataLoader.from_generator( + feed_list=[input, label], capacity=4 * batch_size, iterable=True) + + return loss, train_program, start_program, loader + + +def train(): + global _global_process_mesh + _global_process_mesh = auto.ProcessMesh(mesh=[0, 1]) + + dist_strategy = fleet.DistributedStrategy() + dist_strategy.amp = False + dist_strategy.pipeline = False + dist_strategy.recompute = False + # init parallel optimizer + dist_strategy.semi_auto = True + + fleet.init(is_collective=True, strategy=dist_strategy) + + train_program = static.Program() + start_program = static.Program() + loss, train_program, start_program, loader = mlp_pretrain_forward( + train_program, start_program) + + optimizer = paddle.fluid.optimizer.AdamOptimizer( + learning_rate=0.00001, + beta1=0.9, + beta2=0.999, + epsilon=1e-08, + grad_clip=None) + + optimizer = fleet.distributed_optimizer(optimizer) + _, _, distributed_startup_program, distributed_main_program = optimizer.minimize( + loss, start_program) + + places = static.cuda_places() + loader.set_batch_generator(batch_generator_creator(), places=places) + exe = paddle.static.Executor(places[0]) + exe.run(distributed_startup_program) + + for data in loader(): + loss_print = exe.run(distributed_main_program, + feed=data, + fetch_list=[loss]) + + +if __name__ == "__main__": + train() diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/auto_parallel_rank_mapping.json b/python/paddle/fluid/tests/unittests/auto_parallel/auto_parallel_rank_mapping.json new file mode 100644 index 0000000000000..ff1f62ec1c8c9 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/auto_parallel/auto_parallel_rank_mapping.json @@ -0,0 +1 @@ +[{"hostname": "machine1", "addr": "127.0.0.1", "port": "768", "ranks": {"0": [1], "1": [0]}}] diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/launch.py b/python/paddle/fluid/tests/unittests/auto_parallel/launch.py new file mode 100644 index 0000000000000..c225fe85cd844 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/auto_parallel/launch.py @@ -0,0 +1,22 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from paddle.distributed.fleet import launch +from paddle.distributed.fleet.launch_utils import run_with_coverage + +if __name__ == "__main__": + if os.environ.get("WITH_COVERAGE", "OFF") == "ON": + run_with_coverage(True) + launch.launch() diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_auto_parallel_launch.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_auto_parallel_launch.py new file mode 100644 index 0000000000000..8cf548a0d1268 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_auto_parallel_launch.py @@ -0,0 +1,109 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import os +import sys +import json +import subprocess +from paddle.distributed.fleet.launch_utils import run_with_coverage + +cluster_json = """ +{ + "machines": [ + { + "hostname": "machine1", + "addr": "127.0.0.1", + "port": "768", + "devices": [ + { + "global_id": 0, + "local_id": 0, + "type": "GPU", + "model": "Tesla V100-SXM2-32GB", + "sp_gflops": 15700, + "dp_gflops": 7800, + "memory": 32 + }, + { + "global_id": 1, + "local_id": 1, + "type": "GPU", + "model": "Tesla V100-SXM2-32GB", + "sp_gflops": 15700, + "dp_gflops": 7800, + "memory": 32 + }, + { + "global_id": 2, + "local_id": 0, + "type": "CPU", + "model": "Intel(R) Xeon(R) Gold 6271C CPU @ 2.60G", + "arch": "x86_64", + "vendor": "GenuineIntel", + "sp_gflops": 150, + "dp_gflops": 75, + "memory": "503" + } + ], + "links": [ + { + "source_global_id": 0, + "target_global_id": 1, + "type": "NVL", + "bandwidth": 42 + }, + { + "source_global_id": 1, + "target_global_id": 0, + "type": "PHB", + "bandwidth": 12 + } + ] + } + ] +} +""" + + +class TestAutoParallelLaunch(unittest.TestCase): + def test_launch(self): + file_dir = os.path.dirname(os.path.abspath(__file__)) + + cluster_json_path = os.path.join(file_dir, "auto_parallel_cluster.json") + cluster_json_object = json.loads(cluster_json) + with open(cluster_json_path, "w") as cluster_json_file: + json.dump(cluster_json_object, cluster_json_file) + + launch_demo_path = os.path.join(file_dir, + "auto_parallel_launch_demo.py") + + if os.environ.get("WITH_COVERAGE", "OFF") == "ON": + run_with_coverage(True) + coverage_args = ["-m", "coverage", "run", "--branch", "-p"] + else: + coverage_args = [] + + cmd = [sys.executable, "-u"] + coverage_args + [ + "-m", "launch", "--cluster_topo_path", cluster_json_path, + "--enable_auto_mapping", "True", launch_demo_path + ] + process = subprocess.Popen(cmd) + process.wait() + self.assertEqual(process.returncode, 0) + os.remove(cluster_json_path) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py index 2bd05f1ba221a..5b8a873674aad 100644 --- a/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py +++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py @@ -1259,21 +1259,24 @@ def get_device_local_ids(machine): class TestAutoParallelMapper(unittest.TestCase): def test_mapper_dp_mp(self): - cluster_json_file = "" - cluster_json_object = json.loads(cluster_json) - with open("./auto_parallel_cluster.json", "w") as cluster_json_file: - json.dump(cluster_json_object, cluster_json_file) + # cluster_json_file = "" + # cluster_json_object = json.loads(cluster_json) + # with open("./auto_parallel_cluster.json", "w") as cluster_json_file: + # json.dump(cluster_json_object, cluster_json_file) cluster = Cluster() - cluster.build_from_file("./auto_parallel_cluster.json") - os.remove("./auto_parallel_cluster.json") + cluster.build_from_file( + "/home/aoyulong/workspace/auto_parallel_cluster.json") + # cluster.build_from_file("./auto_parallel_cluster.json") + # os.remove("./auto_parallel_cluster.json") global _global_parallel_strategy _global_parallel_strategy = "dp_mp" global _global_num_stages _global_num_stages = 1 global _global_process_mesh - _global_process_mesh = auto.ProcessMesh( - mesh=[[0, 1, 2, 3], [4, 5, 6, 7]]) + _global_process_mesh = auto.ProcessMesh(mesh=[[0, 1], [2, 3]]) + # _global_process_mesh = auto.ProcessMesh( + # mesh=[[0, 1, 2, 3], [4, 5, 6, 7]]) dist_programs = {} for rank_id in _global_process_mesh.processes: From a0127f1cf3c43d298c8e9512ab898d2f20a03a2f Mon Sep 17 00:00:00 2001 From: Ao Yulong Date: Thu, 18 Nov 2021 07:24:27 +0000 Subject: [PATCH 12/30] Remove the unnecessary json file --- .../unittests/auto_parallel/auto_parallel_rank_mapping.json | 1 - 1 file changed, 1 deletion(-) delete mode 100644 python/paddle/fluid/tests/unittests/auto_parallel/auto_parallel_rank_mapping.json diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/auto_parallel_rank_mapping.json b/python/paddle/fluid/tests/unittests/auto_parallel/auto_parallel_rank_mapping.json deleted file mode 100644 index ff1f62ec1c8c9..0000000000000 --- a/python/paddle/fluid/tests/unittests/auto_parallel/auto_parallel_rank_mapping.json +++ /dev/null @@ -1 +0,0 @@ -[{"hostname": "machine1", "addr": "127.0.0.1", "port": "768", "ranks": {"0": [1], "1": [0]}}] From 48936b84a76f3b1fc5fac7ac47b9ca153f7d7e67 Mon Sep 17 00:00:00 2001 From: Ao Yulong Date: Thu, 18 Nov 2021 08:51:38 +0000 Subject: [PATCH 13/30] Avoid entering get_device_proc_info for auto mapping --- python/paddle/distributed/fleet/launch.py | 6 +++++- python/paddle/distributed/fleet/launch_utils.py | 3 --- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/python/paddle/distributed/fleet/launch.py b/python/paddle/distributed/fleet/launch.py index 895963228259e..978db2019b153 100644 --- a/python/paddle/distributed/fleet/launch.py +++ b/python/paddle/distributed/fleet/launch.py @@ -276,7 +276,11 @@ def cpuonly_check(args): def launch_collective(args): # parse arguments, used for cloud-single-machine and local if args.backend == 'gloo': cpuonly_check(args) - (device_mode, devices_per_proc) = launch_utils.get_device_proc_info(args) + if args.enable_auto_mapping: + (device_mode, devices_per_proc) = (DeviceMode.GPU, []) + else: + (device_mode, + devices_per_proc) = launch_utils.get_device_proc_info(args) trainers_num = cloud_utils.get_trainers_num() logger.debug("parsed from args trainerss_num:{} mode:{} devices:{}".format( trainers_num, device_mode, devices_per_proc)) diff --git a/python/paddle/distributed/fleet/launch_utils.py b/python/paddle/distributed/fleet/launch_utils.py index 0ce638ae07c3a..20d56c4306b0f 100644 --- a/python/paddle/distributed/fleet/launch_utils.py +++ b/python/paddle/distributed/fleet/launch_utils.py @@ -728,9 +728,6 @@ def get_device_mode(backend): def get_device_proc_info(args): - if args.enable_auto_mapping: - return (DeviceMode.GPU, []) - # device_mode device_mode = get_device_mode(args.backend) From 9cd37a6fa656386da1d2b5635ae0ce52849fe536 Mon Sep 17 00:00:00 2001 From: Ao Yulong Date: Fri, 19 Nov 2021 08:13:40 +0000 Subject: [PATCH 14/30] Correct the mapper unit test --- .../unittests/test_auto_parallel_mapper.py | 20 ++++++++----------- 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py index 5b8a873674aad..ae787129e98a6 100644 --- a/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py +++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py @@ -1259,24 +1259,21 @@ def get_device_local_ids(machine): class TestAutoParallelMapper(unittest.TestCase): def test_mapper_dp_mp(self): - # cluster_json_file = "" - # cluster_json_object = json.loads(cluster_json) - # with open("./auto_parallel_cluster.json", "w") as cluster_json_file: - # json.dump(cluster_json_object, cluster_json_file) + cluster_json_file = "" + cluster_json_object = json.loads(cluster_json) + with open("./auto_parallel_cluster.json", "w") as cluster_json_file: + json.dump(cluster_json_object, cluster_json_file) cluster = Cluster() - cluster.build_from_file( - "/home/aoyulong/workspace/auto_parallel_cluster.json") - # cluster.build_from_file("./auto_parallel_cluster.json") - # os.remove("./auto_parallel_cluster.json") + cluster.build_from_file("./auto_parallel_cluster.json") + os.remove("./auto_parallel_cluster.json") global _global_parallel_strategy _global_parallel_strategy = "dp_mp" global _global_num_stages _global_num_stages = 1 global _global_process_mesh - _global_process_mesh = auto.ProcessMesh(mesh=[[0, 1], [2, 3]]) - # _global_process_mesh = auto.ProcessMesh( - # mesh=[[0, 1, 2, 3], [4, 5, 6, 7]]) + _global_process_mesh = auto.ProcessMesh( + mesh=[[0, 1, 2, 3], [4, 5, 6, 7]]) dist_programs = {} for rank_id in _global_process_mesh.processes: @@ -1307,7 +1304,6 @@ def test_mapper_dp_mp(self): self.assertEqual( len(machine_mapped_ranks), len(machine_mapped_device_local_ids)) all_mapped_ranks.update(machine_mapped_ranks) - print(_global_process_mesh.processes, all_mapped_ranks) self.assertEqual(set(_global_process_mesh.processes), all_mapped_ranks) def test_mapper_dp_mp_pp(self): From 7349999be96fcf6b656dcd7e0aa7a0de424479eb Mon Sep 17 00:00:00 2001 From: Ao Yulong Date: Tue, 23 Nov 2021 12:02:03 +0000 Subject: [PATCH 15/30] Add some comments --- python/paddle/distributed/auto_parallel/cluster.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/python/paddle/distributed/auto_parallel/cluster.py b/python/paddle/distributed/auto_parallel/cluster.py index 23ea90071a56d..d65612fc6e9fc 100644 --- a/python/paddle/distributed/auto_parallel/cluster.py +++ b/python/paddle/distributed/auto_parallel/cluster.py @@ -48,12 +48,14 @@ def __init__(self, global_id, local_id, machine): self._local_id = local_id self._machine = machine self._type = None - # different device have different models, such as + # Different device have different models, such as # "Tesla V100-SXM2-32GB" and "A100-SXM4-40GB" etc. self._model = None + # Double precision GFLOPS self._dp_gflops = None + # Single precision GFLOPS self._sp_gflops = None - # memory is stored by GB + # Memory is stored by GB self._memory = None @property From 11d41b41886433029b7ed2a0c58dce18cb206157 Mon Sep 17 00:00:00 2001 From: Ao Yulong Date: Tue, 23 Nov 2021 12:19:23 +0000 Subject: [PATCH 16/30] Remove the related files about mapping --- .../distributed/auto_parallel/mapper.py | 293 ---- .../fluid/tests/unittests/CMakeLists.txt | 2 - .../unittests/test_auto_parallel_mapper.py | 1358 ----------------- 3 files changed, 1653 deletions(-) delete mode 100644 python/paddle/distributed/auto_parallel/mapper.py delete mode 100644 python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py diff --git a/python/paddle/distributed/auto_parallel/mapper.py b/python/paddle/distributed/auto_parallel/mapper.py deleted file mode 100644 index 4c514b2f1d070..0000000000000 --- a/python/paddle/distributed/auto_parallel/mapper.py +++ /dev/null @@ -1,293 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License - -import operator -import functools -import json -import paddle -from collections import deque -from .graph import Node -from .graph import Edge -from .graph import Graph -from .cluster import DeviceType -from .process_group import get_process_group - - -def is_collective_comm_op(op): - comm_list = [ - "c_allreduce_sum", "c_allreduce_min", "c_allreduce_max", - "c_allreduce_prod", "c_reduce_sum", "c_reduce_min", "c_reduce_max", - "c_reduce_prod", "c_broadcast", "c_allgather", "send_v2", "recv_v2" - ] - if op.type in comm_list: - return True - else: - return False - - -def is_p2p_comm_op(op): - comm_list = ["send_v2", "recv_v2"] - if op.type in comm_list: - return True - else: - return False - - -def get_dtype_bytes(dtype): - num_bytes = 0 - if dtype == paddle.float64: - num_bytes = 8 - elif dtype == paddle.float32: - num_bytes = 4 - elif dtype == paddle.float16: - num_bytes = 2 - elif dtype == paddle.bfloat16: - num_bytes = 2 - elif dtype == paddle.int64: - num_bytes = 8 - elif dtype == paddle.int32: - num_bytes = 4 - elif dtype == paddle.int16: - num_bytes = 2 - elif dtype == paddle.int8: - num_bytes = 1 - elif dtype == paddle.uint8: - num_bytes = 1 - else: - raise ValueError("Unrecognized dtype {}.".format(dtype)) - return num_bytes - - -def get_comm_volume(comm_op, src_rank, tgt_rank): - comm_volume = None - if src_rank == tgt_rank: - return comm_volume - comm_op_type = comm_op.type - if comm_op_type != "recv_v2": - tensor_name = comm_op.input_arg_names[0] - else: - tensor_name = comm_op.output_arg_names[0] - tensor = comm_op.block._find_var_recursive(tensor_name) - assert tensor is not None - tensor_shape = tensor.shape - # Skip the batch dim - new_tensor_shape = [] - for val in tensor_shape: - if val == -1: - print("Warning: -1 in the tensor shape.") - new_tensor_shape.append(1) - new_tensor_shape.append(val) - tensor_size = functools.reduce(operator.mul, new_tensor_shape, 1) - tensor_bytes = tensor_size * get_dtype_bytes(tensor.dtype) - if "c_allreduce" in comm_op_type: - comm_volume = 2 * tensor_bytes - elif "c_allgather" in comm_op_type: - comm_volume = tensor_bytes - elif "c_broadcast" in comm_op_type: - if comm_op.attr("root_id") == src_rank: - comm_volume = tensor_bytes - else: - comm_volume = None - elif "c_reduce" in comm_op_type: - if comm_op.attr("root_id") == src_rank: - comm_volume = None - else: - comm_volume = tensor_bytes - elif "send_v2" in comm_op_type: - if comm_op.attr("peer") == tgt_rank: - comm_volume = tensor_bytes - else: - comm_volume = None - elif "recv_v2" in comm_op_type: - comm_volume = None - else: - raise ValueError("Unrecognized communication operator.") - return comm_volume - - -def analyze_comm_requirements_from_op(op, rank): - comm_requirements_to_ranks = {} - if is_collective_comm_op(op): - process_group_id = op.attr("ring_id") - process_group = get_process_group(process_group_id) - if rank not in process_group.ranks: - return comm_requirements_to_ranks - for tgt_rank in process_group.ranks: - comm_volume = get_comm_volume(op, rank, tgt_rank) - if comm_volume is not None: - comm_requirements_to_ranks[tgt_rank] = {} - comm_requirements_to_ranks[tgt_rank][ - "comm_volume"] = comm_volume - elif is_p2p_comm_op(op): - tgt_rank = op.attr("peer") - comm_volume = get_comm_volume(op, rank, tgt_rank) - if comm_volume is not None: - comm_requirements_to_ranks[tgt_rank] = {} - comm_requirements_to_ranks[tgt_rank]["comm_volume"] = comm_volume - else: - comm_requirements_to_ranks = {} - return comm_requirements_to_ranks - - -def analyze_requirements_for_program(program, rank): - resource_requirements = {} - comm_requirements_to_ranks = {} - # only support device_type and only support GPU for now - resource_requirements["device_type"] = DeviceType.GPU - for block in program.blocks: - for op in block.ops: - cur_comm_requirements_to_ranks = analyze_comm_requirements_from_op( - op, rank) - for tgt_rank, link_info in cur_comm_requirements_to_ranks.items(): - if tgt_rank in comm_requirements_to_ranks: - comm_requirements_to_ranks[tgt_rank][ - "comm_volume"] += link_info["comm_volume"] - else: - comm_requirements_to_ranks[tgt_rank] = {} - comm_requirements_to_ranks[tgt_rank][ - "comm_volume"] = link_info["comm_volume"] - return resource_requirements, comm_requirements_to_ranks - - -def build_process_graph(distributed_program): - graph = Graph() - for src_rank, src_program in distributed_program.items(): - resource_requirements, comm_requirements_to_ranks = analyze_requirements_for_program( - src_program, src_rank) - graph.add_node(src_rank, resource_requirements=resource_requirements) - for tgt_rank, comm_requirements in comm_requirements_to_ranks.items(): - graph.add_edge( - src_rank, tgt_rank, comm_requirements=comm_requirements) - return graph - - -def build_cluster_graph(cluster): - graph = Graph() - for machine in cluster.machines.values(): - for device in machine.devices.values(): - graph.add_node(device.global_id, device=device) - for link in machine.links.values(): - graph.add_edge( - link.source.global_id, link.target.global_id, link=link) - return graph - - -def mapping(distributed_program, cluster): - # A very simple mapping algorithm only for GPUs. - # Here we assume one process will be mapped to one GPU. - # In the future, more mapping configurations and algorithms will be supported. - process_graph = build_process_graph(distributed_program) - - cluster_graph = build_cluster_graph(cluster) - - for cur_rank_node in process_graph: - cur_rank_node["visited"] = False - - for cur_device_node in cluster_graph: - cur_device_node["occupied"] = False - - def sort_by_comm_volume(rank_edge): - return rank_edge["comm_requirements"]["comm_volume"] - - def sort_by_comm_bandwidth(device_edge): - return device_edge["link"].bandwidth - - def select_unvisited_rank_node(rank_node_list): - selected_rank_node = None - for rank_node in rank_node_list: - if rank_node["visited"] is False: - selected_rank_node = rank_node - return selected_rank_node - - queue = deque() - root_rank_node = select_unvisited_rank_node( - list(process_graph.nodes.values())) - while root_rank_node is not None: - queue.append(root_rank_node) - while queue: - cur_rank_node = queue.popleft() - if cur_rank_node["visited"]: - continue - device_type = cur_rank_node["resource_requirements"]["device_type"] - cur_device_node = None - for device_node in cluster_graph.nodes.values(): - if (device_node["device"].type == device_type) and ( - not device_node["occupied"]): - device_node["occupied"] = True - cur_rank_node["visited"] = True - cur_rank_node["device"] = device_node["device"] - cur_device_node = device_node - break - assert cur_device_node, "Cannot find a device to satisfy the requirement." - - nbr_rank_edges = [] - for nbr_rank_node_id, nbr_rank_edge in process_graph.adjs[ - cur_rank_node.id].items(): - assert nbr_rank_edge.src_id == cur_rank_node.id and nbr_rank_edge.tgt_id == nbr_rank_node_id - queue.append(process_graph.nodes[nbr_rank_node_id]) - nbr_rank_edges.append(nbr_rank_edge) - nbr_rank_edges.sort(key=sort_by_comm_volume) - - nbr_device_edges = [] - for nbr_device_edge in cluster_graph.adjs[ - cur_device_node.id].values(): - nbr_device_edges.append(nbr_device_edge) - nbr_device_edges.sort(key=sort_by_comm_bandwidth) - - for nbr_rank_edge in nbr_rank_edges: - src_rank_node = process_graph.nodes[nbr_rank_edge.src_id][ - "visited"] - if src_rank_node: - continue - device_type = src_rank_node["resource_requirements"][ - "device_type"] - nbr_rank_node = process_graph.nodes[nbr_rank_edge.tgt_id] - for nbr_device_edge in nbr_device_edges: - nbr_device_node = cluster_graph.nodes[ - nbr_device_edge.tgt_id] - if (nbr_device_node["device"].type == device_type) and ( - not nbr_device_node["occupied"]): - nbr_device_node["occupied"] = True - nbr_rank_node["visited"] = True - nbr_rank_node["device"] = nbr_device_node["device"] - break - root_rank_node = select_unvisited_rank_node( - list(process_graph.nodes.values())) - - rank_mapping = {} - for rank, rank_node in process_graph.nodes.items(): - device = rank_node["device"] - machine = device.machine - if machine.id in rank_mapping: - rank_mapping[machine.id]["hostname"] = machine.hostname - rank_mapping[machine.id]["addr"] = machine.addr - rank_mapping[machine.id]["port"] = machine.port - if rank not in rank_mapping[machine.id]["ranks"]: - rank_mapping[machine.id]["ranks"][rank] = [] - rank_mapping[machine.id]["ranks"][rank].append(device.local_id) - else: - rank_mapping[machine.id]["ranks"][rank].append(device.local_id) - else: - rank_mapping[machine.id] = {} - rank_mapping[machine.id]["hostname"] = machine.hostname - rank_mapping[machine.id]["addr"] = machine.addr - rank_mapping[machine.id]["port"] = machine.port - rank_mapping[machine.id]["ranks"] = {} - rank_mapping[machine.id]["ranks"][rank] = [] - rank_mapping[machine.id]["ranks"][rank].append(device.local_id) - for machine_mapping in rank_mapping.values(): - for rank_devices in machine_mapping["ranks"].values(): - rank_devices.sort() - - return rank_mapping diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 30633050cc196..97af987ae3b33 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -142,7 +142,6 @@ if(((NOT WITH_ROCM) AND (NOT WITH_GPU)) OR WIN32) LIST(REMOVE_ITEM TEST_OPS test_fleet_gradient_scale) LIST(REMOVE_ITEM TEST_OPS test_disable_signal_handler) LIST(REMOVE_ITEM TEST_OPS test_fleet_executor) - LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_mapper) LIST(REMOVE_ITEM TEST_OPS test_fleet_executor_multi_devices) endif() @@ -1040,7 +1039,6 @@ if(WITH_DISTRIBUTE AND WITH_GPU AND WITH_NCCL) set_tests_properties(test_auto_parallel_data_unshard PROPERTIES TIMEOUT 120) set_tests_properties(test_auto_parallel_save_load PROPERTIES TIMEOUT 120) set_tests_properties(test_auto_parallel_autoconvert PROPERTIES TIMEOUT 120) - set_tests_properties(test_auto_parallel_mapper PROPERTIES TIMEOUT 120) if(${NCCL_VERSION} VERSION_GREATER_EQUAL 2212) set_tests_properties(test_parallel_dygraph_sparse_embedding PROPERTIES TIMEOUT 120) set_tests_properties(test_parallel_dygraph_transformer PROPERTIES TIMEOUT 120) diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py deleted file mode 100644 index 2bd05f1ba221a..0000000000000 --- a/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py +++ /dev/null @@ -1,1358 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import unittest -import os -import json -import collections -import math -import paddle - -import paddle.nn as nn -import paddle.nn.functional as F -import paddle.tensor as tensor -import paddle.utils as utils -import paddle.static as static -from paddle.fluid import layers -from paddle.fluid.framework import in_dygraph_mode -from paddle.nn.layer.transformer import _convert_param_attr_to_list -from paddle.fluid.initializer import Normal, Constant, NumpyArrayInitializer -from paddle.distributed import fleet - -import paddle.distributed.auto_parallel as auto -from paddle.distributed.auto_parallel.dist_context import DistributedContext -from paddle.distributed.auto_parallel.partitioner import Partitioner -from paddle.distributed.auto_parallel.reshard import reshard -from paddle.distributed.auto_parallel.process_group import get_all_process_groups -from paddle.distributed.auto_parallel.process_group import new_process_group -from paddle.distributed.auto_parallel.cluster import Cluster -from paddle.distributed.auto_parallel.cluster import DeviceType -from paddle.distributed.auto_parallel.cluster import LinkType -from paddle.distributed.auto_parallel.mapper import build_process_graph -from paddle.distributed.auto_parallel.mapper import build_cluster_graph -from paddle.distributed.auto_parallel.mapper import mapping -from paddle.distributed.auto_parallel.utils import check_distributed_attr_for_program -from paddle.distributed.auto_parallel.utils import print_program_with_dist_attr -from paddle.distributed.auto_parallel.utils import _get_comm_group - -paddle.enable_static() -_global_parallel_strategy = None -_global_process_mesh = None -_global_num_stages = None - -cluster_json = """ -{ - "machines": [ - { - "hostname": "machine0", - "addr": "0.0.0.1", - "port": "768", - "devices": [ - { - "global_id": 0, - "local_id": 0, - "type": "GPU", - "model": "A100-SXM4-40GB", - "sp_gflops": 19500, - "dp_gflops": 9700, - "memory": 40 - }, - { - "global_id": 1, - "local_id": 1, - "type": "GPU", - "model": "A100-SXM4-40GB", - "sp_gflops": 19500, - "dp_gflops": 9700, - "memory": 40 - }, - { - "global_id": 2, - "local_id": 2, - "type": "GPU", - "model": "A100-SXM4-40GB", - "sp_gflops": 19500, - "dp_gflops": 9700, - "memory": 40 - }, - { - "global_id": 3, - "local_id": 3, - "type": "GPU", - "model": "A100-SXM4-40GB", - "sp_gflops": 19500, - "dp_gflops": 9700, - "memory": 40 - }, - { - "global_id": 4, - "local_id": 0, - "type": "NIC" - } - ], - "links": [ - { - "source_global_id": 0, - "target_global_id": 1, - "type": "NVL", - "bandwidth": 42 - }, - { - "source_global_id": 0, - "target_global_id": 2, - "type": "NVL", - "bandwidth": 42 - }, - { - "source_global_id": 0, - "target_global_id": 3, - "type": "NVL", - "bandwidth": 42 - }, - { - "source_global_id": 0, - "target_global_id": 4, - "type": "PHB", - "bandwidth": 12 - }, - { - "source_global_id": 1, - "target_global_id": 0, - "type": "NVL", - "bandwidth": 42 - }, - { - "source_global_id": 1, - "target_global_id": 2, - "type": "NVL", - "bandwidth": 42 - }, - { - "source_global_id": 1, - "target_global_id": 3, - "type": "NVL", - "bandwidth": 42 - }, - { - "source_global_id": 1, - "target_global_id": 4, - "type": "PHB", - "bandwidth": 12 - }, - { - "source_global_id": 2, - "target_global_id": 0, - "type": "NVL", - "bandwidth": 42 - }, - { - "source_global_id": 2, - "target_global_id": 1, - "type": "NVL", - "bandwidth": 42 - }, - { - "source_global_id": 2, - "target_global_id": 3, - "type": "NVL", - "bandwidth": 42 - }, - { - "source_global_id": 2, - "target_global_id": 4, - "type": "PHB", - "bandwidth": 12 - }, - { - "source_global_id": 3, - "target_global_id": 0, - "type": "NVL", - "bandwidth": 42 - }, - { - "source_global_id": 3, - "target_global_id": 1, - "type": "NVL", - "bandwidth": 42 - }, - { - "source_global_id": 3, - "target_global_id": 2, - "type": "NVL", - "bandwidth": 42 - }, - { - "source_global_id": 3, - "target_global_id": 4, - "type": "PHB", - "bandwidth": 12 - }, - { - "source_global_id": 4, - "target_global_id": 9, - "type": "NET", - "bandwidth": 1 - } - ] - }, - { - "hostname": "machine1", - "addr": "0.0.0.2", - "port": "768", - "devices": [ - { - "global_id": 5, - "local_id": 0, - "type": "GPU", - "model": "Tesla V100-SXM2-32GB", - "sp_gflops": 15700, - "dp_gflops": 7800, - "memory": 32 - }, - { - "global_id": 6, - "local_id": 1, - "type": "GPU", - "model": "Tesla V100-SXM2-32GB", - "sp_gflops": 15700, - "dp_gflops": 7800, - "memory": 32 - }, - { - "global_id": 7, - "local_id": 2, - "type": "GPU", - "model": "Tesla V100-SXM2-32GB", - "sp_gflops": 15700, - "dp_gflops": 7800, - "memory": 32 - }, - { - "global_id": 8, - "local_id": 3, - "type": "GPU", - "model": "Tesla V100-SXM2-32GB", - "sp_gflops": 15700, - "dp_gflops": 7800, - "memory": 32 - }, - { - "global_id": 9, - "local_id": 0, - "type": "NIC" - } - ], - "links": [ - { - "source_global_id": 5, - "target_global_id": 6, - "type": "NVL", - "bandwidth": 42 - }, - { - "source_global_id": 5, - "target_global_id": 7, - "type": "NVL", - "bandwidth": 42 - }, - { - "source_global_id": 5, - "target_global_id": 8, - "type": "NVL", - "bandwidth": 42 - }, - { - "source_global_id": 5, - "target_global_id": 9, - "type": "PHB", - "bandwidth": 12 - }, - { - "source_global_id": 6, - "target_global_id": 5, - "type": "NVL", - "bandwidth": 42 - }, - { - "source_global_id": 6, - "target_global_id": 7, - "type": "NVL", - "bandwidth": 42 - }, - { - "source_global_id": 6, - "target_global_id": 8, - "type": "NVL", - "bandwidth": 42 - }, - { - "source_global_id": 6, - "target_global_id": 9, - "type": "PHB", - "bandwidth": 12 - }, - { - "source_global_id": 7, - "target_global_id": 5, - "type": "NVL", - "bandwidth": 42 - }, - { - "source_global_id": 7, - "target_global_id": 6, - "type": "NVL", - "bandwidth": 42 - }, - { - "source_global_id": 7, - "target_global_id": 8, - "type": "NVL", - "bandwidth": 42 - }, - { - "source_global_id": 7, - "target_global_id": 9, - "type": "PHB", - "bandwidth": 12 - }, - { - "source_global_id": 8, - "target_global_id": 5, - "type": "NVL", - "bandwidth": 42 - }, - { - "source_global_id": 8, - "target_global_id": 6, - "type": "NVL", - "bandwidth": 42 - }, - { - "source_global_id": 8, - "target_global_id": 7, - "type": "NVL", - "bandwidth": 42 - }, - { - "source_global_id": 8, - "target_global_id": 9, - "type": "PHB", - "bandwidth": 12 - }, - { - "source_global_id": 9, - "target_global_id": 4, - "type": "NET", - "bandwidth": 1 - } - ] - } - ] -} -""" - - -class MultiHeadAttention(nn.Layer): - """ - Attention mapps queries and a set of key-value pairs to outputs, and - Multi-Head Attention performs multiple parallel attention to jointly attending - to information from different representation subspaces. - """ - - Cache = collections.namedtuple("Cache", ["k", "v"]) - StaticCache = collections.namedtuple("StaticCache", ["k", "v"]) - - def __init__(self, - embed_dim, - num_heads, - dropout=0., - kdim=None, - vdim=None, - need_weights=False, - weight_attr=None, - bias_attr=None, - topo=None, - fuse=False, - stage=None): - super(MultiHeadAttention, self).__init__() - self.embed_dim = embed_dim - self.kdim = kdim if kdim is not None else embed_dim - self.vdim = vdim if vdim is not None else embed_dim - self.num_heads = num_heads - self.dropout = dropout - self.need_weights = need_weights - self.fuse = fuse - - self.stage = stage - - self.head_dim = embed_dim // num_heads - assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads" - - if topo is None or topo.mp_info.size == 1: - if self.fuse: - assert self.kdim == embed_dim - assert self.vdim == embed_dim - self.qkv_proj = nn.Linear( - embed_dim, 3 * embed_dim, weight_attr, bias_attr=bias_attr) - else: - self.q_proj = nn.Linear( - embed_dim, embed_dim, weight_attr, bias_attr=bias_attr) - self.k_proj = nn.Linear( - self.kdim, embed_dim, weight_attr, bias_attr=bias_attr) - self.v_proj = nn.Linear( - self.vdim, embed_dim, weight_attr, bias_attr=bias_attr) - self.out_proj = nn.Linear( - embed_dim, embed_dim, weight_attr, bias_attr=bias_attr) - - def _fuse_prepare_qkv(self, query): - mix_layer = self.qkv_proj(query) - mix_layer = paddle.reshape_(mix_layer, - [0, 0, self.num_heads, 3 * self.head_dim]) - mix_layer = paddle.transpose(mix_layer, [0, 2, 1, 3]) - q, k, v = paddle.split(mix_layer, num_or_sections=3, axis=-1) - return q, k, v - - def _prepare_qkv(self, query, key, value, use_cache=False, cache=None): - r""" - Prapares linear projected queries, keys and values for usage of subsequnt - multiple parallel attention. If `cache` is not None, using cached results - to reduce redundant calculations. - """ - q = self.q_proj(query) - - if _global_parallel_strategy == "mp": - auto.shard_tensor( - self.q_proj.weight, - dist_attr={ - "process_mesh": _global_process_mesh, - "dims_mapping": [-1, 0] - }) - elif _global_parallel_strategy == "dp_mp": - auto.shard_tensor( - self.q_proj.weight, - dist_attr={ - "process_mesh": _global_process_mesh, - "dims_mapping": [-1, 1] - }) - elif _global_parallel_strategy == "dp_mp_pp": - auto.shard_tensor( - self.q_proj.weight, - dist_attr={ - "process_mesh": _global_process_mesh[self.stage], - "dims_mapping": [-1, 1] - }) - - q = tensor.reshape(x=q, shape=[0, 0, self.num_heads, self.head_dim]) - q = tensor.transpose(x=q, perm=[0, 2, 1, 3]) - - if isinstance(cache, self.StaticCache): - # for encoder-decoder attention in inference and has cached - k, v = cache.k, cache.v - else: - k, v = self.compute_kv(key, value) - - if isinstance(cache, self.Cache): - # for decoder self-attention in inference - k = tensor.concat([cache.k, k], axis=2) - v = tensor.concat([cache.v, v], axis=2) - if use_cache is True: - cache = self.Cache(k, v) - - return (q, k, v) if use_cache is False else (q, k, v, cache) - - def compute_kv(self, key, value): - r""" - Applies linear projection on input keys and values, then splits heads - (reshape and transpose) to get keys and values from different representation - subspaces. The results are used as key-values pairs for subsequent multiple - parallel attention. - It is part of calculations in multi-head attention, and is provided as - a method to pre-compute and prefetch these results, thus we can use them - to construct cache for inference. - """ - k = self.k_proj(key) - - if _global_parallel_strategy == "mp": - auto.shard_tensor( - self.k_proj.weight, - dist_attr={ - "process_mesh": _global_process_mesh, - "dims_mapping": [-1, 0] - }) - elif _global_parallel_strategy == "dp_mp": - auto.shard_tensor( - self.k_proj.weight, - dist_attr={ - "process_mesh": _global_process_mesh, - "dims_mapping": [-1, 1] - }) - elif _global_parallel_strategy == "dp_mp_pp": - auto.shard_tensor( - self.k_proj.weight, - dist_attr={ - "process_mesh": _global_process_mesh[self.stage], - "dims_mapping": [-1, 1] - }) - - v = self.v_proj(value) - - if _global_parallel_strategy == "mp": - auto.shard_tensor( - self.v_proj.weight, - dist_attr={ - "process_mesh": _global_process_mesh, - "dims_mapping": [-1, 0] - }) - elif _global_parallel_strategy == "dp_mp": - auto.shard_tensor( - self.v_proj.weight, - dist_attr={ - "process_mesh": _global_process_mesh, - "dims_mapping": [-1, 1] - }) - elif _global_parallel_strategy == "dp_mp_pp": - auto.shard_tensor( - self.v_proj.weight, - dist_attr={ - "process_mesh": _global_process_mesh[self.stage], - "dims_mapping": [-1, 1] - }) - - k = tensor.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim]) - k = tensor.transpose(x=k, perm=[0, 2, 1, 3]) - v = tensor.reshape(x=v, shape=[0, 0, self.num_heads, self.head_dim]) - v = tensor.transpose(x=v, perm=[0, 2, 1, 3]) - return k, v - - def gen_cache(self, key, value=None, type=Cache): - """ - Generates cache for `forward` usage in inference accroding to arguments. - The generated cache is an instance of `MultiHeadAttention.Cache` or an - instance of `MultiHeadAttention.StaticCache`. - """ - if type == MultiHeadAttention.StaticCache: # static_kv - k, v = self.compute_kv(key, value) - return self.StaticCache(k, v) - elif value is None: # incremental_state - k = layers.fill_constant_batch_size_like( - input=key, - shape=[-1, self.num_heads, 0, self.head_dim], - dtype=key.dtype, - value=0) - v = layers.fill_constant_batch_size_like( - input=key, - shape=[-1, self.num_heads, 0, self.head_dim], - dtype=key.dtype, - value=0) - return self.Cache(k, v) - else: - # incremental_state with initial value, mainly for usage like UniLM - return self.Cache(key, value) - - def forward(self, - query, - key, - value, - attn_mask=None, - use_cache=False, - cache=None): - r""" - Applies multi-head attention to map queries and a set of key-value pairs - to outputs. - """ - key = query if key is None else key - value = query if value is None else value - # compute q ,k ,v - if use_cache is False: - if self.fuse: - q, k, v = self._fuse_prepare_qkv(query) - else: - q, k, v = self._prepare_qkv(query, key, value, use_cache, cache) - else: - q, k, v, cache = self._prepare_qkv(query, key, value, use_cache, - cache) - # scale dot product attention - product = layers.matmul( - x=q, y=k, transpose_y=True, alpha=self.head_dim**-0.5) - - if attn_mask is not None: - product = product + attn_mask - - weights = F.softmax(product) - if self.dropout: - weights = F.dropout( - weights, - self.dropout, - training=self.training, - mode="upscale_in_train") - - out = tensor.matmul(weights, v) - - # combine heads - out = tensor.transpose(out, perm=[0, 2, 1, 3]) - out = tensor.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]]) - - # project to output - out = self.out_proj(out) - - if _global_parallel_strategy == "mp": - auto.shard_tensor( - self.out_proj.weight, - dist_attr={ - "process_mesh": _global_process_mesh, - "dims_mapping": [0, -1] - }) - elif _global_parallel_strategy == "dp_mp": - auto.shard_tensor( - self.out_proj.weight, - dist_attr={ - "process_mesh": _global_process_mesh, - "dims_mapping": [1, -1] - }) - elif _global_parallel_strategy == "dp_mp_pp": - auto.shard_tensor( - self.out_proj.weight, - dist_attr={ - "process_mesh": _global_process_mesh[self.stage], - "dims_mapping": [1, -1] - }) - - outs = [out] - if self.need_weights: - outs.append(weights) - if use_cache: - outs.append(cache) - return out if len(outs) == 1 else tuple(outs) - - -class TransformerDecoder(nn.Layer): - """ - TransformerDecoder is a stack of N decoder layers. - """ - - def __init__(self, - decoder_layers, - num_layers, - norm=None, - hidden_size=None, - topo=None): - super(TransformerDecoder, self).__init__() - - self.topo = topo - self.num_layers = num_layers - self.layers = decoder_layers - self.norm = norm - if norm is "LayerNorm": - self.norm = nn.LayerNorm(hidden_size) - elif norm is not None: - raise ValueError("Only support LayerNorm") - self.checkpoints = [] - - def forward(self, - tgt, - memory, - tgt_mask=None, - memory_mask=None, - use_cache=False, - cache=None): - r""" - Applies a stack of N Transformer decoder layers on inputs. If `norm` is - provided, also applies layer normalization on the output of last decoder - layer. - """ - output = tgt - new_caches = [] - self.checkpoints = [] - assert cache is None and use_cache == False - if _global_parallel_strategy == "dp_mp_pp": - auto.shard_tensor( - output, - dist_attr={ - "process_mesh": _global_process_mesh[0], - "dims_mapping": - [0] + [-1 for i in range(len(output.shape) - 1)] - }) - for i, mod in enumerate(self.layers): - if cache is None: - if use_cache: - output, new_cache = mod(output, - memory, - tgt_mask=tgt_mask, - use_cache=use_cache, - cache=cache) - new_caches.append(new_cache) - else: - if _global_parallel_strategy == "dp_mp_pp": - output = auto.shard_op( - mod, - dist_attr={ - "process_mesh": _global_process_mesh[mod.stage] - })(output, memory, tgt_mask, use_cache, cache)[0] - - auto.shard_tensor( - output, - dist_attr={ - "process_mesh": _global_process_mesh[mod.stage], - "dims_mapping": [0] + - [-1 for i in range(len(output.shape) - 1)] - }) - else: - output = mod(output, - memory, - tgt_mask=tgt_mask, - use_cache=use_cache, - cache=cache) - - else: - output, new_cache = mod(output, - memory, - tgt_mask=tgt_mask, - use_cache=use_cache, - cache=cache[i]) - new_caches.append(new_cache) - self.checkpoints.append(output.name) - - if self.norm is not None: - output = self.norm(output) - return output if use_cache is False else (output, new_caches) - - def gen_cache(self, memory, do_zip=False): - r""" - Generates cache for `forward` usage. The generated cache is a list, and - each element in it is a tuple( :code:`(incremental_cache, static_cache)` ) - produced by `TransformerDecoderLayer.gen_cache`. See `TransformerDecoderLayer.gen_cache` - for more details. If `do_zip` is True, apply `zip` on these tuples to get - a list with two elements. - """ - cache = [layer.gen_cache(memory) for layer in self.layers] - if do_zip: - cache = list(zip(*cache)) - return cache - - -class TransformerDecoderLayer(nn.Layer): - """ - The transformer decoder layer. - It contains multiheadattention and some linear layers. - """ - - def __init__(self, - d_model, - nhead, - dim_feedforward, - dropout=0.1, - activation="gelu", - attn_dropout=None, - act_dropout=None, - normalize_before=True, - weight_attr=None, - bias_attr=None, - topo=None, - stage=None): - self._config = locals() - self._config.pop("self") - self._config.pop("__class__", None) # py3 - - self.stage = stage - - super(TransformerDecoderLayer, self).__init__() - attn_dropout = dropout if attn_dropout is None else attn_dropout - act_dropout = dropout if act_dropout is None else act_dropout - self.normalize_before = normalize_before - - weight_attrs = _convert_param_attr_to_list(weight_attr, 3) - bias_attrs = _convert_param_attr_to_list(bias_attr, 3) - - self.self_attn = MultiHeadAttention( - d_model, - nhead, - dropout=attn_dropout, - weight_attr=weight_attrs[0], - bias_attr=bias_attrs[0], - topo=topo, - stage=self.stage) - if topo is None or topo.mp_info.size == 1: - self.linear1 = nn.Linear( - d_model, - dim_feedforward, - weight_attrs[2], - bias_attr=bias_attrs[2]) - self.linear2 = nn.Linear( - dim_feedforward, - d_model, - weight_attrs[2], - bias_attr=bias_attrs[2]) - - self.norm1 = nn.LayerNorm(d_model, epsilon=1e-5) - self.norm2 = nn.LayerNorm(d_model, epsilon=1e-5) - self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train") - self.dropout2 = nn.Dropout(act_dropout, mode="upscale_in_train") - self.activation = getattr(F, activation) - - def forward(self, tgt, memory, tgt_mask=None, use_cache=False, cache=None): - residual = tgt - - if self.normalize_before: - tgt = self.norm1(tgt) - - if use_cache is False: - tgt = self.self_attn(tgt, tgt, tgt, tgt_mask, use_cache, cache) - else: - tgt, incremental_cache = self.self_attn(tgt, tgt, tgt, tgt_mask, - use_cache, cache) - tgt = residual + self.dropout1(tgt) - if not self.normalize_before: - tgt = self.norm1(tgt) - - residual = tgt - if self.normalize_before: - tgt = self.norm2(tgt) - - if _global_parallel_strategy == "mp": - auto.shard_tensor( - self.linear1.weight, - dist_attr={ - "process_mesh": _global_process_mesh, - "dims_mapping": [-1, 0] - }) - elif _global_parallel_strategy == "dp_mp": - auto.shard_tensor( - self.linear1.weight, - dist_attr={ - "process_mesh": _global_process_mesh, - "dims_mapping": [-1, 1] - }) - elif _global_parallel_strategy == "dp_mp_pp": - auto.shard_tensor( - self.linear1.weight, - dist_attr={ - "process_mesh": _global_process_mesh[self.stage], - "dims_mapping": [-1, 1] - }) - - if _global_parallel_strategy == "mp": - auto.shard_tensor( - self.linear2.weight, - dist_attr={ - "process_mesh": _global_process_mesh, - "dims_mapping": [0, -1] - }) - elif _global_parallel_strategy == "dp_mp": - auto.shard_tensor( - self.linear2.weight, - dist_attr={ - "process_mesh": _global_process_mesh, - "dims_mapping": [1, -1] - }) - elif _global_parallel_strategy == "dp_mp_pp": - auto.shard_tensor( - self.linear2.weight, - dist_attr={ - "process_mesh": _global_process_mesh[self.stage], - "dims_mapping": [1, -1] - }) - - # tgt = self.dropout2( - # self.linear2(F.gelu( - # self.linear1(tgt), approximate=True))) - tgt = self.linear1(tgt) - tgt = F.gelu(tgt, approximate=True) - tgt = self.dropout2(self.linear2(tgt)) - tgt = residual + tgt - - if not self.normalize_before: - tgt = self.norm2(tgt) - - return tgt if use_cache is False else (tgt, incremental_cache) - - def gen_cache(self, memory): - incremental_cache = self.self_attn.gen_cache( - memory, type=self.self_attn.Cache) - return incremental_cache - - -class GPTEmbeddings(nn.Layer): - """ - Include embeddings from word, position and token_type embeddings - """ - - def __init__(self, - vocab_size, - hidden_size=768, - hidden_dropout_prob=0.1, - max_position_embeddings=512, - type_vocab_size=16, - initializer_range=0.02, - topo=None, - stage=None): - super(GPTEmbeddings, self).__init__() - if topo is None or topo.mp_info.size == 1: - self.word_embeddings = nn.Embedding( - vocab_size, - hidden_size, - weight_attr=paddle.ParamAttr( - name="word_embeddings", - initializer=nn.initializer.Normal( - mean=0.0, std=initializer_range))) - self.position_embeddings = nn.Embedding( - max_position_embeddings, - hidden_size, - weight_attr=paddle.ParamAttr( - name="pos_embeddings", - initializer=nn.initializer.Normal( - mean=0.0, std=initializer_range))) - - self.dropout = nn.Dropout(hidden_dropout_prob) - - def forward(self, input_ids, position_ids=None): - if position_ids is None: - ones = paddle.ones_like(input_ids, dtype="int64") - seq_length = paddle.cumsum(ones, axis=-1) - position_ids = seq_length - ones - - input_embedings = self.word_embeddings(input_ids) - - if _global_parallel_strategy == "mp": - auto.shard_tensor( - self.word_embeddings.weight, - dist_attr={ - "process_mesh": _global_process_mesh, - "dims_mapping": [0, -1] - }) - elif _global_parallel_strategy == "dp_mp": - auto.shard_tensor( - self.word_embeddings.weight, - dist_attr={ - "process_mesh": _global_process_mesh, - "dims_mapping": [1, -1] - }) - elif _global_parallel_strategy == "dp_mp_pp": - auto.shard_tensor( - self.word_embeddings.weight, - dist_attr={ - "process_mesh": _global_process_mesh[0], - "dims_mapping": [1, -1] - }) - - position_embeddings = self.position_embeddings(position_ids) - embeddings = input_embedings + position_embeddings - embeddings = self.dropout(embeddings) - return embeddings - - -class GPTModel(nn.Layer): - """ - The base model of gpt. - """ - - def __init__(self, - vocab_size, - hidden_size=768, - num_hidden_layers=4, - num_attention_heads=12, - intermediate_size=3072, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=512, - type_vocab_size=16, - initializer_range=0.02, - pad_token_id=0, - topo=None): - super(GPTModel, self).__init__() - - self.pad_token_id = pad_token_id - self.initializer_range = initializer_range - self.topo = topo - self.hidden_size = hidden_size - self.vocab_size = vocab_size - - self.pipline_mode = topo is not None and topo.pp_info.size > 1 - if self.pipline_mode: - self.layer_per_stage = num_hidden_layers // self.topo.pp_info.size - - self.embeddings = GPTEmbeddings( - vocab_size, hidden_size, hidden_dropout_prob, - max_position_embeddings, type_vocab_size, self.initializer_range, - topo) - - layer_per_stage = num_hidden_layers // _global_num_stages - decoder_layers = nn.LayerList() - for i in range(num_hidden_layers): - stage = i // layer_per_stage - DecoderLayer = TransformerDecoderLayer - decoder_layers.append( - DecoderLayer( - d_model=hidden_size, - nhead=num_attention_heads, - dim_feedforward=intermediate_size, - dropout=hidden_dropout_prob, - activation=hidden_act, - attn_dropout=attention_probs_dropout_prob, - act_dropout=hidden_dropout_prob, - weight_attr=paddle.ParamAttr( - initializer=nn.initializer.Normal( - mean=0.0, std=self.initializer_range)), - bias_attr=None, - topo=topo, - stage=stage)) - - Decoder = TransformerDecoder - - self.decoder = Decoder( - decoder_layers, - num_hidden_layers, - norm="LayerNorm", - hidden_size=hidden_size, - topo=topo) - - self.checkpoints = [] - - def forward(self, - input_ids, - position_ids=None, - attention_mask=None, - use_cache=False, - cache=None): - self.checkpoints = [] - if attention_mask is None: - length = paddle.shape(input_ids)[1] - # Use bool mask - attention_mask = paddle.tensor.tril( - paddle.ones( - (length, length), - dtype=self.embeddings.word_embeddings.weight.dtype)) - if position_ids is None: - past_length = 0 - if cache is not None: - past_length = paddle.shape(cache[0].k)[-2] - position_ids = paddle.arange( - past_length, - paddle.shape(input_ids)[-1] + past_length, - dtype='int64') - position_ids = position_ids.unsqueeze(0) - # .expand_as(input_ids) - position_ids = paddle.fluid.layers.expand_as(position_ids, - input_ids) - embedding_output = self.embeddings( - input_ids=input_ids, position_ids=position_ids) - - # TODO, use registered buffer - causal_mask = paddle.tensor.triu( - paddle.ones((paddle.shape(input_ids)[-1], - paddle.shape(input_ids)[-1])) * -1e9, - diagonal=1) - - if attention_mask is not None: - attention_mask = attention_mask + causal_mask - else: - attention_mask = causal_mask - - # The tensor returned by triu not in static graph. - attention_mask.stop_gradient = True - - encoder_outputs = self.decoder( - embedding_output, - memory=None, - tgt_mask=attention_mask, - use_cache=use_cache, - cache=cache) - self.checkpoints.extend(self.decoder.checkpoints) - return encoder_outputs - - -class GPTForPretraining(nn.Layer): - """ - The pretraining model of GPT. - It returns some logits and cached_kvs. - """ - - def __init__(self, gpt): - super(GPTForPretraining, self).__init__() - self.gpt = gpt - self.share_param = False - self.weight = self.gpt.embeddings.word_embeddings.weight - if not self.share_param: - self.weight = self.create_parameter(shape=self.weight.shape) - - def parallel_matmul(self, lm_output, logit_weights, parallel_output, topo): - if topo is not None and topo.mp_info.size > 1: - input_parallel = paddle.distributed.collective._c_identity( - lm_output, group=None) - - logits = paddle.matmul( - input_parallel, logit_weights, transpose_y=True) - - if parallel_output: - return logits - - return paddle.distributed.collective._c_concat(logits, group=None) - else: - logits = paddle.matmul(lm_output, logit_weights, transpose_y=True) - return logits - - def forward(self, - input_ids, - position_ids=None, - attention_mask=None, - masked_positions=None, - use_cache=False, - cache=None): - outputs = self.gpt(input_ids, - position_ids=position_ids, - attention_mask=attention_mask, - use_cache=use_cache, - cache=cache) - if use_cache: - encoder_outputs, cached_kvs = outputs[:2] - else: - encoder_outputs = outputs - logits = self.parallel_matmul(encoder_outputs, self.weight, True, - self.gpt.topo) - - if use_cache: - return logits, cached_kvs - else: - return logits - - -class GPTPretrainingCriterion(nn.Layer): - """ - Criterion for GPT. - It calculates the final loss. - """ - - def __init__(self, topo=None): - super(GPTPretrainingCriterion, self).__init__() - if topo is None or topo.mp_info.size == 1: - self.loss_func = paddle.nn.CrossEntropyLoss(reduction="none") - else: - self.loss_func = paddle.distributed.collective._c_softmax_with_cross_entropy - - def forward(self, prediction_scores, masked_lm_labels, loss_mask): - masked_lm_loss = self.loss_func(prediction_scores, - masked_lm_labels.unsqueeze(2)) - - loss_mask = loss_mask.reshape([-1]) - masked_lm_loss = paddle.sum(masked_lm_loss.reshape([-1]) * loss_mask) - loss = masked_lm_loss / loss_mask.sum() - return loss - - -def gpt_pretrain_forward(train_program, startup_program): - with static.program_guard(train_program, - startup_program), utils.unique_name.guard(): - batch_size = 16 - sequence_len = 512 - input_ids = static.data( - name="input_ids", shape=[batch_size, sequence_len], dtype='int64') - position_ids = static.data( - name="position_ids", - shape=[batch_size, sequence_len], - dtype='int64') - attention_mask = static.data( - name="attention_mask", - shape=[batch_size, 1, sequence_len, sequence_len], - dtype='float64') - labels = static.data( - name="labels", shape=[batch_size, sequence_len], dtype='int64') - loss_mask = static.data( - name="loss_mask", shape=[batch_size, sequence_len], dtype='float64') - - if _global_parallel_strategy == "dp": - auto.shard_tensor( - input_ids, - dist_attr={ - "process_mesh": _global_process_mesh, - "dims_mapping": [0, -1] - }) - elif _global_parallel_strategy == "dp_mp": - auto.shard_tensor( - input_ids, - dist_attr={ - "process_mesh": _global_process_mesh, - "dims_mapping": [0, -1] - }) - elif _global_parallel_strategy == "dp_mp_pp": - auto.shard_tensor( - input_ids, - dist_attr={ - "process_mesh": _global_process_mesh[0], - "dims_mapping": [0, -1] - }) - - gpt = GPTModel( - vocab_size=32768, - hidden_size=768, - num_hidden_layers=2, - num_attention_heads=12, - intermediate_size=4096, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=1024, - type_vocab_size=16, - initializer_range=0.02, - pad_token_id=0, - topo=None) - - model = GPTForPretraining(gpt) - - preds = model(input_ids, position_ids, attention_mask) - - criterion = GPTPretrainingCriterion() - - loss = criterion(preds, labels, loss_mask) - - return train_program, startup_program, loss - - -def get_dist_prog(train_program, startup_program, dist_context, rank_id): - train_program, startup_program, loss = gpt_pretrain_forward(train_program, - startup_program) - - dist_strategy = fleet.DistributedStrategy() - - # auto completion - complete_train_program = auto.complete_annotation(train_program, - dist_context) - partitioner = Partitioner(dist_strategy, dist_context, rank_id) - # logical partition - dist_main_prog, dist_startup_prog = partitioner.transpile_forward( - complete_train_program, startup_program) - dist_params_grads = partitioner.apply_backward( - loss, complete_train_program, startup_program, dist_main_prog, - dist_startup_prog) - optimizer = paddle.fluid.optimizer.AdamOptimizer() - opt_ops = partitioner.apply_optimize(optimizer, dist_params_grads, - dist_main_prog, dist_startup_prog) - - return dist_main_prog, dist_startup_prog - - -def is_in_machine(device_local_id, machine): - for device in machine.devices.values(): - if device_local_id == device.local_id: - return True - return False - - -def get_device_local_ids(machine): - local_ids = [] - for device in machine.devices.values(): - local_ids.append[device.local_id] - return local_ids - - -class TestAutoParallelMapper(unittest.TestCase): - def test_mapper_dp_mp(self): - cluster_json_file = "" - cluster_json_object = json.loads(cluster_json) - with open("./auto_parallel_cluster.json", "w") as cluster_json_file: - json.dump(cluster_json_object, cluster_json_file) - cluster = Cluster() - cluster.build_from_file("./auto_parallel_cluster.json") - os.remove("./auto_parallel_cluster.json") - - global _global_parallel_strategy - _global_parallel_strategy = "dp_mp" - global _global_num_stages - _global_num_stages = 1 - global _global_process_mesh - _global_process_mesh = auto.ProcessMesh( - mesh=[[0, 1, 2, 3], [4, 5, 6, 7]]) - - dist_programs = {} - for rank_id in _global_process_mesh.processes: - train_program = static.Program() - startup_program = static.Program() - dist_context = DistributedContext() - dist_train_program, dist_startup_prog = get_dist_prog( - train_program, startup_program, dist_context, rank_id) - reshard(dist_train_program, dist_startup_prog, rank_id, - dist_context) - dist_programs[rank_id] = dist_train_program - - process_graph = build_process_graph(dist_programs) - - rank_mapping = mapping(dist_programs, cluster) - - all_mapped_ranks = set() - for machine_id, machine_mapping in rank_mapping.items(): - machine = cluster.machines[machine_id] - machine_mapped_ranks = set() - machine_mapped_device_local_ids = set() - for rank, device_ids in machine_mapping["ranks"].items(): - # Only allow one process to one device mapping - self.assertEqual(len(device_ids), 1) - self.assertTrue(is_in_machine(device_ids[0], machine)) - machine_mapped_ranks.add(rank) - machine_mapped_device_local_ids.add(device_ids[0]) - self.assertEqual( - len(machine_mapped_ranks), len(machine_mapped_device_local_ids)) - all_mapped_ranks.update(machine_mapped_ranks) - print(_global_process_mesh.processes, all_mapped_ranks) - self.assertEqual(set(_global_process_mesh.processes), all_mapped_ranks) - - def test_mapper_dp_mp_pp(self): - cluster_json_file = "" - cluster_json_object = json.loads(cluster_json) - with open("./auto_parallel_cluster.json", "w") as cluster_json_file: - json.dump(cluster_json_object, cluster_json_file) - cluster = Cluster() - cluster.build_from_file("./auto_parallel_cluster.json") - os.remove("./auto_parallel_cluster.json") - - global _global_parallel_strategy - _global_parallel_strategy = "dp_mp_pp" - global _global_num_stages - _global_num_stages = 2 - global _global_process_mesh - _global_process_mesh = [[[0, 1], [2, 3]], [[4, 5], [6, 7]]] - processes = [0, 1, 2, 3, 4, 5, 6, 7] - - dist_programs = {} - for rank_id in processes: - train_program = static.Program() - startup_program = static.Program() - dist_context = DistributedContext() - dist_train_program, dist_startup_prog = get_dist_prog( - train_program, startup_program, dist_context, rank_id) - reshard(dist_train_program, dist_startup_prog, rank_id, - dist_context) - dist_programs[rank_id] = dist_train_program - - rank_mapping = mapping(dist_programs, cluster) - - all_mapped_ranks = set() - for machine_id, machine_mapping in rank_mapping.items(): - machine = cluster.machines[machine_id] - machine_mapped_ranks = set() - machine_mapped_device_local_ids = set() - for rank, device_ids in machine_mapping["ranks"].items(): - # Only allow one process to one device mapping - self.assertEqual(len(device_ids), 1) - self.assertTrue(is_in_machine(device_ids[0], machine)) - machine_mapped_ranks.add(rank) - machine_mapped_device_local_ids.add(device_ids[0]) - self.assertEqual( - len(machine_mapped_ranks), len(machine_mapped_device_local_ids)) - all_mapped_ranks.update(machine_mapped_ranks) - self.assertEqual(set(processes), all_mapped_ranks) - - -if __name__ == '__main__': - unittest.main() From f56cacf6974f06ebee310f076f81bf86fe103d9f Mon Sep 17 00:00:00 2001 From: Ao Yulong Date: Wed, 24 Nov 2021 01:32:51 +0000 Subject: [PATCH 17/30] Update the unittest for auto mapping --- .../distributed/auto_parallel/mapper.py | 293 ++++++++++ .../fluid/tests/unittests/CMakeLists.txt | 1 + .../unittests/test_auto_parallel_mapper.py | 547 ++++++++++++++++++ 3 files changed, 841 insertions(+) create mode 100644 python/paddle/distributed/auto_parallel/mapper.py create mode 100644 python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py diff --git a/python/paddle/distributed/auto_parallel/mapper.py b/python/paddle/distributed/auto_parallel/mapper.py new file mode 100644 index 0000000000000..4c514b2f1d070 --- /dev/null +++ b/python/paddle/distributed/auto_parallel/mapper.py @@ -0,0 +1,293 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License + +import operator +import functools +import json +import paddle +from collections import deque +from .graph import Node +from .graph import Edge +from .graph import Graph +from .cluster import DeviceType +from .process_group import get_process_group + + +def is_collective_comm_op(op): + comm_list = [ + "c_allreduce_sum", "c_allreduce_min", "c_allreduce_max", + "c_allreduce_prod", "c_reduce_sum", "c_reduce_min", "c_reduce_max", + "c_reduce_prod", "c_broadcast", "c_allgather", "send_v2", "recv_v2" + ] + if op.type in comm_list: + return True + else: + return False + + +def is_p2p_comm_op(op): + comm_list = ["send_v2", "recv_v2"] + if op.type in comm_list: + return True + else: + return False + + +def get_dtype_bytes(dtype): + num_bytes = 0 + if dtype == paddle.float64: + num_bytes = 8 + elif dtype == paddle.float32: + num_bytes = 4 + elif dtype == paddle.float16: + num_bytes = 2 + elif dtype == paddle.bfloat16: + num_bytes = 2 + elif dtype == paddle.int64: + num_bytes = 8 + elif dtype == paddle.int32: + num_bytes = 4 + elif dtype == paddle.int16: + num_bytes = 2 + elif dtype == paddle.int8: + num_bytes = 1 + elif dtype == paddle.uint8: + num_bytes = 1 + else: + raise ValueError("Unrecognized dtype {}.".format(dtype)) + return num_bytes + + +def get_comm_volume(comm_op, src_rank, tgt_rank): + comm_volume = None + if src_rank == tgt_rank: + return comm_volume + comm_op_type = comm_op.type + if comm_op_type != "recv_v2": + tensor_name = comm_op.input_arg_names[0] + else: + tensor_name = comm_op.output_arg_names[0] + tensor = comm_op.block._find_var_recursive(tensor_name) + assert tensor is not None + tensor_shape = tensor.shape + # Skip the batch dim + new_tensor_shape = [] + for val in tensor_shape: + if val == -1: + print("Warning: -1 in the tensor shape.") + new_tensor_shape.append(1) + new_tensor_shape.append(val) + tensor_size = functools.reduce(operator.mul, new_tensor_shape, 1) + tensor_bytes = tensor_size * get_dtype_bytes(tensor.dtype) + if "c_allreduce" in comm_op_type: + comm_volume = 2 * tensor_bytes + elif "c_allgather" in comm_op_type: + comm_volume = tensor_bytes + elif "c_broadcast" in comm_op_type: + if comm_op.attr("root_id") == src_rank: + comm_volume = tensor_bytes + else: + comm_volume = None + elif "c_reduce" in comm_op_type: + if comm_op.attr("root_id") == src_rank: + comm_volume = None + else: + comm_volume = tensor_bytes + elif "send_v2" in comm_op_type: + if comm_op.attr("peer") == tgt_rank: + comm_volume = tensor_bytes + else: + comm_volume = None + elif "recv_v2" in comm_op_type: + comm_volume = None + else: + raise ValueError("Unrecognized communication operator.") + return comm_volume + + +def analyze_comm_requirements_from_op(op, rank): + comm_requirements_to_ranks = {} + if is_collective_comm_op(op): + process_group_id = op.attr("ring_id") + process_group = get_process_group(process_group_id) + if rank not in process_group.ranks: + return comm_requirements_to_ranks + for tgt_rank in process_group.ranks: + comm_volume = get_comm_volume(op, rank, tgt_rank) + if comm_volume is not None: + comm_requirements_to_ranks[tgt_rank] = {} + comm_requirements_to_ranks[tgt_rank][ + "comm_volume"] = comm_volume + elif is_p2p_comm_op(op): + tgt_rank = op.attr("peer") + comm_volume = get_comm_volume(op, rank, tgt_rank) + if comm_volume is not None: + comm_requirements_to_ranks[tgt_rank] = {} + comm_requirements_to_ranks[tgt_rank]["comm_volume"] = comm_volume + else: + comm_requirements_to_ranks = {} + return comm_requirements_to_ranks + + +def analyze_requirements_for_program(program, rank): + resource_requirements = {} + comm_requirements_to_ranks = {} + # only support device_type and only support GPU for now + resource_requirements["device_type"] = DeviceType.GPU + for block in program.blocks: + for op in block.ops: + cur_comm_requirements_to_ranks = analyze_comm_requirements_from_op( + op, rank) + for tgt_rank, link_info in cur_comm_requirements_to_ranks.items(): + if tgt_rank in comm_requirements_to_ranks: + comm_requirements_to_ranks[tgt_rank][ + "comm_volume"] += link_info["comm_volume"] + else: + comm_requirements_to_ranks[tgt_rank] = {} + comm_requirements_to_ranks[tgt_rank][ + "comm_volume"] = link_info["comm_volume"] + return resource_requirements, comm_requirements_to_ranks + + +def build_process_graph(distributed_program): + graph = Graph() + for src_rank, src_program in distributed_program.items(): + resource_requirements, comm_requirements_to_ranks = analyze_requirements_for_program( + src_program, src_rank) + graph.add_node(src_rank, resource_requirements=resource_requirements) + for tgt_rank, comm_requirements in comm_requirements_to_ranks.items(): + graph.add_edge( + src_rank, tgt_rank, comm_requirements=comm_requirements) + return graph + + +def build_cluster_graph(cluster): + graph = Graph() + for machine in cluster.machines.values(): + for device in machine.devices.values(): + graph.add_node(device.global_id, device=device) + for link in machine.links.values(): + graph.add_edge( + link.source.global_id, link.target.global_id, link=link) + return graph + + +def mapping(distributed_program, cluster): + # A very simple mapping algorithm only for GPUs. + # Here we assume one process will be mapped to one GPU. + # In the future, more mapping configurations and algorithms will be supported. + process_graph = build_process_graph(distributed_program) + + cluster_graph = build_cluster_graph(cluster) + + for cur_rank_node in process_graph: + cur_rank_node["visited"] = False + + for cur_device_node in cluster_graph: + cur_device_node["occupied"] = False + + def sort_by_comm_volume(rank_edge): + return rank_edge["comm_requirements"]["comm_volume"] + + def sort_by_comm_bandwidth(device_edge): + return device_edge["link"].bandwidth + + def select_unvisited_rank_node(rank_node_list): + selected_rank_node = None + for rank_node in rank_node_list: + if rank_node["visited"] is False: + selected_rank_node = rank_node + return selected_rank_node + + queue = deque() + root_rank_node = select_unvisited_rank_node( + list(process_graph.nodes.values())) + while root_rank_node is not None: + queue.append(root_rank_node) + while queue: + cur_rank_node = queue.popleft() + if cur_rank_node["visited"]: + continue + device_type = cur_rank_node["resource_requirements"]["device_type"] + cur_device_node = None + for device_node in cluster_graph.nodes.values(): + if (device_node["device"].type == device_type) and ( + not device_node["occupied"]): + device_node["occupied"] = True + cur_rank_node["visited"] = True + cur_rank_node["device"] = device_node["device"] + cur_device_node = device_node + break + assert cur_device_node, "Cannot find a device to satisfy the requirement." + + nbr_rank_edges = [] + for nbr_rank_node_id, nbr_rank_edge in process_graph.adjs[ + cur_rank_node.id].items(): + assert nbr_rank_edge.src_id == cur_rank_node.id and nbr_rank_edge.tgt_id == nbr_rank_node_id + queue.append(process_graph.nodes[nbr_rank_node_id]) + nbr_rank_edges.append(nbr_rank_edge) + nbr_rank_edges.sort(key=sort_by_comm_volume) + + nbr_device_edges = [] + for nbr_device_edge in cluster_graph.adjs[ + cur_device_node.id].values(): + nbr_device_edges.append(nbr_device_edge) + nbr_device_edges.sort(key=sort_by_comm_bandwidth) + + for nbr_rank_edge in nbr_rank_edges: + src_rank_node = process_graph.nodes[nbr_rank_edge.src_id][ + "visited"] + if src_rank_node: + continue + device_type = src_rank_node["resource_requirements"][ + "device_type"] + nbr_rank_node = process_graph.nodes[nbr_rank_edge.tgt_id] + for nbr_device_edge in nbr_device_edges: + nbr_device_node = cluster_graph.nodes[ + nbr_device_edge.tgt_id] + if (nbr_device_node["device"].type == device_type) and ( + not nbr_device_node["occupied"]): + nbr_device_node["occupied"] = True + nbr_rank_node["visited"] = True + nbr_rank_node["device"] = nbr_device_node["device"] + break + root_rank_node = select_unvisited_rank_node( + list(process_graph.nodes.values())) + + rank_mapping = {} + for rank, rank_node in process_graph.nodes.items(): + device = rank_node["device"] + machine = device.machine + if machine.id in rank_mapping: + rank_mapping[machine.id]["hostname"] = machine.hostname + rank_mapping[machine.id]["addr"] = machine.addr + rank_mapping[machine.id]["port"] = machine.port + if rank not in rank_mapping[machine.id]["ranks"]: + rank_mapping[machine.id]["ranks"][rank] = [] + rank_mapping[machine.id]["ranks"][rank].append(device.local_id) + else: + rank_mapping[machine.id]["ranks"][rank].append(device.local_id) + else: + rank_mapping[machine.id] = {} + rank_mapping[machine.id]["hostname"] = machine.hostname + rank_mapping[machine.id]["addr"] = machine.addr + rank_mapping[machine.id]["port"] = machine.port + rank_mapping[machine.id]["ranks"] = {} + rank_mapping[machine.id]["ranks"][rank] = [] + rank_mapping[machine.id]["ranks"][rank].append(device.local_id) + for machine_mapping in rank_mapping.values(): + for rank_devices in machine_mapping["ranks"].values(): + rank_devices.sort() + + return rank_mapping diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 4ec4299513966..491d7f6028bbb 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -144,6 +144,7 @@ if(((NOT WITH_ROCM) AND (NOT WITH_GPU)) OR WIN32) LIST(REMOVE_ITEM TEST_OPS test_disable_signal_handler) LIST(REMOVE_ITEM TEST_OPS test_fleet_executor) LIST(REMOVE_ITEM TEST_OPS test_fleet_executor_multi_devices) + LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_mapper) endif() # Temporally disable test_deprecated_decorator diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py new file mode 100644 index 0000000000000..a68df5648db21 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py @@ -0,0 +1,547 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import os +import json +import collections +import math +import numpy as np + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +import paddle.tensor as tensor +import paddle.utils as utils +import paddle.static as static +from paddle.fluid import layers +from paddle.fluid.framework import in_dygraph_mode +from paddle.nn.layer.transformer import _convert_param_attr_to_list +from paddle.fluid.initializer import Normal, Constant, NumpyArrayInitializer +from paddle.distributed import fleet + +import paddle.distributed.auto_parallel as auto +from paddle.distributed.auto_parallel.dist_context import DistributedContext +from paddle.distributed.auto_parallel.partitioner import Partitioner +from paddle.distributed.auto_parallel.reshard import reshard +from paddle.distributed.auto_parallel.process_group import get_all_process_groups +from paddle.distributed.auto_parallel.process_group import new_process_group +from paddle.distributed.auto_parallel.cluster import Cluster +from paddle.distributed.auto_parallel.cluster import DeviceType +from paddle.distributed.auto_parallel.cluster import LinkType +from paddle.distributed.auto_parallel.mapper import build_process_graph +from paddle.distributed.auto_parallel.mapper import build_cluster_graph +from paddle.distributed.auto_parallel.mapper import mapping +from paddle.distributed.auto_parallel.utils import check_distributed_attr_for_program +from paddle.distributed.auto_parallel.utils import print_program_with_dist_attr + +paddle.enable_static() +_global_parallel_strategy = None +_global_process_mesh = None +_global_num_stages = None + +cluster_json = """ +{ + "machines": [ + { + "hostname": "machine0", + "addr": "0.0.0.1", + "port": "768", + "devices": [ + { + "global_id": 0, + "local_id": 0, + "type": "GPU", + "model": "A100-SXM4-40GB", + "sp_gflops": 19500, + "dp_gflops": 9700, + "memory": 40 + }, + { + "global_id": 1, + "local_id": 1, + "type": "GPU", + "model": "A100-SXM4-40GB", + "sp_gflops": 19500, + "dp_gflops": 9700, + "memory": 40 + }, + { + "global_id": 2, + "local_id": 2, + "type": "GPU", + "model": "A100-SXM4-40GB", + "sp_gflops": 19500, + "dp_gflops": 9700, + "memory": 40 + }, + { + "global_id": 3, + "local_id": 3, + "type": "GPU", + "model": "A100-SXM4-40GB", + "sp_gflops": 19500, + "dp_gflops": 9700, + "memory": 40 + }, + { + "global_id": 4, + "local_id": 0, + "type": "NIC" + } + ], + "links": [ + { + "source_global_id": 0, + "target_global_id": 1, + "type": "NVL", + "bandwidth": 42 + }, + { + "source_global_id": 0, + "target_global_id": 2, + "type": "NVL", + "bandwidth": 42 + }, + { + "source_global_id": 0, + "target_global_id": 3, + "type": "NVL", + "bandwidth": 42 + }, + { + "source_global_id": 0, + "target_global_id": 4, + "type": "PHB", + "bandwidth": 12 + }, + { + "source_global_id": 1, + "target_global_id": 0, + "type": "NVL", + "bandwidth": 42 + }, + { + "source_global_id": 1, + "target_global_id": 2, + "type": "NVL", + "bandwidth": 42 + }, + { + "source_global_id": 1, + "target_global_id": 3, + "type": "NVL", + "bandwidth": 42 + }, + { + "source_global_id": 1, + "target_global_id": 4, + "type": "PHB", + "bandwidth": 12 + }, + { + "source_global_id": 2, + "target_global_id": 0, + "type": "NVL", + "bandwidth": 42 + }, + { + "source_global_id": 2, + "target_global_id": 1, + "type": "NVL", + "bandwidth": 42 + }, + { + "source_global_id": 2, + "target_global_id": 3, + "type": "NVL", + "bandwidth": 42 + }, + { + "source_global_id": 2, + "target_global_id": 4, + "type": "PHB", + "bandwidth": 12 + }, + { + "source_global_id": 3, + "target_global_id": 0, + "type": "NVL", + "bandwidth": 42 + }, + { + "source_global_id": 3, + "target_global_id": 1, + "type": "NVL", + "bandwidth": 42 + }, + { + "source_global_id": 3, + "target_global_id": 2, + "type": "NVL", + "bandwidth": 42 + }, + { + "source_global_id": 3, + "target_global_id": 4, + "type": "PHB", + "bandwidth": 12 + }, + { + "source_global_id": 4, + "target_global_id": 9, + "type": "NET", + "bandwidth": 1 + } + ] + }, + { + "hostname": "machine1", + "addr": "0.0.0.2", + "port": "768", + "devices": [ + { + "global_id": 5, + "local_id": 0, + "type": "GPU", + "model": "Tesla V100-SXM2-32GB", + "sp_gflops": 15700, + "dp_gflops": 7800, + "memory": 32 + }, + { + "global_id": 6, + "local_id": 1, + "type": "GPU", + "model": "Tesla V100-SXM2-32GB", + "sp_gflops": 15700, + "dp_gflops": 7800, + "memory": 32 + }, + { + "global_id": 7, + "local_id": 2, + "type": "GPU", + "model": "Tesla V100-SXM2-32GB", + "sp_gflops": 15700, + "dp_gflops": 7800, + "memory": 32 + }, + { + "global_id": 8, + "local_id": 3, + "type": "GPU", + "model": "Tesla V100-SXM2-32GB", + "sp_gflops": 15700, + "dp_gflops": 7800, + "memory": 32 + }, + { + "global_id": 9, + "local_id": 0, + "type": "NIC" + } + ], + "links": [ + { + "source_global_id": 5, + "target_global_id": 6, + "type": "NVL", + "bandwidth": 42 + }, + { + "source_global_id": 5, + "target_global_id": 7, + "type": "NVL", + "bandwidth": 42 + }, + { + "source_global_id": 5, + "target_global_id": 8, + "type": "NVL", + "bandwidth": 42 + }, + { + "source_global_id": 5, + "target_global_id": 9, + "type": "PHB", + "bandwidth": 12 + }, + { + "source_global_id": 6, + "target_global_id": 5, + "type": "NVL", + "bandwidth": 42 + }, + { + "source_global_id": 6, + "target_global_id": 7, + "type": "NVL", + "bandwidth": 42 + }, + { + "source_global_id": 6, + "target_global_id": 8, + "type": "NVL", + "bandwidth": 42 + }, + { + "source_global_id": 6, + "target_global_id": 9, + "type": "PHB", + "bandwidth": 12 + }, + { + "source_global_id": 7, + "target_global_id": 5, + "type": "NVL", + "bandwidth": 42 + }, + { + "source_global_id": 7, + "target_global_id": 6, + "type": "NVL", + "bandwidth": 42 + }, + { + "source_global_id": 7, + "target_global_id": 8, + "type": "NVL", + "bandwidth": 42 + }, + { + "source_global_id": 7, + "target_global_id": 9, + "type": "PHB", + "bandwidth": 12 + }, + { + "source_global_id": 8, + "target_global_id": 5, + "type": "NVL", + "bandwidth": 42 + }, + { + "source_global_id": 8, + "target_global_id": 6, + "type": "NVL", + "bandwidth": 42 + }, + { + "source_global_id": 8, + "target_global_id": 7, + "type": "NVL", + "bandwidth": 42 + }, + { + "source_global_id": 8, + "target_global_id": 9, + "type": "PHB", + "bandwidth": 12 + }, + { + "source_global_id": 9, + "target_global_id": 4, + "type": "NET", + "bandwidth": 1 + } + ] + } + ] +} +""" + + +class MLPLayer(nn.Layer): + def __init__(self, + hidden_size=64, + intermediate_size=4 * 64, + initializer_range=0.02): + super(MLPLayer, self).__init__() + d_model = hidden_size + dim_feedforward = intermediate_size + np.random.seed(2021) + arr0 = np.random.normal(0, 0.02, size=(d_model, dim_feedforward)) + arr1 = np.random.normal(0, 0.02, size=(dim_feedforward, d_model)) + arr2 = np.random.normal(0, 0.02, size=(d_model, dim_feedforward)) + arr3 = np.random.normal(0, 0.02, size=(dim_feedforward, d_model)) + weight_attr0 = paddle.ParamAttr(initializer=NumpyArrayInitializer(arr0)) + weight_attr1 = paddle.ParamAttr(initializer=NumpyArrayInitializer(arr1)) + weight_attr2 = paddle.ParamAttr(initializer=NumpyArrayInitializer(arr2)) + weight_attr3 = paddle.ParamAttr(initializer=NumpyArrayInitializer(arr3)) + bias_attr = None + self.linear0 = nn.Linear( + d_model, dim_feedforward, weight_attr0, bias_attr=bias_attr) + self.linear1 = nn.Linear( + dim_feedforward, d_model, weight_attr1, bias_attr=bias_attr) + self.norm = nn.LayerNorm(d_model, epsilon=1e-5) + self.linear2 = nn.Linear( + d_model, dim_feedforward, weight_attr2, bias_attr=bias_attr) + self.linear3 = nn.Linear( + dim_feedforward, d_model, weight_attr3, bias_attr=bias_attr) + + def forward(self, input): + if _global_parallel_strategy == "dp_mp_pp": + auto.shard_tensor( + self.linear0.weight, + dist_attr={ + "process_mesh": _global_process_mesh[0], + "dims_mapping": [-1, 1] + }) + auto.shard_tensor( + self.linear1.weight, + dist_attr={ + "process_mesh": _global_process_mesh[0], + "dims_mapping": [1, -1] + }) + auto.shard_tensor( + self.linear2.weight, + dist_attr={ + "process_mesh": _global_process_mesh[1], + "dims_mapping": [-1, 1] + }) + auto.shard_tensor( + self.linear3.weight, + dist_attr={ + "process_mesh": _global_process_mesh[1], + "dims_mapping": [1, -1] + }) + + out = self.norm(input) + out = self.linear0(out) + out = F.gelu(out, approximate=True) + out = self.linear1(out) + + out = self.linear2(out) + out = F.gelu(out, approximate=True) + out = self.linear3(out) + return out + + +def mlp_forward(train_program, start_program): + with static.program_guard(train_program,start_program), \ + utils.unique_name.guard(): + batch_size = 4 + hidden_size = 64 + input = static.data( + name="input", shape=[batch_size, hidden_size], dtype='float32') + label = static.data( + name="label", shape=[batch_size, 1], dtype='float32') + + if _global_parallel_strategy == "dp_mp_pp": + auto.shard_tensor( + input, + dist_attr={ + "process_mesh": _global_process_mesh[0], + "dims_mapping": [0, -1] + }) + mlp = MLPLayer( + hidden_size=hidden_size, + intermediate_size=4 * hidden_size, + initializer_range=0.02) + predict = mlp(input) + error_cost = paddle.nn.functional.square_error_cost(predict, label) + loss = paddle.mean(error_cost) + return loss, train_program, start_program + + +def get_dist_prog(train_program, startup_program, dist_context, rank_id): + loss, train_program, startup_program = mlp_forward(train_program, + startup_program) + + dist_strategy = fleet.DistributedStrategy() + + # auto completion + complete_train_program = auto.complete_annotation(train_program, + dist_context) + partitioner = Partitioner(dist_strategy, dist_context, rank_id) + # logical partition + dist_main_prog, dist_startup_prog = partitioner.transpile_forward( + complete_train_program, startup_program) + dist_params_grads = partitioner.apply_backward( + loss, complete_train_program, startup_program, dist_main_prog, + dist_startup_prog) + optimizer = paddle.fluid.optimizer.AdamOptimizer() + opt_ops = partitioner.apply_optimize(optimizer, dist_params_grads, + dist_main_prog, dist_startup_prog) + + return dist_main_prog, dist_startup_prog + + +def is_in_machine(device_local_id, machine): + for device in machine.devices.values(): + if device_local_id == device.local_id: + return True + return False + + +def get_device_local_ids(machine): + local_ids = [] + for device in machine.devices.values(): + local_ids.append[device.local_id] + return local_ids + + +class TestAutoParallelMapper(unittest.TestCase): + def test_mapper_dp_mp_pp(self): + cluster_json_file = "" + cluster_json_object = json.loads(cluster_json) + with open("./auto_parallel_cluster.json", "w") as cluster_json_file: + json.dump(cluster_json_object, cluster_json_file) + cluster = Cluster() + cluster.build_from_file("./auto_parallel_cluster.json") + os.remove("./auto_parallel_cluster.json") + + global _global_parallel_strategy + _global_parallel_strategy = "dp_mp_pp" + global _global_num_stages + _global_num_stages = 2 + global _global_process_mesh + _global_process_mesh = [[[0, 1], [2, 3]], [[4, 5], [6, 7]]] + processes = [0, 1, 2, 3, 4, 5, 6, 7] + + dist_programs = {} + for rank_id in processes: + train_program = static.Program() + startup_program = static.Program() + dist_context = DistributedContext() + dist_train_program, dist_startup_prog = get_dist_prog( + train_program, startup_program, dist_context, rank_id) + reshard(dist_train_program, dist_startup_prog, rank_id, + dist_context) + dist_programs[rank_id] = dist_train_program + + rank_mapping = mapping(dist_programs, cluster) + + all_mapped_ranks = set() + for machine_id, machine_mapping in rank_mapping.items(): + machine = cluster.machines[machine_id] + machine_mapped_ranks = set() + machine_mapped_device_local_ids = set() + for rank, device_ids in machine_mapping["ranks"].items(): + # Only allow one process to one device mapping + self.assertEqual(len(device_ids), 1) + self.assertTrue(is_in_machine(device_ids[0], machine)) + machine_mapped_ranks.add(rank) + machine_mapped_device_local_ids.add(device_ids[0]) + self.assertEqual( + len(machine_mapped_ranks), len(machine_mapped_device_local_ids)) + all_mapped_ranks.update(machine_mapped_ranks) + self.assertEqual(set(processes), all_mapped_ranks) + + +if __name__ == '__main__': + unittest.main() From 677d3e3175f5a20c9e888011b9243b153e780527 Mon Sep 17 00:00:00 2001 From: Ao Yulong Date: Thu, 25 Nov 2021 11:43:15 +0000 Subject: [PATCH 18/30] Remove unused rank_mapping unittest --- python/paddle/fluid/tests/unittests/CMakeLists.txt | 2 -- 1 file changed, 2 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 4ec4299513966..fc171a1c3e255 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -60,7 +60,6 @@ list(APPEND MIXED_DIST_TEST_OPS test_fleet_run_random_port) list(APPEND MIXED_DIST_TEST_OPS test_fleet_launch_async) list(APPEND MIXED_DIST_TEST_OPS test_fleet_launch_cloud) list(APPEND MIXED_DIST_TEST_OPS test_fleet_launch_ascend) -list(APPEND MIXED_DIST_TEST_OPS test_fleet_launch_rank_mapping) list(APPEND MIXED_DIST_TEST_OPS test_ascend_group) list(APPEND MIXED_DIST_TEST_OPS test_fleet_launch_nproc) list(APPEND MIXED_DIST_TEST_OPS test_fleet_api_input) @@ -665,7 +664,6 @@ if(WITH_DISTRIBUTE) bash_test_modules(test_fleet_launch_async START_BASH test_fleet_launch_async.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR}) bash_test_modules(test_fleet_launch_cloud START_BASH test_fleet_launch_cloud.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR}) bash_test_modules(test_fleet_launch_nproc START_BASH test_fleet_launch_nproc.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR}) - bash_test_modules(test_fleet_launch_rank_mapping START_BASH test_fleet_launch_rank_mapping.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR}) if(WITH_ASCEND OR WITH_ASCEND_CL) bash_test_modules(test_fleet_launch_ascend START_BASH test_fleet_launch_ascend.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR}) bash_test_modules(test_ascend_group START_BASH test_ascend_group.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR}) From dc2ba12fdf71aef0dc05bc2464674e14c1862394 Mon Sep 17 00:00:00 2001 From: Ao Yulong Date: Fri, 26 Nov 2021 05:15:26 +0000 Subject: [PATCH 19/30] Improve the unittest coverage --- .../paddle/distributed/auto_parallel/graph.py | 12 ++---------- .../distributed/auto_parallel/process_group.py | 18 +++++++++--------- .../unittests/test_auto_parallel_graph.py | 12 ++++++++++++ 3 files changed, 23 insertions(+), 19 deletions(-) diff --git a/python/paddle/distributed/auto_parallel/graph.py b/python/paddle/distributed/auto_parallel/graph.py index c28b8bfdd5320..14856e390709e 100644 --- a/python/paddle/distributed/auto_parallel/graph.py +++ b/python/paddle/distributed/auto_parallel/graph.py @@ -146,20 +146,12 @@ def __len__(self): def __iter__(self): return iter(self._nodes.values()) - def __getitem__(self, n): + def __getitem__(self, node_id): # Return the adjacency of a node - if isinstance(n, Node): - node_id = n.id - else: - node_id = n return self._adjs[node_id] - def __contains__(self, n): + def __contains__(self, node_id): # Check whether a node in the graph - if isinstance(n, Node): - node_id = n.id - else: - node_id = n try: return node_id in self._nodes except TypeError: diff --git a/python/paddle/distributed/auto_parallel/process_group.py b/python/paddle/distributed/auto_parallel/process_group.py index 29bb3edc94c85..70a19f6c5386a 100644 --- a/python/paddle/distributed/auto_parallel/process_group.py +++ b/python/paddle/distributed/auto_parallel/process_group.py @@ -135,15 +135,15 @@ def instantiate(self): self._is_instantiate = True - def __eq__(self, other): - if not isinstance(other, ProcessGroup): - return False - if self.id != other.id: - return False - return True - - def __ne__(self, other): - return not self.__eq__(other) + # def __eq__(self, other): + # if not isinstance(other, ProcessGroup): + # return False + # if self.id != other.id: + # return False + # return True + + # def __ne__(self, other): + # return not self.__eq__(other) def __str__(self): string = "id: {}, nranks: {}, ranks: {}.".format( diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_graph.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_graph.py index eee1ad3ffb991..bbf7e3a46729e 100644 --- a/python/paddle/fluid/tests/unittests/test_auto_parallel_graph.py +++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_graph.py @@ -27,6 +27,8 @@ def test_graph(self): graph = Graph(name="foo") self.assertEqual(graph.attrs["name"], "foo") + graph.add_node(1, weight=0) + # Overide the existing node attribute graph.add_node(1, weight=1) graph.add_node(2, weight=2) graph.add_node(3, weight=3) @@ -57,6 +59,7 @@ def test_graph(self): graph.add_edge(1, 2, weight=0.1) graph.add_edge(1, 3, weight=0.2) graph.add_edge(2, 3, weight=0.3) + graph.add_edge(4, 5, weight=0.4) edge = graph[1][2] edge["info"] = "is a edge" @@ -71,9 +74,18 @@ def test_graph(self): self.assertEqual(graph[1][3]["weight"], 0.2) self.assertEqual(graph[2][3]["weight"], 0.3) + self.assertEqual(graph[4][5]["weight"], 0.4) + str = "{}".format(graph) self.assertIsNotNone(str) + self.assertRaises(TypeError, 6 in graph) + self.assertRaises(TypeError, "unkown_attr" in graph.nodes[1]) + self.assertRaises(TypeError, "unkown_attr" in graph[1][2]) + self.assertRaises(ValueError, graph.add_node, None) + self.assertRaises(ValueError, graph.add_edge, 3, None) + self.assertRaises(ValueError, graph.add_edge, None, 3) + if __name__ == '__main__': unittest.main() From d56ebf8408d714af5991d5293e8da09968757b97 Mon Sep 17 00:00:00 2001 From: Ao Yulong Date: Mon, 29 Nov 2021 08:09:19 +0000 Subject: [PATCH 20/30] Improve the unittest coverage --- .../distributed/auto_parallel/mapper.py | 7 +- .../unittests/test_auto_parallel_mapper.py | 80 ++++++++++++++++--- 2 files changed, 75 insertions(+), 12 deletions(-) diff --git a/python/paddle/distributed/auto_parallel/mapper.py b/python/paddle/distributed/auto_parallel/mapper.py index 4c514b2f1d070..f015cf4477195 100644 --- a/python/paddle/distributed/auto_parallel/mapper.py +++ b/python/paddle/distributed/auto_parallel/mapper.py @@ -28,7 +28,7 @@ def is_collective_comm_op(op): comm_list = [ "c_allreduce_sum", "c_allreduce_min", "c_allreduce_max", "c_allreduce_prod", "c_reduce_sum", "c_reduce_min", "c_reduce_max", - "c_reduce_prod", "c_broadcast", "c_allgather", "send_v2", "recv_v2" + "c_reduce_prod", "c_broadcast", "c_allgather" ] if op.type in comm_list: return True @@ -87,7 +87,8 @@ def get_comm_volume(comm_op, src_rank, tgt_rank): if val == -1: print("Warning: -1 in the tensor shape.") new_tensor_shape.append(1) - new_tensor_shape.append(val) + else: + new_tensor_shape.append(val) tensor_size = functools.reduce(operator.mul, new_tensor_shape, 1) tensor_bytes = tensor_size * get_dtype_bytes(tensor.dtype) if "c_allreduce" in comm_op_type: @@ -95,7 +96,7 @@ def get_comm_volume(comm_op, src_rank, tgt_rank): elif "c_allgather" in comm_op_type: comm_volume = tensor_bytes elif "c_broadcast" in comm_op_type: - if comm_op.attr("root_id") == src_rank: + if comm_op.attr("root") == src_rank: comm_volume = tensor_bytes else: comm_volume = None diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py index a68df5648db21..7b60a9753bd6d 100644 --- a/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py +++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py @@ -23,10 +23,12 @@ import paddle import paddle.nn as nn +import paddle.fluid as fluid import paddle.nn.functional as F import paddle.tensor as tensor import paddle.utils as utils import paddle.static as static +from paddle.fluid import core from paddle.fluid import layers from paddle.fluid.framework import in_dygraph_mode from paddle.nn.layer.transformer import _convert_param_attr_to_list @@ -42,11 +44,13 @@ from paddle.distributed.auto_parallel.cluster import Cluster from paddle.distributed.auto_parallel.cluster import DeviceType from paddle.distributed.auto_parallel.cluster import LinkType +from paddle.distributed.auto_parallel.utils import check_distributed_attr_for_program +from paddle.distributed.auto_parallel.utils import print_program_with_dist_attr from paddle.distributed.auto_parallel.mapper import build_process_graph from paddle.distributed.auto_parallel.mapper import build_cluster_graph from paddle.distributed.auto_parallel.mapper import mapping -from paddle.distributed.auto_parallel.utils import check_distributed_attr_for_program -from paddle.distributed.auto_parallel.utils import print_program_with_dist_attr +from paddle.distributed.auto_parallel.mapper import get_dtype_bytes +from paddle.distributed.auto_parallel.mapper import get_comm_volume paddle.enable_static() _global_parallel_strategy = None @@ -469,16 +473,16 @@ def get_dist_prog(train_program, startup_program, dist_context, rank_id): dist_context) partitioner = Partitioner(dist_strategy, dist_context, rank_id) # logical partition - dist_main_prog, dist_startup_prog = partitioner.transpile_forward( + dist_train_program, dist_startup_prog = partitioner.transpile_forward( complete_train_program, startup_program) dist_params_grads = partitioner.apply_backward( - loss, complete_train_program, startup_program, dist_main_prog, + loss, complete_train_program, startup_program, dist_train_program, dist_startup_prog) optimizer = paddle.fluid.optimizer.AdamOptimizer() opt_ops = partitioner.apply_optimize(optimizer, dist_params_grads, - dist_main_prog, dist_startup_prog) - - return dist_main_prog, dist_startup_prog + dist_train_program, dist_startup_prog) + reshard(dist_train_program, dist_startup_prog, rank_id, dist_context) + return dist_train_program, dist_startup_prog def is_in_machine(device_local_id, machine): @@ -520,8 +524,8 @@ def test_mapper_dp_mp_pp(self): dist_context = DistributedContext() dist_train_program, dist_startup_prog = get_dist_prog( train_program, startup_program, dist_context, rank_id) - reshard(dist_train_program, dist_startup_prog, rank_id, - dist_context) + # if rank_id == 0: + # print_program_with_dist_attr(dist_train_program, dist_context) dist_programs[rank_id] = dist_train_program rank_mapping = mapping(dist_programs, cluster) @@ -542,6 +546,64 @@ def test_mapper_dp_mp_pp(self): all_mapped_ranks.update(machine_mapped_ranks) self.assertEqual(set(processes), all_mapped_ranks) + def test_mapper_misc(self): + self.assertEqual(get_dtype_bytes(paddle.float64), 8) + self.assertEqual(get_dtype_bytes(paddle.float32), 4) + self.assertEqual(get_dtype_bytes(paddle.float16), 2) + self.assertEqual(get_dtype_bytes(paddle.bfloat16), 2) + self.assertEqual(get_dtype_bytes(paddle.int64), 8) + self.assertEqual(get_dtype_bytes(paddle.int32), 4) + self.assertEqual(get_dtype_bytes(paddle.int16), 2) + self.assertEqual(get_dtype_bytes(paddle.int8), 1) + self.assertEqual(get_dtype_bytes(paddle.uint8), 1) + self.assertRaises(ValueError, get_dtype_bytes, "unknown type") + train_program = static.Program() + startup_program = static.Program() + ring_id = 0 + root_id = 0 + nranks = 2 + with fluid.program_guard(train_program, startup_program): + input = layers.data(name="input", shape=[10, 10], dtype='float32') + output = train_program.current_block().create_var( + name="outofbroadcast", + dtype='float32', + type=core.VarDesc.VarType.LOD_TENSOR, + persistable=False, + stop_gradient=False) + broadcast_op = train_program.global_block().append_op( + type="c_broadcast", + inputs={'X': input}, + attrs={'ring_id': ring_id, + 'root': root_id}, + outputs={'Out': output}) + self.assertEqual(get_comm_volume(broadcast_op, 0, 1), 400) + self.assertEqual(get_comm_volume(broadcast_op, 1, 0), None) + allgather_op = train_program.global_block().append_op( + type="c_allgather", + inputs={'X': input}, + attrs={'ring_id': ring_id, + 'nranks': nranks}, + outputs={'Out': output}) + self.assertEqual(get_comm_volume(allgather_op, 0, 1), 400) + self.assertEqual(get_comm_volume(allgather_op, 0, 0), None) + reduce_op = train_program.global_block().append_op( + type="c_reduce_sum", + inputs={'X': input}, + attrs={'ring_id': ring_id, + 'root_id': root_id}, + outputs={'Out': output}) + self.assertEqual(get_comm_volume(reduce_op, 0, 1), None) + self.assertEqual(get_comm_volume(reduce_op, 1, 0), 400) + cast_op = train_program.global_block().append_op( + type="cast", + inputs={"X": input}, + outputs={"Out": output}, + attrs={ + "in_dtype": fluid.core.VarDesc.VarType.FP32, + "out_dtype": fluid.core.VarDesc.VarType.FP32 + }) + self.assertRaises(ValueError, get_comm_volume, cast_op, 0, 1) + if __name__ == '__main__': unittest.main() From 9e8cc186df821fca24078c027f8d448caac6c38f Mon Sep 17 00:00:00 2001 From: Ao Yulong Date: Tue, 30 Nov 2021 02:58:37 +0000 Subject: [PATCH 21/30] Improve the unittest of relaunch --- ...emo.py => auto_parallel_relaunch_model.py} | 10 ++++---- ...unch.py => test_auto_parallel_relaunch.py} | 24 +++++++++++++------ 2 files changed, 21 insertions(+), 13 deletions(-) rename python/paddle/fluid/tests/unittests/auto_parallel/{auto_parallel_launch_demo.py => auto_parallel_relaunch_model.py} (97%) rename python/paddle/fluid/tests/unittests/auto_parallel/{test_auto_parallel_launch.py => test_auto_parallel_relaunch.py} (78%) diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/auto_parallel_launch_demo.py b/python/paddle/fluid/tests/unittests/auto_parallel/auto_parallel_relaunch_model.py similarity index 97% rename from python/paddle/fluid/tests/unittests/auto_parallel/auto_parallel_launch_demo.py rename to python/paddle/fluid/tests/unittests/auto_parallel/auto_parallel_relaunch_model.py index 77d5704579f93..8e5221ed5ffa6 100644 --- a/python/paddle/fluid/tests/unittests/auto_parallel/auto_parallel_launch_demo.py +++ b/python/paddle/fluid/tests/unittests/auto_parallel/auto_parallel_relaunch_model.py @@ -1,11 +1,11 @@ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -155,9 +155,7 @@ def train(): exe.run(distributed_startup_program) for data in loader(): - loss_print = exe.run(distributed_main_program, - feed=data, - fetch_list=[loss]) + exe.run(distributed_main_program, feed=data, fetch_list=[loss]) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_auto_parallel_launch.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_auto_parallel_relaunch.py similarity index 78% rename from python/paddle/fluid/tests/unittests/auto_parallel/test_auto_parallel_launch.py rename to python/paddle/fluid/tests/unittests/auto_parallel/test_auto_parallel_relaunch.py index 8cf548a0d1268..d4c982c68e469 100644 --- a/python/paddle/fluid/tests/unittests/auto_parallel/test_auto_parallel_launch.py +++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_auto_parallel_relaunch.py @@ -16,6 +16,7 @@ import os import sys import json +import shutil import subprocess from paddle.distributed.fleet.launch_utils import run_with_coverage @@ -77,17 +78,16 @@ """ -class TestAutoParallelLaunch(unittest.TestCase): - def test_launch(self): +class TestAutoParallelReLaunch(unittest.TestCase): + def test_relaunch(self): file_dir = os.path.dirname(os.path.abspath(__file__)) - cluster_json_path = os.path.join(file_dir, "auto_parallel_cluster.json") cluster_json_object = json.loads(cluster_json) with open(cluster_json_path, "w") as cluster_json_file: json.dump(cluster_json_object, cluster_json_file) - launch_demo_path = os.path.join(file_dir, - "auto_parallel_launch_demo.py") + launch_model_path = os.path.join(file_dir, + "auto_parallel_relaunch_model.py") if os.environ.get("WITH_COVERAGE", "OFF") == "ON": run_with_coverage(True) @@ -97,12 +97,22 @@ def test_launch(self): cmd = [sys.executable, "-u"] + coverage_args + [ "-m", "launch", "--cluster_topo_path", cluster_json_path, - "--enable_auto_mapping", "True", launch_demo_path + "--enable_auto_mapping", "True", launch_model_path ] process = subprocess.Popen(cmd) process.wait() self.assertEqual(process.returncode, 0) - os.remove(cluster_json_path) + + # Remove unnecessary files + if os.path.exists(cluster_json_path): + os.remove(cluster_json_path) + rank_mapping_json_path = os.path.join(file_dir, + "auto_parallel_rank_mapping.json") + if os.path.exists(rank_mapping_json_path): + os.remove(rank_mapping_json_path) + log_path = os.path.join(file_dir, "log") + if os.path.exists(log_path): + shutil.rmtree(log_path) if __name__ == "__main__": From fd8ff31aaf8e2e616e5ccf3ba57d5780dfc31dc3 Mon Sep 17 00:00:00 2001 From: Ao Yulong Date: Tue, 30 Nov 2021 08:45:06 +0000 Subject: [PATCH 22/30] Fix the unittest problem in CI --- .../fluid/tests/unittests/auto_parallel/CMakeLists.txt | 5 ----- .../fluid/tests/unittests/test_auto_parallel_mapper.py | 3 +++ 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt index f666e4c261076..d6dd6251086f7 100644 --- a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt @@ -5,8 +5,3 @@ if(NOT WITH_NCCL) list(REMOVE_ITEM TEST_OPS test_auto_parallel_launch) list(APPEND DIST_TEST_OPS test_auto_parallel_launch) endif() - -# foreach(TEST_OP ${TEST_OPS}) -# py_test_modules(${TEST_OP} MODULES ${TEST_OP}) -# set_tests_properties(${TEST_OP} PROPERTIES TIMEOUT 120) -# endforeach(TEST_OP) diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py index 7b60a9753bd6d..de37ac56bfbb6 100644 --- a/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py +++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py @@ -52,6 +52,9 @@ from paddle.distributed.auto_parallel.mapper import get_dtype_bytes from paddle.distributed.auto_parallel.mapper import get_comm_volume +if os.getenv("CUDA_VISIBLE_DEVICES") is not None: + os.environ["CUDA_VISIBLE_DEVICES"] = "" + paddle.enable_static() _global_parallel_strategy = None _global_process_mesh = None From 35828ddb77399c5cb5e9c777c716011df3afa5b4 Mon Sep 17 00:00:00 2001 From: Ao Yulong Date: Wed, 1 Dec 2021 02:04:19 +0000 Subject: [PATCH 23/30] Improve the unittest of relaunch --- .../paddle/distributed/auto_parallel/parallelizer.py | 11 +++++++++-- .../tests/unittests/auto_parallel/CMakeLists.txt | 8 +++----- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/python/paddle/distributed/auto_parallel/parallelizer.py b/python/paddle/distributed/auto_parallel/parallelizer.py index d9c598f23844e..ccdcc36434e33 100644 --- a/python/paddle/distributed/auto_parallel/parallelizer.py +++ b/python/paddle/distributed/auto_parallel/parallelizer.py @@ -23,6 +23,7 @@ import paddle from paddle.distributed.utils import get_logger from paddle.distributed.fleet import cloud_utils +from paddle.distributed.fleet.launch_utils import run_with_coverage import paddle.fluid.core as core from .dist_context import DistributedContext from .dist_context import get_default_distributed_context @@ -137,8 +138,14 @@ def parallelize(self, original_cmd_args = os.getenv("PADDLE_ORIGINAL_CMD_ARGS") rank_mapping_args = " ".join( ["--rank_mapping_path", rank_mapping_path]) - new_cmd_args = "-u -m paddle.distributed.fleet.launch" + " " + rank_mapping_args + " " + original_cmd_args - new_cmd = [sys.executable] + shlex.split(new_cmd_args) + if os.environ.get("WITH_COVERAGE", "OFF") == "ON": + run_with_coverage(True) + coverage_args = ["-m", "coverage", "run", "--branch", "-p"] + else: + coverage_args = [] + new_cmd_args = "-m paddle.distributed.fleet.launch" + " " + rank_mapping_args + " " + original_cmd_args + new_cmd = [sys.executable, "-u"] + coverage_args + shlex.split( + new_cmd_args) print(new_cmd) new_process = subprocess.Popen(new_cmd) new_process.wait() diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt index d6dd6251086f7..219094d36fda6 100644 --- a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt @@ -1,7 +1,5 @@ -file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") -string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") - -if(NOT WITH_NCCL) - list(REMOVE_ITEM TEST_OPS test_auto_parallel_launch) +# file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") +# string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") +if(WITH_DISTRIBUTE AND WITH_GPU AND WITH_NCCL) list(APPEND DIST_TEST_OPS test_auto_parallel_launch) endif() From 8d4199cd27a9292275f4239e489bd30d1d28cc48 Mon Sep 17 00:00:00 2001 From: Ao Yulong Date: Wed, 1 Dec 2021 03:28:56 +0000 Subject: [PATCH 24/30] Remove unnecessary statements --- python/paddle/distributed/auto_parallel/parallelizer.py | 2 -- python/paddle/distributed/fleet/launch_utils.py | 3 ++- .../paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt | 2 +- .../unittests/auto_parallel/test_auto_parallel_relaunch.py | 1 - 4 files changed, 3 insertions(+), 5 deletions(-) diff --git a/python/paddle/distributed/auto_parallel/parallelizer.py b/python/paddle/distributed/auto_parallel/parallelizer.py index ccdcc36434e33..f8c0edf84d665 100644 --- a/python/paddle/distributed/auto_parallel/parallelizer.py +++ b/python/paddle/distributed/auto_parallel/parallelizer.py @@ -23,7 +23,6 @@ import paddle from paddle.distributed.utils import get_logger from paddle.distributed.fleet import cloud_utils -from paddle.distributed.fleet.launch_utils import run_with_coverage import paddle.fluid.core as core from .dist_context import DistributedContext from .dist_context import get_default_distributed_context @@ -139,7 +138,6 @@ def parallelize(self, rank_mapping_args = " ".join( ["--rank_mapping_path", rank_mapping_path]) if os.environ.get("WITH_COVERAGE", "OFF") == "ON": - run_with_coverage(True) coverage_args = ["-m", "coverage", "run", "--branch", "-p"] else: coverage_args = [] diff --git a/python/paddle/distributed/fleet/launch_utils.py b/python/paddle/distributed/fleet/launch_utils.py index 04aa3b8db19b2..1764e0b2cbf1a 100644 --- a/python/paddle/distributed/fleet/launch_utils.py +++ b/python/paddle/distributed/fleet/launch_utils.py @@ -542,7 +542,8 @@ def start_local_trainers(cluster, current_env.update(proc_env) coverage_args = [] - if run_with_coverage(): + if run_with_coverage() or os.environ.get("WITH_COVERAGE", + "OFF") == "ON": coverage_args = ["-m", "coverage", "run", "--branch", "-p"] cmd = [sys.executable, "-u"] + coverage_args + [training_script ] + training_script_args diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt index 219094d36fda6..58fc9ebb339d5 100644 --- a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt @@ -1,5 +1,5 @@ # file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") # string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") if(WITH_DISTRIBUTE AND WITH_GPU AND WITH_NCCL) - list(APPEND DIST_TEST_OPS test_auto_parallel_launch) + list(APPEND DIST_TEST_OPS test_auto_parallel_relaunch) endif() diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_auto_parallel_relaunch.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_auto_parallel_relaunch.py index d4c982c68e469..321b262286218 100644 --- a/python/paddle/fluid/tests/unittests/auto_parallel/test_auto_parallel_relaunch.py +++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_auto_parallel_relaunch.py @@ -90,7 +90,6 @@ def test_relaunch(self): "auto_parallel_relaunch_model.py") if os.environ.get("WITH_COVERAGE", "OFF") == "ON": - run_with_coverage(True) coverage_args = ["-m", "coverage", "run", "--branch", "-p"] else: coverage_args = [] From d2e37379a394e8abf73058780e4a6b65836cad47 Mon Sep 17 00:00:00 2001 From: Ao Yulong Date: Wed, 1 Dec 2021 13:13:06 +0000 Subject: [PATCH 25/30] Update the unittest cmakefile --- .../paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt index 58fc9ebb339d5..2ec5800489a1c 100644 --- a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt @@ -1,5 +1,5 @@ # file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") # string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") -if(WITH_DISTRIBUTE AND WITH_GPU AND WITH_NCCL) +if(WITH_DISTRIBUTE AND WITH_GPU) list(APPEND DIST_TEST_OPS test_auto_parallel_relaunch) endif() From 3aef5c5b649498fb75cbd47011829aafae353cf3 Mon Sep 17 00:00:00 2001 From: Ao Yulong Date: Fri, 3 Dec 2021 02:29:56 +0000 Subject: [PATCH 26/30] Correct the cmakefile of auto parallel unittests --- .../paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt index 2ec5800489a1c..b1d811889dcda 100644 --- a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt @@ -1,5 +1,5 @@ # file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") # string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") if(WITH_DISTRIBUTE AND WITH_GPU) - list(APPEND DIST_TEST_OPS test_auto_parallel_relaunch) + py_test_modules(test_auto_parallel_relaunch MODULES test_auto_parallel_relaunch ENVS ${dist_ENVS}) endif() From e74622452e110e466ea1b375d90c9625435cdfd8 Mon Sep 17 00:00:00 2001 From: Ao Yulong Date: Fri, 3 Dec 2021 09:09:02 +0000 Subject: [PATCH 27/30] Modify codes based on the new elastic change --- .../distributed/auto_parallel/parallelizer.py | 22 ++++++---- python/paddle/distributed/fleet/launch.py | 20 +++++++-- .../paddle/distributed/fleet/launch_utils.py | 42 ++++++++++++++----- 3 files changed, 63 insertions(+), 21 deletions(-) diff --git a/python/paddle/distributed/auto_parallel/parallelizer.py b/python/paddle/distributed/auto_parallel/parallelizer.py index f8c0edf84d665..affb27317daaf 100644 --- a/python/paddle/distributed/auto_parallel/parallelizer.py +++ b/python/paddle/distributed/auto_parallel/parallelizer.py @@ -68,6 +68,9 @@ def __init__(self, fleet): self._enable_auto_mapping = False else: self._enable_auto_mapping = True + self._need_rank_mapping = os.getenv("PADDLE_NEED_RANK_MAPPING") + self._need_rank_mapping = True if self._need_rank_mapping and \ + self._need_rank_mapping.lower() == 'true' else False def _remove_distributed_attrs(self, main_program): suffix = core.kAutoParallelSuffix() @@ -112,7 +115,7 @@ def parallelize(self, self._parameter_list = parameter_list self._no_grad_set = no_grad_set - if self._enable_auto_mapping and self._rank_mapping_path is None: + if self._enable_auto_mapping and self._need_rank_mapping: # Do the mapping pass before parallelization assert self._cluster is not None, \ "The cluster must not be none when using auto mapping." @@ -129,14 +132,20 @@ def parallelize(self, rank_mapping = list(rank_mapping_dict.values()) # Relaunch the training by using the rank mapping file - cwd = pathlib.Path().resolve() - rank_mapping_path = os.path.join(cwd, - "auto_parallel_rank_mapping.json") - with open(rank_mapping_path, "w") as rank_mapping_file: + with open(self._rank_mapping_path, "w") as rank_mapping_file: json.dump(rank_mapping, rank_mapping_file) + + enable_elastic = os.getenv("PADDLE_ENABLE_ELASTIC") + enable_elastic = True if enable_elastic and enable_elastic.lower( + ) == 'true' else False + if enable_elastic: + print("Auto mapping finished, now do elastic re-launch") + sys.exit(paddle.distributed.fleet.elastic.manager. + ELASTIC_AUTO_PARALLEL_EXIT_CODE) + original_cmd_args = os.getenv("PADDLE_ORIGINAL_CMD_ARGS") rank_mapping_args = " ".join( - ["--rank_mapping_path", rank_mapping_path]) + ["--rank_mapping_path", self._rank_mapping_path]) if os.environ.get("WITH_COVERAGE", "OFF") == "ON": coverage_args = ["-m", "coverage", "run", "--branch", "-p"] else: @@ -144,7 +153,6 @@ def parallelize(self, new_cmd_args = "-m paddle.distributed.fleet.launch" + " " + rank_mapping_args + " " + original_cmd_args new_cmd = [sys.executable, "-u"] + coverage_args + shlex.split( new_cmd_args) - print(new_cmd) new_process = subprocess.Popen(new_cmd) new_process.wait() assert new_process.returncode == 0, \ diff --git a/python/paddle/distributed/fleet/launch.py b/python/paddle/distributed/fleet/launch.py index b66a48fe92754..eabb5f7e96e4a 100644 --- a/python/paddle/distributed/fleet/launch.py +++ b/python/paddle/distributed/fleet/launch.py @@ -65,7 +65,6 @@ import time import six import copy -import shlex import pathlib import argparse from argparse import ArgumentParser, REMAINDER @@ -301,8 +300,17 @@ def get_cluster_info(args): if args.enable_auto_mapping == True: assert args.cluster_topo_path is not None, \ "The cluster topology must be provied when enabling auto mapping." - if args.rank_mapping_path is None: - # original_args = [shlex.quote(c) for c in sys.argv[1:]] + rank_mapping_path = args.rank_mapping_path or os.getenv( + "PADDLE_RANK_MAPPING_PATH") + if not rank_mapping_path: + os.environ["PADDLE_NEED_RANK_MAPPING"] = str(True) + os.environ["PADDLE_ENABLE_ELASTIC"] = str( + enable_elastic(args, device_mode)) + cwd = pathlib.Path().resolve() + rank_mapping_path = os.path.join(cwd, + "auto_parallel_rank_mapping.json") + os.environ["PADDLE_RANK_MAPPING_PATH"] = str(rank_mapping_path) + original_args = sys.argv[1:] os.environ["PADDLE_ORIGINAL_CMD_ARGS"] = " ".join(original_args) os.environ["PADDLE_CLUSTER_TOPO_PATH"] = str(args.cluster_topo_path) @@ -311,8 +319,12 @@ def get_cluster_info(args): cluster, pod = launch_utils.get_mapped_cluster_from_args_without_rank_mapping( args, device_mode) else: + os.environ["PADDLE_NEED_RANK_MAPPING"] = str(False) + os.environ["PADDLE_ENABLE_ELASTIC"] = str( + enable_elastic(args, device_mode)) + os.environ["PADDLE_CLUSTER_TOPO_PATH"] = str(args.cluster_topo_path) - os.environ["PADDLE_RANK_MAPPING_PATH"] = str(args.rank_mapping_path) + os.environ["PADDLE_RANK_MAPPING_PATH"] = str(rank_mapping_path) os.environ["PADDLE_ENABLE_AUTO_MAPPING"] = str( args.enable_auto_mapping) cluster, pod = launch_utils.get_mapped_cluster_from_args_with_rank_mapping( diff --git a/python/paddle/distributed/fleet/launch_utils.py b/python/paddle/distributed/fleet/launch_utils.py index 8f290aee30953..6bbdd0175bc36 100644 --- a/python/paddle/distributed/fleet/launch_utils.py +++ b/python/paddle/distributed/fleet/launch_utils.py @@ -570,7 +570,7 @@ def start_local_trainers(cluster, f.write("PADDLE_TRAINER_ENDPOINTS: \n") f.write("\n".join(cluster.trainers_endpoints())) if current_env.get("PADDLE_ENABLE_AUTO_MAPPING") is not None \ - and current_env.get("PADDLE_RANK_MAPPING_PATH", None) is None: + and current_env.get("PADDLE_NEED_RANK_MAPPING").lower() == "true": fn = open("%s/prelaunchlog.%d" % (log_dir, idx), "a") else: fn = open("%s/workerlog.%d" % (log_dir, idx), "a") @@ -907,7 +907,14 @@ def get_mapped_cluster_from_args_without_rank_mapping(args, device_mode): trainer_endpoints = [] for ip in node_ips: node_rank = node_ips.index(ip) - if os.environ.get('FLAGS_START_PORT') is not None: + if os.environ.get('PADDLE_PORT') is not None: + start_port = int(os.getenv("PADDLE_PORT", "")) + free_ports = [ + x + for x in range(start_port, start_port + len(node_ranks[ + node_rank])) + ] + elif os.environ.get('FLAGS_START_PORT') is not None: start_port = int(os.environ.get('FLAGS_START_PORT')) free_ports = [ x @@ -975,9 +982,13 @@ def get_mapped_cluster_from_args_with_rank_mapping(args, device_mode): gpus_num = fluid.core.get_cuda_device_count() # parse ip-ranks json file + rank_mapping_path = args.rank_mapping_path or os.getenv( + "PADDLE_RANK_MAPPING_PATH") rank_mapping = None - with open(args.rank_mapping_path, "r") as json_file: + with open(rank_mapping_path, "r") as json_file: rank_mapping = json.load(json_file) + # reset PADDLE_RANK_MAPPING_PATH env + os.environ["PADDLE_RANK_MAPPING_PATH"] = "" node_ips = [] node_ranks = [] @@ -1017,7 +1028,14 @@ def get_mapped_cluster_from_args_with_rank_mapping(args, device_mode): trainer_endpoints = [] for ip in node_ips: node_rank = node_ips.index(ip) - if os.environ.get('FLAGS_START_PORT') is not None: + if os.environ.get('PADDLE_PORT') is not None: + start_port = int(os.getenv("PADDLE_PORT", "")) + free_ports = [ + x + for x in range(start_port, start_port + len(node_ranks[ + node_rank])) + ] + elif os.environ.get('FLAGS_START_PORT') is not None: start_port = int(os.environ.get('FLAGS_START_PORT')) free_ports = [ x @@ -1298,14 +1316,18 @@ def get_role_endpoints(self, args): _, self.current_node_ip = get_host_name_ip() else: self.current_node_ip = pod_ip - assert self.current_node_ip in self.node_ips, "Can't find your local ip {%s} in args.servers and args.workers ips: {%s}" \ - % (self.current_node_ip, self.node_ips) - self.node_rank = self.node_ips.index(self.current_node_ip) - logger.debug( - "parsed from args: node_ips:{} current_node_ip:{} node_rank:{}". - format(self.node_ips, self.current_node_ip, self.node_rank)) + if not self.distribute_mode == DistributeMode.PS_HETER: + assert self.current_node_ip in self.node_ips, "Can't find your local ip {%s} in args.servers and args.workers ips: {%s}" \ + % (self.current_node_ip, self.node_ips) + if self.current_node_ip in self.node_ips: + self.node_rank = self.node_ips.index(self.current_node_ip) + logger.debug( + "parsed from args: node_ips:{} current_node_ip:{} node_rank:{}". + format(self.node_ips, self.current_node_ip, self.node_rank)) def start_ps(self): + if not self.current_node_ip in self.node_ips: + return cluster = Cluster(hdfs=None) server_rank = 0 worker_rank = 0 From ce25444a006e71ac924953d2039852954b7940f3 Mon Sep 17 00:00:00 2001 From: Ao Yulong Date: Fri, 3 Dec 2021 11:18:49 +0000 Subject: [PATCH 28/30] Use the GPUs exclusively in the unittest --- .../paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt index b1d811889dcda..8b9af06bb2b86 100644 --- a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt @@ -1,5 +1,5 @@ # file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") # string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") if(WITH_DISTRIBUTE AND WITH_GPU) - py_test_modules(test_auto_parallel_relaunch MODULES test_auto_parallel_relaunch ENVS ${dist_ENVS}) + py_test_modules(test_auto_parallel_relaunch MODULES test_auto_parallel_relaunch LABELS "RUN_TYPE=EXCLUSIVE" ENVS ${dist_ENVS}) endif() From 8706b242d5f950f56081ffebe389bdbb9ef537c2 Mon Sep 17 00:00:00 2001 From: Ao Yulong Date: Fri, 3 Dec 2021 12:50:00 +0000 Subject: [PATCH 29/30] Correct the cmakefile --- .../paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt index 8b9af06bb2b86..16f0ed31051a9 100644 --- a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt @@ -1,5 +1,6 @@ # file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") # string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") if(WITH_DISTRIBUTE AND WITH_GPU) - py_test_modules(test_auto_parallel_relaunch MODULES test_auto_parallel_relaunch LABELS "RUN_TYPE=EXCLUSIVE" ENVS ${dist_ENVS}) + py_test_modules(test_auto_parallel_relaunch MODULES test_auto_parallel_relaunch ENVS ${dist_ENVS}) + set_tests_properties(test_auto_parallel_relaunch PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE") endif() From 9a23b7f068e340b7bdc496219c731570d866e0d5 Mon Sep 17 00:00:00 2001 From: Ao Yulong Date: Sat, 4 Dec 2021 11:38:52 +0000 Subject: [PATCH 30/30] Set the timeout of the unittest --- .../paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt index 16f0ed31051a9..4244fda0c51d9 100644 --- a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt @@ -2,5 +2,5 @@ # string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") if(WITH_DISTRIBUTE AND WITH_GPU) py_test_modules(test_auto_parallel_relaunch MODULES test_auto_parallel_relaunch ENVS ${dist_ENVS}) - set_tests_properties(test_auto_parallel_relaunch PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE") + set_tests_properties(test_auto_parallel_relaunch PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 120) endif()