Skip to content

Commit

Permalink
Revert "[XPU] fix the dataloader problem in RDMA env (#54150)" (#55150)
Browse files Browse the repository at this point in the history
This reverts commit 15c8752.
  • Loading branch information
XiaociZhang authored Jul 6, 2023
1 parent 96ee44b commit 86694ce
Show file tree
Hide file tree
Showing 2 changed files with 3 additions and 16 deletions.
3 changes: 2 additions & 1 deletion cmake/external/xpu.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,8 @@ ExternalProject_Add(
${CMAKE_SOURCE_DIR}/tools/xpu/pack_paddle_depence.sh ${XPU_XRE_URL}
${XPU_XRE_DIR_NAME} ${XPU_XDNN_URL} ${XPU_XDNN_DIR_NAME} ${XPU_XCCL_URL}
${XPU_XCCL_DIR_NAME} && wget ${XPU_XFT_GET_DEPENCE_URL} && bash
get_xft_dependence.sh ${XPU_XFT_URL} ${XPU_XFT_DIR_NAME} && bash
get_xft_dependence.sh ${XPU_XFT_URL} ${XPU_XFT_DIR_NAME} &&
WITH_XPTI=${WITH_XPTI} bash
${CMAKE_SOURCE_DIR}/tools/xpu/get_xpti_dependence.sh ${XPU_XPTI_URL}
${XPU_XPTI_DIR_NAME}
DOWNLOAD_NO_PROGRESS 1
Expand Down
16 changes: 1 addition & 15 deletions python/paddle/io/dataloader/dataloader_iter.py
Original file line number Diff line number Diff line change
Expand Up @@ -427,21 +427,7 @@ def __init__(self, loader):
self._shutdown = False

def _init_workers(self):
# NOTE(zhangxiaoci): When trained in XPU multi-node RDMA environment, an unexpected
# segmentfault will be raised in dataloader process, where the traceback goes all
# back to a runtime error that dataloader workers exit unexpectedly. Similar problems
# have been discussed that lead to a misbehavior of OpenCV working in multiprocessing
# environment. A possible solution is to change default 'fork' mode of multiprocessing
# start method to 'spawn'. See https://stackoverflow.com/questions/54013846 for details.
# NOTE(zhangxiaoci): Replace multiprocessing with multiprocess since in some training
# environments the former will raise 'AttributeError: Can't pickle local object xxx',
# which is a side effect of changing the default start method.
if paddle.is_compiled_with_xpu():
import multiprocess as multiprocessing

multiprocessing.set_start_method('spawn', force=True)
else:
from paddle.incubate import multiprocessing
from paddle.incubate import multiprocessing

# multiprocess worker and indice queue list initial as empty
self._workers = []
Expand Down

0 comments on commit 86694ce

Please sign in to comment.