Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

多线程时,同一输入做两个embedding程序会崩溃 #9200

Closed
jshower opened this issue Mar 19, 2018 · 3 comments
Closed

多线程时,同一输入做两个embedding程序会崩溃 #9200

jshower opened this issue Mar 19, 2018 · 3 comments
Assignees
Labels
User 用于标记用户问题

Comments

@jshower
Copy link
Contributor

jshower commented Mar 19, 2018

在多线程时,使用如下网络结构进行训练时,在多线程时会出现训练崩溃的问题(单线程不会),在末尾处贴了错误的日志。

import sys
import os
import math

import paddle.fluid as fluid
from paddle.fluid.initializer import NormalInitializer


def ner_net(word_dict_len, label_dict_len, parallel):
    IS_SPARSE = True
    #embedding_name = 'emb'
    #word_dict_len = 1942562
    word_dim = 32
    mention_dict_len = 57
    mention_dim = 20
    grnn_hidden = 36
    #label_dict_len = 49
    
    def _net_conf(word, mark, target):
        word_embedding = fluid.layers.embedding(
            input=word,
            size=[word_dict_len, word_dim],
            dtype='float32',
            is_sparse=IS_SPARSE,
            param_attr=fluid.ParamAttr(learning_rate=5))

        mention_embedding = fluid.layers.embedding(
            input=mention,
            size=[mention_dict_len, mention_dim],
            dtype='float32',
            is_sparse=IS_SPARSE,
            param_attr=fluid.ParamAttr(learning_rate=5))

        word_embedding_r = fluid.layers.embedding(
            input=word,
            size=[word_dict_len, word_dim],
            dtype='float32',
            is_sparse=IS_SPARSE,
            param_attr=fluid.ParamAttr(learning_rate=5))

        mention_embedding_r = fluid.layers.embedding(
            input=mention,
            size=[mention_dict_len, mention_dim],
            dtype='float32',
            is_sparse=IS_SPARSE,
            param_attr=fluid.ParamAttr(learning_rate=5))

        word_mention_vector = fluid.layers.concat(
            input=[word_embedding, mention_embedding], axis=1)

        word_mention_vector_r = fluid.layers.concat(
            input=[word_embedding_r, mention_embedding_r], axis=1)
    
        pre_gru = fluid.layers.fc(input = word_mention_vector, 
            size = grnn_hidden * 3,
            param_attr = fluid.ParamAttr(
                regularizer=fluid.regularizer.L2DecayRegularizer(regularization_coeff=1e-4)))
        gru = fluid.layers.dynamic_gru(input=pre_gru, 
            size=grnn_hidden,
            param_attr = fluid.ParamAttr(
                regularizer=fluid.regularizer.L2DecayRegularizer(regularization_coeff=1e-4)))

        pre_gru_r = fluid.layers.fc(input=word_mention_vector_r, 
            size=grnn_hidden * 3,
            param_attr = fluid.ParamAttr(
                regularizer=fluid.regularizer.L2DecayRegularizer(regularization_coeff=1e-4)))
        gru_r = fluid.layers.dynamic_gru(input=pre_gru_r,
            size=grnn_hidden,
            is_reverse=True,
            param_attr = fluid.ParamAttr(
                regularizer=fluid.regularizer.L2DecayRegularizer(regularization_coeff=1e-4)))

        gru_merged = fluid.layers.concat(input=[gru, gru_r], axis=1)

        emission = fluid.layers.fc(
            size=label_dict_len,
            input=gru_merged,
            param_attr = fluid.ParamAttr(
                regularizer=fluid.regularizer.L2DecayRegularizer(regularization_coeff=1e-4)))

        crf_cost = fluid.layers.linear_chain_crf(
            input=emission,
            label=target,
            param_attr=fluid.ParamAttr(
                name='crfw',
                learning_rate=0.2,
                #regularizer=fluid.regularizer.L2DecayRegularizer(regularization_coeff=1e-4)
        ))
        avg_cost = fluid.layers.mean(x=crf_cost)
        return avg_cost, emission

    word = fluid.layers.data(name='word', shape=[1], dtype='int64', lod_level=1)
    mention = fluid.layers.data(name='mention', shape=[1], dtype='int64', lod_level=1)
    target = fluid.layers.data(
        name="target", shape=[1], dtype='int64', lod_level=1)

    if parallel:
        places = fluid.layers.get_places()
        pd = fluid.layers.ParallelDo(places)
        with pd.do():
            word_ = pd.read_input(word)
            mention_ = pd.read_input(mention)
            target_ = pd.read_input(target)
            avg_cost, emission_base = _net_conf(word_, mention_, target_)
            pd.write_output(avg_cost)
            pd.write_output(emission_base)
        avg_cost_list, emission = pd()
        avg_cost = fluid.layers.mean(x=avg_cost_list)
        emission.stop_gradient = True
    else:
        avg_cost, emission = _net_conf(word, mention, target)

    return avg_cost, emission, word, mention, target

将上述程序稍作调整,去掉两个不同的embedding,则不会出错。想请教一下是什么原因,感觉这是一个对用户不友好的地方,因为对同一个输入做多个不同的embedding实际中也是存在的。

import sys
import os
import math

import paddle.fluid as fluid
from paddle.fluid.initializer import NormalInitializer


def ner_net(word_dict_len, label_dict_len, parallel):
    IS_SPARSE = True
    #embedding_name = 'emb'
    #word_dict_len = 1942562
    word_dim = 32
    mention_dict_len = 57
    mention_dim = 20
    grnn_hidden = 36
    #label_dict_len = 49
    
    def _net_conf(word, mark, target):
        word_embedding = fluid.layers.embedding(
            input=word,
            size=[word_dict_len, word_dim],
            dtype='float32',
            is_sparse=IS_SPARSE,
            param_attr=fluid.ParamAttr(name="word_emb_f", learning_rate=5))

        mention_embedding = fluid.layers.embedding(
            input=mention,
            size=[mention_dict_len, mention_dim],
            dtype='float32',
            is_sparse=IS_SPARSE,
            param_attr=fluid.ParamAttr(name="men_emb_r", learning_rate=5))
        '''        
        word_embedding_r = fluid.layers.embedding(
            input=word,
            size=[word_dict_len, word_dim],
            dtype='float32',
            is_sparse=IS_SPARSE,
            param_attr=fluid.ParamAttr(name="word_emb_f", learning_rate=5))

        mention_embedding_r = fluid.layers.embedding(
            input=mention,
            size=[mention_dict_len, mention_dim],
            dtype='float32',
            is_sparse=IS_SPARSE,
            param_attr=fluid.ParamAttr(name="men_emb_r", learning_rate=5))
        '''
        word_mention_vector = fluid.layers.concat(
            input=[word_embedding, mention_embedding], axis=1)
        '''
        word_mention_vector_r = fluid.layers.concat(
            input=[word_embedding_r, mention_embedding_r], axis=1)
        '''    
        pre_gru = fluid.layers.fc(input = word_mention_vector, 
            size = grnn_hidden * 3,
            param_attr = fluid.ParamAttr(
                regularizer=fluid.regularizer.L2DecayRegularizer(regularization_coeff=1e-4)))
        gru = fluid.layers.dynamic_gru(input=pre_gru, 
            size=grnn_hidden,
            param_attr = fluid.ParamAttr(
                regularizer=fluid.regularizer.L2DecayRegularizer(regularization_coeff=1e-4)))

        pre_gru_r = fluid.layers.fc(input=word_mention_vector, 
            size=grnn_hidden * 3,
            param_attr = fluid.ParamAttr(
                regularizer=fluid.regularizer.L2DecayRegularizer(regularization_coeff=1e-4)))
        gru_r = fluid.layers.dynamic_gru(input=pre_gru_r,
            size=grnn_hidden,
            is_reverse=True,
            param_attr = fluid.ParamAttr(
                regularizer=fluid.regularizer.L2DecayRegularizer(regularization_coeff=1e-4)))

        gru_merged = fluid.layers.concat(input=[gru, gru_r], axis=1)

        emission = fluid.layers.fc(
            size=label_dict_len,
            input=gru_merged,
            param_attr = fluid.ParamAttr(
                regularizer=fluid.regularizer.L2DecayRegularizer(regularization_coeff=1e-4)))

        crf_cost = fluid.layers.linear_chain_crf(
            input=emission,
            label=target,
            param_attr=fluid.ParamAttr(
                name='crfw',
                learning_rate=0.2,
                #regularizer=fluid.regularizer.L2DecayRegularizer(regularization_coeff=1e-4)
        ))
        avg_cost = fluid.layers.mean(x=crf_cost)
        return avg_cost, emission

    word = fluid.layers.data(name='word', shape=[1], dtype='int64', lod_level=1)
    mention = fluid.layers.data(name='mention', shape=[1], dtype='int64', lod_level=1)
    target = fluid.layers.data(
        name="target", shape=[1], dtype='int64', lod_level=1)

    if parallel:
        places = fluid.layers.get_places()
        pd = fluid.layers.ParallelDo(places)
        with pd.do():
            word_ = pd.read_input(word)
            mention_ = pd.read_input(mention)
            target_ = pd.read_input(target)
            avg_cost, emission_base = _net_conf(word_, mention_, target_)
            pd.write_output(avg_cost)
            pd.write_output(emission_base)
        avg_cost_list, emission = pd()
        avg_cost = fluid.layers.mean(x=avg_cost_list)
        emission.stop_gradient = True
    else:
        avg_cost, emission = _net_conf(word, mention, target)

    return avg_cost, emission, word, mention, target
F0319 08:04:48.310140  3321 threadpool.h:96] The exception is thrown inside the thread pool. You should use RunAndGetException to handle the exception.
The default exception handler is LOG(FATAL).enforce dtype != -1 failed, -1 == -1
Sum operator should have at least one tensor at [/paddle_gpu/Paddle/paddle/fluid/operators/sum_op.cc:73]
PaddlePaddle Call Stacks: 
0       0x7f2c50a1c48cp paddle::platform::EnforceNotMet::EnforceNotMet(std::__exception_ptr::exception_ptr, char const*, int) + 572
1       0x7f2c514f6138p paddle::operators::SumOp::GetExpectedKernelType(paddle::framework::ExecutionContext const&) const + 1912
2       0x7f2c5161682dp paddle::framework::OperatorWithKernel::RunImpl(paddle::framework::Scope const&, boost::variant<paddle::platform::CUDAPlace, paddle::platform::CPUPlace, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_> const&) const + 381
3       0x7f2c50acd4a5p paddle::framework::Executor::RunPreparedContext(paddle::framework::ExecutorPrepareContext*, paddle::framework::Scope*, bool, bool) + 1781
4       0x7f2c50acea5fp paddle::framework::Executor::Run(paddle::framework::ProgramDesc const&, paddle::framework::Scope*, int, bool, bool) + 63
5       0x7f2c513f38b3p std::_Function_handler<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> (), std::__future_base::_Task_setter<std::unique_ptr<std::__future_base::_Result<std::unique_ptr<paddle::platform::EnforceNotMet, std::default_delete<paddle::platform::EnforceNotMet> > >, std::__future_base::_Result_base::_Deleter>, std::_Bind_simple<std::reference_wrapper<std::future<std::unique_ptr<paddle::platform::EnforceNotMet, std::default_delete<paddle::platform::EnforceNotMet> > > paddle::framework::ThreadPool::RunAndGetException<paddle::operators::ParallelDoGradOp::RunImpl(paddle::framework::Scope const&, boost::variant<paddle::platform::CUDAPlace, paddle::platform::CPUPlace, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_> const&) const::{lambda()#1}>(paddle::operators::ParallelDoGradOp::RunImpl(paddle::framework::Scope const&, boost::variant<paddle::platform::CUDAPlace, paddle::platform::CPUPlace, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_> const&) const::{lambda()#1})::{lambda()#1}> ()>, std::unique_ptr<paddle::platform::EnforceNotMet, std::default_delete<paddle::platform::EnforceNotMet> > > >::_M_invoke(std::_Any_data const&) + 99
6       0x7f2c513f058ep std::__future_base::_State_baseV2::_M_do_set(std::function<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> ()>*, bool*) + 46
7       0x7f2c8a954a99p
8       0x7f2c513f0bd2p std::__future_base::_State_baseV2::_M_set_result(std::function<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> ()>, bool) + 146
9       0x7f2c513f0d46p std::__future_base::_Task_state<std::future<std::unique_ptr<paddle::platform::EnforceNotMet, std::default_delete<std::unique_ptr> > > paddle::framework::ThreadPool::RunAndGetException<paddle::operators::ParallelDoGradOp::RunImpl(paddle::framework::Scope const&, boost::variant<paddle::platform::CUDAPlace, paddle::platform::CPUPlace, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_> const&) const::{lambda()#1}>(paddle::operators::ParallelDoGradOp::RunImpl(paddle::framework::Scope const&, boost::variant<paddle::platform::CUDAPlace, paddle::platform::CPUPlace, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_> const&) const::{lambda()#1})::{lambda()#1}, std::allocator<int>, std::default_delete<std::unique_ptr> ()>::_M_run() + 86
10      0x7f2c516315d4p paddle::framework::ThreadPool::TaskLoop() + 1012
11      0x7f2c7eab8c80p
12      0x7f2c8a94d6bap
13      0x7f2c8a68341dp clone + 109
*** Check failure stack trace: ***
    @     0x7f2c517decad  google::LogMessage::Fail()
    @     0x7f2c517e0ff8  google::LogMessage::SendToLog()
    @     0x7f2c517de7bb  google::LogMessage::Flush()
    @     0x7f2c517e1ece  google::LogMessageFatal::~LogMessageFatal()
    @     0x7f2c513f1847  std::_Function_handler<>::_M_invoke()
    @     0x7f2c513f058e  std::__future_base::_State_baseV2::_M_do_set()
    @     0x7f2c8a954a99  __pthread_once_slow
    @     0x7f2c513f0bd2  std::__future_base::_State_baseV2::_M_set_result()
    @     0x7f2c513f0c91  std::__future_base::_Deferred_state<>::_M_complete_async()
    @     0x7f2c513fa32a  paddle::operators::ParallelDoGradOp::RunImpl()
    @     0x7f2c50acd4a5  paddle::framework::Executor::RunPreparedContext()
    @     0x7f2c50acea5f  paddle::framework::Executor::Run()
    @     0x7f2c50a38fc3  _ZZN8pybind1112cpp_function10initializeIZNS0_C4IvN6paddle9framework8ExecutorEIRKNS4_11ProgramDescEPNS4_5ScopeEibbEINS_4nameENS_9is_methodENS_7siblingEEEEMT0_FT_DpT1_EDpRKT2_EUlPS5_S8_SA_ibbE_vISO_S8_SA_ibbEISB_SC_SD_EEEvOSF_PFSE_SH_ESN_ENUlRNS_6detail13function_callEE1_4_FUNESV_
    @     0x7f2c50a36d04  pybind11::cpp_function::dispatcher()
    @           0x4c37ed  PyEval_EvalFrameEx
    @           0x4b9ab6  PyEval_EvalCodeEx
    @           0x4c16e7  PyEval_EvalFrameEx
    @           0x4b9ab6  PyEval_EvalCodeEx
    @           0x4c1e6f  PyEval_EvalFrameEx
    @           0x4b9ab6  PyEval_EvalCodeEx
    @           0x4eb30f  (unknown)
    @           0x4e5422  PyRun_FileExFlags
    @           0x4e3cd6  PyRun_SimpleFileExFlags
    @           0x493ae2  Py_Main
    @     0x7f2c8a59c830  __libc_start_main
    @           0x4933e9  _start
    @              (nil)  (unknown)
Aborted

如果需要完整的环境来复现问题,可以在Hi上联系我(jiaozhenyu)。

@Yancey1989 Yancey1989 added the User 用于标记用户问题 label Mar 19, 2018
@Yancey1989
Copy link
Contributor

Hi @tonyyang-svail, I knew you're the main contributor to ParallelDo, could you please take a look at this issue?

@tonyyang-svail
Copy link

@jshower Currently this feature is not supported for ParallelDo. We will support this feature in ParallelExecutor(#9080).

@Yancey1989
Copy link
Contributor

Close this inactive issue, please feel free to reopen it.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
User 用于标记用户问题
Projects
None yet
Development

No branches or pull requests

4 participants