We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
在多线程时,使用如下网络结构进行训练时,在多线程时会出现训练崩溃的问题(单线程不会),在末尾处贴了错误的日志。
import sys import os import math import paddle.fluid as fluid from paddle.fluid.initializer import NormalInitializer def ner_net(word_dict_len, label_dict_len, parallel): IS_SPARSE = True #embedding_name = 'emb' #word_dict_len = 1942562 word_dim = 32 mention_dict_len = 57 mention_dim = 20 grnn_hidden = 36 #label_dict_len = 49 def _net_conf(word, mark, target): word_embedding = fluid.layers.embedding( input=word, size=[word_dict_len, word_dim], dtype='float32', is_sparse=IS_SPARSE, param_attr=fluid.ParamAttr(learning_rate=5)) mention_embedding = fluid.layers.embedding( input=mention, size=[mention_dict_len, mention_dim], dtype='float32', is_sparse=IS_SPARSE, param_attr=fluid.ParamAttr(learning_rate=5)) word_embedding_r = fluid.layers.embedding( input=word, size=[word_dict_len, word_dim], dtype='float32', is_sparse=IS_SPARSE, param_attr=fluid.ParamAttr(learning_rate=5)) mention_embedding_r = fluid.layers.embedding( input=mention, size=[mention_dict_len, mention_dim], dtype='float32', is_sparse=IS_SPARSE, param_attr=fluid.ParamAttr(learning_rate=5)) word_mention_vector = fluid.layers.concat( input=[word_embedding, mention_embedding], axis=1) word_mention_vector_r = fluid.layers.concat( input=[word_embedding_r, mention_embedding_r], axis=1) pre_gru = fluid.layers.fc(input = word_mention_vector, size = grnn_hidden * 3, param_attr = fluid.ParamAttr( regularizer=fluid.regularizer.L2DecayRegularizer(regularization_coeff=1e-4))) gru = fluid.layers.dynamic_gru(input=pre_gru, size=grnn_hidden, param_attr = fluid.ParamAttr( regularizer=fluid.regularizer.L2DecayRegularizer(regularization_coeff=1e-4))) pre_gru_r = fluid.layers.fc(input=word_mention_vector_r, size=grnn_hidden * 3, param_attr = fluid.ParamAttr( regularizer=fluid.regularizer.L2DecayRegularizer(regularization_coeff=1e-4))) gru_r = fluid.layers.dynamic_gru(input=pre_gru_r, size=grnn_hidden, is_reverse=True, param_attr = fluid.ParamAttr( regularizer=fluid.regularizer.L2DecayRegularizer(regularization_coeff=1e-4))) gru_merged = fluid.layers.concat(input=[gru, gru_r], axis=1) emission = fluid.layers.fc( size=label_dict_len, input=gru_merged, param_attr = fluid.ParamAttr( regularizer=fluid.regularizer.L2DecayRegularizer(regularization_coeff=1e-4))) crf_cost = fluid.layers.linear_chain_crf( input=emission, label=target, param_attr=fluid.ParamAttr( name='crfw', learning_rate=0.2, #regularizer=fluid.regularizer.L2DecayRegularizer(regularization_coeff=1e-4) )) avg_cost = fluid.layers.mean(x=crf_cost) return avg_cost, emission word = fluid.layers.data(name='word', shape=[1], dtype='int64', lod_level=1) mention = fluid.layers.data(name='mention', shape=[1], dtype='int64', lod_level=1) target = fluid.layers.data( name="target", shape=[1], dtype='int64', lod_level=1) if parallel: places = fluid.layers.get_places() pd = fluid.layers.ParallelDo(places) with pd.do(): word_ = pd.read_input(word) mention_ = pd.read_input(mention) target_ = pd.read_input(target) avg_cost, emission_base = _net_conf(word_, mention_, target_) pd.write_output(avg_cost) pd.write_output(emission_base) avg_cost_list, emission = pd() avg_cost = fluid.layers.mean(x=avg_cost_list) emission.stop_gradient = True else: avg_cost, emission = _net_conf(word, mention, target) return avg_cost, emission, word, mention, target
将上述程序稍作调整,去掉两个不同的embedding,则不会出错。想请教一下是什么原因,感觉这是一个对用户不友好的地方,因为对同一个输入做多个不同的embedding实际中也是存在的。
import sys import os import math import paddle.fluid as fluid from paddle.fluid.initializer import NormalInitializer def ner_net(word_dict_len, label_dict_len, parallel): IS_SPARSE = True #embedding_name = 'emb' #word_dict_len = 1942562 word_dim = 32 mention_dict_len = 57 mention_dim = 20 grnn_hidden = 36 #label_dict_len = 49 def _net_conf(word, mark, target): word_embedding = fluid.layers.embedding( input=word, size=[word_dict_len, word_dim], dtype='float32', is_sparse=IS_SPARSE, param_attr=fluid.ParamAttr(name="word_emb_f", learning_rate=5)) mention_embedding = fluid.layers.embedding( input=mention, size=[mention_dict_len, mention_dim], dtype='float32', is_sparse=IS_SPARSE, param_attr=fluid.ParamAttr(name="men_emb_r", learning_rate=5)) ''' word_embedding_r = fluid.layers.embedding( input=word, size=[word_dict_len, word_dim], dtype='float32', is_sparse=IS_SPARSE, param_attr=fluid.ParamAttr(name="word_emb_f", learning_rate=5)) mention_embedding_r = fluid.layers.embedding( input=mention, size=[mention_dict_len, mention_dim], dtype='float32', is_sparse=IS_SPARSE, param_attr=fluid.ParamAttr(name="men_emb_r", learning_rate=5)) ''' word_mention_vector = fluid.layers.concat( input=[word_embedding, mention_embedding], axis=1) ''' word_mention_vector_r = fluid.layers.concat( input=[word_embedding_r, mention_embedding_r], axis=1) ''' pre_gru = fluid.layers.fc(input = word_mention_vector, size = grnn_hidden * 3, param_attr = fluid.ParamAttr( regularizer=fluid.regularizer.L2DecayRegularizer(regularization_coeff=1e-4))) gru = fluid.layers.dynamic_gru(input=pre_gru, size=grnn_hidden, param_attr = fluid.ParamAttr( regularizer=fluid.regularizer.L2DecayRegularizer(regularization_coeff=1e-4))) pre_gru_r = fluid.layers.fc(input=word_mention_vector, size=grnn_hidden * 3, param_attr = fluid.ParamAttr( regularizer=fluid.regularizer.L2DecayRegularizer(regularization_coeff=1e-4))) gru_r = fluid.layers.dynamic_gru(input=pre_gru_r, size=grnn_hidden, is_reverse=True, param_attr = fluid.ParamAttr( regularizer=fluid.regularizer.L2DecayRegularizer(regularization_coeff=1e-4))) gru_merged = fluid.layers.concat(input=[gru, gru_r], axis=1) emission = fluid.layers.fc( size=label_dict_len, input=gru_merged, param_attr = fluid.ParamAttr( regularizer=fluid.regularizer.L2DecayRegularizer(regularization_coeff=1e-4))) crf_cost = fluid.layers.linear_chain_crf( input=emission, label=target, param_attr=fluid.ParamAttr( name='crfw', learning_rate=0.2, #regularizer=fluid.regularizer.L2DecayRegularizer(regularization_coeff=1e-4) )) avg_cost = fluid.layers.mean(x=crf_cost) return avg_cost, emission word = fluid.layers.data(name='word', shape=[1], dtype='int64', lod_level=1) mention = fluid.layers.data(name='mention', shape=[1], dtype='int64', lod_level=1) target = fluid.layers.data( name="target", shape=[1], dtype='int64', lod_level=1) if parallel: places = fluid.layers.get_places() pd = fluid.layers.ParallelDo(places) with pd.do(): word_ = pd.read_input(word) mention_ = pd.read_input(mention) target_ = pd.read_input(target) avg_cost, emission_base = _net_conf(word_, mention_, target_) pd.write_output(avg_cost) pd.write_output(emission_base) avg_cost_list, emission = pd() avg_cost = fluid.layers.mean(x=avg_cost_list) emission.stop_gradient = True else: avg_cost, emission = _net_conf(word, mention, target) return avg_cost, emission, word, mention, target
F0319 08:04:48.310140 3321 threadpool.h:96] The exception is thrown inside the thread pool. You should use RunAndGetException to handle the exception. The default exception handler is LOG(FATAL).enforce dtype != -1 failed, -1 == -1 Sum operator should have at least one tensor at [/paddle_gpu/Paddle/paddle/fluid/operators/sum_op.cc:73] PaddlePaddle Call Stacks: 0 0x7f2c50a1c48cp paddle::platform::EnforceNotMet::EnforceNotMet(std::__exception_ptr::exception_ptr, char const*, int) + 572 1 0x7f2c514f6138p paddle::operators::SumOp::GetExpectedKernelType(paddle::framework::ExecutionContext const&) const + 1912 2 0x7f2c5161682dp paddle::framework::OperatorWithKernel::RunImpl(paddle::framework::Scope const&, boost::variant<paddle::platform::CUDAPlace, paddle::platform::CPUPlace, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_> const&) const + 381 3 0x7f2c50acd4a5p paddle::framework::Executor::RunPreparedContext(paddle::framework::ExecutorPrepareContext*, paddle::framework::Scope*, bool, bool) + 1781 4 0x7f2c50acea5fp paddle::framework::Executor::Run(paddle::framework::ProgramDesc const&, paddle::framework::Scope*, int, bool, bool) + 63 5 0x7f2c513f38b3p std::_Function_handler<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> (), std::__future_base::_Task_setter<std::unique_ptr<std::__future_base::_Result<std::unique_ptr<paddle::platform::EnforceNotMet, std::default_delete<paddle::platform::EnforceNotMet> > >, std::__future_base::_Result_base::_Deleter>, std::_Bind_simple<std::reference_wrapper<std::future<std::unique_ptr<paddle::platform::EnforceNotMet, std::default_delete<paddle::platform::EnforceNotMet> > > paddle::framework::ThreadPool::RunAndGetException<paddle::operators::ParallelDoGradOp::RunImpl(paddle::framework::Scope const&, boost::variant<paddle::platform::CUDAPlace, paddle::platform::CPUPlace, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_> const&) const::{lambda()#1}>(paddle::operators::ParallelDoGradOp::RunImpl(paddle::framework::Scope const&, boost::variant<paddle::platform::CUDAPlace, paddle::platform::CPUPlace, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_> const&) const::{lambda()#1})::{lambda()#1}> ()>, std::unique_ptr<paddle::platform::EnforceNotMet, std::default_delete<paddle::platform::EnforceNotMet> > > >::_M_invoke(std::_Any_data const&) + 99 6 0x7f2c513f058ep std::__future_base::_State_baseV2::_M_do_set(std::function<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> ()>*, bool*) + 46 7 0x7f2c8a954a99p 8 0x7f2c513f0bd2p std::__future_base::_State_baseV2::_M_set_result(std::function<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> ()>, bool) + 146 9 0x7f2c513f0d46p std::__future_base::_Task_state<std::future<std::unique_ptr<paddle::platform::EnforceNotMet, std::default_delete<std::unique_ptr> > > paddle::framework::ThreadPool::RunAndGetException<paddle::operators::ParallelDoGradOp::RunImpl(paddle::framework::Scope const&, boost::variant<paddle::platform::CUDAPlace, paddle::platform::CPUPlace, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_> const&) const::{lambda()#1}>(paddle::operators::ParallelDoGradOp::RunImpl(paddle::framework::Scope const&, boost::variant<paddle::platform::CUDAPlace, paddle::platform::CPUPlace, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_, boost::detail::variant::void_> const&) const::{lambda()#1})::{lambda()#1}, std::allocator<int>, std::default_delete<std::unique_ptr> ()>::_M_run() + 86 10 0x7f2c516315d4p paddle::framework::ThreadPool::TaskLoop() + 1012 11 0x7f2c7eab8c80p 12 0x7f2c8a94d6bap 13 0x7f2c8a68341dp clone + 109 *** Check failure stack trace: *** @ 0x7f2c517decad google::LogMessage::Fail() @ 0x7f2c517e0ff8 google::LogMessage::SendToLog() @ 0x7f2c517de7bb google::LogMessage::Flush() @ 0x7f2c517e1ece google::LogMessageFatal::~LogMessageFatal() @ 0x7f2c513f1847 std::_Function_handler<>::_M_invoke() @ 0x7f2c513f058e std::__future_base::_State_baseV2::_M_do_set() @ 0x7f2c8a954a99 __pthread_once_slow @ 0x7f2c513f0bd2 std::__future_base::_State_baseV2::_M_set_result() @ 0x7f2c513f0c91 std::__future_base::_Deferred_state<>::_M_complete_async() @ 0x7f2c513fa32a paddle::operators::ParallelDoGradOp::RunImpl() @ 0x7f2c50acd4a5 paddle::framework::Executor::RunPreparedContext() @ 0x7f2c50acea5f paddle::framework::Executor::Run() @ 0x7f2c50a38fc3 _ZZN8pybind1112cpp_function10initializeIZNS0_C4IvN6paddle9framework8ExecutorEIRKNS4_11ProgramDescEPNS4_5ScopeEibbEINS_4nameENS_9is_methodENS_7siblingEEEEMT0_FT_DpT1_EDpRKT2_EUlPS5_S8_SA_ibbE_vISO_S8_SA_ibbEISB_SC_SD_EEEvOSF_PFSE_SH_ESN_ENUlRNS_6detail13function_callEE1_4_FUNESV_ @ 0x7f2c50a36d04 pybind11::cpp_function::dispatcher() @ 0x4c37ed PyEval_EvalFrameEx @ 0x4b9ab6 PyEval_EvalCodeEx @ 0x4c16e7 PyEval_EvalFrameEx @ 0x4b9ab6 PyEval_EvalCodeEx @ 0x4c1e6f PyEval_EvalFrameEx @ 0x4b9ab6 PyEval_EvalCodeEx @ 0x4eb30f (unknown) @ 0x4e5422 PyRun_FileExFlags @ 0x4e3cd6 PyRun_SimpleFileExFlags @ 0x493ae2 Py_Main @ 0x7f2c8a59c830 __libc_start_main @ 0x4933e9 _start @ (nil) (unknown) Aborted
如果需要完整的环境来复现问题,可以在Hi上联系我(jiaozhenyu)。
The text was updated successfully, but these errors were encountered:
Hi @tonyyang-svail, I knew you're the main contributor to ParallelDo, could you please take a look at this issue?
ParallelDo
Sorry, something went wrong.
@jshower Currently this feature is not supported for ParallelDo. We will support this feature in ParallelExecutor(#9080).
Close this inactive issue, please feel free to reopen it.
tonyyang-svail
guru4elephant
No branches or pull requests
在多线程时,使用如下网络结构进行训练时,在多线程时会出现训练崩溃的问题(单线程不会),在末尾处贴了错误的日志。
将上述程序稍作调整,去掉两个不同的embedding,则不会出错。想请教一下是什么原因,感觉这是一个对用户不友好的地方,因为对同一个输入做多个不同的embedding实际中也是存在的。
如果需要完整的环境来复现问题,可以在Hi上联系我(jiaozhenyu)。
The text was updated successfully, but these errors were encountered: