szha opened a new issue #18099: Error in test_contrib_amp.py::test_fp16_casting URL: https://github.com/apache/incubator-mxnet/issues/18099 ## Description As part of #18025 I added a waitall() in between test modules. This revealed the following error in unix-gpu pipeline which seems to be related to pointwise fusion. http://jenkins.mxnet-ci.amazon-ml.com/blue/organizations/jenkins/mxnet-validation%2Funix-gpu/detail/PR-18025/30/pipeline ``` ==================================== ERRORS ==================================== ____________________ ERROR at teardown of test_fp16_casting ____________________ def teardown_module(): """ A function with a 'magic name' executed automatically after each pytest test module. It waits for all operations in one file to finish before carrying on the next. """ > mx.nd.waitall() tests/python/unittest/common.py:310: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ python/mxnet/ndarray/ndarray.py:211: in waitall check_call(_LIB.MXNDArrayWaitAll()) _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ ret = -1 def check_call(ret): """Check the return value of C API call. This function will raise an exception when an error occurs. Wrap every API call with this function. Parameters ---------- ret : int return value from API calls. """ if ret != 0: > raise get_last_ffi_error() E mxnet.base.MXNetError: Traceback (most recent call last): E [bt] (9) /work/mxnet/python/mxnet/../../build/libmxnet.so(std::thread::_State_impl<std::thread::_Invoker<std::tuple<std::function<void (std::shared_ptr<dmlc::ManualEvent>)>, std::shared_ptr<dmlc::ManualEvent> > > >::_M_run()+0x4a) [0x7f1ec6cdd4da] E [bt] (8) /work/mxnet/python/mxnet/../../build/libmxnet.so(std::_Function_handler<void (std::shared_ptr<dmlc::ManualEvent>), mxnet::engine::ThreadedEnginePerDevice::PushToExecute(mxnet::engine::OprBlock*, bool)::{lambda()#4}::operator()() const::{lambda(std::shared_ptr<dmlc::ManualEvent>)#1}>::_M_invoke(std::_Any_data const&, std::shared_ptr<dmlc::ManualEvent>&&)+0x4e) [0x7f1ec6ce17fe] E [bt] (7) /work/mxnet/python/mxnet/../../build/libmxnet.so(void mxnet::engine::ThreadedEnginePerDevice::GPUWorker<(dmlc::ConcurrentQueueType)0>(mxnet::Context, bool, mxnet::engine::ThreadedEnginePerDevice::ThreadWorkerBlock<(dmlc::ConcurrentQueueType)0>*, std::shared_ptr<dmlc::ManualEvent> const&)+0x11d) [0x7f1ec6ce151d] E [bt] (6) /work/mxnet/python/mxnet/../../build/libmxnet.so(mxnet::engine::ThreadedEngine::ExecuteOprBlock(mxnet::RunContext, mxnet::engine::OprBlock*)+0x121) [0x7f1ec6cde491] E [bt] (5) /work/mxnet/python/mxnet/../../build/libmxnet.so(+0x20828ee) [0x7f1ec6cd38ee] E [bt] (4) /work/mxnet/python/mxnet/../../build/libmxnet.so(std::_Function_handler<void (mxnet::RunContext), mxnet::imperative::PushFCompute(std::function<void (nnvm::NodeAttrs const&, mxnet::OpContext const&, std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> > const&, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&, std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> > const&)> const&, nnvm::Op const*, nnvm::NodeAttrs const&, mxnet::Context const&, std::vector<mxnet::engine::Var*, std::allocator<mxnet::engine::Var*> > const&, std::vector<mxnet::engine::Var*, std::allocator<mxnet::engine::Var*> > const&, std::vector<mxnet::Resource, std::allocator<mxnet::Resource> > const&, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, std::vector<unsigned int, std::allocator<unsigned int> > const&, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&)::{lambda(mxnet::RunContext)#1}>::_M_invoke(std::_Any_data const&, mxnet::RunContext&&)+0x17) [0x7f1ec6daa027] E [bt] (3) /work/mxnet/python/mxnet/../../build/libmxnet.so(mxnet::imperative::PushFCompute(std::function<void (nnvm::NodeAttrs const&, mxnet::OpContext const&, std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> > const&, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&, std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> > const&)> const&, nnvm::Op const*, nnvm::NodeAttrs const&, mxnet::Context const&, std::vector<mxnet::engine::Var*, std::allocator<mxnet::engine::Var*> > const&, std::vector<mxnet::engine::Var*, std::allocator<mxnet::engine::Var*> > const&, std::vector<mxnet::Resource, std::allocator<mxnet::Resource> > const&, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, std::vector<unsigned int, std::allocator<unsigned int> > const&, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&)::{lambda(mxnet::RunContext)#1}::operator()(mxnet::RunContext) const+0x934) [0x7f1ec6da9bd4] E [bt] (2) /work/mxnet/python/mxnet/../../build/libmxnet.so(mxnet::op::TVMBinaryBroadcastScalarCompute::operator()(nnvm::NodeAttrs const&, mxnet::OpContext const&, std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> > const&, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&, std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> > const&)+0x5b8) [0x7f1ec7f2de28] E [bt] (1) /work/mxnet/python/mxnet/../../build/libmxnet.so(tvm::runtime::TVMOpModule::CallEx(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, mxnet::OpContext const&, std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> > const&, tvm::runtime::TVMArgs) const+0xb1) [0x7f1ec9cdd2d1] E [bt] (0) /work/build/3rdparty/tvm/libtvm_runtime.so(+0x4ac09) [0x7f1f42c8ac09] E [bt] (9) /work/mxnet/python/mxnet/../../build/libmxnet.so(std::_Function_handler<void (mxnet::RunContext), mxnet::imperative::PushFCompute(std::function<void (nnvm::NodeAttrs const&, mxnet::OpContext const&, std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> > const&, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&, std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> > const&)> const&, nnvm::Op const*, nnvm::NodeAttrs const&, mxnet::Context const&, std::vector<mxnet::engine::Var*, std::allocator<mxnet::engine::Var*> > const&, std::vector<mxnet::engine::Var*, std::allocator<mxnet::engine::Var*> > const&, std::vector<mxnet::Resource, std::allocator<mxnet::Resource> > const&, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, std::vector<unsigned int, std::allocator<unsigned int> > const&, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&)::{lambda(mxnet::RunContext)#1}>::_M_invoke(std::_Any_data const&, mxnet::RunContext&&)+0x17) [0x7f1ec6daa027] E [bt] (8) /work/mxnet/python/mxnet/../../build/libmxnet.so(mxnet::imperative::PushFCompute(std::function<void (nnvm::NodeAttrs const&, mxnet::OpContext const&, std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> > const&, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&, std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> > const&)> const&, nnvm::Op const*, nnvm::NodeAttrs const&, mxnet::Context const&, std::vector<mxnet::engine::Var*, std::allocator<mxnet::engine::Var*> > const&, std::vector<mxnet::engine::Var*, std::allocator<mxnet::engine::Var*> > const&, std::vector<mxnet::Resource, std::allocator<mxnet::Resource> > const&, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, std::vector<unsigned int, std::allocator<unsigned int> > const&, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&)::{lambda(mxnet::RunContext)#1}::operator()(mxnet::RunContext) const+0x934) [0x7f1ec6da9bd4] E [bt] (7) /work/mxnet/python/mxnet/../../build/libmxnet.so(mxnet::op::TVMBinaryBroadcastScalarCompute::operator()(nnvm::NodeAttrs const&, mxnet::OpContext const&, std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> > const&, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&, std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> > const&)+0x5b8) [0x7f1ec7f2de28] E [bt] (6) /work/mxnet/python/mxnet/../../build/libmxnet.so(tvm::runtime::TVMOpModule::CallEx(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, mxnet::OpContext const&, std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> > const&, tvm::runtime::TVMArgs) const+0xb1) [0x7f1ec9cdd2d1] E [bt] (5) /work/build/3rdparty/tvm/libtvm_runtime.so(+0x4a09f) [0x7f1f42c8a09f] E [bt] (4) /work/mxnet/python/mxnet/../../build/libtvmop.so(greater_scalar_gpufloat32_2bool_2+0x210) [0x7f1dcafe6500] E [bt] (3) /work/mxnet/python/mxnet/../../build/libtvmop.so(+0xbe71f) [0x7f1dcafe671f] E [bt] (2) /work/build/3rdparty/tvm/libtvm_runtime.so(TVMBackendGetFuncFromEnv+0x61) [0x7f1f42c70831] E [bt] (1) /work/build/3rdparty/tvm/libtvm_runtime.so(tvm::runtime::ModuleNode::GetFuncFromEnv(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&)+0x3d8) [0x7f1f42c93498] E [bt] (0) /work/mxnet/python/mxnet/../../build/libmxnet.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x7f) [0x7f1ec6ad00cf] E File "/work/mxnet/3rdparty/tvm/src/runtime/module.cc", line 123 E File "/work/mxnet/3rdparty/tvm/src/runtime/library_module.cc", line 91 E TVMError: Check failed: ret == 0 (-1 vs. 0) : Check failed: f != nullptr: Cannot find function greater_scalar_gpufloat32_2bool_2_kernel0 in the imported modules or global registry ```
---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services