szha opened a new issue #18099: Error in test_contrib_amp.py::test_fp16_casting
URL: https://github.com/apache/incubator-mxnet/issues/18099
 
 
   ## Description
   As part of #18025 I added a waitall() in between test modules. This revealed 
the following error in unix-gpu pipeline which seems to be related to pointwise 
fusion.
   
   
http://jenkins.mxnet-ci.amazon-ml.com/blue/organizations/jenkins/mxnet-validation%2Funix-gpu/detail/PR-18025/30/pipeline
   
   ```
   ==================================== ERRORS 
====================================
   ____________________ ERROR at teardown of test_fp16_casting 
____________________
   
       def teardown_module():
           """
           A function with a 'magic name' executed automatically after each 
pytest test module.
       
           It waits for all operations in one file to finish before carrying on 
the next.
           """
   >       mx.nd.waitall()
   
   tests/python/unittest/common.py:310: 
   _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
_ _ 
   python/mxnet/ndarray/ndarray.py:211: in waitall
       check_call(_LIB.MXNDArrayWaitAll())
   _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
_ _ 
   
   ret = -1
   
       def check_call(ret):
           """Check the return value of C API call.
       
           This function will raise an exception when an error occurs.
           Wrap every API call with this function.
       
           Parameters
           ----------
           ret : int
               return value from API calls.
           """
           if ret != 0:
   >           raise get_last_ffi_error()
   E           mxnet.base.MXNetError: Traceback (most recent call last):
   E             [bt] (9) 
/work/mxnet/python/mxnet/../../build/libmxnet.so(std::thread::_State_impl<std::thread::_Invoker<std::tuple<std::function<void
 (std::shared_ptr<dmlc::ManualEvent>)>, std::shared_ptr<dmlc::ManualEvent> > > 
>::_M_run()+0x4a) [0x7f1ec6cdd4da]
   E             [bt] (8) 
/work/mxnet/python/mxnet/../../build/libmxnet.so(std::_Function_handler<void 
(std::shared_ptr<dmlc::ManualEvent>), 
mxnet::engine::ThreadedEnginePerDevice::PushToExecute(mxnet::engine::OprBlock*, 
bool)::{lambda()#4}::operator()() 
const::{lambda(std::shared_ptr<dmlc::ManualEvent>)#1}>::_M_invoke(std::_Any_data
 const&, std::shared_ptr<dmlc::ManualEvent>&&)+0x4e) [0x7f1ec6ce17fe]
   E             [bt] (7) /work/mxnet/python/mxnet/../../build/libmxnet.so(void 
mxnet::engine::ThreadedEnginePerDevice::GPUWorker<(dmlc::ConcurrentQueueType)0>(mxnet::Context,
 bool, 
mxnet::engine::ThreadedEnginePerDevice::ThreadWorkerBlock<(dmlc::ConcurrentQueueType)0>*,
 std::shared_ptr<dmlc::ManualEvent> const&)+0x11d) [0x7f1ec6ce151d]
   E             [bt] (6) 
/work/mxnet/python/mxnet/../../build/libmxnet.so(mxnet::engine::ThreadedEngine::ExecuteOprBlock(mxnet::RunContext,
 mxnet::engine::OprBlock*)+0x121) [0x7f1ec6cde491]
   E             [bt] (5) 
/work/mxnet/python/mxnet/../../build/libmxnet.so(+0x20828ee) [0x7f1ec6cd38ee]
   E             [bt] (4) 
/work/mxnet/python/mxnet/../../build/libmxnet.so(std::_Function_handler<void 
(mxnet::RunContext), mxnet::imperative::PushFCompute(std::function<void 
(nnvm::NodeAttrs const&, mxnet::OpContext const&, std::vector<mxnet::TBlob, 
std::allocator<mxnet::TBlob> > const&, std::vector<mxnet::OpReqType, 
std::allocator<mxnet::OpReqType> > const&, std::vector<mxnet::TBlob, 
std::allocator<mxnet::TBlob> > const&)> const&, nnvm::Op const*, 
nnvm::NodeAttrs const&, mxnet::Context const&, std::vector<mxnet::engine::Var*, 
std::allocator<mxnet::engine::Var*> > const&, std::vector<mxnet::engine::Var*, 
std::allocator<mxnet::engine::Var*> > const&, std::vector<mxnet::Resource, 
std::allocator<mxnet::Resource> > const&, std::vector<mxnet::NDArray*, 
std::allocator<mxnet::NDArray*> > const&, std::vector<mxnet::NDArray*, 
std::allocator<mxnet::NDArray*> > const&, std::vector<unsigned int, 
std::allocator<unsigned int> > const&, std::vector<mxnet::OpReqType, 
std::allocator<mxnet::OpReqType> > 
const&)::{lambda(mxnet::RunContext)#1}>::_M_invoke(std::_Any_data const&, 
mxnet::RunContext&&)+0x17) [0x7f1ec6daa027]
   E             [bt] (3) 
/work/mxnet/python/mxnet/../../build/libmxnet.so(mxnet::imperative::PushFCompute(std::function<void
 (nnvm::NodeAttrs const&, mxnet::OpContext const&, std::vector<mxnet::TBlob, 
std::allocator<mxnet::TBlob> > const&, std::vector<mxnet::OpReqType, 
std::allocator<mxnet::OpReqType> > const&, std::vector<mxnet::TBlob, 
std::allocator<mxnet::TBlob> > const&)> const&, nnvm::Op const*, 
nnvm::NodeAttrs const&, mxnet::Context const&, std::vector<mxnet::engine::Var*, 
std::allocator<mxnet::engine::Var*> > const&, std::vector<mxnet::engine::Var*, 
std::allocator<mxnet::engine::Var*> > const&, std::vector<mxnet::Resource, 
std::allocator<mxnet::Resource> > const&, std::vector<mxnet::NDArray*, 
std::allocator<mxnet::NDArray*> > const&, std::vector<mxnet::NDArray*, 
std::allocator<mxnet::NDArray*> > const&, std::vector<unsigned int, 
std::allocator<unsigned int> > const&, std::vector<mxnet::OpReqType, 
std::allocator<mxnet::OpReqType> > 
const&)::{lambda(mxnet::RunContext)#1}::operator()(mxnet::RunContext) 
const+0x934) [0x7f1ec6da9bd4]
   E             [bt] (2) 
/work/mxnet/python/mxnet/../../build/libmxnet.so(mxnet::op::TVMBinaryBroadcastScalarCompute::operator()(nnvm::NodeAttrs
 const&, mxnet::OpContext const&, std::vector<mxnet::TBlob, 
std::allocator<mxnet::TBlob> > const&, std::vector<mxnet::OpReqType, 
std::allocator<mxnet::OpReqType> > const&, std::vector<mxnet::TBlob, 
std::allocator<mxnet::TBlob> > const&)+0x5b8) [0x7f1ec7f2de28]
   E             [bt] (1) 
/work/mxnet/python/mxnet/../../build/libmxnet.so(tvm::runtime::TVMOpModule::CallEx(std::__cxx11::basic_string<char,
 std::char_traits<char>, std::allocator<char> > const&, mxnet::OpContext 
const&, std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> > const&, 
tvm::runtime::TVMArgs) const+0xb1) [0x7f1ec9cdd2d1]
   E             [bt] (0) /work/build/3rdparty/tvm/libtvm_runtime.so(+0x4ac09) 
[0x7f1f42c8ac09]
   E             [bt] (9) 
/work/mxnet/python/mxnet/../../build/libmxnet.so(std::_Function_handler<void 
(mxnet::RunContext), mxnet::imperative::PushFCompute(std::function<void 
(nnvm::NodeAttrs const&, mxnet::OpContext const&, std::vector<mxnet::TBlob, 
std::allocator<mxnet::TBlob> > const&, std::vector<mxnet::OpReqType, 
std::allocator<mxnet::OpReqType> > const&, std::vector<mxnet::TBlob, 
std::allocator<mxnet::TBlob> > const&)> const&, nnvm::Op const*, 
nnvm::NodeAttrs const&, mxnet::Context const&, std::vector<mxnet::engine::Var*, 
std::allocator<mxnet::engine::Var*> > const&, std::vector<mxnet::engine::Var*, 
std::allocator<mxnet::engine::Var*> > const&, std::vector<mxnet::Resource, 
std::allocator<mxnet::Resource> > const&, std::vector<mxnet::NDArray*, 
std::allocator<mxnet::NDArray*> > const&, std::vector<mxnet::NDArray*, 
std::allocator<mxnet::NDArray*> > const&, std::vector<unsigned int, 
std::allocator<unsigned int> > const&, std::vector<mxnet::OpReqType, 
std::allocator<mxnet::OpReqType> > 
const&)::{lambda(mxnet::RunContext)#1}>::_M_invoke(std::_Any_data const&, 
mxnet::RunContext&&)+0x17) [0x7f1ec6daa027]
   E             [bt] (8) 
/work/mxnet/python/mxnet/../../build/libmxnet.so(mxnet::imperative::PushFCompute(std::function<void
 (nnvm::NodeAttrs const&, mxnet::OpContext const&, std::vector<mxnet::TBlob, 
std::allocator<mxnet::TBlob> > const&, std::vector<mxnet::OpReqType, 
std::allocator<mxnet::OpReqType> > const&, std::vector<mxnet::TBlob, 
std::allocator<mxnet::TBlob> > const&)> const&, nnvm::Op const*, 
nnvm::NodeAttrs const&, mxnet::Context const&, std::vector<mxnet::engine::Var*, 
std::allocator<mxnet::engine::Var*> > const&, std::vector<mxnet::engine::Var*, 
std::allocator<mxnet::engine::Var*> > const&, std::vector<mxnet::Resource, 
std::allocator<mxnet::Resource> > const&, std::vector<mxnet::NDArray*, 
std::allocator<mxnet::NDArray*> > const&, std::vector<mxnet::NDArray*, 
std::allocator<mxnet::NDArray*> > const&, std::vector<unsigned int, 
std::allocator<unsigned int> > const&, std::vector<mxnet::OpReqType, 
std::allocator<mxnet::OpReqType> > 
const&)::{lambda(mxnet::RunContext)#1}::operator()(mxnet::RunContext) 
const+0x934) [0x7f1ec6da9bd4]
   E             [bt] (7) 
/work/mxnet/python/mxnet/../../build/libmxnet.so(mxnet::op::TVMBinaryBroadcastScalarCompute::operator()(nnvm::NodeAttrs
 const&, mxnet::OpContext const&, std::vector<mxnet::TBlob, 
std::allocator<mxnet::TBlob> > const&, std::vector<mxnet::OpReqType, 
std::allocator<mxnet::OpReqType> > const&, std::vector<mxnet::TBlob, 
std::allocator<mxnet::TBlob> > const&)+0x5b8) [0x7f1ec7f2de28]
   E             [bt] (6) 
/work/mxnet/python/mxnet/../../build/libmxnet.so(tvm::runtime::TVMOpModule::CallEx(std::__cxx11::basic_string<char,
 std::char_traits<char>, std::allocator<char> > const&, mxnet::OpContext 
const&, std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> > const&, 
tvm::runtime::TVMArgs) const+0xb1) [0x7f1ec9cdd2d1]
   E             [bt] (5) /work/build/3rdparty/tvm/libtvm_runtime.so(+0x4a09f) 
[0x7f1f42c8a09f]
   E             [bt] (4) 
/work/mxnet/python/mxnet/../../build/libtvmop.so(greater_scalar_gpufloat32_2bool_2+0x210)
 [0x7f1dcafe6500]
   E             [bt] (3) 
/work/mxnet/python/mxnet/../../build/libtvmop.so(+0xbe71f) [0x7f1dcafe671f]
   E             [bt] (2) 
/work/build/3rdparty/tvm/libtvm_runtime.so(TVMBackendGetFuncFromEnv+0x61) 
[0x7f1f42c70831]
   E             [bt] (1) 
/work/build/3rdparty/tvm/libtvm_runtime.so(tvm::runtime::ModuleNode::GetFuncFromEnv(std::__cxx11::basic_string<char,
 std::char_traits<char>, std::allocator<char> > const&)+0x3d8) [0x7f1f42c93498]
   E             [bt] (0) 
/work/mxnet/python/mxnet/../../build/libmxnet.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x7f)
 [0x7f1ec6ad00cf]
   E             File "/work/mxnet/3rdparty/tvm/src/runtime/module.cc", line 123
   E             File "/work/mxnet/3rdparty/tvm/src/runtime/library_module.cc", 
line 91
   E           TVMError: Check failed: ret == 0 (-1 vs. 0) : Check failed: f != 
nullptr: Cannot find function greater_scalar_gpufloat32_2bool_2_kernel0 in the 
imported modules or global registry
   ```

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

Reply via email to