huabinhuang1994 opened a new issue #9031: cuda error:Check failed: e == cudaSuccess || e == cudaErrorCudartUnloading CUDA: unknown error URL: https://github.com/apache/incubator-mxnet/issues/9031 1.my gpu is 1080ti,my system is ubuntu16.04. 2. I install cuda 8.0 on my computor , and tensorflow and pytorch cpu version works fine on my computor. 3.I install mxnet gpu version from source. 4.Here is my code: from mxnet import autograd import mxnet as mx import os,json,time,fire,ipdb,tqdm from mxnet import gluon from mxnet import image from mxnet import init from mxnet import nd from mxnet.gluon.data import vision import numpy as np from mxnet import initializer batch_size=28 num_classes = 80 data_ctx = mx.cpu() model_ctx = mx.gpu() train_auglist = image.CreateAugmenter(data_shape=(3, 256, 256), resize=430, rand_crop=True, rand_resize=False, rand_mirror=False, # mean=np.array([0.4914, 0.4822, 0.4465]), mean=True, # std=np.array([0.2023, 0.1994, 0.2010]), std=True, brightness=0, contrast=0, saturation=0, hue=0, pca_noise=0, rand_gray=0, inter_method=2) val_auglist = image.CreateAugmenter(data_shape=(3, 256, 256), resize=430, rand_crop=False, rand_resize=False, rand_mirror=False, # mean=np.array([0.4914, 0.4822, 0.4465]), mean=True, # std=np.array([0.2023, 0.1994, 0.2010]), std=True, brightness=0, contrast=0, saturation=0, hue=0, pca_noise=0, rand_gray=0, inter_method=2) def transform_test(data, label): im = data.astype('float32') / 255 auglist = image.CreateAugmenter(data_shape=(3, 256, 256), mean=np.array([0.4914, 0.4822, 0.4465]), std=np.array([0.2023, 0.1994, 0.2010])) for aug in auglist: im = aug(im) im = nd.transpose(im, (2,0,1)) return (im, nd.array([label]).asscalar().astype('float32')) def get_iterators(batch_size, data_shape=(3, 256, 256)): train = mx.image.ImageIter( path_imgrec = '/home/hhb/dataset/scene_data_train.rec', data_name = 'data', label_name = 'softmax_label', batch_size = batch_size, data_shape = data_shape, path_imgidx='/home/hhb/dataset/scene_data_train.idx', shuffle = True, aug_list=train_auglist,) val = mx.image.ImageIter( path_imgrec ='/home/hhb/dataset/scene_data_val.rec', data_name = 'data', label_name = 'softmax_label', batch_size = batch_size, data_shape = data_shape, path_imgidx = '/home/hhb/dataset/scene_data_val.idx', shuffle =False, aug_list=val_auglist,) return (train, val) def get_fine_tune_model(symbol, arg_params, num_classes, layer_name='flatten0'): all_layers = symbol.get_internals() net = all_layers[layer_name+'_output'] net = mx.symbol.FullyConnected(data=net, num_hidden=num_classes, name='fc1') net = mx.symbol.SoftmaxOutput(data=net, name='softmax') new_args = dict({k:arg_params[k] for k in arg_params if 'fc1' not in k}) return (net, new_args) def fit(symbol, arg_params, aux_params, train, val, batch_size, num_gpus): dev = [mx.gpu(i) for i in range(num_gpus)] mod = mx.mod.Module(symbol=symbol, context=mx.cpu()) mod.fit(train, val, num_epoch=8, arg_params=arg_params, aux_params=aux_params, allow_missing=True, batch_end_callback = mx.callback.Speedometer(batch_size, 10), kvstore='device', optimizer='sgd', optimizer_params={'learning_rate':0.01}, initializer=mx.init.Xavier(rnd_type='gaussian', factor_type="in", magnitude=2), eval_metric='acc') metric = mx.metric.Accuracy() return mod.score(val, metric) sym, arg_params, aux_params=mx.model.load_checkpoint('resnet-50',0) (new_sym, new_args) = get_fine_tune_model(sym, arg_params, num_classes) (train, val) = get_iterators(batch_size) mod_score = fit(new_sym, new_args, aux_params, train, val, batch_size, num_gpus=1) here is my error: /home/hhb/anaconda3/lib/python3.6/site-packages/urllib3/contrib/pyopenssl.py:46: DeprecationWarning: OpenSSL.rand is deprecated - you should use os.urandom instead import OpenSSL.SSL /home/hhb/anaconda3/lib/python3.6/site-packages/jedi/_compatibility.py:6: DeprecationWarning: the imp module is deprecated in favour of importlib; see the module's documentation for alternative uses import imp [10:33:27] src/nnvm/legacy_json_util.cc:209: Loading symbol saved by previous version v0.8.0. Attempting to upgrade... [10:33:27] src/nnvm/legacy_json_util.cc:217: Symbol successfully upgraded! [10:33:28] /home/travis/build/dmlc/mxnet-distro/mxnet-build/dmlc-core/include/dmlc/logging.h:308: [10:33:28] src/storage/storage.cc:114: Check failed: e == cudaSuccess || e == cudaErrorCudartUnloading CUDA: unknown error Stack trace returned 10 entries: [bt] (0) /home/hhb/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x28965c) [0x7fa0d737165c] [bt] (1) /home/hhb/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x29c891e) [0x7fa0d9ab091e] [bt] (2) /home/hhb/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x29ca21d) [0x7fa0d9ab221d] [bt] (3) /home/hhb/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x29ca612) [0x7fa0d9ab2612] [bt] (4) /home/hhb/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x24719cd) [0x7fa0d95599cd] [bt] (5) /home/hhb/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x248f894) [0x7fa0d9577894] [bt] (6) /home/hhb/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x2476b9c) [0x7fa0d955eb9c] [bt] (7) /home/hhb/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x247b048) [0x7fa0d9563048] [bt] (8) /home/hhb/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x2482e6a) [0x7fa0d956ae6a] [bt] (9) /home/hhb/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x2483564) [0x7fa0d956b564] Traceback (most recent call last): File "/home/hhb/anaconda3/lib/python3.6/site-packages/mxnet/symbol/symbol.py", line 1488, in simple_bind ctypes.byref(exe_handle))) File "/home/hhb/anaconda3/lib/python3.6/site-packages/mxnet/base.py", line 146, in check_call raise MXNetError(py_str(_LIB.MXGetLastError())) mxnet.base.MXNetError: [10:33:28] src/storage/storage.cc:114: Check failed: e == cudaSuccess || e == cudaErrorCudartUnloading CUDA: unknown error Stack trace returned 10 entries: [bt] (0) /home/hhb/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x28965c) [0x7fa0d737165c] [bt] (1) /home/hhb/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x29c891e) [0x7fa0d9ab091e] [bt] (2) /home/hhb/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x29ca21d) [0x7fa0d9ab221d] [bt] (3) /home/hhb/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x29ca612) [0x7fa0d9ab2612] [bt] (4) /home/hhb/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x24719cd) [0x7fa0d95599cd] [bt] (5) /home/hhb/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x248f894) [0x7fa0d9577894] [bt] (6) /home/hhb/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x2476b9c) [0x7fa0d955eb9c] [bt] (7) /home/hhb/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x247b048) [0x7fa0d9563048] [bt] (8) /home/hhb/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x2482e6a) [0x7fa0d956ae6a] [bt] (9) /home/hhb/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x2483564) [0x7fa0d956b564] During handling of the above exception, another exception occurred: Traceback (most recent call last): File "/home/hhb/mxnetproject/new_scene.py", line 90, in <module> mod_score = fit(new_sym, new_args, aux_params, train, val, batch_size, num_gpus=1) File "/home/hhb/mxnetproject/new_scene.py", line 84, in fit eval_metric='acc') File "/home/hhb/anaconda3/lib/python3.6/site-packages/mxnet/module/base_module.py", line 460, in fit for_training=True, force_rebind=force_rebind) File "/home/hhb/anaconda3/lib/python3.6/site-packages/mxnet/module/module.py", line 428, in bind state_names=self._state_names) File "/home/hhb/anaconda3/lib/python3.6/site-packages/mxnet/module/executor_group.py", line 237, in __init__ self.bind_exec(data_shapes, label_shapes, shared_group) File "/home/hhb/anaconda3/lib/python3.6/site-packages/mxnet/module/executor_group.py", line 333, in bind_exec shared_group)) File "/home/hhb/anaconda3/lib/python3.6/site-packages/mxnet/module/executor_group.py", line 611, in _bind_ith_exec shared_buffer=shared_data_arrays, **input_shapes) File "/home/hhb/anaconda3/lib/python3.6/site-packages/mxnet/symbol/symbol.py", line 1494, in simple_bind raise RuntimeError(error_msg) RuntimeError: simple_bind error. Arguments: data: (28, 3, 256, 256) softmax_label: (28,) [10:33:28] src/storage/storage.cc:114: Check failed: e == cudaSuccess || e == cudaErrorCudartUnloading CUDA: unknown error Stack trace returned 10 entries: [bt] (0) /home/hhb/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x28965c) [0x7fa0d737165c] [bt] (1) /home/hhb/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x29c891e) [0x7fa0d9ab091e] [bt] (2) /home/hhb/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x29ca21d) [0x7fa0d9ab221d] [bt] (3) /home/hhb/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x29ca612) [0x7fa0d9ab2612] [bt] (4) /home/hhb/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x24719cd) [0x7fa0d95599cd] [bt] (5) /home/hhb/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x248f894) [0x7fa0d9577894] [bt] (6) /home/hhb/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x2476b9c) [0x7fa0d955eb9c] [bt] (7) /home/hhb/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x247b048) [0x7fa0d9563048] [bt] (8) /home/hhb/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x2482e6a) [0x7fa0d956ae6a] [bt] (9) /home/hhb/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x2483564) [0x7fa0d956b564] I have tried everyting I can find on the internet ,but they do not work for me. I dont know what to do next.
---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services