cjolivier01 commented on issue #9410: Training with the same parameters and seed gets significantly different results URL: https://github.com/apache/incubator-mxnet/issues/9410#issuecomment-372485696 Modified script per your second code comment (smaller test set): ```python import mxnet as mx from mxnet import nd, gluon, autograd, ndarray import numpy as np import random from mxnet import profiler # profiler.set_config(profile_symbolic=True, aggregate_stats=True, continuous_dump=True) # profiler.set_state('run') def transform(data, label): return [dat.astype(np.float32) for dat in data], [lab.astype(np.float32) for lab in label] train_cifar_gluon = gluon.data.vision.CIFAR10(train=True, transform=transform) test_cifar_gluon = gluon.data.vision.CIFAR10(train=False, transform=transform) def convert_gluon_dataset_to_numpy(data): ds = data[:][0][0].shape X = np.empty((len(data[:][0]), ds[2], ds[0], ds[1]), dtype=np.float32) for i, example in enumerate(data[:][0]): X[i, :] = np.rollaxis(example.asnumpy(),2) y = np.array(data[:][1]) return X, y X, y = convert_gluon_dataset_to_numpy(train_cifar_gluon) X_test, y_test = convert_gluon_dataset_to_numpy(test_cifar_gluon) # In[2]: def predict_scores(net, X_, batch_size, context): scores = None test_loaded = gluon.data.DataLoader(mx.nd.array(X_), batch_size, shuffle=False) for data in test_loaded: data = data.as_in_context(context) output = net(data).asnumpy() if scores is None: scores = output else: scores = np.append(scores, output, axis=0) return scores # In[3]: gpu_count = 1 _ctx_list = [mx.gpu(i) for i in range(gpu_count)] _batch_size=64 epochs=1 _seed=42 _optimizer='sgd' _learning_rate=0.1 _xavier_magnitude=2. _momentum=0.9 _wd=0.0001 _nclasses=10 n_batch=5000 #n_batch=_batch_size * 2 random_selector = np.random.RandomState(0) confidences = random_selector.rand(X.shape[0]) cert_idx = np.argsort(confidences) selected_indices = cert_idx[:n_batch] selected_indices = np.sort(selected_indices) # ### Try 1 # In[4]: random.seed(_seed) mx.random.seed(_seed) np.random.seed(_seed) # In[5]: net = gluon.model_zoo.vision.get_model('resnet34_v2', pretrained=False, classes=_nclasses, ctx=_ctx_list) loss = gluon.loss.SoftmaxCrossEntropyLoss() # In[6]: net.collect_params().initialize(mx.init.Xavier(magnitude=_xavier_magnitude), ctx=_ctx_list, force_reinit=True) trainer = gluon.Trainer(net.collect_params(), _optimizer, optimizer_params=dict(learning_rate=_learning_rate, momentum=_momentum, wd=_wd), kvstore='device' if len(_ctx_list) > 0 else 'local') #train_data = mx.io.NDArrayIter(X, label=y, batch_size=_batch_size) train_data = mx.io.NDArrayIter(X[selected_indices, :], label=y[selected_indices], batch_size=_batch_size) for e in range(epochs): train_data.reset() for batch in train_data: cur_contexts = _ctx_list if batch.data[0].shape[0] < len(_ctx_list): cur_contexts = cur_contexts[:batch.data[0].shape[0]] data = gluon.utils.split_and_load(batch.data[0], ctx_list=cur_contexts, batch_axis=0, even_split=False) label = gluon.utils.split_and_load(batch.label[0], ctx_list=cur_contexts, batch_axis=0, even_split=False) Ls = [] with autograd.record(): # Start recording the derivatives for x_cur, y_cur in zip(data, label): L = loss(net(x_cur), y_cur) # store the loss and do backward after we have done forward # on all GPUs for better speed on multiple GPUs. Ls.append(L) for L in Ls: L.backward() trainer.step(batch.data[0].shape[0]) scores_test = predict_scores(net, X_test, _batch_size, _ctx_list[0]) predictions_test = np.argmax(scores_test, axis=1) accuracy = np.mean(predictions_test == y_test) print('[Epoch %d] accuracy=%f' % (e, accuracy)) # ### Try 2 # In[7]: # profiler.set_state('stop') # print(profiler.dumps(True)) random.seed(_seed) mx.random.seed(_seed) np.random.seed(_seed) # In[8]: net = gluon.model_zoo.vision.get_model('resnet34_v2', pretrained=False, classes=_nclasses, ctx=_ctx_list) loss = gluon.loss.SoftmaxCrossEntropyLoss() # In[9]: net.collect_params().initialize(mx.init.Xavier(magnitude=_xavier_magnitude), ctx=_ctx_list, force_reinit=True) trainer = gluon.Trainer(net.collect_params(), _optimizer, optimizer_params=dict(learning_rate=_learning_rate, momentum=_momentum, wd=_wd), kvstore='device' if len(_ctx_list) > 0 else 'local') #train_data = mx.io.NDArrayIter(X, label=y, batch_size=_batch_size) train_data = mx.io.NDArrayIter(X[selected_indices, :], label=y[selected_indices], batch_size=_batch_size) for e in range(epochs): train_data.reset() for batch in train_data: cur_contexts = _ctx_list if batch.data[0].shape[0] < len(_ctx_list): cur_contexts = cur_contexts[:batch.data[0].shape[0]] data = gluon.utils.split_and_load(batch.data[0], ctx_list=cur_contexts, batch_axis=0, even_split=False) label = gluon.utils.split_and_load(batch.label[0], ctx_list=cur_contexts, batch_axis=0, even_split=False) Ls = [] with autograd.record(): # Start recording the derivatives for x_cur, y_cur in zip(data, label): L = loss(net(x_cur), y_cur) # store the loss and do backward after we have done forward # on all GPUs for better speed on multiple GPUs. Ls.append(L) for L in Ls: L.backward() trainer.step(batch.data[0].shape[0]) scores_test = predict_scores(net, X_test, _batch_size, _ctx_list[0]) predictions_test = np.argmax(scores_test, axis=1) accuracy = np.mean(predictions_test == y_test) print('[Epoch %d] accuracy=%f' % (e, accuracy)) ```
---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services