cjolivier01 commented on issue #9410: Training with the same parameters and 
seed gets significantly different results
URL: 
https://github.com/apache/incubator-mxnet/issues/9410#issuecomment-372485696
 
 
   Modified script per your second code comment (smaller test set):
   ```python
   import mxnet as mx
   from mxnet import nd, gluon, autograd, ndarray
   import numpy as np
   import random
   from mxnet import profiler
   
   # profiler.set_config(profile_symbolic=True, aggregate_stats=True, 
continuous_dump=True)
   # profiler.set_state('run')
   
   def transform(data, label):
       return [dat.astype(np.float32) for dat in data], [lab.astype(np.float32) 
for lab in label]
   
   
   train_cifar_gluon = gluon.data.vision.CIFAR10(train=True, 
transform=transform)
   test_cifar_gluon = gluon.data.vision.CIFAR10(train=False, 
transform=transform)
   
   def convert_gluon_dataset_to_numpy(data):
       ds = data[:][0][0].shape
       X = np.empty((len(data[:][0]), ds[2], ds[0], ds[1]), dtype=np.float32)
       for i, example in enumerate(data[:][0]):
           X[i, :] = np.rollaxis(example.asnumpy(),2)
       y = np.array(data[:][1])
       return X, y
   
   X, y = convert_gluon_dataset_to_numpy(train_cifar_gluon)
   X_test, y_test = convert_gluon_dataset_to_numpy(test_cifar_gluon)
   
   
   # In[2]:
   
   
   def predict_scores(net, X_, batch_size, context):
       scores = None
       test_loaded = gluon.data.DataLoader(mx.nd.array(X_), batch_size, 
shuffle=False)
       for data in test_loaded:
           data = data.as_in_context(context)
           output = net(data).asnumpy()
           if scores is None:
               scores = output
           else:
               scores = np.append(scores, output, axis=0)
       return scores
   
   
   # In[3]:
   
   
   gpu_count = 1
   _ctx_list = [mx.gpu(i) for i in range(gpu_count)]
   _batch_size=64
   epochs=1
   _seed=42
   _optimizer='sgd'
   _learning_rate=0.1
   _xavier_magnitude=2.
   _momentum=0.9
   _wd=0.0001
   _nclasses=10
   
   
   n_batch=5000
   #n_batch=_batch_size * 2
   random_selector = np.random.RandomState(0)
   confidences = random_selector.rand(X.shape[0])
   cert_idx = np.argsort(confidences)
   selected_indices = cert_idx[:n_batch]
   selected_indices = np.sort(selected_indices)
   
   # ### Try 1
   
   # In[4]:
   
   
   random.seed(_seed)
   mx.random.seed(_seed)
   np.random.seed(_seed)
   
   
   # In[5]:
   
   
   net = gluon.model_zoo.vision.get_model('resnet34_v2', pretrained=False, 
classes=_nclasses, ctx=_ctx_list)
   
   loss = gluon.loss.SoftmaxCrossEntropyLoss()
   
   
   # In[6]:
   
   
   net.collect_params().initialize(mx.init.Xavier(magnitude=_xavier_magnitude), 
ctx=_ctx_list, force_reinit=True)
   
   trainer = gluon.Trainer(net.collect_params(), _optimizer,
                           optimizer_params=dict(learning_rate=_learning_rate, 
momentum=_momentum,
                                                 wd=_wd),
                           kvstore='device' if len(_ctx_list) > 0 else 'local')
   
   #train_data = mx.io.NDArrayIter(X, label=y, batch_size=_batch_size)
   train_data = mx.io.NDArrayIter(X[selected_indices, :], 
label=y[selected_indices], batch_size=_batch_size)
   
   for e in range(epochs):
       train_data.reset()
       for batch in train_data:
           cur_contexts = _ctx_list
           if batch.data[0].shape[0] < len(_ctx_list):
               cur_contexts = cur_contexts[:batch.data[0].shape[0]]
           data = gluon.utils.split_and_load(batch.data[0], 
ctx_list=cur_contexts, batch_axis=0, even_split=False)
           label = gluon.utils.split_and_load(batch.label[0], 
ctx_list=cur_contexts, batch_axis=0, even_split=False)
           Ls = []
           with autograd.record():  # Start recording the derivatives
               for x_cur, y_cur in zip(data, label):
                   L = loss(net(x_cur), y_cur)
                   # store the loss and do backward after we have done forward
                   # on all GPUs for better speed on multiple GPUs.
                   Ls.append(L)
               for L in Ls:
                   L.backward()
           trainer.step(batch.data[0].shape[0])
   
       scores_test = predict_scores(net, X_test, _batch_size, _ctx_list[0])
       predictions_test = np.argmax(scores_test, axis=1)
       accuracy = np.mean(predictions_test == y_test)
       print('[Epoch %d] accuracy=%f' % (e, accuracy))
   
   
   # ### Try 2
   
   # In[7]:
   # profiler.set_state('stop')
   # print(profiler.dumps(True))
   
   random.seed(_seed)
   mx.random.seed(_seed)
   np.random.seed(_seed)
   
   
   # In[8]:
   
   
   net = gluon.model_zoo.vision.get_model('resnet34_v2', pretrained=False, 
classes=_nclasses, ctx=_ctx_list)
   
   loss = gluon.loss.SoftmaxCrossEntropyLoss()
   
   
   # In[9]:
   
   
   net.collect_params().initialize(mx.init.Xavier(magnitude=_xavier_magnitude), 
ctx=_ctx_list, force_reinit=True)
   
   trainer = gluon.Trainer(net.collect_params(), _optimizer,
                           optimizer_params=dict(learning_rate=_learning_rate, 
momentum=_momentum,
                                                 wd=_wd),
                           kvstore='device' if len(_ctx_list) > 0 else 'local')
   
   #train_data = mx.io.NDArrayIter(X, label=y, batch_size=_batch_size)
   train_data = mx.io.NDArrayIter(X[selected_indices, :], 
label=y[selected_indices], batch_size=_batch_size)
   
   for e in range(epochs):
       train_data.reset()
       for batch in train_data:
           cur_contexts = _ctx_list
           if batch.data[0].shape[0] < len(_ctx_list):
               cur_contexts = cur_contexts[:batch.data[0].shape[0]]
           data = gluon.utils.split_and_load(batch.data[0], 
ctx_list=cur_contexts, batch_axis=0, even_split=False)
           label = gluon.utils.split_and_load(batch.label[0], 
ctx_list=cur_contexts, batch_axis=0, even_split=False)
           Ls = []
           with autograd.record():  # Start recording the derivatives
               for x_cur, y_cur in zip(data, label):
                   L = loss(net(x_cur), y_cur)
                   # store the loss and do backward after we have done forward
                   # on all GPUs for better speed on multiple GPUs.
                   Ls.append(L)
               for L in Ls:
                   L.backward()
           trainer.step(batch.data[0].shape[0])
   
       scores_test = predict_scores(net, X_test, _batch_size, _ctx_list[0])
       predictions_test = np.argmax(scores_test, axis=1)
       accuracy = np.mean(predictions_test == y_test)
       print('[Epoch %d] accuracy=%f' % (e, accuracy))
   ```

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

Reply via email to