kohillyang commented on issue #18800:
URL: 
https://github.com/apache/incubator-mxnet/issues/18800#issuecomment-691611097


   @szha  I found that training with mx.mod.Module setting 
MXNET_BACKWARD_DO_MIRROR to 1 takes more GPU memory than Gluon HybridBlock. 
Because if setting MXNET_BACKWARD_DO_MIRROR to 1, MXNET_USE_FUSION must be also 
set to 1 because it seems that relu has been fused. Does it mean that Gluon 
does not need MXNET_BACKWARD_DO_MIRROR? Or we can't generate Symbol from 
HybridBlock and must write a network with pure symbol API?
   
   I test the memory consuming with the following codes:
   ```python
   import mxnet as mx
   import mxnet.autograd as ag
   
   
   class NaiveDataset(object):
       def __len__(self):
           return 10000
   
       def __getitem__(self, idx):
           if idx % 2 ==0:
               label = mx.nd.zeros(shape=(1000, ))
               label[0] = 1
               return mx.nd.array(mx.nd.zeros(shape=(3, 224, 224))), label
           else:
               label = mx.nd.zeros(shape=(1000, ))
               label[1] = 1
               return mx.nd.array(mx.nd.ones(shape=(3, 224, 224))), label
   
   
   def train_gluon_model_with_module():
       import os
       # os.environ["MXNET_BACKWARD_DO_MIRROR"]="1"
       # os.environ["MXNET_USE_FUSION"]="0"
       ctx_list = [mx.gpu(0)]
       from models.backbones.resnet._resnetv1b import resnet50_v1b
       net = resnet50_v1b(pretrained=False)
       # net = mx.gluon.model_zoo.vision.resnet50_v1(pretrained=False)
       net.initialize()
       _ = net(mx.nd.zeros(shape=(1, 3, 224, 224)))
       arg_params = {}
       aux_params = {}
       arg_params_collected = net.collect_params()
       for k in arg_params_collected:
           arg_params[k] = arg_params_collected[k].data(mx.cpu())
       for k in arg_params_collected:
           aux_params[k] = arg_params_collected[k].data(mx.cpu())
   
       data = mx.sym.var(name="data")
       sym = net(data)
       module = mx.mod.Module(sym, data_names=['data'], label_names=[], 
context=ctx_list)
       module.bind(data_shapes=[("data", (len(ctx_list) * 2, 3, 224, 224))])
       module.init_params(arg_params=arg_params, aux_params=aux_params, 
allow_missing=False, allow_extra=True)
       module.init_optimizer(force_init=True)
       train_loader = mx.gluon.data.DataLoader(dataset=NaiveDataset(), 
batch_size=100,
                                               num_workers=8, 
last_batch="discard", shuffle=True,
                                               thread_pool=False)
       for data_batch in train_loader:
           module_data_batch = mx.io.DataBatch(data=[data_batch[0], ], 
label=None)
           module.forward(module_data_batch, is_train=True)
           y_hat = module.get_outputs(merge_multi_context=True)
           label_list = mx.gluon.utils.split_and_load(data_batch[1], 
ctx_list=ctx_list, batch_axis=0)
           preds_list = mx.gluon.utils.split_and_load(y_hat[0], 
ctx_list=ctx_list, batch_axis=0)
           pred_grad_list = []
           for pred, label in zip(preds_list, label_list):  # type: 
mx.nd.NDArray, mx.nd.NDArray
               pred.attach_grad()
               label.attach_grad()
               with ag.record():
                   pred_log_softmax = mx.nd.log_softmax(pred,  axis=1)
                   loss = pred_log_softmax * label * -1
               loss.backward()
               pred_grad_list.append(pred.grad)
           pred_gradients = mx.nd.concatenate(pred_grad_list, axis=0)
           module.backward([pred_gradients])
           module.update()
           print(loss.sum().asnumpy())
           mx.nd.waitall()
   
   
   def train_gluon_model_with_gluon():
       ctx_list = [mx.gpu(0)]
       net = mx.gluon.model_zoo.vision.resnet50_v1(pretrained=False)
       net.initialize()
       net.collect_params().reset_ctx(ctx_list)
       net.hybridize(static_alloc=True)
       trainer = mx.gluon.Trainer(
           net.collect_params(),  # fix batchnorm, fix first stage, etc...
           'sgd',
           {
               'learning_rate':1e-2
            },
       )
   
       train_loader = mx.gluon.data.DataLoader(dataset=NaiveDataset(), 
batch_size=100,
                                               num_workers=8, 
last_batch="discard", shuffle=True,
                                               thread_pool=False)
       for data_batch in train_loader:
           data_list = mx.gluon.utils.split_and_load(data_batch[0], 
ctx_list=ctx_list, batch_axis=0)
           label_list = mx.gluon.utils.split_and_load(data_batch[1], 
ctx_list=ctx_list, batch_axis=0)
           losses = []
           for data, label in zip(data_list, label_list):  # type: 
mx.nd.NDArray, mx.nd.NDArray
               with ag.record():
                   y_hat = net(data)
                   pred_log_softmax = mx.nd.log_softmax(y_hat,  axis=1)
                   loss = pred_log_softmax * label * -1
               losses.append(loss)
           ag.backward(losses)
           trainer.step(1)
           print(loss.sum().asnumpy())
           mx.nd.waitall()
   
   
   if __name__ == '__main__':
       # train_gluon_model_with_module()
       train_gluon_model_with_gluon()
   
   ```
   
   By default train_gluon_model_with_module and train_gluon_model_with_gluon 
need almost same GPU memory, but if  set MXNET_BACKWARD_DO_MIRROR to 1 and set 
MXNET_USE_FUSION to 0, train_gluon_model_with_module will fail and raise a OOM 
exception. 


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]



---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to