kohillyang commented on issue #18800:
URL:
https://github.com/apache/incubator-mxnet/issues/18800#issuecomment-691611097
@szha I found that training with mx.mod.Module setting
MXNET_BACKWARD_DO_MIRROR to 1 takes more GPU memory than Gluon HybridBlock.
Because if setting MXNET_BACKWARD_DO_MIRROR to 1, MXNET_USE_FUSION must be also
set to 1 because it seems that relu has been fused. Does it mean that Gluon
does not need MXNET_BACKWARD_DO_MIRROR? Or we can't generate Symbol from
HybridBlock and must write a network with pure symbol API?
I test the memory consuming with the following codes:
```python
import mxnet as mx
import mxnet.autograd as ag
class NaiveDataset(object):
def __len__(self):
return 10000
def __getitem__(self, idx):
if idx % 2 ==0:
label = mx.nd.zeros(shape=(1000, ))
label[0] = 1
return mx.nd.array(mx.nd.zeros(shape=(3, 224, 224))), label
else:
label = mx.nd.zeros(shape=(1000, ))
label[1] = 1
return mx.nd.array(mx.nd.ones(shape=(3, 224, 224))), label
def train_gluon_model_with_module():
import os
# os.environ["MXNET_BACKWARD_DO_MIRROR"]="1"
# os.environ["MXNET_USE_FUSION"]="0"
ctx_list = [mx.gpu(0)]
from models.backbones.resnet._resnetv1b import resnet50_v1b
net = resnet50_v1b(pretrained=False)
# net = mx.gluon.model_zoo.vision.resnet50_v1(pretrained=False)
net.initialize()
_ = net(mx.nd.zeros(shape=(1, 3, 224, 224)))
arg_params = {}
aux_params = {}
arg_params_collected = net.collect_params()
for k in arg_params_collected:
arg_params[k] = arg_params_collected[k].data(mx.cpu())
for k in arg_params_collected:
aux_params[k] = arg_params_collected[k].data(mx.cpu())
data = mx.sym.var(name="data")
sym = net(data)
module = mx.mod.Module(sym, data_names=['data'], label_names=[],
context=ctx_list)
module.bind(data_shapes=[("data", (len(ctx_list) * 2, 3, 224, 224))])
module.init_params(arg_params=arg_params, aux_params=aux_params,
allow_missing=False, allow_extra=True)
module.init_optimizer(force_init=True)
train_loader = mx.gluon.data.DataLoader(dataset=NaiveDataset(),
batch_size=100,
num_workers=8,
last_batch="discard", shuffle=True,
thread_pool=False)
for data_batch in train_loader:
module_data_batch = mx.io.DataBatch(data=[data_batch[0], ],
label=None)
module.forward(module_data_batch, is_train=True)
y_hat = module.get_outputs(merge_multi_context=True)
label_list = mx.gluon.utils.split_and_load(data_batch[1],
ctx_list=ctx_list, batch_axis=0)
preds_list = mx.gluon.utils.split_and_load(y_hat[0],
ctx_list=ctx_list, batch_axis=0)
pred_grad_list = []
for pred, label in zip(preds_list, label_list): # type:
mx.nd.NDArray, mx.nd.NDArray
pred.attach_grad()
label.attach_grad()
with ag.record():
pred_log_softmax = mx.nd.log_softmax(pred, axis=1)
loss = pred_log_softmax * label * -1
loss.backward()
pred_grad_list.append(pred.grad)
pred_gradients = mx.nd.concatenate(pred_grad_list, axis=0)
module.backward([pred_gradients])
module.update()
print(loss.sum().asnumpy())
mx.nd.waitall()
def train_gluon_model_with_gluon():
ctx_list = [mx.gpu(0)]
net = mx.gluon.model_zoo.vision.resnet50_v1(pretrained=False)
net.initialize()
net.collect_params().reset_ctx(ctx_list)
net.hybridize(static_alloc=True)
trainer = mx.gluon.Trainer(
net.collect_params(), # fix batchnorm, fix first stage, etc...
'sgd',
{
'learning_rate':1e-2
},
)
train_loader = mx.gluon.data.DataLoader(dataset=NaiveDataset(),
batch_size=100,
num_workers=8,
last_batch="discard", shuffle=True,
thread_pool=False)
for data_batch in train_loader:
data_list = mx.gluon.utils.split_and_load(data_batch[0],
ctx_list=ctx_list, batch_axis=0)
label_list = mx.gluon.utils.split_and_load(data_batch[1],
ctx_list=ctx_list, batch_axis=0)
losses = []
for data, label in zip(data_list, label_list): # type:
mx.nd.NDArray, mx.nd.NDArray
with ag.record():
y_hat = net(data)
pred_log_softmax = mx.nd.log_softmax(y_hat, axis=1)
loss = pred_log_softmax * label * -1
losses.append(loss)
ag.backward(losses)
trainer.step(1)
print(loss.sum().asnumpy())
mx.nd.waitall()
if __name__ == '__main__':
# train_gluon_model_with_module()
train_gluon_model_with_gluon()
```
By default train_gluon_model_with_module and train_gluon_model_with_gluon
need almost same GPU memory, but if set MXNET_BACKWARD_DO_MIRROR to 1 and set
MXNET_USE_FUSION to 0, train_gluon_model_with_module will fail and raise a OOM
exception.
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]