@szha I found that training with mx.mod.Module setting
MXNET_BACKWARD_DO_MIRROR to 1 takes more GPU memory than Gluon HybridBlock.
Because if setting MXNET_BACKWARD_DO_MIRROR to 1, MXNET_USE_FUSION must be also
set to 1 because it seems that relu has been fused. Does it mean that Gluon
does not need MXNET_BACKWARD_DO_MIRROR? Or we can't generate Symbol from
HybridBlock and must write a network with pure symbol API?
I test the memory consuming with the following codes:
```python
import mxnet as mx
import mxnet.autograd as ag
class NaiveDataset(object):
def __len__(self):
return 10000
def __getitem__(self, idx):
if idx % 2 ==0:
label = mx.nd.zeros(shape=(1000, ))
label[0] = 1
return mx.nd.array(mx.nd.zeros(shape=(3, 224, 224))), label
else:
label = mx.nd.zeros(shape=(1000, ))
label[1] = 1
return mx.nd.array(mx.nd.ones(shape=(3, 224, 224))), label
def train_gluon_model_with_module():
import os
# os.environ["MXNET_BACKWARD_DO_MIRROR"]="1"
# os.environ["MXNET_USE_FUSION"]="0"
ctx_list = [mx.gpu(0)]
from models.backbones.resnet._resnetv1b import resnet50_v1b
net = resnet50_v1b(pretrained=False)
# net = mx.gluon.model_zoo.vision.resnet50_v1(pretrained=False)
net.initialize()
_ = net(mx.nd.zeros(shape=(1, 3, 224, 224)))
arg_params = {}
aux_params = {}
arg_params_collected = net.collect_params()
for k in arg_params_collected:
arg_params[k] = arg_params_collected[k].data(mx.cpu())
for k in arg_params_collected:
aux_params[k] = arg_params_collected[k].data(mx.cpu())
data = mx.sym.var(name="data")
sym = net(data)
module = mx.mod.Module(sym, data_names=['data'], label_names=[],
context=ctx_list)
module.bind(data_shapes=[("data", (len(ctx_list) * 2, 3, 224, 224))])
module.init_params(arg_params=arg_params, aux_params=aux_params,
allow_missing=False, allow_extra=True)
module.init_optimizer(force_init=True)
train_loader = mx.gluon.data.DataLoader(dataset=NaiveDataset(),
batch_size=100,
num_workers=8,
last_batch="discard", shuffle=True,
thread_pool=False)
for data_batch in train_loader:
module_data_batch = mx.io.DataBatch(data=[data_batch[0], ], label=None)
module.forward(module_data_batch, is_train=True)
y_hat = module.get_outputs(merge_multi_context=True)
label_list = mx.gluon.utils.split_and_load(data_batch[1],
ctx_list=ctx_list, batch_axis=0)
preds_list = mx.gluon.utils.split_and_load(y_hat[0], ctx_list=ctx_list,
batch_axis=0)
pred_grad_list = []
for pred, label in zip(preds_list, label_list): # type: mx.nd.NDArray,
mx.nd.NDArray
pred.attach_grad()
label.attach_grad()
with ag.record():
pred_log_softmax = mx.nd.log_softmax(pred, axis=1)
loss = pred_log_softmax * label * -1
loss.backward()
pred_grad_list.append(pred.grad)
pred_gradients = mx.nd.concatenate(pred_grad_list, axis=0)
module.backward([pred_gradients])
module.update()
print(loss.sum().asnumpy())
mx.nd.waitall()
def train_gluon_model_with_gluon():
ctx_list = [mx.gpu(0)]
net = mx.gluon.model_zoo.vision.resnet50_v1(pretrained=False)
net.initialize()
net.collect_params().reset_ctx(ctx_list)
net.hybridize(static_alloc=True)
trainer = mx.gluon.Trainer(
net.collect_params(), # fix batchnorm, fix first stage, etc...
'sgd',
{
'learning_rate':1e-2
},
)
train_loader = mx.gluon.data.DataLoader(dataset=NaiveDataset(),
batch_size=100,
num_workers=8,
last_batch="discard", shuffle=True,
thread_pool=False)
for data_batch in train_loader:
data_list = mx.gluon.utils.split_and_load(data_batch[0],
ctx_list=ctx_list, batch_axis=0)
label_list = mx.gluon.utils.split_and_load(data_batch[1],
ctx_list=ctx_list, batch_axis=0)
losses = []
for data, label in zip(data_list, label_list): # type: mx.nd.NDArray,
mx.nd.NDArray
with ag.record():
y_hat = net(data)
pred_log_softmax = mx.nd.log_softmax(y_hat, axis=1)
loss = pred_log_softmax * label * -1
losses.append(loss)
ag.backward(losses)
trainer.step(1)
print(loss.sum().asnumpy())
mx.nd.waitall()
if __name__ == '__main__':
# train_gluon_model_with_module()
train_gluon_model_with_gluon()
```
By default train_gluon_model_with_module and train_gluon_model_with_gluon need
almost same GPU memory, but if set MXNET_BACKWARD_DO_MIRROR to 1 and set
MXNET_USE_FUSION to 0, train_gluon_model_with_module will fail and raise a OOM
exception.
--
You are receiving this because you were mentioned.
Reply to this email directly or view it on GitHub:
https://github.com/apache/incubator-mxnet/issues/18800#issuecomment-691611097