jeou opened a new issue #20299:
URL: https://github.com/apache/incubator-mxnet/issues/20299


   ## Description
   ```
   resume = _validate_checkpoint('fcn', 
'fcn_resnet50_Cityscapes_11_06_23_42_45_best.params')
   net1 = get_model_by_name('fcn', ctx=ctx, model_kwargs=model_kwargs)
   net1.load_parameters(resume, ctx=ctx, ignore_extra=True)
   net2 = get_model_by_name('fcn', ctx=ctx, model_kwargs=model_kwargs)
   net2.load_parameters(resume, ctx=ctx, ignore_extra=True)
   
   train_iter, num_train = _data_iter('Cityscapes', batch_size=1, shuffle=False,
                                      last_batch='keep', 
root=get_dataset_info('Cityscapes')[0], split='train',
                                      mode='val', base_size=2048, crop_size=224)
   
   from gluoncv.loss import SoftmaxCrossEntropyLoss
   loss2 = SoftmaxCrossEntropyLoss()
   
   _ignore_label = -1
   _sparse_label = True
   _size_average = False
   _batch_axis = 0
   
   def hybrid_forward(F, pred, label):
       """Compute loss"""
       softmaxout = F.SoftmaxOutput(
           pred, label.astype(pred.dtype), ignore_label=_ignore_label,
           multi_output=_sparse_label,
           use_ignore=True, normalization='valid' if _size_average else 'null')
       loss = -F.pick(F.log(softmaxout), label, axis=1, keepdims=True)
       loss = F.where(label.expand_dims(axis=1) == _ignore_label,
                      F.zeros_like(loss), loss)
       return F.mean(loss, axis=_batch_axis, exclude=True)
   
   for i, (data, target) in enumerate(train_iter):
       with autograd.record(True):
           # for comparison, remember to set dropout layer to None
           loss_11 = hybrid_forward(nd, *net1(data), target)
           loss_12 = loss2(*net2(data), target)
   
       autograd.backward([loss_11, loss_12])
       params1 = net1.collect_params()
       params2 = net2.collect_params()
       grad1 = params1['fcnresnet0_fcnhead0_conv0_weight'].grad()
       grad2 = params2['fcnresnet1_fcnhead0_conv0_weight'].grad()
       sum = nd.sum(grad2 - grad1)
       t = grad2 / grad1
   ```
   with two same models, i copy the hybrid_forward function from 
gluoncv.loss(softmaxcrossentropy loss in gluoncv). 
   note that i already take away the dropout layer of models for comparison.
   this function yeilds the loss_11 value which is equal to 
loss_12(softmaxcrossentropy loss from gluoncv).
   but after autograd.backward, grad1(from the copied function) isn't equal to 
grad2(from gluoncv.loss.softmaxcrossentropy).
   why the forward leads to the same loss value yet the gradients are different 
between loss from gluoncv and copied function?
   the debug shotcut picture is below.
   softmaxcrossentropy loss from gluoncv is below.
   ```
   class SoftmaxCrossEntropyLoss(Loss):
       r"""SoftmaxCrossEntropyLoss with ignore labels
   
       Parameters
       ----------
       axis : int, default -1
           The axis to sum over when computing softmax and entropy.
       sparse_label : bool, default True
           Whether label is an integer array instead of probability 
distribution.
       from_logits : bool, default False
           Whether input is a log probability (usually from log_softmax) instead
           of unnormalized numbers.
       weight : float or None
           Global scalar weight for loss.
       batch_axis : int, default 0
           The axis that represents mini-batch.
       ignore_label : int, default -1
           The label to ignore.
       size_average : bool, default False
           Whether to re-scale loss with regard to ignored labels.
       """
       def __init__(self, sparse_label=True, batch_axis=0, ignore_label=-1,
                    size_average=True, **kwargs):
           super(SoftmaxCrossEntropyLoss, self).__init__(None, batch_axis, 
**kwargs)
           self._sparse_label = sparse_label
           self._ignore_label = ignore_label
           self._size_average = size_average
   
       def hybrid_forward(self, F, pred, label):
           """Compute loss"""
           softmaxout = F.SoftmaxOutput(
               pred, label.astype(pred.dtype), ignore_label=self._ignore_label,
               multi_output=self._sparse_label,
               use_ignore=True, normalization='valid' if self._size_average 
else 'null')
           if self._sparse_label:
               loss = -F.pick(F.log(softmaxout), label, axis=1, keepdims=True)
           else:
               label = _reshape_like(F, label, pred)
               loss = -F.sum(F.log(softmaxout) * label, axis=-1, keepdims=True)
           loss = F.where(label.expand_dims(axis=1) == self._ignore_label,
                          F.zeros_like(loss), loss)
           return F.mean(loss, axis=self._batch_axis, exclude=True)
   ```
   ## Occurrences
   
![image](https://user-images.githubusercontent.com/30623934/119309253-a4c38b80-bca0-11eb-9dea-fb809528ee42.png)
   values in grad2/grad1 are usually close to 1.09....
   
   stuck by this, please help me if you know anything that may cause this.
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]



---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to