[GitHub] [incubator-mxnet] barry-jin opened a new issue #20197: MXNet 1.8.0.post0 sparse feature segmentation fault

GitBox Wed, 21 Apr 2021 10:33:28 -0700


barry-jin opened a new issue #20197:
URL: https://github.com/apache/incubator-mxnet/issues/20197



   ## Description
   GluonNLP v0.x branch CI is blocked after switching from MXNet 1.7.0.post1 to 
MXNet 1.8.0.post0 (tracked in https://github.com/dmlc/gluon-nlp/issues/1559). 
It looks like the sparse feature in MXNet 1.8.0 will cause segmentation fault. 
   
   ### Error Message
   ```
   Segmentation fault: 11
   
   terminate called without an active exception
   Aborted (core dumped)
   ```
   
   ## To Reproduce
   ```python3
   import mxnet as mx
   from mxnet import nd, gluon
   from mxnet.gluon import Block, HybridBlock
   
   class _Helper(HybridBlock):
       def __init__(self, num_classes, num_sampled, in_unit):
           super(_Helper, self).__init__()
           self._num_classes = num_classes
           self._num_sampled = num_sampled
           self._in_unit = in_unit
   
       def hybrid_forward(self, F, x, sampled_values, label, w_all, b_all):
           """Forward computation."""
           sampled_candidates, expected_count_sampled, expected_count_true = 
sampled_values
           # (num_sampled, in_unit)
           w_sampled = w_all.slice(begin=(0, 0), end=(self._num_sampled, None))
           w_true = w_all.slice(begin=(self._num_sampled, 0), end=(None, None))
           b_sampled = b_all.slice(begin=(0,), end=(self._num_sampled,))
           b_true = b_all.slice(begin=(self._num_sampled,), end=(None,))
           # true pred
           # (batch_size, 1)
           x = x.reshape((-1, self._in_unit))
           pred_true = (w_true * x).sum(axis=1) + b_true
           # samples pred
           # (batch_size, num_sampled)
           b_sampled = F.reshape(b_sampled, (-1,))
           pred_sampled = F.FullyConnected(x, weight=w_sampled, bias=b_sampled,
                                           num_hidden=self._num_sampled)
   
           # remove accidental hits
           label_vec = F.reshape(label, (-1, 1)).astype('int32')
           sample_vec = F.reshape(sampled_candidates, (1, -1)).astype('int32')
           mask = F.broadcast_equal(label_vec, sample_vec).astype('float32') * 
-1e37
           pred_sampled = pred_sampled + mask
   
           # subtract log(q)
           expected_count_sampled = expected_count_sampled.astype('float32')
           expected_count_sampled = expected_count_sampled.reshape(shape=(1, 
self._num_sampled))
           expected_count_true = 
expected_count_true.astype('float32').reshape((-1,))
           pred_true = pred_true - F.log(expected_count_true)
           pred_true = pred_true.reshape((-1, 1))
           pred_sampled = F.broadcast_sub(pred_sampled, 
F.log(expected_count_sampled))
   
           # pred and new_labels
           # (batch_size, 1+num_sampled)
           pred = F.concat(pred_true, pred_sampled, dim=1)
           new_label = F.zeros_like(label)
           return pred, new_label
   
   class SimpleSparse(Block):
       def __init__(self, num_classes, num_sampled, in_unit):
           super(SimpleSparse, self).__init__()
           with self.name_scope():
               self.weight = self.params.get('weight', shape=(num_classes, 
in_unit),
                                             init=None, dtype='float32',
                                             grad_stype='row_sparse', 
stype='row_sparse')
               self.bias = self.params.get('bias', shape=(num_classes,), 
init='zeros',
                                           dtype='float32')
           self._num_classes = num_classes
           self._num_sampled = num_sampled
           self._in_unit = in_unit
           self._kwargs = {'input_dim': self._num_classes, 'output_dim': 
self._in_unit,
                           'sparse_grad': True}
           self._dense = _Helper(num_classes, num_sampled, in_unit)
   
       def forward(self, x, sampled_values, label): # pylint: 
disable=arguments-differ
           """Forward computation."""
           sampled_candidates, _, _ = sampled_values
           # (batch_size,)
           label = label.reshape(shape=(-1,))
           # (num_sampled+batch_size,)
           ids = nd.concat(sampled_candidates.astype('int32'), 
label.astype('int32'), dim=0)
           # lookup weights and biases
           weight = self.weight.row_sparse_data(ids)
           bias = self.bias.data(ids.context)
           # (num_sampled+batch_size, dim)
           w_all = nd.Embedding(data=ids, weight=weight, **self._kwargs)
           # (num_sampled+batch_size,)
           b_all = nd.take(bias, indices=ids)
           out, new_targets = self._dense(x, sampled_values, label, w_all, 
b_all)
           return out, new_targets
   
   def test():
       ctx = mx.cpu()
       batch_size = 2
       num_sampled = 3
       vocab_size = 10
       num_hidden = 5
       model = SimpleSparse(vocab_size, num_sampled, num_hidden)
       loss = gluon.loss.SoftmaxCrossEntropyLoss()
       model.hybridize()
       model.initialize(mx.init.Xavier(), ctx=ctx)
       trainer = mx.gluon.Trainer(model.collect_params(), 'sgd')
       x = mx.nd.ones((batch_size, num_hidden))
       y = mx.nd.ones((batch_size,))
       sampled_cls = mx.nd.ones((num_sampled,), dtype='float32')
       sampled_cls_cnt = mx.nd.ones((num_sampled,), dtype='float32')
       true_cls_cnt = mx.nd.ones((batch_size,), dtype='float32')
       samples = (sampled_cls, sampled_cls_cnt, true_cls_cnt)
       with mx.autograd.record():
           pred, new_y = model(x, samples, y)
           l = loss(pred, new_y)
       l.backward()
       mx.nd.waitall()
   
   if __name__ == '__main__':
       test()
   
   ```
   
   ### Steps to reproduce
   (Paste the commands you ran that produced the error.)
   
   Run script above
   or 
   ```
   $ git clone https://github.com/gluon-nlp
   $ cd gluon-nlp
   $ git checkout v0.x
   $ python3 -m pip install -e .[extra,dev]
   $ python3 -m pytest 
tests/unittest/test_sampled_logits.py::test_is_softmax_loss
   ```
   
   ## What have you tried to solve it?
   
   1.
   2.
   
   ## Environment
   
   ***We recommend using our script for collecting the diagnostic information 
with the following command***
   `curl --retry 10 -s 
https://raw.githubusercontent.com/apache/incubator-mxnet/master/tools/diagnose.py
 | python3`
   
   <details>
   <summary>Environment Information</summary>
   
   ```
   # Paste the diagnose.py command output here
   ```
   
   </details>
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]



---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [incubator-mxnet] barry-jin opened a new issue #20197: MXNet 1.8.0.post0 sparse feature segmentation fault

Reply via email to