barry-jin opened a new issue #20197: URL: https://github.com/apache/incubator-mxnet/issues/20197
## Description GluonNLP v0.x branch CI is blocked after switching from MXNet 1.7.0.post1 to MXNet 1.8.0.post0 (tracked in https://github.com/dmlc/gluon-nlp/issues/1559). It looks like the sparse feature in MXNet 1.8.0 will cause segmentation fault. ### Error Message ``` Segmentation fault: 11 terminate called without an active exception Aborted (core dumped) ``` ## To Reproduce ```python3 import mxnet as mx from mxnet import nd, gluon from mxnet.gluon import Block, HybridBlock class _Helper(HybridBlock): def __init__(self, num_classes, num_sampled, in_unit): super(_Helper, self).__init__() self._num_classes = num_classes self._num_sampled = num_sampled self._in_unit = in_unit def hybrid_forward(self, F, x, sampled_values, label, w_all, b_all): """Forward computation.""" sampled_candidates, expected_count_sampled, expected_count_true = sampled_values # (num_sampled, in_unit) w_sampled = w_all.slice(begin=(0, 0), end=(self._num_sampled, None)) w_true = w_all.slice(begin=(self._num_sampled, 0), end=(None, None)) b_sampled = b_all.slice(begin=(0,), end=(self._num_sampled,)) b_true = b_all.slice(begin=(self._num_sampled,), end=(None,)) # true pred # (batch_size, 1) x = x.reshape((-1, self._in_unit)) pred_true = (w_true * x).sum(axis=1) + b_true # samples pred # (batch_size, num_sampled) b_sampled = F.reshape(b_sampled, (-1,)) pred_sampled = F.FullyConnected(x, weight=w_sampled, bias=b_sampled, num_hidden=self._num_sampled) # remove accidental hits label_vec = F.reshape(label, (-1, 1)).astype('int32') sample_vec = F.reshape(sampled_candidates, (1, -1)).astype('int32') mask = F.broadcast_equal(label_vec, sample_vec).astype('float32') * -1e37 pred_sampled = pred_sampled + mask # subtract log(q) expected_count_sampled = expected_count_sampled.astype('float32') expected_count_sampled = expected_count_sampled.reshape(shape=(1, self._num_sampled)) expected_count_true = expected_count_true.astype('float32').reshape((-1,)) pred_true = pred_true - F.log(expected_count_true) pred_true = pred_true.reshape((-1, 1)) pred_sampled = F.broadcast_sub(pred_sampled, F.log(expected_count_sampled)) # pred and new_labels # (batch_size, 1+num_sampled) pred = F.concat(pred_true, pred_sampled, dim=1) new_label = F.zeros_like(label) return pred, new_label class SimpleSparse(Block): def __init__(self, num_classes, num_sampled, in_unit): super(SimpleSparse, self).__init__() with self.name_scope(): self.weight = self.params.get('weight', shape=(num_classes, in_unit), init=None, dtype='float32', grad_stype='row_sparse', stype='row_sparse') self.bias = self.params.get('bias', shape=(num_classes,), init='zeros', dtype='float32') self._num_classes = num_classes self._num_sampled = num_sampled self._in_unit = in_unit self._kwargs = {'input_dim': self._num_classes, 'output_dim': self._in_unit, 'sparse_grad': True} self._dense = _Helper(num_classes, num_sampled, in_unit) def forward(self, x, sampled_values, label): # pylint: disable=arguments-differ """Forward computation.""" sampled_candidates, _, _ = sampled_values # (batch_size,) label = label.reshape(shape=(-1,)) # (num_sampled+batch_size,) ids = nd.concat(sampled_candidates.astype('int32'), label.astype('int32'), dim=0) # lookup weights and biases weight = self.weight.row_sparse_data(ids) bias = self.bias.data(ids.context) # (num_sampled+batch_size, dim) w_all = nd.Embedding(data=ids, weight=weight, **self._kwargs) # (num_sampled+batch_size,) b_all = nd.take(bias, indices=ids) out, new_targets = self._dense(x, sampled_values, label, w_all, b_all) return out, new_targets def test(): ctx = mx.cpu() batch_size = 2 num_sampled = 3 vocab_size = 10 num_hidden = 5 model = SimpleSparse(vocab_size, num_sampled, num_hidden) loss = gluon.loss.SoftmaxCrossEntropyLoss() model.hybridize() model.initialize(mx.init.Xavier(), ctx=ctx) trainer = mx.gluon.Trainer(model.collect_params(), 'sgd') x = mx.nd.ones((batch_size, num_hidden)) y = mx.nd.ones((batch_size,)) sampled_cls = mx.nd.ones((num_sampled,), dtype='float32') sampled_cls_cnt = mx.nd.ones((num_sampled,), dtype='float32') true_cls_cnt = mx.nd.ones((batch_size,), dtype='float32') samples = (sampled_cls, sampled_cls_cnt, true_cls_cnt) with mx.autograd.record(): pred, new_y = model(x, samples, y) l = loss(pred, new_y) l.backward() mx.nd.waitall() if __name__ == '__main__': test() ``` ### Steps to reproduce (Paste the commands you ran that produced the error.) Run script above or ``` $ git clone https://github.com/gluon-nlp $ cd gluon-nlp $ git checkout v0.x $ python3 -m pip install -e .[extra,dev] $ python3 -m pytest tests/unittest/test_sampled_logits.py::test_is_softmax_loss ``` ## What have you tried to solve it? 1. 2. ## Environment ***We recommend using our script for collecting the diagnostic information with the following command*** `curl --retry 10 -s https://raw.githubusercontent.com/apache/incubator-mxnet/master/tools/diagnose.py | python3` <details> <summary>Environment Information</summary> ``` # Paste the diagnose.py command output here ``` </details> -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
