[MXNet Forum] MXNet process does not stop

Ahmed Mustahid via MXNet Forum Wed, 08 Jul 2020 21:07:16 -0700


I have been trying to run the following code based on the Gluon image 
classification tutorial 
[dive_deep_cifar10](https://gluon-cv.mxnet.io/build/examples_classification/dive_deep_cifar10.html)


While it produces expected outputs, the process never ends even if it reaches 
the end of the code (saving an image **test.png**).

The similar code in pytorch exits properly when it reaches the EOL. Would 
anyone help me resolve the issue? 
Thank you
```


from __future__ import division
from gluoncv import data, utils
from matplotlib import pyplot as plt

import gluoncv as gcv
from gluoncv.data import VOCDetection
from matplotlib import pyplot as plt
from gluoncv.utils import viz


import argparse, time, logging, random, math

import numpy as np
import mxnet as mx

from mxnet import gluon, nd
from mxnet import autograd as ag
from mxnet.gluon import nn
from mxnet.gluon.data.vision import transforms

from gluoncv.model_zoo import get_model
from gluoncv.utils import makedirs, TrainingHistory
from gluoncv.data import transforms as gcv_transforms



def test(ctx, val_data):
    metric = mx.metric.Accuracy()
    for i, batch in enumerate(val_data):
        data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0)
        label = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0)
        outputs = [net(X) for X in data]
        metric.update(label, outputs)
    return metric.get()



if __name__=="__main__":

    train_dataset = data.VOCDetection(splits=[(2007, 'trainval'), (2012, 
'trainval')])
    val_dataset = data.VOCDetection(splits=[(2007, 'test')])
    
    
    num_gpus = 1
    ctx = [mx.gpu(i) for i in range(num_gpus)]
    
    # Get the model CIFAR_ResNet20_v1, with 10 output classes, without 
pre-trained weights
    net = get_model('cifar_resnet20_v1', classes=10)
    net.initialize(mx.init.Xavier(), ctx = ctx)
    
    
    transform_train = transforms.Compose([
        # Randomly crop an area and resize it to be 32x32, then pad it to be 
40x40
        gcv_transforms.RandomCrop(32, pad=4),
        # Randomly flip the image horizontally
        transforms.RandomFlipLeftRight(),
        # Transpose the image from height*width*num_channels to 
num_channels*height*width
        # and map values from [0, 255] to [0,1]
        transforms.ToTensor(),
        # Normalize the image with mean and standard deviation calculated 
across all images
        transforms.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010])
    ])
    
    
    
    transform_test = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010])
    ])
    
    
    # Batch Size for Each GPU
    per_device_batch_size = 128
    # Number of data loader workers
    num_workers = 8
    # Calculate effective total batch size
    batch_size = per_device_batch_size * num_gpus
    
    # Set train=True for training data
    # Set shuffle=True to shuffle the training data
    train_data = gluon.data.DataLoader(
        gluon.data.vision.CIFAR10(train=True).transform_first(transform_train),
        batch_size=batch_size, shuffle=True, last_batch='discard', 
num_workers=num_workers)
    
    # Set train=False for validation data
    val_data = gluon.data.DataLoader(
        gluon.data.vision.CIFAR10(train=False).transform_first(transform_test),
        batch_size=batch_size, shuffle=False, num_workers=num_workers)
    
    
    
    
    # Learning rate decay factor
    lr_decay = 0.1
    # Epochs where learning rate decays
    lr_decay_epoch = [80, 160, np.inf]
    
    # Nesterov accelerated gradient descent
    optimizer = 'nag'
    # Set parameters
    optimizer_params = {'learning_rate': 0.1, 'wd': 0.0001, 'momentum': 0.9}
    
    # Define our trainer for net
    trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params)
    
    
    loss_fn = gluon.loss.SoftmaxCrossEntropyLoss()
    
    
    train_metric = mx.metric.Accuracy()
    train_history = TrainingHistory(['training-error', 'validation-error'])
    
    
    
    epochs = 3
    lr_decay_count = 0
    
    for epoch in range(epochs):
        tic = time.time()
        train_metric.reset()
        train_loss = 0
    
        # Learning rate decay
        if epoch == lr_decay_epoch[lr_decay_count]:
            trainer.set_learning_rate(trainer.learning_rate*lr_decay)
            lr_decay_count += 1
    
        # Loop through each batch of training data
        for i, batch in enumerate(train_data):
            # Extract data and label
            data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, 
batch_axis=0)
            label = gluon.utils.split_and_load(batch[1], ctx_list=ctx, 
batch_axis=0)
    
            # AutoGrad
            with ag.record():
                output = [net(X) for X in data]
                loss = [loss_fn(yhat, y) for yhat, y in zip(output, label)]
    
            # Backpropagation
            for l in loss:
                l.backward()
    
            # Optimize
            trainer.step(batch_size)
    
            # Update metrics
            train_loss += sum([l.sum().asscalar() for l in loss])
            train_metric.update(label, output)
    
        name, acc = train_metric.get()
        # Evaluate on Validation data
        name, val_acc = test(ctx, val_data)
    
        # Update history and print metrics
        train_history.update([1-acc, 1-val_acc])
        print('[Epoch %d] train=%f val=%f loss=%f time: %f' %
            (epoch, acc, val_acc, train_loss, time.time()-tic))
    
    # We can plot the metric scores with:
    
    train_history.plot(save_path="test.png")
    
   
```
The output is the following as expected:

```
[Epoch 0] train=0.472676 val=0.606800 loss=72228.871788 time: 19.881742
[Epoch 1] train=0.663442 val=0.681100 loss=47372.536560 time: 19.804182
[Epoch 2] train=0.731751 val=0.734900 loss=37964.900452 time: 19.507770

```

> ps -a

```
    PID TTY          TIME CMD
    964 tty1     00:00:01 Xorg
   1112 tty1     00:00:00 gnome-session-b
   1764 tty2     00:01:00 Xorg
   1777 tty2     00:00:00 gnome-session-b
  10275 pts/1    00:01:23 python
  10315 pts/1    00:00:09 python
  10324 pts/1    00:00:09 python
  10333 pts/1    00:00:09 python
  10342 pts/1    00:00:09 python
  10351 pts/1    00:00:09 python
  10360 pts/1    00:00:09 python
  10369 pts/1    00:00:09 python
  10378 pts/1    00:00:09 python
  10391 pts/1    00:00:00 python
  10400 pts/1    00:00:00 python
  10409 pts/1    00:00:00 python
  10418 pts/1    00:00:00 python
  10427 pts/1    00:00:00 python
  10436 pts/1    00:00:00 python
  10445 pts/1    00:00:00 python
  10454 pts/1    00:00:00 python
  12313 pts/1    00:00:00 ps

```





---
[Visit Topic](https://discuss.mxnet.io/t/mxnet-process-does-not-stop/6393/1) or 
reply to this email to respond.

You are receiving this because you enabled mailing list mode.

To unsubscribe from these emails, [click 
here](https://discuss.mxnet.io/email/unsubscribe/f04aea9f1ed114e78026137c4176c019b285c8b627895de3b29ed52dee5a2269).

[MXNet Forum] MXNet process does not stop

Reply via email to