Hi, I've been trying to train a model using the GPU on a server, but I'm 
getting the error:

    Check failed: e == CUBLAS_STATUS_SUCCESS (13 vs. 0) : cuBLAS: 
CUBLAS_STATUS_EXECUTION_FAILED

I found a few similar topics on the forum, and they all seem to point out to 
some problems with the installed versions of CUDA.
However, on the same machine I can train some object detection models on GPU 
within the same environment without any errors, so I guess the problem is in my 
code.


This is what I'm doing so far:

```
import mxnet as mx
from mxnet import autograd, gluon, init
from mxnet import ndarray as nd
import numpy as np
from mxnet.gluon import nn
from mxnet import init
from mxnet.gluon import loss as gloss
import pandas as pd


def main()

    ## read, preprocess and split data
    df_data = pd.read_csv('some_file.csv')
    df_data = pre_process(df_data)
    X_train, y_train, X_test, y_test = split_data(df_data)


    train(X_train, X_test, y_train, y_test, lr, batch_size, nr_epochs)


def train(X_train, X_test, y_train, y_test, lr, batch_size, nr_epochs):
    y_train = mx.nd.array(y_train.to_numpy().reshape(-1,1), dtype=np.float32)
    y_test = mx.nd.array(y_test.to_numpy().reshape(-1,1), dtype=np.float32)
    X_train = mx.nd.array(X_train.to_numpy(), dtype=np.float32)
    X_test = mx.nd.array(X_test.to_numpy(), dtype=np.float32)
    ## define ctx and load data on it
    ctx = mx.gpu(3)
    X_train = X_train.as_in_context(ctx)
    X_test = X_test.as_in_context(ctx)
    y_train = y_train.as_in_context(ctx)
    y_test = y_test.as_in_context(ctx)

    ##--------------------
    ##   building model
    ##--------------------
    batch = batch_size
    epochs = nr_epochs
    dataset = gluon.data.dataset.ArrayDataset(X_train, y_train)
    data_loader = gluon.data.DataLoader(dataset, batch_size=batch, shuffle=True)

    model = nn.Sequential()
    model.add(nn.Dense(64, activation='relu'))
    model.add(nn.Dense(1))
    model.initialize(init.Normal(sigma=0.01), ctx)
    model.collect_params().reset_ctx(ctx)
    loss = gloss.L2Loss()
    trainer = gluon.Trainer(model.collect_params(), 'sgd', {'learning_rate': 
lr})

    ##--------------------
    ##   training
    ##--------------------
    for epoch in range(1, epochs + 1):
        for X_batch, Y_batch in data_loader:
            with autograd.record():
                l = loss(model(X_batch), Y_batch)
            l.backward()
            trainer.step(batch)

```
the error occurs even if I try something as simple as:

    print(l)

within the for loop, so I assume there's something wrong here?





---
[Visit 
Topic](https://discuss.mxnet.apache.org/t/correct-way-to-train-sequential-model-on-gpu/6848/1)
 or reply to this email to respond.

You are receiving this because you enabled mailing list mode.

To unsubscribe from these emails, [click 
here](https://discuss.mxnet.apache.org/email/unsubscribe/bf56c7c54adaeb43581341c20b184cd16b0665e9c7c51accb74e8a0bf4e2fad4).

Reply via email to