This is an automated email from the ASF dual-hosted git repository. qkou pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git
The following commit(s) were added to refs/heads/master by this push: new 5c8981a [R] Initializer fix and adjustments to RNN API (#8121) 5c8981a is described below commit 5c8981a740d7e05afb664f2228b85eb22a6d1666 Author: jeremiedb <jeremi...@users.noreply.github.com> AuthorDate: Fri Nov 10 21:24:25 2017 -0500 [R] Initializer fix and adjustments to RNN API (#8121) --- R-package/R/gru.R | 355 ------------------------------------------ R-package/R/initializer.R | 32 ++-- R-package/R/lstm.R | 388 ---------------------------------------------- R-package/R/rnn.graph.R | 160 ++++++++++++------- R-package/R/viz.graph.R | 9 +- 5 files changed, 125 insertions(+), 819 deletions(-) diff --git a/R-package/R/gru.R b/R-package/R/gru.R deleted file mode 100644 index d2ffd9a..0000000 --- a/R-package/R/gru.R +++ /dev/null @@ -1,355 +0,0 @@ -# gru cell symbol -gru <- function(num.hidden, indata, prev.state, param, seqidx, layeridx, dropout=0) { - if (dropout > 0) - indata <- mx.symbol.Dropout(data=indata, p=dropout) - i2h <- mx.symbol.FullyConnected(data=indata, - weight=param$gates.i2h.weight, - bias=param$gates.i2h.bias, - num.hidden=num.hidden * 2, - name=paste0("t", seqidx, ".l", layeridx, ".gates.i2h")) - h2h <- mx.symbol.FullyConnected(data=prev.state$h, - weight=param$gates.h2h.weight, - bias=param$gates.h2h.bias, - num.hidden=num.hidden * 2, - name=paste0("t", seqidx, ".l", layeridx, ".gates.h2h")) - gates <- i2h + h2h - slice.gates <- mx.symbol.SliceChannel(gates, num.outputs=2, - name=paste0("t", seqidx, ".l", layeridx, ".slice")) - update.gate <- mx.symbol.Activation(slice.gates[[1]], act.type="sigmoid") - reset.gate <- mx.symbol.Activation(slice.gates[[2]], act.type="sigmoid") - - htrans.i2h <- mx.symbol.FullyConnected(data=indata, - weight=param$trans.i2h.weight, - bias=param$trans.i2h.bias, - num.hidden=num.hidden, - name=paste0("t", seqidx, ".l", layeridx, ".trans.i2h")) - h.after.reset <- prev.state$h * reset.gate - htrans.h2h <- mx.symbol.FullyConnected(data=h.after.reset, - weight=param$trans.h2h.weight, - bias=param$trans.h2h.bias, - num.hidden=num.hidden, - name=paste0("t", seqidx, ".l", layeridx, ".trans.h2h")) - h.trans <- htrans.i2h + htrans.h2h - h.trans.active <- mx.symbol.Activation(h.trans, act.type="tanh") - next.h <- prev.state$h + update.gate * (h.trans.active - prev.state$h) - return (list(h=next.h)) -} - -# unrolled gru network -gru.unroll <- function(num.gru.layer, seq.len, input.size, - num.hidden, num.embed, num.label, dropout=0) { - embed.weight <- mx.symbol.Variable("embed.weight") - cls.weight <- mx.symbol.Variable("cls.weight") - cls.bias <- mx.symbol.Variable("cls.bias") - param.cells <- lapply(1:num.gru.layer, function(i) { - cell <- list(gates.i2h.weight = mx.symbol.Variable(paste0("l", i, ".gates.i2h.weight")), - gates.i2h.bias = mx.symbol.Variable(paste0("l", i, ".gates.i2h.bias")), - gates.h2h.weight = mx.symbol.Variable(paste0("l", i, ".gates.h2h.weight")), - gates.h2h.bias = mx.symbol.Variable(paste0("l", i, ".gates.h2h.bias")), - trans.i2h.weight = mx.symbol.Variable(paste0("l", i, ".trans.i2h.weight")), - trans.i2h.bias = mx.symbol.Variable(paste0("l", i, ".trans.i2h.bias")), - trans.h2h.weight = mx.symbol.Variable(paste0("l", i, ".trans.h2h.weight")), - trans.h2h.bias = mx.symbol.Variable(paste0("l", i, ".trans.h2h.bias"))) - return (cell) - }) - last.states <- lapply(1:num.gru.layer, function(i) { - state <- list(h=mx.symbol.Variable(paste0("l", i, ".init.h"))) - return (state) - }) - - # embeding layer - label <- mx.symbol.Variable("label") - data <- mx.symbol.Variable("data") - embed <- mx.symbol.Embedding(data=data, input.dim=input.size, - weight=embed.weight, output.dim=num.embed, name='embed') - wordvec <- mx.symbol.SliceChannel(data=embed, num.outputs=seq.len, squeeze.axis=1) - - last.hidden <- list() - for (seqidx in 1:seq.len) { - hidden <- wordvec[[seqidx]] - # stack GRU - for (i in 1:num.gru.layer) { - dp <- ifelse(i==1, 0, dropout) - next.state <- gru(num.hidden, indata=hidden, - prev.state=last.states[[i]], - param=param.cells[[i]], - seqidx=seqidx, layeridx=i, - dropout=dp) - hidden <- next.state$h - last.states[[i]] <- next.state - } - # decoder - if (dropout > 0) - hidden <- mx.symbol.Dropout(data=hidden, p=dropout) - last.hidden <- c(last.hidden, hidden) - } - last.hidden$dim <- 0 - last.hidden$num.args <- seq.len - concat <-mxnet:::mx.varg.symbol.Concat(last.hidden) - fc <- mx.symbol.FullyConnected(data=concat, - weight=cls.weight, - bias=cls.bias, - num.hidden=num.label) - - label <- mx.symbol.transpose(data=label) - label <- mx.symbol.Reshape(data=label, target.shape=c(0)) - - loss.all <- mx.symbol.SoftmaxOutput(data=fc, label=label, name="sm") - return (loss.all) -} - -# gru inference model symbol -gru.inference.symbol <- function(num.gru.layer, seq.len, input.size, - num.hidden, num.embed, num.label, dropout=0) { - seqidx <- 1 - embed.weight <- mx.symbol.Variable("embed.weight") - cls.weight <- mx.symbol.Variable("cls.weight") - cls.bias <- mx.symbol.Variable("cls.bias") - - param.cells <- lapply(1:num.gru.layer, function(i) { - cell <- list(gates.i2h.weight = mx.symbol.Variable(paste0("l", i, ".gates.i2h.weight")), - gates.i2h.bias = mx.symbol.Variable(paste0("l", i, ".gates.i2h.bias")), - gates.h2h.weight = mx.symbol.Variable(paste0("l", i, ".gates.h2h.weight")), - gates.h2h.bias = mx.symbol.Variable(paste0("l", i, ".gates.h2h.bias")), - trans.i2h.weight = mx.symbol.Variable(paste0("l", i, ".trans.i2h.weight")), - trans.i2h.bias = mx.symbol.Variable(paste0("l", i, ".trans.i2h.bias")), - trans.h2h.weight = mx.symbol.Variable(paste0("l", i, ".trans.h2h.weight")), - trans.h2h.bias = mx.symbol.Variable(paste0("l", i, ".trans.h2h.bias"))) - return (cell) - }) - last.states <- lapply(1:num.gru.layer, function(i) { - state <- list(h=mx.symbol.Variable(paste0("l", i, ".init.h"))) - return (state) - }) - - # embeding layer - data <- mx.symbol.Variable("data") - hidden <- mx.symbol.Embedding(data=data, input_dim=input.size, - weight=embed.weight, output_dim=num.embed, name="embed") - - # stack GRU - for (i in 1:num.gru.layer) { - dp <- ifelse(i==1, 0, dropout) - next.state <- gru(num.hidden, indata=hidden, - prev.state=last.states[[i]], - param=param.cells[[i]], - seqidx=seqidx, layeridx=i, - dropout=dp) - hidden <- next.state$h - last.states[[i]] <- next.state - } - # decoder - if (dropout > 0) - hidden <- mx.symbol.Dropout(data=hidden, p=dropout) - - fc <- mx.symbol.FullyConnected(data=hidden, num_hidden=num.label, - weight=cls.weight, bias=cls.bias, name='pred') - sm <- mx.symbol.SoftmaxOutput(data=fc, name='sm') - unpack.h <- lapply(1:num.gru.layer, function(i) { - state <- last.states[[i]] - state.h <- mx.symbol.BlockGrad(state$h, name=paste0("l", i, ".last.h")) - return (state.h) - }) - - list.all <- c(sm, unpack.h) - return (mx.symbol.Group(list.all)) -} - -#' Training GRU Unrolled Model -#' -#' @param train.data mx.io.DataIter or list(data=R.array, label=R.array) -#' The Training set. -#' @param eval.data mx.io.DataIter or list(data=R.array, label=R.array), optional -#' The validation set used for validation evaluation during the progress. -#' @param num.gru.layer integer -#' The number of the layer of gru. -#' @param seq.len integer -#' The length of the input sequence. -#' @param num.hidden integer -#' The number of hidden nodes. -#' @param num.embed integer -#' The output dim of embedding. -#' @param num.label integer -#' The number of labels. -#' @param batch.size integer -#' The batch size used for R array training. -#' @param input.size integer -#' The input dim of one-hot encoding of embedding -#' @param ctx mx.context, optional -#' The device used to perform training. -#' @param num.round integer, default=10 -#' The number of iterations over training data to train the model. -#' @param update.period integer, default=1 -#' The number of iterations to update parameters during training period. -#' @param initializer initializer object. default=mx.init.uniform(0.01) -#' The initialization scheme for parameters. -#' @param dropout float, default=0 -#' A number in [0,1) containing the dropout ratio from the last hidden layer to the output layer. -#' @param optimizer string, default="sgd" -#' The optimization method. -#' @param ... other parameters passing to \code{mx.gru}/. -#' @return model A trained gru unrolled model. -#' -#' @export -mx.gru <- function( train.data, eval.data=NULL, - num.gru.layer, seq.len, - num.hidden, num.embed, num.label, - batch.size, input.size, - ctx=mx.ctx.default(), - num.round=10, update.period=1, - initializer=mx.init.uniform(0.01), - dropout=0, optimizer='sgd', - ...) { - # check data and change data into iterator - train.data <- check.data(train.data, batch.size, TRUE) - eval.data <- check.data(eval.data, batch.size, FALSE) - - # get unrolled gru symbol - rnn.sym <- gru.unroll( num.gru.layer=num.gru.layer, - num.hidden=num.hidden, - seq.len=seq.len, - input.size=input.size, - num.embed=num.embed, - num.label=num.label, - dropout=dropout) - - init.states.name <- lapply(1:num.gru.layer, function(i) { - state.h <- paste0("l", i, ".init.h") - return (state.h) - }) - - # set up gru model - model <- setup.rnn.model(rnn.sym=rnn.sym, - ctx=ctx, - num.rnn.layer=num.gru.layer, - seq.len=seq.len, - num.hidden=num.hidden, - num.embed=num.embed, - num.label=num.label, - batch.size=batch.size, - input.size=input.size, - init.states.name=init.states.name, - initializer=initializer, - dropout=dropout) - - # train gru model - model <- train.rnn( model, train.data, eval.data, - num.round=num.round, - update.period=update.period, - ctx=ctx, - init.states.name=init.states.name, - ...) - # change model into MXFeedForwardModel - model <- list(symbol=model$symbol, arg.params=model$rnn.exec$ref.arg.arrays, aux.params=model$rnn.exec$ref.aux.arrays) - return(structure(model, class="MXFeedForwardModel")) -} - -#' Create a GRU Inference Model -#' -#' @param num.gru.layer integer -#' The number of the layer of gru. -#' @param input.size integer -#' The input dim of one-hot encoding of embedding -#' @param num.hidden integer -#' The number of hidden nodes. -#' @param num.embed integer -#' The output dim of embedding. -#' @param num.label integer -#' The number of labels. -#' @param batch.size integer, default=1 -#' The batch size used for R array training. -#' @param arg.params list -#' The batch size used for R array training. -#' @param ctx mx.context, optional -#' Model parameter, list of name to NDArray of net's weights. -#' @param dropout float, default=0 -#' A number in [0,1) containing the dropout ratio from the last hidden layer to the output layer. -#' @return model list(rnn.exec=integer, symbol=mxnet symbol, num.rnn.layer=integer, num.hidden=integer, seq.len=integer, batch.size=integer, num.embed=integer) -#' A gru inference model. -#' -#' @export -mx.gru.inference <- function(num.gru.layer, - input.size, - num.hidden, - num.embed, - num.label, - batch.size=1, - arg.params, - ctx=mx.cpu(), - dropout=0.) { - sym <- gru.inference.symbol(num.gru.layer=num.gru.layer, - input.size=input.size, - num.hidden=num.hidden, - num.embed=num.embed, - num.label=num.label, - dropout=dropout) - - init.states.name <- lapply(1:num.gru.layer, function(i) { - state.h <- paste0("l", i, ".init.h") - return (state.h) - }) - - seq.len <- 1 - # set up gru model - model <- setup.rnn.model(rnn.sym=sym, - ctx=ctx, - num.rnn.layer=num.gru.layer, - seq.len=seq.len, - num.hidden=num.hidden, - num.embed=num.embed, - num.label=num.label, - batch.size=batch.size, - input.size=input.size, - init.states.name=init.states.name, - initializer=mx.init.uniform(0.01), - dropout=dropout) - arg.names <- names(model$rnn.exec$ref.arg.arrays) - for (k in names(arg.params)) { - if ((k %in% arg.names) && is.param.name(k) ) { - rnn.input <- list() - rnn.input[[k]] <- arg.params[[k]] - mx.exec.update.arg.arrays(model$rnn.exec, rnn.input, match.name=TRUE) - } - } - init.states <- list() - for (i in 1:num.gru.layer) { - init.states[[paste0("l", i, ".init.h")]] <- model$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.h")]]*0 - } - mx.exec.update.arg.arrays(model$rnn.exec, init.states, match.name=TRUE) - - return (model) -} - -#' Using forward function to predict in gru inference model -#' -#' @param model gru model -#' A gru inference model -#' @param input.data, array.matrix -#' The input data for forward function -#' @param new.seq boolean, default=FALSE -#' Whether the input is the start of a new sequence -#' -#' @return result A list(prob=prob, model=model) containing the result probability of each label and the model. -#' -#' @export -mx.gru.forward <- function(model, input.data, new.seq=FALSE) { - if (new.seq == TRUE) { - init.states <- list() - for (i in 1:model$num.rnn.layer) { - init.states[[paste0("l", i, ".init.h")]] <- model$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.h")]]*0 - } - mx.exec.update.arg.arrays(model$rnn.exec, init.states, match.name=TRUE) - } - dim(input.data) <- c(model$batch.size) - data <- list(data=mx.nd.array(input.data)) - mx.exec.update.arg.arrays(model$rnn.exec, data, match.name=TRUE) - mx.exec.forward(model$rnn.exec, is.train=FALSE) - init.states <- list() - for (i in 1:model$num.rnn.layer) { - init.states[[paste0("l", i, ".init.h")]] <- model$rnn.exec$ref.outputs[[paste0("l", i, ".last.h_output")]] - } - mx.exec.update.arg.arrays(model$rnn.exec, init.states, match.name=TRUE) - prob <- model$rnn.exec$ref.outputs[["sm_output"]] - return (list(prob=prob, model=model)) -} - diff --git a/R-package/R/initializer.R b/R-package/R/initializer.R index 7a1ffb2..9f5e75b 100644 --- a/R-package/R/initializer.R +++ b/R-package/R/initializer.R @@ -4,11 +4,11 @@ #' @param shape the shape of the array to be generated. #' mx.init.internal.default <- function(name, shape, ctx, allow.unknown=FALSE) { - if (endsWith(name, "bias")) return (mx.nd.zeros(shape, ctx)) - if (endsWith(name, "gamma")) return (mx.nd.ones(shape, ctx)) - if (endsWith(name, "beta")) return (mx.nd.zeros(shape, ctx)) - if (endsWith(name, "moving_mean")) return (mx.nd.zeros(shape, ctx)) - if (endsWith(name, "moving_var")) return (mx.nd.ones(shape, ctx)) + if (endsWith(name, "bias")) return (mx.nd.zeros(shape)) + if (endsWith(name, "gamma")) return (mx.nd.ones(shape)) + if (endsWith(name, "beta")) return (mx.nd.zeros(shape)) + if (endsWith(name, "moving_mean")) return (mx.nd.zeros(shape)) + if (endsWith(name, "moving_var")) return (mx.nd.ones(shape)) if (allow.unknown) return(NULL) stop(paste("Unkown initialization pattern for ", name)) } @@ -21,9 +21,9 @@ mx.init.internal.default <- function(name, shape, ctx, allow.unknown=FALSE) { mx.init.uniform <- function(scale) { function(name, shape, ctx, allow.unknown=FALSE) { if (!endsWith(name, "weight")) { - return (mx.init.internal.default(name, shape, ctx, allow.unknown)) + return (mx.init.internal.default(name = name, shape = shape, allow.unknown = allow.unknown)) } - return (mx.runif(shape, -scale, scale, ctx)) + return (mx.nd.random.uniform(low = -scale, high = scale, shape = shape)) } } @@ -35,9 +35,9 @@ mx.init.uniform <- function(scale) { mx.init.normal <- function(sd) { function(name, shape, ctx, allow.unknown=FALSE) { if (!endsWith(name, "weight")) { - return (mx.init.internal.default(name, shape, ctx, allow.unknown)) + return (mx.init.internal.default(name = name, shape = shape, allow.unknown = allow.unknown)) } - return (mx.rnorm(shape, 0, sd, ctx)) + return (mx.nd.random.normal(loc = 0, scale = sd, shape = shape)) } } @@ -56,9 +56,9 @@ mx.init.Xavier <- function(rnd_type = "uniform", factor_type = "avg", magnitude = 3){ function(name, shape, ctx, allow.unknown = FALSE){ if (!endsWith(name, "weight")) { - return (mx.init.internal.default(name, shape, ctx, allow.unknown)) + return (mx.init.internal.default(name = name, shape = shape, allow.unknown = allow.unknown)) } - + fan_out = shape[length(shape)] fan_in = prod(shape[-length(shape)]) factor_val = 1 @@ -71,13 +71,13 @@ mx.init.Xavier <- function(rnd_type = "uniform", factor_type = "avg", } else { stop("Not supported factor type. See usage of function mx.init.Xavier") } - + scale = sqrt(magnitude / factor_val) - + if (rnd_type == "uniform"){ - return(mx.runif(shape, -scale, scale, ctx)) + return(mx.nd.random.uniform(low = -scale, high = scale, shape = shape)) } else if (rnd_type == "gaussian"){ - return(mx.rnorm(shape, 0, scale, ctx)) + return(mx.nd.random.normal(loc = 0, scale = scale, shape = shape)) } else { stop("Not supported random type. See usage of function mx.init.Xavier") } @@ -92,7 +92,7 @@ mx.init.Xavier <- function(rnd_type = "uniform", factor_type = "avg", #' @param ctx mx.context The context of the weights #' @param skip.unknown Whether skip the unknown weight types #' @export -mx.init.create <- function(initializer, shape.array, ctx, skip.unknown=TRUE) { +mx.init.create <- function(initializer, shape.array, ctx=NULL, skip.unknown=TRUE) { if (length(shape.array) == 0) return(list()) names = names(shape.array) ret <- lapply(1 : length(names), function(i) { diff --git a/R-package/R/lstm.R b/R-package/R/lstm.R deleted file mode 100644 index 6223889..0000000 --- a/R-package/R/lstm.R +++ /dev/null @@ -1,388 +0,0 @@ -# lstm cell symbol -lstm <- function(num.hidden, indata, prev.state, param, seqidx, layeridx, dropout=0) { - if (dropout > 0) - indata <- mx.symbol.Dropout(data=indata, p=dropout) - i2h <- mx.symbol.FullyConnected(data=indata, - weight=param$i2h.weight, - bias=param$i2h.bias, - num.hidden=num.hidden * 4, - name=paste0("t", seqidx, ".l", layeridx, ".i2h")) - h2h <- mx.symbol.FullyConnected(data=prev.state$h, - weight=param$h2h.weight, - bias=param$h2h.bias, - num.hidden=num.hidden * 4, - name=paste0("t", seqidx, ".l", layeridx, ".h2h")) - gates <- i2h + h2h - slice.gates <- mx.symbol.SliceChannel(gates, num.outputs=4, - name=paste0("t", seqidx, ".l", layeridx, ".slice")) - - in.gate <- mx.symbol.Activation(slice.gates[[1]], act.type="sigmoid") - in.transform <- mx.symbol.Activation(slice.gates[[2]], act.type="tanh") - forget.gate <- mx.symbol.Activation(slice.gates[[3]], act.type="sigmoid") - out.gate <- mx.symbol.Activation(slice.gates[[4]], act.type="sigmoid") - next.c <- (forget.gate * prev.state$c) + (in.gate * in.transform) - next.h <- out.gate * mx.symbol.Activation(next.c, act.type="tanh") - - return (list(c=next.c, h=next.h)) -} - -# unrolled lstm network -lstm.unroll <- function(num.lstm.layer, seq.len, input.size, - num.hidden, num.embed, num.label, dropout=0.) { - - embed.weight <- mx.symbol.Variable("embed.weight") - cls.weight <- mx.symbol.Variable("cls.weight") - cls.bias <- mx.symbol.Variable("cls.bias") - - param.cells <- lapply(1:num.lstm.layer, function(i) { - cell <- list(i2h.weight = mx.symbol.Variable(paste0("l", i, ".i2h.weight")), - i2h.bias = mx.symbol.Variable(paste0("l", i, ".i2h.bias")), - h2h.weight = mx.symbol.Variable(paste0("l", i, ".h2h.weight")), - h2h.bias = mx.symbol.Variable(paste0("l", i, ".h2h.bias"))) - return (cell) - }) - last.states <- lapply(1:num.lstm.layer, function(i) { - state <- list(c=mx.symbol.Variable(paste0("l", i, ".init.c")), - h=mx.symbol.Variable(paste0("l", i, ".init.h"))) - return (state) - }) - - # embeding layer - label <- mx.symbol.Variable("label") - data <- mx.symbol.Variable("data") - embed <- mx.symbol.Embedding(data=data, input_dim=input.size, - weight=embed.weight, output_dim=num.embed, name="embed") - wordvec <- mx.symbol.SliceChannel(data=embed, num_outputs=seq.len, squeeze_axis=1) - - last.hidden <- list() - for (seqidx in 1:seq.len) { - hidden <- wordvec[[seqidx]] - # stack lstm - for (i in 1:num.lstm.layer) { - dp <- ifelse(i==1, 0, dropout) - next.state <- lstm(num.hidden, indata=hidden, - prev.state=last.states[[i]], - param=param.cells[[i]], - seqidx=seqidx, layeridx=i, - dropout=dp) - hidden <- next.state$h - last.states[[i]] <- next.state - } - # decoder - if (dropout > 0) - hidden <- mx.symbol.Dropout(data=hidden, p=dropout) - last.hidden <- c(last.hidden, hidden) - } - last.hidden$dim <- 0 - last.hidden$num.args <- seq.len - concat <-mxnet:::mx.varg.symbol.Concat(last.hidden) - fc <- mx.symbol.FullyConnected(data=concat, - weight=cls.weight, - bias=cls.bias, - num.hidden=num.label) - - label <- mx.symbol.transpose(data=label) - label <- mx.symbol.Reshape(data=label, target.shape=c(0)) - - loss.all <- mx.symbol.SoftmaxOutput(data=fc, label=label, name="sm") - return (loss.all) -} - -# lstm inference model symbol -lstm.inference.symbol <- function(num.lstm.layer, input.size, - num.hidden, num.embed, num.label, dropout=0.) { - seqidx <- 0 - embed.weight <- mx.symbol.Variable("embed.weight") - cls.weight <- mx.symbol.Variable("cls.weight") - cls.bias <- mx.symbol.Variable("cls.bias") - - param.cells <- lapply(1:num.lstm.layer, function(i) { - cell <- list(i2h.weight = mx.symbol.Variable(paste0("l", i, ".i2h.weight")), - i2h.bias = mx.symbol.Variable(paste0("l", i, ".i2h.bias")), - h2h.weight = mx.symbol.Variable(paste0("l", i, ".h2h.weight")), - h2h.bias = mx.symbol.Variable(paste0("l", i, ".h2h.bias"))) - return (cell) - }) - last.states <- lapply(1:num.lstm.layer, function(i) { - state <- list(c=mx.symbol.Variable(paste0("l", i, ".init.c")), - h=mx.symbol.Variable(paste0("l", i, ".init.h"))) - return (state) - }) - - # embeding layer - data <- mx.symbol.Variable("data") - hidden <- mx.symbol.Embedding(data=data, input_dim=input.size, - weight=embed.weight, output_dim=num.embed, name="embed") - - # stack lstm - for (i in 1:num.lstm.layer) { - dp <- ifelse(i==1, 0, dropout) - next.state <- lstm(num.hidden, indata=hidden, - prev.state=last.states[[i]], - param=param.cells[[i]], - seqidx=seqidx, layeridx=i, - dropout=dp) - hidden <- next.state$h - last.states[[i]] <- next.state - } - # decoder - if (dropout > 0) - hidden <- mx.symbol.Dropout(data=hidden, p=dropout) - - fc <- mx.symbol.FullyConnected(data=hidden, num_hidden=num.label, - weight=cls.weight, bias=cls.bias, name='pred') - sm <- mx.symbol.SoftmaxOutput(data=fc, name='sm') - unpack.c <- lapply(1:num.lstm.layer, function(i) { - state <- last.states[[i]] - state.c <- mx.symbol.BlockGrad(state$c, name=paste0("l", i, ".last.c")) - return (state.c) - }) - unpack.h <- lapply(1:num.lstm.layer, function(i) { - state <- last.states[[i]] - state.h <- mx.symbol.BlockGrad(state$h, name=paste0("l", i, ".last.h")) - return (state.h) - }) - - list.all <- c(sm, unpack.c, unpack.h) - return (mx.symbol.Group(list.all)) -} - - - -#' Training LSTM Unrolled Model -#' -#' @param train.data mx.io.DataIter or list(data=R.array, label=R.array) -#' The Training set. -#' @param eval.data mx.io.DataIter or list(data=R.array, label=R.array), optional -#' The validation set used for validation evaluation during the progress. -#' @param num.lstm.layer integer -#' The number of the layer of lstm. -#' @param seq.len integer -#' The length of the input sequence. -#' @param num.hidden integer -#' The number of hidden nodes. -#' @param num.embed integer -#' The output dim of embedding. -#' @param num.label integer -#' The number of labels. -#' @param batch.size integer -#' The batch size used for R array training. -#' @param input.size integer -#' The input dim of one-hot encoding of embedding -#' @param ctx mx.context, optional -#' The device used to perform training. -#' @param num.round integer, default=10 -#' The number of iterations over training data to train the model. -#' @param update.period integer, default=1 -#' The number of iterations to update parameters during training period. -#' @param initializer initializer object. default=mx.init.uniform(0.01) -#' The initialization scheme for parameters. -#' @param dropout float, default=0 -#' A number in [0,1) containing the dropout ratio from the last hidden layer to the output layer. -#' @param optimizer string, default="sgd" -#' The optimization method. -#' @param epoch.end.callback function, optional -#' The callback when iteration ends. -#' @param batch.end.callback function, optional -#' The callback when one mini-batch iteration ends. -#' @param ... other parameters passing to \code{mx.lstm}/. -#' @return model A trained lstm unrolled model. -#' -#' @export -mx.lstm <- function(train.data, eval.data=NULL, - num.lstm.layer, seq.len, - num.hidden, num.embed, num.label, - batch.size, input.size, - ctx=mx.ctx.default(), - num.round=10, update.period=1, - initializer=mx.init.uniform(0.01), - dropout=0, optimizer='sgd', - epoch.end.callback=NULL, batch.end.callback=NULL, - model, - arg.params, - ...) { - # check data and change data into iterator - train.data <- check.data(train.data, batch.size, TRUE) - eval.data <- check.data(eval.data, batch.size, FALSE) - - - - # get unrolled lstm symbol - if(missing(model)){ - rnn.sym <- lstm.unroll(num.lstm.layer=num.lstm.layer, - num.hidden=num.hidden, - seq.len=seq.len, - input.size=input.size, - num.embed=num.embed, - num.label=num.label, - dropout=dropout) - } else { - rnn.sym=model$symbol - } - - init.states.c <- lapply(1:num.lstm.layer, function(i) { - state.c <- paste0("l", i, ".init.c") - return (state.c) - }) - init.states.h <- lapply(1:num.lstm.layer, function(i) { - state.h <- paste0("l", i, ".init.h") - return (state.h) - }) - init.states.name <- c(init.states.c, init.states.h) - - # set up lstm model - model <- setup.rnn.model(rnn.sym=rnn.sym, - ctx=ctx, - num.rnn.layer=num.lstm.layer, - seq.len=seq.len, - num.hidden=num.hidden, - num.embed=num.embed, - num.label=num.label, - batch.size=batch.size, - input.size=input.size, - init.states.name=init.states.name, - initializer=initializer, - dropout=dropout) - # restore states - if (!missing(arg.params)){ - arg.names <- names(model$rnn.exec$ref.arg.arrays) - for (k in names(arg.params)) { - if ((k %in% arg.names) && is.param.name(k) ) { - rnn.input <- list() - rnn.input[[k]] <- arg.params[[k]] - mx.exec.update.arg.arrays(model$rnn.exec, rnn.input, match.name=TRUE) - } - } - } - - # train lstm model - model <- train.rnn( model, train.data, eval.data, - num.round=num.round, - update.period=update.period, - ctx=ctx, - init.states.name=init.states.name, - epoch.end.callback=epoch.end.callback, - batch.end.callback=batch.end.callback, - ...) - # change model into MXFeedForwardModel - model <- list(symbol=model$symbol, arg.params=model$rnn.exec$ref.arg.arrays, aux.params=model$rnn.exec$ref.aux.arrays) - return(structure(model, class="MXFeedForwardModel")) -} - - -#' Create a LSTM Inference Model -#' -#' @param num.lstm.layer integer -#' The number of the layer of lstm. -#' @param input.size integer -#' The input dim of one-hot encoding of embedding -#' @param num.hidden integer -#' The number of hidden nodes. -#' @param num.embed integer -#' The output dim of embedding. -#' @param num.label integer -#' The number of labels. -#' @param batch.size integer, default=1 -#' The batch size used for R array training. -#' @param arg.params list -#' The batch size used for R array training. -#' @param ctx mx.context, optional -#' Model parameter, list of name to NDArray of net's weights. -#' @param dropout float, default=0 -#' A number in [0,1) containing the dropout ratio from the last hidden layer to the output layer. -#' @return model list(rnn.exec=integer, symbol=mxnet symbol, num.rnn.layer=integer, num.hidden=integer, seq.len=integer, batch.size=integer, num.embed=integer) -#' A lstm inference model. -#' -#' @export -mx.lstm.inference <- function(num.lstm.layer, - input.size, - num.hidden, - num.embed, - num.label, - batch.size=1, - arg.params, - ctx=mx.cpu(), - dropout=0.) { - sym <- lstm.inference.symbol(num.lstm.layer=num.lstm.layer, - input.size=input.size, - num.hidden=num.hidden, - num.embed=num.embed, - num.label=num.label, - dropout=dropout) - - init.states.c <- lapply(1:num.lstm.layer, function(i) { - state.c <- paste0("l", i, ".init.c") - return (state.c) - }) - init.states.h <- lapply(1:num.lstm.layer, function(i) { - state.h <- paste0("l", i, ".init.h") - return (state.h) - }) - init.states.name <- c(init.states.c, init.states.h) - - seq.len <- 1 - # set up lstm model - model <- setup.rnn.model(rnn.sym=sym, - ctx=ctx, - num.rnn.layer=num.lstm.layer, - seq.len=seq.len, - num.hidden=num.hidden, - num.embed=num.embed, - num.label=num.label, - batch.size=batch.size, - input.size=input.size, - init.states.name=init.states.name, - initializer=mx.init.uniform(0.01), - dropout=dropout) - arg.names <- names(model$rnn.exec$ref.arg.arrays) - for (k in names(arg.params)) { - if ((k %in% arg.names) && is.param.name(k) ) { - rnn.input <- list() - rnn.input[[k]] <- arg.params[[k]] - mx.exec.update.arg.arrays(model$rnn.exec, rnn.input, match.name=TRUE) - } - } - init.states <- list() - for (i in 1:num.lstm.layer) { - init.states[[paste0("l", i, ".init.c")]] <- model$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.c")]]*0 - init.states[[paste0("l", i, ".init.h")]] <- model$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.h")]]*0 - } - mx.exec.update.arg.arrays(model$rnn.exec, init.states, match.name=TRUE) - - return (model) -} - -#' Using forward function to predict in lstm inference model -#' -#' @param model lstm model -#' A Lstm inference model -#' @param input.data, array.matrix -#' The input data for forward function -#' @param new.seq boolean, default=FALSE -#' Whether the input is the start of a new sequence -#' -#' @return result A list(prob=prob, model=model) containing the result probability of each label and the model. -#' -#' @export -mx.lstm.forward <- function(model, input.data, new.seq=FALSE) { - if (new.seq == TRUE) { - init.states <- list() - for (i in 1:model$num.rnn.layer) { - init.states[[paste0("l", i, ".init.c")]] <- model$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.c")]]*0 - init.states[[paste0("l", i, ".init.h")]] <- model$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.h")]]*0 - } - mx.exec.update.arg.arrays(model$rnn.exec, init.states, match.name=TRUE) - } - dim(input.data) <- c(model$batch.size) - data <- list(data=mx.nd.array(input.data)) - mx.exec.update.arg.arrays(model$rnn.exec, data, match.name=TRUE) - mx.exec.forward(model$rnn.exec, is.train=FALSE) - init.states <- list() - for (i in 1:model$num.rnn.layer) { - init.states[[paste0("l", i, ".init.c")]] <- model$rnn.exec$ref.outputs[[paste0("l", i, ".last.c_output")]] - init.states[[paste0("l", i, ".init.h")]] <- model$rnn.exec$ref.outputs[[paste0("l", i, ".last.h_output")]] - } - mx.exec.update.arg.arrays(model$rnn.exec, init.states, match.name=TRUE) - prob <- model$rnn.exec$ref.outputs[["sm_output"]] - return (list(prob=prob, model=model)) -} diff --git a/R-package/R/rnn.graph.R b/R-package/R/rnn.graph.R index 11e5ef5..2c099f0 100644 --- a/R-package/R/rnn.graph.R +++ b/R-package/R/rnn.graph.R @@ -5,32 +5,33 @@ #' @param cell.type Type of RNN cell: either gru or lstm #' @param num.rnn.layer int, number of stacked layers #' @param num.hidden int, size of the state in each RNN layer -#' @param num.embed int, dimension of the embedding vectors -#' @param num.label int, number of categories in labels -#' @param input.size int, number of levels in the data +#' @param num.embed int, default = NULL - no embedding. Dimension of the embedding vectors +#' @param num.decode int, number of output variables in the decoding layer +#' @param input.size int, number of levels in the data - only used for embedding #' @param dropout #' #' @export rnn.graph <- function(num.rnn.layer, - input.size, - num.embed, + input.size = NULL, + num.embed = NULL, num.hidden, - num.label, + num.decode, dropout = 0, ignore_label = -1, + loss_output = NULL, config, cell.type, masking = F, output_last_state = F) { # define input arguments - label <- mx.symbol.Variable("label") data <- mx.symbol.Variable("data") + label <- mx.symbol.Variable("label") seq.mask <- mx.symbol.Variable("seq.mask") - embed.weight <- mx.symbol.Variable("embed.weight") - rnn.params.weight <- mx.symbol.Variable("rnn.params.weight") + if (!is.null(num.embed)) embed.weight <- mx.symbol.Variable("embed.weight") + rnn.params.weight <- mx.symbol.Variable("rnn.params.weight") rnn.state <- mx.symbol.Variable("rnn.state") if (cell.type == "lstm") { @@ -40,15 +41,17 @@ rnn.graph <- function(num.rnn.layer, cls.weight <- mx.symbol.Variable("cls.weight") cls.bias <- mx.symbol.Variable("cls.bias") - embed <- mx.symbol.Embedding(data=data, input_dim=input.size, - weight=embed.weight, output_dim=num.embed, name="embed") + if (!is.null(num.embed)){ + data <- mx.symbol.Embedding(data=data, input_dim=input.size, + weight=embed.weight, output_dim=num.embed, name="embed") + } # RNN cells if (cell.type == "lstm") { - rnn <- mx.symbol.RNN(data=embed, state=rnn.state, state_cell = rnn.state.cell, parameters=rnn.params.weight, state.size=num.hidden, num.layers=num.rnn.layer, bidirectional=F, mode=cell.type, state.outputs=output_last_state, p=dropout, name=paste(cell.type, num.rnn.layer, "layer", sep="_")) + rnn <- mx.symbol.RNN(data=data, state=rnn.state, state_cell = rnn.state.cell, parameters=rnn.params.weight, state.size=num.hidden, num.layers=num.rnn.layer, bidirectional=F, mode=cell.type, state.outputs=output_last_state, p=dropout, name=paste(cell.type, num.rnn.layer, "layer", sep="_")) } else { - rnn <- mx.symbol.RNN(data=embed, state=rnn.state, parameters=rnn.params.weight, state.size=num.hidden, num.layers=num.rnn.layer, bidirectional=F, mode=cell.type, state.outputs=output_last_state, p=dropout, name=paste(cell.type, num.rnn.layer, "layer", sep="_")) + rnn <- mx.symbol.RNN(data=data, state=rnn.state, parameters=rnn.params.weight, state.size=num.hidden, num.layers=num.rnn.layer, bidirectional=F, mode=cell.type, state.outputs=output_last_state, p=dropout, name=paste(cell.type, num.rnn.layer, "layer", sep="_")) } # Decode @@ -57,30 +60,44 @@ rnn.graph <- function(num.rnn.layer, if (masking) mask <- mx.symbol.SequenceLast(data=rnn[[1]], use.sequence.length = T, sequence_length = seq.mask, name = "mask") else mask <- mx.symbol.SequenceLast(data=rnn[[1]], use.sequence.length = F, name = "mask") - fc <- mx.symbol.FullyConnected(data=mask, - weight=cls.weight, - bias=cls.bias, - num.hidden=num.label, - name = "decode") + decode <- mx.symbol.FullyConnected(data=mask, + weight=cls.weight, + bias=cls.bias, + num.hidden=num.decode, + name = "decode") - loss <- mx.symbol.SoftmaxOutput(data=fc, label=label, use_ignore = !ignore_label == -1, ignore_label = ignore_label, name = "loss") + if (!is.null(loss_output)) { + loss <- switch(loss_output, + softmax = mx.symbol.SoftmaxOutput(data=decode, label=label, use_ignore = !ignore_label == -1, ignore_label = ignore_label, name = "loss"), + linear = mx.symbol.LinearRegressionOutput(data=decode, label=label, name = "loss"), + logictic = mx.symbol.LogisticRegressionOutput(data=decode, label=label, name = "loss"), + MAE = mx.symbol.MAERegressionOutput(data=decode, label=label, name = "loss") + ) + } else loss <- decode } else if (config=="one-to-one"){ if (masking) mask <- mx.symbol.SequenceMask(data = rnn[[1]], use.sequence.length = T, sequence_length = seq.mask, value = 0, name = "mask") else mask <- mx.symbol.identity(data = rnn[[1]], name = "mask") - - reshape = mx.symbol.reshape(mask, shape=c(num.hidden, -1)) + + mask = mx.symbol.reshape(mask, shape=c(num.hidden, -1)) decode <- mx.symbol.FullyConnected(data=reshape, weight=cls.weight, bias=cls.bias, - num.hidden=num.label, + num.hidden=num.decode, name = "decode") label <- mx.symbol.reshape(data=label, shape=c(-1), name = "label_reshape") - loss <- mx.symbol.SoftmaxOutput(data=decode, label=label, use_ignore = !ignore_label == -1, ignore_label = ignore_label, name = "loss") + if (!is.null(loss_output)) { + loss <- switch(loss_output, + softmax = mx.symbol.SoftmaxOutput(data=decode, label=label, use_ignore = !ignore_label == -1, ignore_label = ignore_label, name = "loss"), + linear = mx.symbol.LinearRegressionOutput(data=decode, label=label, name = "loss"), + logictic = mx.symbol.LogisticRegressionOutput(data=decode, label=label, name = "loss"), + MAE = mx.symbol.MAERegressionOutput(data=decode, label=label, name = "loss") + ) + } else loss <- decode } return(loss) } @@ -176,21 +193,24 @@ gru.cell <- function(num.hidden, indata, prev.state, param, seqidx, layeridx, dr #' unroll representation of RNN running on non CUDA device - under development #' #' @export -rnn.unroll <- function(num.rnn.layer, - seq.len, - input.size, - num.embed, - num.hidden, - num.label, - dropout, - ignore_label, - init.state=NULL, - config, - cell.type="lstm", - masking = F, - output_last_state=F) { - - embed.weight <- mx.symbol.Variable("embed.weight") +rnn.graph.unroll <- function(num.rnn.layer, + seq.len, + input.size = NULL, + num.embed = NULL, + num.hidden, + num.decode, + dropout = 0, + ignore_label = -1, + loss_output = NULL, + init.state = NULL, + config, + cell.type = "lstm", + masking = F, + output_last_state = F) { + + + if (!is.null(num.embed)) embed.weight <- mx.symbol.Variable("embed.weight") + cls.weight <- mx.symbol.Variable("cls.weight") cls.bias <- mx.symbol.Variable("cls.bias") @@ -215,19 +235,22 @@ rnn.unroll <- function(num.rnn.layer, }) # embeding layer - label <- mx.symbol.Variable("label") data <- mx.symbol.Variable("data") + label <- mx.symbol.Variable("label") + seq.mask <- mx.symbol.Variable("seq.mask") - embed <- mx.symbol.Embedding(data = data, input_dim = input.size, - weight=embed.weight, output_dim = num.embed, name = "embed") + if (!is.null(num.embed)) { + data <- mx.symbol.Embedding(data = data, input_dim = input.size, + weight=embed.weight, output_dim = num.embed, name = "embed") + } - embed <- mx.symbol.split(data = embed, axis = 0, num.outputs = seq.len, squeeze_axis = T) + data <- mx.symbol.split(data = data, axis = 0, num.outputs = seq.len, squeeze_axis = T) last.hidden <- list() last.states <- list() for (seqidx in 1:seq.len) { - hidden <- embed[[seqidx]] + hidden <- data[[seqidx]] for (i in 1:num.rnn.layer) { @@ -250,34 +273,57 @@ rnn.unroll <- function(num.rnn.layer, last.states[[i]] <- next.state } - # Decoding - if (config=="one-to-one"){ - last.hidden <- c(last.hidden, hidden) - } + # Aggregate outputs from each timestep + last.hidden <- c(last.hidden, hidden) } + # concat hidden units - concat seq.len blocks of dimension num.hidden x batch.size + concat <- mx.symbol.concat(data = last.hidden, num.args = seq.len, dim = 0, name = "concat") + concat <- mx.symbol.reshape(data = concat, shape = c(num.hidden, -1, seq.len), name = "rnn_reshape") + if (config=="seq-to-one"){ - fc <- mx.symbol.FullyConnected(data = hidden, - weight = cls.weight, - bias = cls.bias, - num.hidden = num.label) - loss <- mx.symbol.SoftmaxOutput(data = fc, name="sm", label=label, use_ignore = !ignore_label == -1, ignore_label = ignore_label) + if (masking) mask <- mx.symbol.SequenceLast(data=concat, use.sequence.length = T, sequence_length = seq.mask, name = "mask") else + mask <- mx.symbol.SequenceLast(data=concat, use.sequence.length = F, name = "mask") + + decode <- mx.symbol.FullyConnected(data = mask, + weight = cls.weight, + bias = cls.bias, + num.hidden = num.decode, + name = "decode") + + if (!is.null(loss_output)) { + loss <- switch(loss_output, + softmax = mx.symbol.SoftmaxOutput(data=decode, label=label, use_ignore = !ignore_label == -1, ignore_label = ignore_label, name = "loss"), + linear = mx.symbol.LinearRegressionOutput(data=decode, label=label, name = "loss"), + logictic = mx.symbol.LogisticRegressionOutput(data=decode, label=label, name = "loss"), + MAE = mx.symbol.MAERegressionOutput(data=decode, label=label, name = "loss") + ) + } else loss <- decode } else if (config=="one-to-one"){ - # concat hidden units - concat seq.len blocks of dimension num.hidden x batch.size - concat <- mx.symbol.concat(data = last.hidden, num.args = seq.len, dim = 0, name = "concat") + if (masking) mask <- mx.symbol.SequenceMask(data = concat, use.sequence.length = T, sequence_length = seq.mask, value = 0, name = "mask") else + mask <- mx.symbol.identity(data = concat, name = "mask") + + mask = mx.symbol.reshape(mask, shape=c(num.hidden, -1)) - decode <- mx.symbol.FullyConnected(data = concat, + decode <- mx.symbol.FullyConnected(data = mask, weight = cls.weight, bias = cls.bias, - num.hidden = num.label, + num.hidden = num.decode, name = "decode") label <- mx.symbol.reshape(data = label, shape = -1, name = "label_reshape") - loss <- mx.symbol.SoftmaxOutput(data = decode, name="sm", label = label, use_ignore = !ignore_label == -1, ignore_label = ignore_label) + if (!is.null(loss_output)) { + loss <- switch(loss_output, + softmax = mx.symbol.SoftmaxOutput(data=decode, label=label, use_ignore = !ignore_label == -1, ignore_label = ignore_label, name = "loss"), + linear = mx.symbol.LinearRegressionOutput(data=decode, label=label, name = "loss"), + logictic = mx.symbol.LogisticRegressionOutput(data=decode, label=label, name = "loss"), + MAE = mx.symbol.MAERegressionOutput(data=decode, label=label, name = "loss") + ) + } else loss <- decode } return(loss) } diff --git a/R-package/R/viz.graph.R b/R-package/R/viz.graph.R index aef90ad..6d13de0 100644 --- a/R-package/R/viz.graph.R +++ b/R-package/R/viz.graph.R @@ -123,11 +123,14 @@ graph.viz <- function(symbol, shape=NULL, direction="TD", type="graph", graph.wi stringsAsFactors=F) edges_df$from<- id_dic[as.character(edges_df$from)] - nodes_df_new<- create_node_df(n = nrow(nodes_df), label=nodes_df$label, shape=nodes_df$shape, type="base", penwidth=2, color=nodes_df$color, style="filled", fillcolor=adjustcolor(nodes_df$color, alpha.f = 1)) - edge_df_new<- create_edge_df(from = edges_df$from, to=edges_df$to, color="black") + nodes_df_new<- create_node_df(n = nrow(nodes_df), label=nodes_df$label, shape=nodes_df$shape, type="base", penwidth=2, color=nodes_df$color, style="filled", + fillcolor=adjustcolor(nodes_df$color, alpha.f = 1), fontcolor = "black") + edge_df_new<- create_edge_df(from = edges_df$from, to=edges_df$to, color="black", fontcolor = "black") if (!is.null(shape)){ - edges_labels_raw<- symbol$get.internals()$infer.shape(list(data=shape))$out.shapes + if (is.list(shape)) { + edges_labels_raw<- symbol$get.internals()$infer.shape(shape)$out.shapes + } else edges_labels_raw<- symbol$get.internals()$infer.shape(list(data=shape))$out.shapes if (!is.null(edges_labels_raw)){ edge_label_str <- function(x) paste0(x, collapse="X") edges_labels_raw<- sapply(edges_labels_raw, edge_label_str) -- To stop receiving notification emails like this one, please contact ['"comm...@mxnet.apache.org" <comm...@mxnet.apache.org>'].