Github user felixcheung commented on a diff in the pull request: https://github.com/apache/spark/pull/21710#discussion_r223574367 --- Diff: R/pkg/R/mllib_fpm.R --- @@ -154,3 +160,74 @@ setMethod("write.ml", signature(object = "FPGrowthModel", path = "character"), function(object, path, overwrite = FALSE) { write_internal(object, path, overwrite) }) + +#' PrefixSpan +#' +#' A parallel PrefixSpan algorithm to mine frequent sequential patterns. +#' \code{spark.prefixSpan} returns an instance of PrefixSpan. +#' \code{spark.findFrequentSequentialPatterns} returns a complete set of frequent sequential +#' patterns. +#' For more details, see +#' \href{https://spark.apache.org/docs/latest/mllib-frequent-pattern-mining.html#prefixspan}{ +#' PrefixSpan}. +#' +#' @param minSupport Minimal support level. +#' @param maxPatternLength Maximal pattern length. +#' @param maxLocalProjDBSize Maximum number of items (including delimiters used in the internal +#' storage format) allowed in a projected database before local +#' processing. +#' @param sequenceCol name of the sequence column in dataset. +#' @param ... additional argument(s) passed to the method. +#' @return \code{spark.prefixSpan} returns an instance of PrefixSpan +#' @rdname spark.prefixSpan +#' @name spark.prefixSpan +#' @aliases spark.prefixSpan,ANY-method +#' @examples +#' \dontrun{ +#' df <- createDataFrame(list(list(list(list(1L, 2L), list(3L))), +#' list(list(list(1L), list(3L, 2L), list(1L, 2L))), +#' list(list(list(1L, 2L), list(5L))), +#' list(list(list(6L)))), schema = c("sequence")) +#' prefix_Span <- spark.prefixSpan(minSupport = 0.5, maxPatternLength = 5L, +#' maxLocalProjDBSize = 32000000L) +#' frequency <- spark.findFrequentSequentialPatterns(prefix_Span, df) +#' showDF(frequency) +#' } +#' @note spark.prefixSpan since 3.0.0 +setMethod("spark.prefixSpan", signature(), + function(minSupport=0.1, maxPatternLength=10L, + maxLocalProjDBSize=32000000L, sequenceCol="sequence") { + if (!is.numeric(minSupport) || minSupport < 0) { + stop("minSupport should be a number with value >= 0.") + } + if (!is.integer(maxPatternLength) || maxPatternLength <= 0) { + stop("maxPatternLength should be a number with value > 0.") + } + if (!is.numeric(maxLocalProjDBSize) || maxLocalProjDBSize <= 0) { + stop("maxLocalProjDBSize should be a number with value > 0.") + } + + jobj <- callJStatic("org.apache.spark.ml.r.PrefixSpanWrapper", "getPrefixSpan", + as.numeric(minSupport), as.integer(maxPatternLength), + as.numeric(maxLocalProjDBSize), as.character(sequenceCol)) + new("PrefixSpan", jobj = jobj) + }) + +# Find frequent sequential patterns. + +#' @param object a PrefixSpan object. +#' @param data A SparkDataFrame. +#' @return A complete set of frequent sequential patterns in the input sequences of itemsets. +#' The returned \code{SparkDataFrame} contains columns of sequence and corresponding +#' frequency. The schema of it will be: +#' \code{sequence: ArrayType(ArrayType(T))} (T is the item type) +#' \code{freq: Long} +#' @rdname spark.prefixSpan +#' @aliases findFrequentSequentialPatterns,PrefixSpan,SparkDataFrame-method +#' @note spark.findFrequentSequentialPatterns(PrefixSpan, SparkDataFrame) since 3.0.0 + +setMethod("spark.findFrequentSequentialPatterns", + signature(object = "PrefixSpan", data = "SparkDataFrame"), --- End diff -- nm, see the other comment instead
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org