spark git commit: [SPARK-24207][R] add R API for PrefixSpan

felixcheung Sun, 21 Oct 2018 12:33:02 -0700

Repository: spark
Updated Branches:
  refs/heads/master 4c6c6711d -> fc64e83f9



[SPARK-24207][R] add R API for PrefixSpan

## What changes were proposed in this pull request?

add R API for PrefixSpan

## How was this patch tested?
add test in test_mllib_fpm.R

Author: Huaxin Gao <huax...@us.ibm.com>

Closes #21710 from huaxingao/spark-24207.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/fc64e83f
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/fc64e83f
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/fc64e83f

Branch: refs/heads/master
Commit: fc64e83f9538d6b7e13359a4933a454ba7ed89ec
Parents: 4c6c671
Author: Huaxin Gao <huax...@us.ibm.com>
Authored: Sun Oct 21 12:32:43 2018 -0700
Committer: Felix Cheung <felixche...@apache.org>
Committed: Sun Oct 21 12:32:43 2018 -0700

----------------------------------------------------------------------
 R/pkg/NAMESPACE                                 |  3 +-
 R/pkg/R/generics.R                              |  4 ++
 R/pkg/R/mllib_fpm.R                             | 64 +++++++++++++++++++
 R/pkg/tests/fulltests/test_mllib_fpm.R          | 16 +++++
 R/pkg/vignettes/sparkr-vignettes.Rmd            | 13 ++++
 docs/ml-frequent-pattern-mining.md              | 53 ++++++++++++++++
 docs/sparkr.md                                  |  1 +
 .../examples/ml/JavaPrefixSpanExample.java      | 67 ++++++++++++++++++++
 .../src/main/python/ml/prefixspan_example.py    | 48 ++++++++++++++
 examples/src/main/r/ml/prefixSpan.R             | 42 ++++++++++++
 .../spark/examples/ml/PrefixSpanExample.scala   | 62 ++++++++++++++++++
 .../apache/spark/ml/r/PrefixSpanWrapper.scala   | 34 ++++++++++
 12 files changed, 406 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/fc64e83f/R/pkg/NAMESPACE
----------------------------------------------------------------------
diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index c512284..36d7a9b 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -70,7 +70,8 @@ exportMethods("glm",
               "spark.svmLinear",
               "spark.fpGrowth",
               "spark.freqItemsets",
-              "spark.associationRules")
+              "spark.associationRules",
+              "spark.findFrequentSequentialPatterns")
 
 # Job group lifecycle management methods
 export("setJobGroup",

http://git-wip-us.apache.org/repos/asf/spark/blob/fc64e83f/R/pkg/R/generics.R
----------------------------------------------------------------------
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index d501f73..045e075 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -1457,6 +1457,10 @@ setGeneric("spark.freqItemsets", function(object) { 
standardGeneric("spark.freqI
 #' @rdname spark.fpGrowth
 setGeneric("spark.associationRules", function(object) { 
standardGeneric("spark.associationRules") })
 
+#' @rdname spark.prefixSpan
+setGeneric("spark.findFrequentSequentialPatterns",
+            function(data, ...) { 
standardGeneric("spark.findFrequentSequentialPatterns") })
+
 #' @param object a fitted ML model object.
 #' @param path the directory where the model is saved.
 #' @param ... additional argument(s) passed to the method.

http://git-wip-us.apache.org/repos/asf/spark/blob/fc64e83f/R/pkg/R/mllib_fpm.R
----------------------------------------------------------------------
diff --git a/R/pkg/R/mllib_fpm.R b/R/pkg/R/mllib_fpm.R
index 4ad34fe..ac37580 100644
--- a/R/pkg/R/mllib_fpm.R
+++ b/R/pkg/R/mllib_fpm.R
@@ -23,6 +23,12 @@
 #' @note FPGrowthModel since 2.2.0
 setClass("FPGrowthModel", slots = list(jobj = "jobj"))
 
+#' S4 class that represents a PrefixSpan
+#'
+#' @param jobj a Java object reference to the backing Scala PrefixSpan
+#' @note PrefixSpan since 3.0.0
+setClass("PrefixSpan", slots = list(jobj = "jobj"))
+
 #' FP-growth
 #'
 #' A parallel FP-growth algorithm to mine frequent itemsets.
@@ -155,3 +161,61 @@ setMethod("write.ml", signature(object = "FPGrowthModel", 
path = "character"),
           function(object, path, overwrite = FALSE) {
             write_internal(object, path, overwrite)
           })
+
+#' PrefixSpan
+#'
+#' A parallel PrefixSpan algorithm to mine frequent sequential patterns.
+#' \code{spark.findFrequentSequentialPatterns} returns a complete set of 
frequent sequential
+#' patterns.
+#' For more details, see
+#' 
\href{https://spark.apache.org/docs/latest/mllib-frequent-pattern-mining.html#prefixspan}{
+#' PrefixSpan}.
+#'
+#  Find frequent sequential patterns.
+#' @param data A SparkDataFrame.
+#' @param minSupport Minimal support level.
+#' @param maxPatternLength Maximal pattern length.
+#' @param maxLocalProjDBSize Maximum number of items (including delimiters 
used in the internal
+#'                           storage format) allowed in a projected database 
before local
+#'                           processing.
+#' @param sequenceCol name of the sequence column in dataset.
+#' @param ... additional argument(s) passed to the method.
+#' @return A complete set of frequent sequential patterns in the input 
sequences of itemsets.
+#'         The returned \code{SparkDataFrame} contains columns of sequence and 
corresponding
+#'         frequency. The schema of it will be:
+#'         \code{sequence: ArrayType(ArrayType(T))} (T is the item type)
+#'         \code{freq: Long}
+#' @rdname spark.prefixSpan
+#' @aliases findFrequentSequentialPatterns,PrefixSpan,SparkDataFrame-method
+#' @examples
+#' \dontrun{
+#' df <- createDataFrame(list(list(list(list(1L, 2L), list(3L))),
+#'                       list(list(list(1L), list(3L, 2L), list(1L, 2L))),
+#'                       list(list(list(1L, 2L), list(5L))),
+#'                       list(list(list(6L)))), schema = c("sequence"))
+#' frequency <- spark.findFrequentSequentialPatterns(df, minSupport = 0.5, 
maxPatternLength = 5L,
+#'                                                   maxLocalProjDBSize = 
32000000L)
+#' showDF(frequency)
+#' }
+#' @note spark.findFrequentSequentialPatterns(SparkDataFrame) since 3.0.0
+setMethod("spark.findFrequentSequentialPatterns",
+          signature(data = "SparkDataFrame"),
+          function(data, minSupport = 0.1, maxPatternLength = 10L,
+            maxLocalProjDBSize = 32000000L, sequenceCol = "sequence") {
+              if (!is.numeric(minSupport) || minSupport < 0) {
+                stop("minSupport should be a number with value >= 0.")
+              }
+              if (!is.integer(maxPatternLength) || maxPatternLength <= 0) {
+                stop("maxPatternLength should be a number with value > 0.")
+              }
+              if (!is.numeric(maxLocalProjDBSize) || maxLocalProjDBSize <= 0) {
+                stop("maxLocalProjDBSize should be a number with value > 0.")
+              }
+
+              jobj <- callJStatic("org.apache.spark.ml.r.PrefixSpanWrapper", 
"getPrefixSpan",
+                                  as.numeric(minSupport), 
as.integer(maxPatternLength),
+                                  as.numeric(maxLocalProjDBSize), 
as.character(sequenceCol))
+              object <- new("PrefixSpan", jobj = jobj)
+              dataFrame(callJMethod(object@jobj, 
"findFrequentSequentialPatterns", data@sdf))
+            }
+          )

http://git-wip-us.apache.org/repos/asf/spark/blob/fc64e83f/R/pkg/tests/fulltests/test_mllib_fpm.R
----------------------------------------------------------------------
diff --git a/R/pkg/tests/fulltests/test_mllib_fpm.R 
b/R/pkg/tests/fulltests/test_mllib_fpm.R
index d80f66a..daf9ff9 100644
--- a/R/pkg/tests/fulltests/test_mllib_fpm.R
+++ b/R/pkg/tests/fulltests/test_mllib_fpm.R
@@ -83,4 +83,20 @@ test_that("spark.fpGrowth", {
 
 })
 
+test_that("spark.prefixSpan", {
+    df <- createDataFrame(list(list(list(list(1L, 2L), list(3L))),
+                          list(list(list(1L), list(3L, 2L), list(1L, 2L))),
+                          list(list(list(1L, 2L), list(5L))),
+                          list(list(list(6L)))), schema = c("sequence"))
+    result1 <- spark.findFrequentSequentialPatterns(df, minSupport = 0.5, 
maxPatternLength = 5L,
+                                                    maxLocalProjDBSize = 
32000000L)
+
+    expected_result <- createDataFrame(list(list(list(list(1L)), 3L),
+                                            list(list(list(3L)), 2L),
+                                            list(list(list(2L)), 3L),
+                                            list(list(list(1L, 2L)), 3L),
+                                            list(list(list(1L), list(3L)), 
2L)),
+                                            schema = c("sequence", "freq"))
+  })
+
 sparkR.session.stop()

http://git-wip-us.apache.org/repos/asf/spark/blob/fc64e83f/R/pkg/vignettes/sparkr-vignettes.Rmd
----------------------------------------------------------------------
diff --git a/R/pkg/vignettes/sparkr-vignettes.Rmd 
b/R/pkg/vignettes/sparkr-vignettes.Rmd
index ad93494..7d924ef 100644
--- a/R/pkg/vignettes/sparkr-vignettes.Rmd
+++ b/R/pkg/vignettes/sparkr-vignettes.Rmd
@@ -542,6 +542,7 @@ SparkR supports the following machine learning models and 
algorithms.
 #### Frequent Pattern Mining
 
 * FP-growth
+* PrefixSpan
 
 #### Statistics
 
@@ -998,6 +999,18 @@ We can make predictions based on the `antecedent`.
 head(predict(fpm, df))
 ```
 
+#### PrefixSpan
+
+`spark.findFrequentSequentialPatterns` method can be used to find the complete 
set of frequent sequential patterns in the input sequences of itemsets.
+
+```{r}
+df <- createDataFrame(list(list(list(list(1L, 2L), list(3L))),
+                      list(list(list(1L), list(3L, 2L), list(1L, 2L))),
+                      list(list(list(1L, 2L), list(5L))),
+                      list(list(list(6L)))), schema = c("sequence"))
+head(spark.findFrequentSequentialPatterns(df, minSupport = 0.5, 
maxPatternLength = 5L))
+```
+
 #### Kolmogorov-Smirnov Test
 
 `spark.kstest` runs a two-sided, one-sample [Kolmogorov-Smirnov (KS) 
test](https://en.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test).

http://git-wip-us.apache.org/repos/asf/spark/blob/fc64e83f/docs/ml-frequent-pattern-mining.md
----------------------------------------------------------------------
diff --git a/docs/ml-frequent-pattern-mining.md 
b/docs/ml-frequent-pattern-mining.md
index 81634de..c2043d4 100644
--- a/docs/ml-frequent-pattern-mining.md
+++ b/docs/ml-frequent-pattern-mining.md
@@ -85,3 +85,56 @@ Refer to the [R API docs](api/R/spark.fpGrowth.html) for 
more details.
 </div>
 
 </div>
+
+## PrefixSpan
+
+PrefixSpan is a sequential pattern mining algorithm described in
+[Pei et al., Mining Sequential Patterns by Pattern-Growth: The
+PrefixSpan Approach](http://dx.doi.org/10.1109%2FTKDE.2004.77). We refer
+the reader to the referenced paper for formalizing the sequential
+pattern mining problem.
+
+`spark.ml`'s PrefixSpan implementation takes the following parameters:
+
+* `minSupport`: the minimum support required to be considered a frequent
+  sequential pattern.
+* `maxPatternLength`: the maximum length of a frequent sequential
+  pattern. Any frequent pattern exceeding this length will not be
+  included in the results.
+* `maxLocalProjDBSize`: the maximum number of items allowed in a
+  prefix-projected database before local iterative processing of the
+  projected database begins. This parameter should be tuned with respect
+  to the size of your executors.
+* `sequenceCol`: the name of the sequence column in dataset (default 
"sequence"), rows with
+  nulls in this column are ignored.
+
+**Examples**
+
+<div class="codetabs">
+
+<div data-lang="scala" markdown="1">
+Refer to the [Scala API 
docs](api/scala/index.html#org.apache.spark.ml.fpm.PrefixSpan) for more details.
+
+{% include_example scala/org/apache/spark/examples/ml/PrefixSpanExample.scala 
%}
+</div>
+
+<div data-lang="java" markdown="1">
+Refer to the [Java API docs](api/java/org/apache/spark/ml/fpm/PrefixSpan.html) 
for more details.
+
+{% include_example 
java/org/apache/spark/examples/ml/JavaPrefixSpanExample.java %}
+</div>
+
+<div data-lang="python" markdown="1">
+Refer to the [Python API 
docs](api/python/pyspark.ml.html#pyspark.ml.fpm.PrefixSpan) for more details.
+
+{% include_example python/ml/prefixspan_example.py %}
+</div>
+
+<div data-lang="r" markdown="1">
+
+Refer to the [R API docs](api/R/spark.prefixSpan.html) for more details.
+
+{% include_example r/ml/prefixSpan.R %}
+</div>
+
+</div>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/spark/blob/fc64e83f/docs/sparkr.md
----------------------------------------------------------------------
diff --git a/docs/sparkr.md b/docs/sparkr.md
index e6ec9ee..ba4cca8 100644
--- a/docs/sparkr.md
+++ b/docs/sparkr.md
@@ -510,6 +510,7 @@ SparkR supports the following machine learning algorithms 
currently:
 #### Frequent Pattern Mining
 
 * [`spark.fpGrowth`](api/R/spark.fpGrowth.html) : 
[`FP-growth`](ml-frequent-pattern-mining.html#fp-growth)
+* [`spark.prefixSpan`](api/R/spark.prefixSpan.html) : 
[`PrefixSpan`](ml-frequent-pattern-mining.html#prefixSpan)
 
 #### Statistics
 

http://git-wip-us.apache.org/repos/asf/spark/blob/fc64e83f/examples/src/main/java/org/apache/spark/examples/ml/JavaPrefixSpanExample.java
----------------------------------------------------------------------
diff --git 
a/examples/src/main/java/org/apache/spark/examples/ml/JavaPrefixSpanExample.java
 
b/examples/src/main/java/org/apache/spark/examples/ml/JavaPrefixSpanExample.java
new file mode 100644
index 0000000..891f306
--- /dev/null
+++ 
b/examples/src/main/java/org/apache/spark/examples/ml/JavaPrefixSpanExample.java
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+// $example on$
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.spark.ml.fpm.PrefixSpan;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.SparkSession;
+import org.apache.spark.sql.types.*;
+// $example off$
+
+/**
+ * An example demonstrating PrefixSpan.
+ * Run with
+ * <pre>
+ * bin/run-example ml.JavaPrefixSpanExample
+ * </pre>
+ */
+public class JavaPrefixSpanExample {
+  public static void main(String[] args) {
+    SparkSession spark = SparkSession
+      .builder()
+      .appName("JavaPrefixSpanExample")
+      .getOrCreate();
+
+    // $example on$
+    List<Row> data = Arrays.asList(
+      RowFactory.create(Arrays.asList(Arrays.asList(1, 2), Arrays.asList(3))),
+      RowFactory.create(Arrays.asList(Arrays.asList(1), Arrays.asList(3, 2), 
Arrays.asList(1,2))),
+      RowFactory.create(Arrays.asList(Arrays.asList(1, 2), Arrays.asList(5))),
+      RowFactory.create(Arrays.asList(Arrays.asList(6)))
+    );
+    StructType schema = new StructType(new StructField[]{ new StructField(
+      "sequence", new ArrayType(new ArrayType(DataTypes.IntegerType, true), 
true),
+      false, Metadata.empty())
+    });
+    Dataset<Row> sequenceDF = spark.createDataFrame(data, schema);
+
+    PrefixSpan prefixSpan = new 
PrefixSpan().setMinSupport(0.5).setMaxPatternLength(5);
+
+    // Finding frequent sequential patterns
+    prefixSpan.findFrequentSequentialPatterns(sequenceDF).show();
+    // $example off$
+
+    spark.stop();
+  }
+}

http://git-wip-us.apache.org/repos/asf/spark/blob/fc64e83f/examples/src/main/python/ml/prefixspan_example.py
----------------------------------------------------------------------
diff --git a/examples/src/main/python/ml/prefixspan_example.py 
b/examples/src/main/python/ml/prefixspan_example.py
new file mode 100644
index 0000000..88d1d41
--- /dev/null
+++ b/examples/src/main/python/ml/prefixspan_example.py
@@ -0,0 +1,48 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""
+An example demonstrating PrefixSpan.
+Run with:
+  bin/spark-submit examples/src/main/python/ml/prefixspan_example.py
+"""
+# $example on$
+from pyspark.ml.fpm import PrefixSpan
+# $example off$
+from pyspark.sql import Row, SparkSession
+
+if __name__ == "__main__":
+    spark = SparkSession\
+        .builder\
+        .appName("PrefixSpanExample")\
+        .getOrCreate()
+    sc = spark.sparkContext
+
+    # $example on$
+    df = sc.parallelize([Row(sequence=[[1, 2], [3]]),
+                         Row(sequence=[[1], [3, 2], [1, 2]]),
+                         Row(sequence=[[1, 2], [5]]),
+                         Row(sequence=[[6]])]).toDF()
+
+    prefixSpan = PrefixSpan(minSupport=0.5, maxPatternLength=5,
+                            maxLocalProjDBSize=32000000)
+
+    # Find frequent sequential patterns.
+    prefixSpan.findFrequentSequentialPatterns(df).show()
+    # $example off$
+
+    spark.stop()

http://git-wip-us.apache.org/repos/asf/spark/blob/fc64e83f/examples/src/main/r/ml/prefixSpan.R
----------------------------------------------------------------------
diff --git a/examples/src/main/r/ml/prefixSpan.R 
b/examples/src/main/r/ml/prefixSpan.R
new file mode 100644
index 0000000..9b70573
--- /dev/null
+++ b/examples/src/main/r/ml/prefixSpan.R
@@ -0,0 +1,42 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# To run this example use
+# ./bin/spark-submit examples/src/main/r/ml/prefixSpan.R
+
+# Load SparkR library into your R session
+library(SparkR)
+
+# Initialize SparkSession
+sparkR.session(appName = "SparkR-ML-prefixSpan-example")
+
+# $example on$
+# Load training data
+
+df <- createDataFrame(list(list(list(list(1L, 2L), list(3L))),
+                      list(list(list(1L), list(3L, 2L), list(1L, 2L))),
+                      list(list(list(1L, 2L), list(5L))),
+                      list(list(list(6L)))), schema = c("sequence"))
+
+# Finding frequent sequential patterns
+frequency <- spark.findFrequentSequentialPatterns(df, minSupport = 0.5, 
maxPatternLength = 5L,
+                                                  maxLocalProjDBSize = 
32000000L)
+showDF(frequency)
+
+# $example off$
+
+sparkR.session.stop()
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/spark/blob/fc64e83f/examples/src/main/scala/org/apache/spark/examples/ml/PrefixSpanExample.scala
----------------------------------------------------------------------
diff --git 
a/examples/src/main/scala/org/apache/spark/examples/ml/PrefixSpanExample.scala 
b/examples/src/main/scala/org/apache/spark/examples/ml/PrefixSpanExample.scala
new file mode 100644
index 0000000..0a2d310
--- /dev/null
+++ 
b/examples/src/main/scala/org/apache/spark/examples/ml/PrefixSpanExample.scala
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml
+
+// scalastyle:off println
+
+// $example on$
+import org.apache.spark.ml.fpm.PrefixSpan
+// $example off$
+import org.apache.spark.sql.SparkSession
+
+/**
+ * An example demonstrating PrefixSpan.
+ * Run with
+ * {{{
+ * bin/run-example ml.PrefixSpanExample
+ * }}}
+ */
+object PrefixSpanExample {
+
+  def main(args: Array[String]): Unit = {
+    val spark = SparkSession
+      .builder
+      .appName(s"${this.getClass.getSimpleName}")
+      .getOrCreate()
+    import spark.implicits._
+
+    // $example on$
+    val smallTestData = Seq(
+      Seq(Seq(1, 2), Seq(3)),
+      Seq(Seq(1), Seq(3, 2), Seq(1, 2)),
+      Seq(Seq(1, 2), Seq(5)),
+      Seq(Seq(6)))
+
+    val df = smallTestData.toDF("sequence")
+    val result = new PrefixSpan()
+      .setMinSupport(0.5)
+      .setMaxPatternLength(5)
+      .setMaxLocalProjDBSize(32000000)
+      .findFrequentSequentialPatterns(df)
+      .show()
+    // $example off$
+
+    spark.stop()
+  }
+}
+// scalastyle:on println

http://git-wip-us.apache.org/repos/asf/spark/blob/fc64e83f/mllib/src/main/scala/org/apache/spark/ml/r/PrefixSpanWrapper.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/PrefixSpanWrapper.scala 
b/mllib/src/main/scala/org/apache/spark/ml/r/PrefixSpanWrapper.scala
new file mode 100644
index 0000000..268d596
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/ml/r/PrefixSpanWrapper.scala
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.r
+
+import org.apache.spark.ml.fpm.PrefixSpan
+
+private[r] object PrefixSpanWrapper {
+  def getPrefixSpan(
+      minSupport: Double,
+      maxPatternLength: Int,
+      maxLocalProjDBSize: Double,
+      sequenceCol: String): PrefixSpan = {
+    new PrefixSpan()
+      .setMinSupport(minSupport)
+      .setMaxPatternLength(maxPatternLength)
+      .setMaxLocalProjDBSize(maxLocalProjDBSize.toLong)
+      .setSequenceCol(sequenceCol)
+  }
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-24207][R] add R API for PrefixSpan

Reply via email to