spark git commit: [SPARK-15946][MLLIB] Conversion between old/new vector columns in a DataFrame (Python)
Repository: spark Updated Branches: refs/heads/branch-2.0 f0de45cb1 -> 0a8fd2eb8 [SPARK-15946][MLLIB] Conversion between old/new vector columns in a DataFrame (Python) ## What changes were proposed in this pull request? This PR implements python wrappers for #13662 to convert old/new vector columns in a DataFrame. ## How was this patch tested? doctest in Python cc: yanboliang Author: Xiangrui Meng Closes #13731 from mengxr/SPARK-15946. (cherry picked from commit edb23f9e47eecfe60992dde0e037ec1985c77e1d) Signed-off-by: Yanbo Liang Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0a8fd2eb Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0a8fd2eb Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0a8fd2eb Branch: refs/heads/branch-2.0 Commit: 0a8fd2eb8966afaff3030adef5fc6fd73171607c Parents: f0de45c Author: Xiangrui Meng Authored: Fri Jun 17 21:22:29 2016 -0700 Committer: Yanbo Liang Committed: Fri Jun 17 21:22:41 2016 -0700 -- .../spark/mllib/api/python/PythonMLLibAPI.scala | 14 python/pyspark/mllib/util.py| 82 2 files changed, 96 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/0a8fd2eb/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala index 7df6160..f2c70ba 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala @@ -1201,6 +1201,20 @@ private[python] class PythonMLLibAPI extends Serializable { val spark = SparkSession.builder().sparkContext(sc).getOrCreate() spark.createDataFrame(blockMatrix.blocks) } + + /** + * Python-friendly version of [[MLUtils.convertVectorColumnsToML()]]. + */ + def convertVectorColumnsToML(dataset: DataFrame, cols: JArrayList[String]): DataFrame = { +MLUtils.convertVectorColumnsToML(dataset, cols.asScala: _*) + } + + /** + * Python-friendly version of [[MLUtils.convertVectorColumnsFromML()]] + */ + def convertVectorColumnsFromML(dataset: DataFrame, cols: JArrayList[String]): DataFrame = { +MLUtils.convertVectorColumnsFromML(dataset, cols.asScala: _*) + } } /** http://git-wip-us.apache.org/repos/asf/spark/blob/0a8fd2eb/python/pyspark/mllib/util.py -- diff --git a/python/pyspark/mllib/util.py b/python/pyspark/mllib/util.py index a316ee1..a7e6bcc 100644 --- a/python/pyspark/mllib/util.py +++ b/python/pyspark/mllib/util.py @@ -26,6 +26,7 @@ if sys.version > '3': from pyspark import SparkContext, since from pyspark.mllib.common import callMLlibFunc, inherit_doc from pyspark.mllib.linalg import Vectors, SparseVector, _convert_to_vector +from pyspark.sql import DataFrame class MLUtils(object): @@ -200,6 +201,86 @@ class MLUtils(object): """ return callMLlibFunc("loadVectors", sc, path) +@staticmethod +@since("2.0.0") +def convertVectorColumnsToML(dataset, *cols): +""" +Converts vector columns in an input DataFrame from the +:py:class:`pyspark.mllib.linalg.Vector` type to the new +:py:class:`pyspark.ml.linalg.Vector` type under the `spark.ml` +package. + +:param dataset: + input dataset +:param cols: + a list of vector columns to be converted. + New vector columns will be ignored. If unspecified, all old + vector columns will be converted excepted nested ones. +:return: + the input dataset with old vector columns converted to the + new vector type + +>>> import pyspark +>>> from pyspark.mllib.linalg import Vectors +>>> from pyspark.mllib.util import MLUtils +>>> df = spark.createDataFrame( +... [(0, Vectors.sparse(2, [1], [1.0]), Vectors.dense(2.0, 3.0))], +... ["id", "x", "y"]) +>>> r1 = MLUtils.convertVectorColumnsToML(df).first() +>>> isinstance(r1.x, pyspark.ml.linalg.SparseVector) +True +>>> isinstance(r1.y, pyspark.ml.linalg.DenseVector) +True +>>> r2 = MLUtils.convertVectorColumnsToML(df, "x").first() +>>> isinstance(r2.x, pyspark.ml.linalg.SparseVector) +True +>>> isinstance(r2.y, pyspark.mllib.linalg.DenseVector) +True +""" +if not isinstance(dataset, DataFrame): +raise TypeError("Input dataset must be a DataFrame but got {}.".format(type(dataset))) +return callMLlibFu
spark git commit: [SPARK-15946][MLLIB] Conversion between old/new vector columns in a DataFrame (Python)
Repository: spark Updated Branches: refs/heads/master af2a4b082 -> edb23f9e4 [SPARK-15946][MLLIB] Conversion between old/new vector columns in a DataFrame (Python) ## What changes were proposed in this pull request? This PR implements python wrappers for #13662 to convert old/new vector columns in a DataFrame. ## How was this patch tested? doctest in Python cc: yanboliang Author: Xiangrui Meng Closes #13731 from mengxr/SPARK-15946. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/edb23f9e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/edb23f9e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/edb23f9e Branch: refs/heads/master Commit: edb23f9e47eecfe60992dde0e037ec1985c77e1d Parents: af2a4b0 Author: Xiangrui Meng Authored: Fri Jun 17 21:22:29 2016 -0700 Committer: Yanbo Liang Committed: Fri Jun 17 21:22:29 2016 -0700 -- .../spark/mllib/api/python/PythonMLLibAPI.scala | 14 python/pyspark/mllib/util.py| 82 2 files changed, 96 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/edb23f9e/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala index 7df6160..f2c70ba 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala @@ -1201,6 +1201,20 @@ private[python] class PythonMLLibAPI extends Serializable { val spark = SparkSession.builder().sparkContext(sc).getOrCreate() spark.createDataFrame(blockMatrix.blocks) } + + /** + * Python-friendly version of [[MLUtils.convertVectorColumnsToML()]]. + */ + def convertVectorColumnsToML(dataset: DataFrame, cols: JArrayList[String]): DataFrame = { +MLUtils.convertVectorColumnsToML(dataset, cols.asScala: _*) + } + + /** + * Python-friendly version of [[MLUtils.convertVectorColumnsFromML()]] + */ + def convertVectorColumnsFromML(dataset: DataFrame, cols: JArrayList[String]): DataFrame = { +MLUtils.convertVectorColumnsFromML(dataset, cols.asScala: _*) + } } /** http://git-wip-us.apache.org/repos/asf/spark/blob/edb23f9e/python/pyspark/mllib/util.py -- diff --git a/python/pyspark/mllib/util.py b/python/pyspark/mllib/util.py index a316ee1..a7e6bcc 100644 --- a/python/pyspark/mllib/util.py +++ b/python/pyspark/mllib/util.py @@ -26,6 +26,7 @@ if sys.version > '3': from pyspark import SparkContext, since from pyspark.mllib.common import callMLlibFunc, inherit_doc from pyspark.mllib.linalg import Vectors, SparseVector, _convert_to_vector +from pyspark.sql import DataFrame class MLUtils(object): @@ -200,6 +201,86 @@ class MLUtils(object): """ return callMLlibFunc("loadVectors", sc, path) +@staticmethod +@since("2.0.0") +def convertVectorColumnsToML(dataset, *cols): +""" +Converts vector columns in an input DataFrame from the +:py:class:`pyspark.mllib.linalg.Vector` type to the new +:py:class:`pyspark.ml.linalg.Vector` type under the `spark.ml` +package. + +:param dataset: + input dataset +:param cols: + a list of vector columns to be converted. + New vector columns will be ignored. If unspecified, all old + vector columns will be converted excepted nested ones. +:return: + the input dataset with old vector columns converted to the + new vector type + +>>> import pyspark +>>> from pyspark.mllib.linalg import Vectors +>>> from pyspark.mllib.util import MLUtils +>>> df = spark.createDataFrame( +... [(0, Vectors.sparse(2, [1], [1.0]), Vectors.dense(2.0, 3.0))], +... ["id", "x", "y"]) +>>> r1 = MLUtils.convertVectorColumnsToML(df).first() +>>> isinstance(r1.x, pyspark.ml.linalg.SparseVector) +True +>>> isinstance(r1.y, pyspark.ml.linalg.DenseVector) +True +>>> r2 = MLUtils.convertVectorColumnsToML(df, "x").first() +>>> isinstance(r2.x, pyspark.ml.linalg.SparseVector) +True +>>> isinstance(r2.y, pyspark.mllib.linalg.DenseVector) +True +""" +if not isinstance(dataset, DataFrame): +raise TypeError("Input dataset must be a DataFrame but got {}.".format(type(dataset))) +return callMLlibFunc("convertVectorColumnsToML", dataset, list(cols)) + +@staticmethod +@since("2.0.0") +def con