Github user jkbradley commented on a diff in the pull request:
https://github.com/apache/spark/pull/20112#discussion_r159096655
--- Diff: python/pyspark/ml/feature.py ---
@@ -3466,6 +3466,72 @@ def selectedFeatures(self):
return self._call_java("selectedFeatures")
+@inherit_doc
+class VectorSizeHint(JavaTransformer, HasInputCol, HasHandleInvalid,
JavaMLReadable,
+ JavaMLWritable):
+"""
+A feature transformer that adds size information to the metadata of a
vector column.
+VectorAssembler needs size information for its input columns and
cannot be used on streaming
+dataframes without this metadata.
+
+>>> from pyspark.ml.linalg import Vectors
+>>> from pyspark.ml import Pipeline, PipelineModel
+>>> data = [(Vectors.dense([1., 2., 3.]), 4.)]
+>>> df = spark.createDataFrame(data, ["vector", "float"])
+>>>
+>>> sizeHint = VectorSizeHint(inputCol="vector", size=3,
handleInvalid="skip")
+>>> vecAssembler = VectorAssembler(inputCols=["vector", "float"],
outputCol="assembled")
+>>> pipeline = Pipeline(stages=[sizeHint, vecAssembler])
+>>>
+>>> pipelineModel = pipeline.fit(df)
+>>> pipelineModel.transform(df).head().assembled
+DenseVector([1.0, 2.0, 3.0, 4.0])
+>>> vectorSizeHintPath = temp_path + "/vector-size-hint-pipeline"
+>>> pipelineModel.save(vectorSizeHintPath)
+>>> loadedPipeline = PipelineModel.load(vectorSizeHintPath)
+>>> loaded = loadedPipeline.transform(df).head().assembled
+>>> expected = pipelineModel.transform(df).head().assembled
+>>> loaded == expected
+True
+
+.. versionadded:: 2.3.0
+.. note:: Experimental
+"""
+
+size = Param(Params._dummy(), "size", "Size of vectors in column.",
+ typeConverter=TypeConverters.toInt)
+
+@since("2.3.0")
+def getSize(self):
+""" Gets size param, the size of vectors in `inputCol`."""
+self.getOrDefault(self.size)
+
+@since("2.3.0")
+def setSize(self, value):
+""" Sets size param, the size of vectors in `inputCol`."""
+self._set(size=value)
+
+@keyword_only
+def __init__(self, inputCol=None, size=None, handleInvalid="error"):
--- End diff --
Let's stick with the order which all other python classes follow: dummy
Params, __init__, Param setters & getters
---
-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org