Repository: spark Updated Branches: refs/heads/master 3d8837e59 -> 028ee4016
[SPARK-22801][ML][PYSPARK] Allow FeatureHasher to treat numeric columns as categorical Previously, `FeatureHasher` always treats numeric type columns as numbers and never as categorical features. It is quite common to have categorical features represented as numbers or codes in data sources. In order to hash these features as categorical, users must first explicitly convert them to strings which is cumbersome. Add a new param `categoricalCols` which specifies the numeric columns that should be treated as categorical features. ## How was this patch tested? New unit tests. Author: Nick Pentreath <ni...@za.ibm.com> Closes #19991 from MLnick/hasher-num-cat. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/028ee401 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/028ee401 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/028ee401 Branch: refs/heads/master Commit: 028ee40165315337e2a349b19731764d64e4f51d Parents: 3d8837e Author: Nick Pentreath <ni...@za.ibm.com> Authored: Sun Dec 31 14:51:38 2017 +0200 Committer: Nick Pentreath <ni...@za.ibm.com> Committed: Sun Dec 31 14:51:38 2017 +0200 ---------------------------------------------------------------------- .../apache/spark/ml/feature/FeatureHasher.scala | 38 ++++++++++++++++---- .../spark/ml/feature/FeatureHasherSuite.scala | 25 +++++++++++++ python/pyspark/ml/feature.py | 34 ++++++++++++++---- 3 files changed, 83 insertions(+), 14 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/028ee401/mllib/src/main/scala/org/apache/spark/ml/feature/FeatureHasher.scala ---------------------------------------------------------------------- diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/FeatureHasher.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/FeatureHasher.scala index 4615dae..a918dd4 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/FeatureHasher.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/FeatureHasher.scala @@ -21,7 +21,7 @@ import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.linalg.Vectors -import org.apache.spark.ml.param.{IntParam, ParamMap, ParamValidators} +import org.apache.spark.ml.param.{IntParam, ParamMap, ParamValidators, StringArrayParam} import org.apache.spark.ml.param.shared.{HasInputCols, HasOutputCol} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.feature.{HashingTF => OldHashingTF} @@ -40,9 +40,9 @@ import org.apache.spark.util.collection.OpenHashMap * The [[FeatureHasher]] transformer operates on multiple columns. Each column may contain either * numeric or categorical features. Behavior and handling of column data types is as follows: * -Numeric columns: For numeric features, the hash value of the column name is used to map the - * feature value to its index in the feature vector. Numeric features are never - * treated as categorical, even when they are integers. You must explicitly - * convert numeric columns containing categorical features to strings first. + * feature value to its index in the feature vector. By default, numeric features + * are not treated as categorical (even when they are integers). To treat them + * as categorical, specify the relevant columns in `categoricalCols`. * -String columns: For categorical features, the hash value of the string "column_name=value" * is used to map to the vector index, with an indicator value of `1.0`. * Thus, categorical features are "one-hot" encoded @@ -87,6 +87,17 @@ class FeatureHasher(@Since("2.3.0") override val uid: String) extends Transforme def this() = this(Identifiable.randomUID("featureHasher")) /** + * Numeric columns to treat as categorical features. By default only string and boolean + * columns are treated as categorical, so this param can be used to explicitly specify the + * numerical columns to treat as categorical. Note, the relevant columns must also be set in + * `inputCols`. + * @group param + */ + @Since("2.3.0") + val categoricalCols = new StringArrayParam(this, "categoricalCols", + "numeric columns to treat as categorical") + + /** * Number of features. Should be greater than 0. * (default = 2^18^) * @group param @@ -117,15 +128,28 @@ class FeatureHasher(@Since("2.3.0") override val uid: String) extends Transforme @Since("2.3.0") def setOutputCol(value: String): this.type = set(outputCol, value) + /** @group getParam */ + @Since("2.3.0") + def getCategoricalCols: Array[String] = $(categoricalCols) + + /** @group setParam */ + @Since("2.3.0") + def setCategoricalCols(value: Array[String]): this.type = set(categoricalCols, value) + @Since("2.3.0") override def transform(dataset: Dataset[_]): DataFrame = { val hashFunc: Any => Int = OldHashingTF.murmur3Hash val n = $(numFeatures) val localInputCols = $(inputCols) + val catCols = if (isSet(categoricalCols)) { + $(categoricalCols).toSet + } else { + Set[String]() + } val outputSchema = transformSchema(dataset.schema) val realFields = outputSchema.fields.filter { f => - f.dataType.isInstanceOf[NumericType] + f.dataType.isInstanceOf[NumericType] && !catCols.contains(f.name) }.map(_.name).toSet def getDouble(x: Any): Double = { @@ -149,8 +173,8 @@ class FeatureHasher(@Since("2.3.0") override val uid: String) extends Transforme val hash = hashFunc(colName) (hash, value) } else { - // string and boolean values are treated as categorical, with an indicator value of 1.0 - // and vector index based on hash of "column_name=value" + // string, boolean and numeric values that are in catCols are treated as categorical, + // with an indicator value of 1.0 and vector index based on hash of "column_name=value" val value = row.get(fieldIndex).toString val fieldName = s"$colName=$value" val hash = hashFunc(fieldName) http://git-wip-us.apache.org/repos/asf/spark/blob/028ee401/mllib/src/test/scala/org/apache/spark/ml/feature/FeatureHasherSuite.scala ---------------------------------------------------------------------- diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/FeatureHasherSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/FeatureHasherSuite.scala index 407371a..3fc3cbb 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/FeatureHasherSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/FeatureHasherSuite.scala @@ -78,6 +78,31 @@ class FeatureHasherSuite extends SparkFunSuite assert(features.zip(expected).forall { case (e, a) => e ~== a absTol 1e-14 }) } + test("setting explicit numerical columns to treat as categorical") { + val df = Seq( + (2.0, 1, "foo"), + (3.0, 2, "bar") + ).toDF("real", "int", "string") + + val n = 100 + val hasher = new FeatureHasher() + .setInputCols("real", "int", "string") + .setCategoricalCols(Array("real")) + .setOutputCol("features") + .setNumFeatures(n) + val output = hasher.transform(df) + + val features = output.select("features").as[Vector].collect() + // Assume perfect hash on field names + def idx: Any => Int = murmur3FeatureIdx(n) + // check expected indices + val expected = Seq( + Vectors.sparse(n, Seq((idx("real=2.0"), 1.0), (idx("int"), 1.0), (idx("string=foo"), 1.0))), + Vectors.sparse(n, Seq((idx("real=3.0"), 1.0), (idx("int"), 2.0), (idx("string=bar"), 1.0))) + ) + assert(features.zip(expected).forall { case (e, a) => e ~== a absTol 1e-14 }) + } + test("hashing works for all numeric types") { val df = Seq(5.0, 10.0, 15.0).toDF("real") http://git-wip-us.apache.org/repos/asf/spark/blob/028ee401/python/pyspark/ml/feature.py ---------------------------------------------------------------------- diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 5094324..13bf95c 100755 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -714,9 +714,9 @@ class FeatureHasher(JavaTransformer, HasInputCols, HasOutputCol, HasNumFeatures, * Numeric columns: For numeric features, the hash value of the column name is used to map the - feature value to its index in the feature vector. Numeric features are never - treated as categorical, even when they are integers. You must explicitly - convert numeric columns containing categorical features to strings first. + feature value to its index in the feature vector. By default, numeric features + are not treated as categorical (even when they are integers). To treat them + as categorical, specify the relevant columns in `categoricalCols`. * String columns: For categorical features, the hash value of the string "column_name=value" @@ -741,6 +741,8 @@ class FeatureHasher(JavaTransformer, HasInputCols, HasOutputCol, HasNumFeatures, >>> hasher = FeatureHasher(inputCols=cols, outputCol="features") >>> hasher.transform(df).head().features SparseVector(262144, {51871: 1.0, 63643: 1.0, 174475: 2.0, 253195: 1.0}) + >>> hasher.setCategoricalCols(["real"]).transform(df).head().features + SparseVector(262144, {51871: 1.0, 63643: 1.0, 171257: 1.0, 253195: 1.0}) >>> hasherPath = temp_path + "/hasher" >>> hasher.save(hasherPath) >>> loadedHasher = FeatureHasher.load(hasherPath) @@ -752,10 +754,14 @@ class FeatureHasher(JavaTransformer, HasInputCols, HasOutputCol, HasNumFeatures, .. versionadded:: 2.3.0 """ + categoricalCols = Param(Params._dummy(), "categoricalCols", + "numeric columns to treat as categorical", + typeConverter=TypeConverters.toListString) + @keyword_only - def __init__(self, numFeatures=1 << 18, inputCols=None, outputCol=None): + def __init__(self, numFeatures=1 << 18, inputCols=None, outputCol=None, categoricalCols=None): """ - __init__(self, numFeatures=1 << 18, inputCols=None, outputCol=None) + __init__(self, numFeatures=1 << 18, inputCols=None, outputCol=None, categoricalCols=None) """ super(FeatureHasher, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.FeatureHasher", self.uid) @@ -765,14 +771,28 @@ class FeatureHasher(JavaTransformer, HasInputCols, HasOutputCol, HasNumFeatures, @keyword_only @since("2.3.0") - def setParams(self, numFeatures=1 << 18, inputCols=None, outputCol=None): + def setParams(self, numFeatures=1 << 18, inputCols=None, outputCol=None, categoricalCols=None): """ - setParams(self, numFeatures=1 << 18, inputCols=None, outputCol=None) + setParams(self, numFeatures=1 << 18, inputCols=None, outputCol=None, categoricalCols=None) Sets params for this FeatureHasher. """ kwargs = self._input_kwargs return self._set(**kwargs) + @since("2.3.0") + def setCategoricalCols(self, value): + """ + Sets the value of :py:attr:`categoricalCols`. + """ + return self._set(categoricalCols=value) + + @since("2.3.0") + def getCategoricalCols(self): + """ + Gets the value of binary or its default value. + """ + return self.getOrDefault(self.categoricalCols) + @inherit_doc class HashingTF(JavaTransformer, HasInputCol, HasOutputCol, HasNumFeatures, JavaMLReadable, --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org