spark git commit: [SPARK-22801][ML][PYSPARK] Allow FeatureHasher to treat numeric columns as categorical

mlnick Sun, 31 Dec 2017 04:51:52 -0800

Repository: spark
Updated Branches:
  refs/heads/master 3d8837e59 -> 028ee4016



[SPARK-22801][ML][PYSPARK] Allow FeatureHasher to treat numeric columns as 
categorical

Previously, `FeatureHasher` always treats numeric type columns as numbers and 
never as categorical features. It is quite common to have categorical features 
represented as numbers or codes in data sources.

In order to hash these features as categorical, users must first explicitly 
convert them to strings which is cumbersome.

Add a new param `categoricalCols` which specifies the numeric columns that 
should be treated as categorical features.

## How was this patch tested?

New unit tests.

Author: Nick Pentreath <ni...@za.ibm.com>

Closes #19991 from MLnick/hasher-num-cat.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/028ee401
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/028ee401
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/028ee401

Branch: refs/heads/master
Commit: 028ee40165315337e2a349b19731764d64e4f51d
Parents: 3d8837e
Author: Nick Pentreath <ni...@za.ibm.com>
Authored: Sun Dec 31 14:51:38 2017 +0200
Committer: Nick Pentreath <ni...@za.ibm.com>
Committed: Sun Dec 31 14:51:38 2017 +0200

----------------------------------------------------------------------
 .../apache/spark/ml/feature/FeatureHasher.scala | 38 ++++++++++++++++----
 .../spark/ml/feature/FeatureHasherSuite.scala   | 25 +++++++++++++
 python/pyspark/ml/feature.py                    | 34 ++++++++++++++----
 3 files changed, 83 insertions(+), 14 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/028ee401/mllib/src/main/scala/org/apache/spark/ml/feature/FeatureHasher.scala
----------------------------------------------------------------------
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/feature/FeatureHasher.scala 
b/mllib/src/main/scala/org/apache/spark/ml/feature/FeatureHasher.scala
index 4615dae..a918dd4 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/FeatureHasher.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/FeatureHasher.scala
@@ -21,7 +21,7 @@ import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.ml.Transformer
 import org.apache.spark.ml.attribute.AttributeGroup
 import org.apache.spark.ml.linalg.Vectors
-import org.apache.spark.ml.param.{IntParam, ParamMap, ParamValidators}
+import org.apache.spark.ml.param.{IntParam, ParamMap, ParamValidators, 
StringArrayParam}
 import org.apache.spark.ml.param.shared.{HasInputCols, HasOutputCol}
 import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, 
Identifiable, SchemaUtils}
 import org.apache.spark.mllib.feature.{HashingTF => OldHashingTF}
@@ -40,9 +40,9 @@ import org.apache.spark.util.collection.OpenHashMap
  * The [[FeatureHasher]] transformer operates on multiple columns. Each column 
may contain either
  * numeric or categorical features. Behavior and handling of column data types 
is as follows:
  *  -Numeric columns: For numeric features, the hash value of the column name 
is used to map the
- *                    feature value to its index in the feature vector. 
Numeric features are never
- *                    treated as categorical, even when they are integers. You 
must explicitly
- *                    convert numeric columns containing categorical features 
to strings first.
+ *                    feature value to its index in the feature vector. By 
default, numeric features
+ *                    are not treated as categorical (even when they are 
integers). To treat them
+ *                    as categorical, specify the relevant columns in 
`categoricalCols`.
  *  -String columns: For categorical features, the hash value of the string 
"column_name=value"
  *                   is used to map to the vector index, with an indicator 
value of `1.0`.
  *                   Thus, categorical features are "one-hot" encoded
@@ -87,6 +87,17 @@ class FeatureHasher(@Since("2.3.0") override val uid: 
String) extends Transforme
   def this() = this(Identifiable.randomUID("featureHasher"))
 
   /**
+   * Numeric columns to treat as categorical features. By default only string 
and boolean
+   * columns are treated as categorical, so this param can be used to 
explicitly specify the
+   * numerical columns to treat as categorical. Note, the relevant columns 
must also be set in
+   * `inputCols`.
+   * @group param
+   */
+  @Since("2.3.0")
+  val categoricalCols = new StringArrayParam(this, "categoricalCols",
+    "numeric columns to treat as categorical")
+
+  /**
    * Number of features. Should be greater than 0.
    * (default = 2^18^)
    * @group param
@@ -117,15 +128,28 @@ class FeatureHasher(@Since("2.3.0") override val uid: 
String) extends Transforme
   @Since("2.3.0")
   def setOutputCol(value: String): this.type = set(outputCol, value)
 
+  /** @group getParam */
+  @Since("2.3.0")
+  def getCategoricalCols: Array[String] = $(categoricalCols)
+
+  /** @group setParam */
+  @Since("2.3.0")
+  def setCategoricalCols(value: Array[String]): this.type = 
set(categoricalCols, value)
+
   @Since("2.3.0")
   override def transform(dataset: Dataset[_]): DataFrame = {
     val hashFunc: Any => Int = OldHashingTF.murmur3Hash
     val n = $(numFeatures)
     val localInputCols = $(inputCols)
+    val catCols = if (isSet(categoricalCols)) {
+      $(categoricalCols).toSet
+    } else {
+      Set[String]()
+    }
 
     val outputSchema = transformSchema(dataset.schema)
     val realFields = outputSchema.fields.filter { f =>
-      f.dataType.isInstanceOf[NumericType]
+      f.dataType.isInstanceOf[NumericType] && !catCols.contains(f.name)
     }.map(_.name).toSet
 
     def getDouble(x: Any): Double = {
@@ -149,8 +173,8 @@ class FeatureHasher(@Since("2.3.0") override val uid: 
String) extends Transforme
             val hash = hashFunc(colName)
             (hash, value)
           } else {
-            // string and boolean values are treated as categorical, with an 
indicator value of 1.0
-            // and vector index based on hash of "column_name=value"
+            // string, boolean and numeric values that are in catCols are 
treated as categorical,
+            // with an indicator value of 1.0 and vector index based on hash 
of "column_name=value"
             val value = row.get(fieldIndex).toString
             val fieldName = s"$colName=$value"
             val hash = hashFunc(fieldName)

http://git-wip-us.apache.org/repos/asf/spark/blob/028ee401/mllib/src/test/scala/org/apache/spark/ml/feature/FeatureHasherSuite.scala
----------------------------------------------------------------------
diff --git 
a/mllib/src/test/scala/org/apache/spark/ml/feature/FeatureHasherSuite.scala 
b/mllib/src/test/scala/org/apache/spark/ml/feature/FeatureHasherSuite.scala
index 407371a..3fc3cbb 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/FeatureHasherSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/FeatureHasherSuite.scala
@@ -78,6 +78,31 @@ class FeatureHasherSuite extends SparkFunSuite
     assert(features.zip(expected).forall { case (e, a) => e ~== a absTol 1e-14 
})
   }
 
+  test("setting explicit numerical columns to treat as categorical") {
+    val df = Seq(
+      (2.0, 1, "foo"),
+      (3.0, 2, "bar")
+    ).toDF("real", "int", "string")
+
+    val n = 100
+    val hasher = new FeatureHasher()
+      .setInputCols("real", "int", "string")
+      .setCategoricalCols(Array("real"))
+      .setOutputCol("features")
+      .setNumFeatures(n)
+    val output = hasher.transform(df)
+
+    val features = output.select("features").as[Vector].collect()
+    // Assume perfect hash on field names
+    def idx: Any => Int = murmur3FeatureIdx(n)
+    // check expected indices
+    val expected = Seq(
+      Vectors.sparse(n, Seq((idx("real=2.0"), 1.0), (idx("int"), 1.0), 
(idx("string=foo"), 1.0))),
+      Vectors.sparse(n, Seq((idx("real=3.0"), 1.0), (idx("int"), 2.0), 
(idx("string=bar"), 1.0)))
+    )
+    assert(features.zip(expected).forall { case (e, a) => e ~== a absTol 1e-14 
})
+  }
+
   test("hashing works for all numeric types") {
     val df = Seq(5.0, 10.0, 15.0).toDF("real")
 

http://git-wip-us.apache.org/repos/asf/spark/blob/028ee401/python/pyspark/ml/feature.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 5094324..13bf95c 100755
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -714,9 +714,9 @@ class FeatureHasher(JavaTransformer, HasInputCols, 
HasOutputCol, HasNumFeatures,
 
     * Numeric columns:
         For numeric features, the hash value of the column name is used to map 
the
-        feature value to its index in the feature vector. Numeric features are 
never
-        treated as categorical, even when they are integers. You must 
explicitly
-        convert numeric columns containing categorical features to strings 
first.
+        feature value to its index in the feature vector. By default, numeric 
features
+        are not treated as categorical (even when they are integers). To treat 
them
+        as categorical, specify the relevant columns in `categoricalCols`.
 
     * String columns:
         For categorical features, the hash value of the string 
"column_name=value"
@@ -741,6 +741,8 @@ class FeatureHasher(JavaTransformer, HasInputCols, 
HasOutputCol, HasNumFeatures,
     >>> hasher = FeatureHasher(inputCols=cols, outputCol="features")
     >>> hasher.transform(df).head().features
     SparseVector(262144, {51871: 1.0, 63643: 1.0, 174475: 2.0, 253195: 1.0})
+    >>> hasher.setCategoricalCols(["real"]).transform(df).head().features
+    SparseVector(262144, {51871: 1.0, 63643: 1.0, 171257: 1.0, 253195: 1.0})
     >>> hasherPath = temp_path + "/hasher"
     >>> hasher.save(hasherPath)
     >>> loadedHasher = FeatureHasher.load(hasherPath)
@@ -752,10 +754,14 @@ class FeatureHasher(JavaTransformer, HasInputCols, 
HasOutputCol, HasNumFeatures,
     .. versionadded:: 2.3.0
     """
 
+    categoricalCols = Param(Params._dummy(), "categoricalCols",
+                            "numeric columns to treat as categorical",
+                            typeConverter=TypeConverters.toListString)
+
     @keyword_only
-    def __init__(self, numFeatures=1 << 18, inputCols=None, outputCol=None):
+    def __init__(self, numFeatures=1 << 18, inputCols=None, outputCol=None, 
categoricalCols=None):
         """
-        __init__(self, numFeatures=1 << 18, inputCols=None, outputCol=None)
+        __init__(self, numFeatures=1 << 18, inputCols=None, outputCol=None, 
categoricalCols=None)
         """
         super(FeatureHasher, self).__init__()
         self._java_obj = 
self._new_java_obj("org.apache.spark.ml.feature.FeatureHasher", self.uid)
@@ -765,14 +771,28 @@ class FeatureHasher(JavaTransformer, HasInputCols, 
HasOutputCol, HasNumFeatures,
 
     @keyword_only
     @since("2.3.0")
-    def setParams(self, numFeatures=1 << 18, inputCols=None, outputCol=None):
+    def setParams(self, numFeatures=1 << 18, inputCols=None, outputCol=None, 
categoricalCols=None):
         """
-        setParams(self, numFeatures=1 << 18, inputCols=None, outputCol=None)
+        setParams(self, numFeatures=1 << 18, inputCols=None, outputCol=None, 
categoricalCols=None)
         Sets params for this FeatureHasher.
         """
         kwargs = self._input_kwargs
         return self._set(**kwargs)
 
+    @since("2.3.0")
+    def setCategoricalCols(self, value):
+        """
+        Sets the value of :py:attr:`categoricalCols`.
+        """
+        return self._set(categoricalCols=value)
+
+    @since("2.3.0")
+    def getCategoricalCols(self):
+        """
+        Gets the value of binary or its default value.
+        """
+        return self.getOrDefault(self.categoricalCols)
+
 
 @inherit_doc
 class HashingTF(JavaTransformer, HasInputCol, HasOutputCol, HasNumFeatures, 
JavaMLReadable,


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-22801][ML][PYSPARK] Allow FeatureHasher to treat numeric columns as categorical

Reply via email to