Repository: spark Updated Branches: refs/heads/master 2ad4837cf -> 829f1d95b
[SPARK-7579] [ML] [DOC] User guide update for OneHotEncoder Author: Sandy Ryza <sa...@cloudera.com> Closes #6126 from sryza/sandy-spark-7579 and squashes the following commits: 5af803d [Sandy Ryza] SPARK-7579 [MLLIB] User guide update for OneHotEncoder Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/829f1d95 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/829f1d95 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/829f1d95 Branch: refs/heads/master Commit: 829f1d95bac9153e7b646fbc0d55566ecf896200 Parents: 2ad4837 Author: Sandy Ryza <sa...@cloudera.com> Authored: Wed May 20 13:10:30 2015 -0700 Committer: Joseph K. Bradley <jos...@databricks.com> Committed: Wed May 20 13:10:30 2015 -0700 ---------------------------------------------------------------------- docs/ml-features.md | 95 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 95 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/829f1d95/docs/ml-features.md ---------------------------------------------------------------------- diff --git a/docs/ml-features.md b/docs/ml-features.md index 63ea3e5..235029d 100644 --- a/docs/ml-features.md +++ b/docs/ml-features.md @@ -440,5 +440,100 @@ for expanded in polyDF.select("polyFeatures").take(3): </div> </div> +## OneHotEncoder + +[One-hot encoding](http://en.wikipedia.org/wiki/One-hot) maps a column of label indices to a column of binary vectors, with at most a single one-value. This encoding allows algorithms which expect continuous features, such as Logistic Regression, to use categorical features + +<div class="codetabs"> +<div data-lang="scala" markdown="1"> +{% highlight scala %} +import org.apache.spark.ml.feature.{OneHotEncoder, StringIndexer} + +val df = sqlContext.createDataFrame(Seq( + (0, "a"), + (1, "b"), + (2, "c"), + (3, "a"), + (4, "a"), + (5, "c") +)).toDF("id", "category") + +val indexer = new StringIndexer() + .setInputCol("category") + .setOutputCol("categoryIndex") + .fit(df) +val indexed = indexer.transform(df) + +val encoder = new OneHotEncoder().setInputCol("categoryIndex"). + setOutputCol("categoryVec") +val encoded = encoder.transform(indexed) +encoded.select("id", "categoryVec").foreach(println) +{% endhighlight %} +</div> + +<div data-lang="java" markdown="1"> +{% highlight java %} +import com.google.common.collect.Lists; + +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.ml.feature.OneHotEncoder; +import org.apache.spark.ml.feature.StringIndexer; +import org.apache.spark.ml.feature.StringIndexerModel; +import org.apache.spark.sql.DataFrame; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.RowFactory; +import org.apache.spark.sql.types.DataTypes; +import org.apache.spark.sql.types.Metadata; +import org.apache.spark.sql.types.StructField; +import org.apache.spark.sql.types.StructType; + +JavaRDD<Row> jrdd = jsc.parallelize(Lists.newArrayList( + RowFactory.create(0, "a"), + RowFactory.create(1, "b"), + RowFactory.create(2, "c"), + RowFactory.create(3, "a"), + RowFactory.create(4, "a"), + RowFactory.create(5, "c") +)); +StructType schema = new StructType(new StructField[]{ + new StructField("id", DataTypes.DoubleType, false, Metadata.empty()), + new StructField("category", DataTypes.StringType, false, Metadata.empty()) +}); +DataFrame df = sqlContext.createDataFrame(jrdd, schema); +StringIndexerModel indexer = new StringIndexer() + .setInputCol("category") + .setOutputCol("categoryIndex") + .fit(df); +DataFrame indexed = indexer.transform(df); + +OneHotEncoder encoder = new OneHotEncoder() + .setInputCol("categoryIndex") + .setOutputCol("categoryVec"); +DataFrame encoded = encoder.transform(indexed); +{% endhighlight %} +</div> + +<div data-lang="python" markdown="1"> +{% highlight python %} +from pyspark.ml.feature import OneHotEncoder, StringIndexer + +df = sqlContext.createDataFrame([ + (0, "a"), + (1, "b"), + (2, "c"), + (3, "a"), + (4, "a"), + (5, "c") +], ["id", "category"]) + +stringIndexer = StringIndexer(inputCol="category", outputCol="categoryIndex") +model = stringIndexer.fit(df) +indexed = model.transform(df) +encoder = OneHotEncoder(includeFirst=False, inputCol="categoryIndex", outputCol="categoryVec") +encoded = encoder.transform(indexed) +{% endhighlight %} +</div> +</div> + # Feature Selectors --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org